Remove Encode::Tcl::Extended, suggested by
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if (data->flags & SF_BEFORE_EOL)
509                 data->flags
510                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511             else
512                 data->flags &= ~SF_FL_BEFORE_EOL;
513         }
514     }
515     SvCUR_set(data->last_found, 0);
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590         cl->flags &= ~ANYOF_UNICODE_ALL;
591         cl->flags |= ANYOF_UNICODE;
592         ARG_SET(cl, ARG(and_with));
593     }
594     if (!(and_with->flags & ANYOF_UNICODE_ALL))
595         cl->flags &= ~ANYOF_UNICODE_ALL;
596     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597         cl->flags &= ~ANYOF_UNICODE;
598 }
599
600 /* 'OR' a given class with another one.  Can create false positives */
601 /* We assume that cl is not inverted */
602 STATIC void
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
604 {
605     if (or_with->flags & ANYOF_INVERT) {
606         /* We do not use
607          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608          *   <= (B1 | !B2) | (CL1 | !CL2)
609          * which is wasteful if CL2 is small, but we ignore CL2:
610          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611          * XXXX Can we handle case-fold?  Unclear:
612          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614          */
615         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616              && !(or_with->flags & ANYOF_FOLD)
617              && !(cl->flags & ANYOF_FOLD) ) {
618             int i;
619
620             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621                 cl->bitmap[i] |= ~or_with->bitmap[i];
622         } /* XXXX: logic is complicated otherwise */
623         else {
624             cl_anything(pRExC_state, cl);
625         }
626     } else {
627         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629              && (!(or_with->flags & ANYOF_FOLD)
630                  || (cl->flags & ANYOF_FOLD)) ) {
631             int i;
632
633             /* OR char bitmap and class bitmap separately */
634             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635                 cl->bitmap[i] |= or_with->bitmap[i];
636             if (or_with->flags & ANYOF_CLASS) {
637                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638                     cl->classflags[i] |= or_with->classflags[i];
639                 cl->flags |= ANYOF_CLASS;
640             }
641         }
642         else { /* XXXX: logic is complicated, leave it along for a moment. */
643             cl_anything(pRExC_state, cl);
644         }
645     }
646     if (or_with->flags & ANYOF_EOS)
647         cl->flags |= ANYOF_EOS;
648
649     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650         ARG(cl) != ARG(or_with)) {
651         cl->flags |= ANYOF_UNICODE_ALL;
652         cl->flags &= ~ANYOF_UNICODE;
653     }
654     if (or_with->flags & ANYOF_UNICODE_ALL) {
655         cl->flags |= ANYOF_UNICODE_ALL;
656         cl->flags &= ~ANYOF_UNICODE;
657     }
658 }
659
660 /*
661  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662  * These need to be revisited when a newer toolchain becomes available.
663  */
664 #if defined(__sparc64__) && defined(__GNUC__)
665 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 #       undef  SPARC64_GCC_WORKAROUND
667 #       define SPARC64_GCC_WORKAROUND 1
668 #   endif
669 #endif
670
671 /* REx optimizer.  Converts nodes into quickier variants "in place".
672    Finds fixed substrings.  */
673
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675    to the position after last scanned or to NULL. */
676
677 STATIC I32
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679                         /* scanp: Start here (read-write). */
680                         /* deltap: Write maxlen-minlen here. */
681                         /* last: Stop before this one. */
682 {
683     I32 min = 0, pars = 0, code;
684     regnode *scan = *scanp, *next;
685     I32 delta = 0;
686     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687     int is_inf_internal = 0;            /* The studied chunk is infinite */
688     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689     scan_data_t data_fake;
690     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
691
692     while (scan && OP(scan) != END && scan < last) {
693         /* Peephole optimizer: */
694
695         if (PL_regkind[(U8)OP(scan)] == EXACT) {
696             /* Merge several consecutive EXACTish nodes into one. */
697             regnode *n = regnext(scan);
698             U32 stringok = 1;
699 #ifdef DEBUGGING
700             regnode *stop = scan;
701 #endif
702
703             next = scan + NODE_SZ_STR(scan);
704             /* Skip NOTHING, merge EXACT*. */
705             while (n &&
706                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
707                      (stringok && (OP(n) == OP(scan))))
708                    && NEXT_OFF(n)
709                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710                 if (OP(n) == TAIL || n > next)
711                     stringok = 0;
712                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713                     NEXT_OFF(scan) += NEXT_OFF(n);
714                     next = n + NODE_STEP_REGNODE;
715 #ifdef DEBUGGING
716                     if (stringok)
717                         stop = n;
718 #endif
719                     n = regnext(n);
720                 }
721                 else if (stringok) {
722                     int oldl = STR_LEN(scan);
723                     regnode *nnext = regnext(n);
724
725                     if (oldl + STR_LEN(n) > U8_MAX)
726                         break;
727                     NEXT_OFF(scan) += NEXT_OFF(n);
728                     STR_LEN(scan) += STR_LEN(n);
729                     next = n + NODE_SZ_STR(n);
730                     /* Now we can overwrite *n : */
731                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 #ifdef DEBUGGING
733                     stop = next - 1;
734 #endif
735                     n = nnext;
736                 }
737             }
738
739             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
740 /*
741   Two problematic code points in Unicode casefolding of EXACT nodes:
742
743    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746    which casefold to
747
748    Unicode                      UTF-8
749
750    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
751    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753    This means that in case-insensitive matching (or "loose matching",
754    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755    length of the above casefolded versions) can match a target string
756    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757    This would rather mess up the minimum length computation.
758
759    What we'll do is to look for the tail four bytes, and then peek
760    at the preceding two bytes to see whether we need to decrease
761    the minimum length by four (six minus two).
762
763    Thanks to the design of UTF-8, there cannot be false matches:
764    A sequence of valid UTF-8 bytes cannot be a subsequence of
765    another valid sequence of UTF-8 bytes.
766
767 */
768                  char *s0 = STRING(scan), *s, *t;
769                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770                  char *t0 = "\xcc\x88\xcc\x81";
771                  char *t1 = t0 + 3;
772                  
773                  for (s = s0 + 2;
774                       s < s2 && (t = ninstr(s, s1, t0, t1));
775                       s = t + 4) {
776                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778                            min -= 4;
779                  }
780             }
781
782 #ifdef DEBUGGING
783             /* Allow dumping */
784             n = scan + NODE_SZ_STR(scan);
785             while (n <= stop) {
786                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787                     OP(n) = OPTIMIZED;
788                     NEXT_OFF(n) = 0;
789                 }
790                 n++;
791             }
792 #endif
793         }
794         /* Follow the next-chain of the current node and optimize
795            away all the NOTHINGs from it.  */
796         if (OP(scan) != CURLYX) {
797             int max = (reg_off_by_arg[OP(scan)]
798                        ? I32_MAX
799                        /* I32 may be smaller than U16 on CRAYs! */
800                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802             int noff;
803             regnode *n = scan;
804         
805             /* Skip NOTHING and LONGJMP. */
806             while ((n = regnext(n))
807                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809                    && off + noff < max)
810                 off += noff;
811             if (reg_off_by_arg[OP(scan)])
812                 ARG(scan) = off;
813             else
814                 NEXT_OFF(scan) = off;
815         }
816         /* The principal pseudo-switch.  Cannot be a switch, since we
817            look into several different things.  */
818         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820             next = regnext(scan);
821             code = OP(scan);
822         
823             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824                 I32 max1 = 0, min1 = I32_MAX, num = 0;
825                 struct regnode_charclass_class accum;
826                 
827                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829                 if (flags & SCF_DO_STCLASS)
830                     cl_init_zero(pRExC_state, &accum);
831                 while (OP(scan) == code) {
832                     I32 deltanext, minnext, f = 0, fake;
833                     struct regnode_charclass_class this_class;
834
835                     num++;
836                     data_fake.flags = 0;
837                     if (data) {         
838                         data_fake.whilem_c = data->whilem_c;
839                         data_fake.last_closep = data->last_closep;
840                     }
841                     else
842                         data_fake.last_closep = &fake;
843                     next = regnext(scan);
844                     scan = NEXTOPER(scan);
845                     if (code != BRANCH)
846                         scan = NEXTOPER(scan);
847                     if (flags & SCF_DO_STCLASS) {
848                         cl_init(pRExC_state, &this_class);
849                         data_fake.start_class = &this_class;
850                         f = SCF_DO_STCLASS_AND;
851                     }           
852                     if (flags & SCF_WHILEM_VISITED_POS)
853                         f |= SCF_WHILEM_VISITED_POS;
854                     /* we suppose the run is continuous, last=next...*/
855                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
856                                           next, &data_fake, f);
857                     if (min1 > minnext)
858                         min1 = minnext;
859                     if (max1 < minnext + deltanext)
860                         max1 = minnext + deltanext;
861                     if (deltanext == I32_MAX)
862                         is_inf = is_inf_internal = 1;
863                     scan = next;
864                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865                         pars++;
866                     if (data && (data_fake.flags & SF_HAS_EVAL))
867                         data->flags |= SF_HAS_EVAL;
868                     if (data)
869                         data->whilem_c = data_fake.whilem_c;
870                     if (flags & SCF_DO_STCLASS)
871                         cl_or(pRExC_state, &accum, &this_class);
872                     if (code == SUSPEND)
873                         break;
874                 }
875                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876                     min1 = 0;
877                 if (flags & SCF_DO_SUBSTR) {
878                     data->pos_min += min1;
879                     data->pos_delta += max1 - min1;
880                     if (max1 != min1 || is_inf)
881                         data->longest = &(data->longest_float);
882                 }
883                 min += min1;
884                 delta += max1 - min1;
885                 if (flags & SCF_DO_STCLASS_OR) {
886                     cl_or(pRExC_state, data->start_class, &accum);
887                     if (min1) {
888                         cl_and(data->start_class, &and_with);
889                         flags &= ~SCF_DO_STCLASS;
890                     }
891                 }
892                 else if (flags & SCF_DO_STCLASS_AND) {
893                     if (min1) {
894                         cl_and(data->start_class, &accum);
895                         flags &= ~SCF_DO_STCLASS;
896                     }
897                     else {
898                         /* Switch to OR mode: cache the old value of
899                          * data->start_class */
900                         StructCopy(data->start_class, &and_with,
901                                    struct regnode_charclass_class);
902                         flags &= ~SCF_DO_STCLASS_AND;
903                         StructCopy(&accum, data->start_class,
904                                    struct regnode_charclass_class);
905                         flags |= SCF_DO_STCLASS_OR;
906                         data->start_class->flags |= ANYOF_EOS;
907                     }
908                 }
909             }
910             else if (code == BRANCHJ)   /* single branch is optimized. */
911                 scan = NEXTOPER(NEXTOPER(scan));
912             else                        /* single branch is optimized. */
913                 scan = NEXTOPER(scan);
914             continue;
915         }
916         else if (OP(scan) == EXACT) {
917             I32 l = STR_LEN(scan);
918             UV uc = *((U8*)STRING(scan));
919             if (UTF) {
920                 U8 *s = (U8*)STRING(scan);
921                 l = utf8_length(s, s + l);
922                 uc = utf8_to_uvchr(s, NULL);
923             }
924             min += l;
925             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926                 /* The code below prefers earlier match for fixed
927                    offset, later match for variable offset.  */
928                 if (data->last_end == -1) { /* Update the start info. */
929                     data->last_start_min = data->pos_min;
930                     data->last_start_max = is_inf
931                         ? I32_MAX : data->pos_min + data->pos_delta;
932                 }
933                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
934                 data->last_end = data->pos_min + l;
935                 data->pos_min += l; /* As in the first entry. */
936                 data->flags &= ~SF_BEFORE_EOL;
937             }
938             if (flags & SCF_DO_STCLASS_AND) {
939                 /* Check whether it is compatible with what we know already! */
940                 int compat = 1;
941
942                 if (uc >= 0x100 ||
943                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
944                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
945                     && (!(data->start_class->flags & ANYOF_FOLD)
946                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
947                     )
948                     compat = 0;
949                 ANYOF_CLASS_ZERO(data->start_class);
950                 ANYOF_BITMAP_ZERO(data->start_class);
951                 if (compat)
952                     ANYOF_BITMAP_SET(data->start_class, uc);
953                 data->start_class->flags &= ~ANYOF_EOS;
954                 if (uc < 0x100)
955                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
956             }
957             else if (flags & SCF_DO_STCLASS_OR) {
958                 /* false positive possible if the class is case-folded */
959                 if (uc < 0x100)
960                     ANYOF_BITMAP_SET(data->start_class, uc);
961                 else
962                     data->start_class->flags |= ANYOF_UNICODE_ALL;
963                 data->start_class->flags &= ~ANYOF_EOS;
964                 cl_and(data->start_class, &and_with);
965             }
966             flags &= ~SCF_DO_STCLASS;
967         }
968         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
969             I32 l = STR_LEN(scan);
970             UV uc = *((U8*)STRING(scan));
971
972             /* Search for fixed substrings supports EXACT only. */
973             if (flags & SCF_DO_SUBSTR)
974                 scan_commit(pRExC_state, data);
975             if (UTF) {
976                 U8 *s = (U8 *)STRING(scan);
977                 l = utf8_length(s, s + l);
978                 uc = utf8_to_uvchr(s, NULL);
979             }
980             min += l;
981             if (data && (flags & SCF_DO_SUBSTR))
982                 data->pos_min += l;
983             if (flags & SCF_DO_STCLASS_AND) {
984                 /* Check whether it is compatible with what we know already! */
985                 int compat = 1;
986
987                 if (uc >= 0x100 ||
988                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
989                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
990                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
991                     compat = 0;
992                 ANYOF_CLASS_ZERO(data->start_class);
993                 ANYOF_BITMAP_ZERO(data->start_class);
994                 if (compat) {
995                     ANYOF_BITMAP_SET(data->start_class, uc);
996                     data->start_class->flags &= ~ANYOF_EOS;
997                     data->start_class->flags |= ANYOF_FOLD;
998                     if (OP(scan) == EXACTFL)
999                         data->start_class->flags |= ANYOF_LOCALE;
1000                 }
1001             }
1002             else if (flags & SCF_DO_STCLASS_OR) {
1003                 if (data->start_class->flags & ANYOF_FOLD) {
1004                     /* false positive possible if the class is case-folded.
1005                        Assume that the locale settings are the same... */
1006                     if (uc < 0x100)
1007                         ANYOF_BITMAP_SET(data->start_class, uc);
1008                     data->start_class->flags &= ~ANYOF_EOS;
1009                 }
1010                 cl_and(data->start_class, &and_with);
1011             }
1012             flags &= ~SCF_DO_STCLASS;
1013         }
1014         else if (strchr((char*)PL_varies,OP(scan))) {
1015             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1016             I32 f = flags, pos_before = 0;
1017             regnode *oscan = scan;
1018             struct regnode_charclass_class this_class;
1019             struct regnode_charclass_class *oclass = NULL;
1020             I32 next_is_eval = 0;
1021
1022             switch (PL_regkind[(U8)OP(scan)]) {
1023             case WHILEM:                /* End of (?:...)* . */
1024                 scan = NEXTOPER(scan);
1025                 goto finish;
1026             case PLUS:
1027                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1028                     next = NEXTOPER(scan);
1029                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1030                         mincount = 1;
1031                         maxcount = REG_INFTY;
1032                         next = regnext(scan);
1033                         scan = NEXTOPER(scan);
1034                         goto do_curly;
1035                     }
1036                 }
1037                 if (flags & SCF_DO_SUBSTR)
1038                     data->pos_min++;
1039                 min++;
1040                 /* Fall through. */
1041             case STAR:
1042                 if (flags & SCF_DO_STCLASS) {
1043                     mincount = 0;
1044                     maxcount = REG_INFTY;
1045                     next = regnext(scan);
1046                     scan = NEXTOPER(scan);
1047                     goto do_curly;
1048                 }
1049                 is_inf = is_inf_internal = 1;
1050                 scan = regnext(scan);
1051                 if (flags & SCF_DO_SUBSTR) {
1052                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1053                     data->longest = &(data->longest_float);
1054                 }
1055                 goto optimize_curly_tail;
1056             case CURLY:
1057                 mincount = ARG1(scan);
1058                 maxcount = ARG2(scan);
1059                 next = regnext(scan);
1060                 if (OP(scan) == CURLYX) {
1061                     I32 lp = (data ? *(data->last_closep) : 0);
1062
1063                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064                 }
1065                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1066                 next_is_eval = (OP(scan) == EVAL);
1067               do_curly:
1068                 if (flags & SCF_DO_SUBSTR) {
1069                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1070                     pos_before = data->pos_min;
1071                 }
1072                 if (data) {
1073                     fl = data->flags;
1074                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075                     if (is_inf)
1076                         data->flags |= SF_IS_INF;
1077                 }
1078                 if (flags & SCF_DO_STCLASS) {
1079                     cl_init(pRExC_state, &this_class);
1080                     oclass = data->start_class;
1081                     data->start_class = &this_class;
1082                     f |= SCF_DO_STCLASS_AND;
1083                     f &= ~SCF_DO_STCLASS_OR;
1084                 }
1085                 /* These are the cases when once a subexpression
1086                    fails at a particular position, it cannot succeed
1087                    even after backtracking at the enclosing scope.
1088                 
1089                    XXXX what if minimal match and we are at the
1090                         initial run of {n,m}? */
1091                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092                     f &= ~SCF_WHILEM_VISITED_POS;
1093
1094                 /* This will finish on WHILEM, setting scan, or on NULL: */
1095                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096                                       mincount == 0
1097                                         ? (f & ~SCF_DO_SUBSTR) : f);
1098
1099                 if (flags & SCF_DO_STCLASS)
1100                     data->start_class = oclass;
1101                 if (mincount == 0 || minnext == 0) {
1102                     if (flags & SCF_DO_STCLASS_OR) {
1103                         cl_or(pRExC_state, data->start_class, &this_class);
1104                     }
1105                     else if (flags & SCF_DO_STCLASS_AND) {
1106                         /* Switch to OR mode: cache the old value of
1107                          * data->start_class */
1108                         StructCopy(data->start_class, &and_with,
1109                                    struct regnode_charclass_class);
1110                         flags &= ~SCF_DO_STCLASS_AND;
1111                         StructCopy(&this_class, data->start_class,
1112                                    struct regnode_charclass_class);
1113                         flags |= SCF_DO_STCLASS_OR;
1114                         data->start_class->flags |= ANYOF_EOS;
1115                     }
1116                 } else {                /* Non-zero len */
1117                     if (flags & SCF_DO_STCLASS_OR) {
1118                         cl_or(pRExC_state, data->start_class, &this_class);
1119                         cl_and(data->start_class, &and_with);
1120                     }
1121                     else if (flags & SCF_DO_STCLASS_AND)
1122                         cl_and(data->start_class, &this_class);
1123                     flags &= ~SCF_DO_STCLASS;
1124                 }
1125                 if (!scan)              /* It was not CURLYX, but CURLY. */
1126                     scan = next;
1127                 if (ckWARN(WARN_REGEXP)
1128                        /* ? quantifier ok, except for (?{ ... }) */
1129                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1130                     && (minnext == 0) && (deltanext == 0)
1131                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1132                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1133                 {
1134                     vWARN(RExC_parse,
1135                           "Quantifier unexpected on zero-length expression");
1136                 }
1137
1138                 min += minnext * mincount;
1139                 is_inf_internal |= ((maxcount == REG_INFTY
1140                                      && (minnext + deltanext) > 0)
1141                                     || deltanext == I32_MAX);
1142                 is_inf |= is_inf_internal;
1143                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144
1145                 /* Try powerful optimization CURLYX => CURLYN. */
1146                 if (  OP(oscan) == CURLYX && data
1147                       && data->flags & SF_IN_PAR
1148                       && !(data->flags & SF_HAS_EVAL)
1149                       && !deltanext && minnext == 1 ) {
1150                     /* Try to optimize to CURLYN.  */
1151                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1152                     regnode *nxt1 = nxt;
1153 #ifdef DEBUGGING
1154                     regnode *nxt2;
1155 #endif
1156
1157                     /* Skip open. */
1158                     nxt = regnext(nxt);
1159                     if (!strchr((char*)PL_simple,OP(nxt))
1160                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1161                              && STR_LEN(nxt) == 1))
1162                         goto nogo;
1163 #ifdef DEBUGGING
1164                     nxt2 = nxt;
1165 #endif
1166                     nxt = regnext(nxt);
1167                     if (OP(nxt) != CLOSE)
1168                         goto nogo;
1169                     /* Now we know that nxt2 is the only contents: */
1170                     oscan->flags = ARG(nxt);
1171                     OP(oscan) = CURLYN;
1172                     OP(nxt1) = NOTHING; /* was OPEN. */
1173 #ifdef DEBUGGING
1174                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1178                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1180 #endif
1181                 }
1182               nogo:
1183
1184                 /* Try optimization CURLYX => CURLYM. */
1185                 if (  OP(oscan) == CURLYX && data
1186                       && !(data->flags & SF_HAS_PAR)
1187                       && !(data->flags & SF_HAS_EVAL)
1188                       && !deltanext  ) {
1189                     /* XXXX How to optimize if data == 0? */
1190                     /* Optimize to a simpler form.  */
1191                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1192                     regnode *nxt2;
1193
1194                     OP(oscan) = CURLYM;
1195                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1196                             && (OP(nxt2) != WHILEM))
1197                         nxt = nxt2;
1198                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1199                     /* Need to optimize away parenths. */
1200                     if (data->flags & SF_IN_PAR) {
1201                         /* Set the parenth number.  */
1202                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203
1204                         if (OP(nxt) != CLOSE)
1205                             FAIL("Panic opt close");
1206                         oscan->flags = ARG(nxt);
1207                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1208                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1209 #ifdef DEBUGGING
1210                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1212                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1214 #endif
1215 #if 0
1216                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217                             regnode *nnxt = regnext(nxt1);
1218                         
1219                             if (nnxt == nxt) {
1220                                 if (reg_off_by_arg[OP(nxt1)])
1221                                     ARG_SET(nxt1, nxt2 - nxt1);
1222                                 else if (nxt2 - nxt1 < U16_MAX)
1223                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1224                                 else
1225                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1226                             }
1227                             nxt1 = nnxt;
1228                         }
1229 #endif
1230                         /* Optimize again: */
1231                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1232                                     NULL, 0);
1233                     }
1234                     else
1235                         oscan->flags = 0;
1236                 }
1237                 else if ((OP(oscan) == CURLYX)
1238                          && (flags & SCF_WHILEM_VISITED_POS)
1239                          /* See the comment on a similar expression above.
1240                             However, this time it not a subexpression
1241                             we care about, but the expression itself. */
1242                          && (maxcount == REG_INFTY)
1243                          && data && ++data->whilem_c < 16) {
1244                     /* This stays as CURLYX, we can put the count/of pair. */
1245                     /* Find WHILEM (as in regexec.c) */
1246                     regnode *nxt = oscan + NEXT_OFF(oscan);
1247
1248                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249                         nxt += ARG(nxt);
1250                     PREVOPER(nxt)->flags = data->whilem_c
1251                         | (RExC_whilem_seen << 4); /* On WHILEM */
1252                 }
1253                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1254                     pars++;
1255                 if (flags & SCF_DO_SUBSTR) {
1256                     SV *last_str = Nullsv;
1257                     int counted = mincount != 0;
1258
1259                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1260 #if defined(SPARC64_GCC_WORKAROUND)
1261                         I32 b = 0;
1262                         STRLEN l = 0;
1263                         char *s = NULL;
1264                         I32 old = 0;
1265
1266                         if (pos_before >= data->last_start_min)
1267                             b = pos_before;
1268                         else
1269                             b = data->last_start_min;
1270
1271                         l = 0;
1272                         s = SvPV(data->last_found, l);
1273                         old = b - data->last_start_min;
1274
1275 #else
1276                         I32 b = pos_before >= data->last_start_min
1277                             ? pos_before : data->last_start_min;
1278                         STRLEN l;
1279                         char *s = SvPV(data->last_found, l);
1280                         I32 old = b - data->last_start_min;
1281 #endif
1282
1283                         if (UTF)
1284                             old = utf8_hop((U8*)s, old) - (U8*)s;
1285                         
1286                         l -= old;
1287                         /* Get the added string: */
1288                         last_str = newSVpvn(s  + old, l);
1289                         if (deltanext == 0 && pos_before == b) {
1290                             /* What was added is a constant string */
1291                             if (mincount > 1) {
1292                                 SvGROW(last_str, (mincount * l) + 1);
1293                                 repeatcpy(SvPVX(last_str) + l,
1294                                           SvPVX(last_str), l, mincount - 1);
1295                                 SvCUR(last_str) *= mincount;
1296                                 /* Add additional parts. */
1297                                 SvCUR_set(data->last_found,
1298                                           SvCUR(data->last_found) - l);
1299                                 sv_catsv(data->last_found, last_str);
1300                                 data->last_end += l * (mincount - 1);
1301                             }
1302                         } else {
1303                             /* start offset must point into the last copy */
1304                             data->last_start_min += minnext * (mincount - 1);
1305                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1306                                 * (minnext + data->pos_delta);
1307                         }
1308                     }
1309                     /* It is counted once already... */
1310                     data->pos_min += minnext * (mincount - counted);
1311                     data->pos_delta += - counted * deltanext +
1312                         (minnext + deltanext) * maxcount - minnext * mincount;
1313                     if (mincount != maxcount) {
1314                          /* Cannot extend fixed substrings found inside
1315                             the group.  */
1316                         scan_commit(pRExC_state,data);
1317                         if (mincount && last_str) {
1318                             sv_setsv(data->last_found, last_str);
1319                             data->last_end = data->pos_min;
1320                             data->last_start_min =
1321                                 data->pos_min - CHR_SVLEN(last_str);
1322                             data->last_start_max = is_inf
1323                                 ? I32_MAX
1324                                 : data->pos_min + data->pos_delta
1325                                 - CHR_SVLEN(last_str);
1326                         }
1327                         data->longest = &(data->longest_float);
1328                     }
1329                     SvREFCNT_dec(last_str);
1330                 }
1331                 if (data && (fl & SF_HAS_EVAL))
1332                     data->flags |= SF_HAS_EVAL;
1333               optimize_curly_tail:
1334                 if (OP(oscan) != CURLYX) {
1335                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1336                            && NEXT_OFF(next))
1337                         NEXT_OFF(oscan) += NEXT_OFF(next);
1338                 }
1339                 continue;
1340             default:                    /* REF and CLUMP only? */
1341                 if (flags & SCF_DO_SUBSTR) {
1342                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1343                     data->longest = &(data->longest_float);
1344                 }
1345                 is_inf = is_inf_internal = 1;
1346                 if (flags & SCF_DO_STCLASS_OR)
1347                     cl_anything(pRExC_state, data->start_class);
1348                 flags &= ~SCF_DO_STCLASS;
1349                 break;
1350             }
1351         }
1352         else if (strchr((char*)PL_simple,OP(scan))) {
1353             int value = 0;
1354
1355             if (flags & SCF_DO_SUBSTR) {
1356                 scan_commit(pRExC_state,data);
1357                 data->pos_min++;
1358             }
1359             min++;
1360             if (flags & SCF_DO_STCLASS) {
1361                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1362
1363                 /* Some of the logic below assumes that switching
1364                    locale on will only add false positives. */
1365                 switch (PL_regkind[(U8)OP(scan)]) {
1366                 case SANY:
1367                 default:
1368                   do_default:
1369                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1370                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1371                         cl_anything(pRExC_state, data->start_class);
1372                     break;
1373                 case REG_ANY:
1374                     if (OP(scan) == SANY)
1375                         goto do_default;
1376                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1377                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1378                                  || (data->start_class->flags & ANYOF_CLASS));
1379                         cl_anything(pRExC_state, data->start_class);
1380                     }
1381                     if (flags & SCF_DO_STCLASS_AND || !value)
1382                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1383                     break;
1384                 case ANYOF:
1385                     if (flags & SCF_DO_STCLASS_AND)
1386                         cl_and(data->start_class,
1387                                (struct regnode_charclass_class*)scan);
1388                     else
1389                         cl_or(pRExC_state, data->start_class,
1390                               (struct regnode_charclass_class*)scan);
1391                     break;
1392                 case ALNUM:
1393                     if (flags & SCF_DO_STCLASS_AND) {
1394                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1395                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1396                             for (value = 0; value < 256; value++)
1397                                 if (!isALNUM(value))
1398                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1399                         }
1400                     }
1401                     else {
1402                         if (data->start_class->flags & ANYOF_LOCALE)
1403                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404                         else {
1405                             for (value = 0; value < 256; value++)
1406                                 if (isALNUM(value))
1407                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1408                         }
1409                     }
1410                     break;
1411                 case ALNUML:
1412                     if (flags & SCF_DO_STCLASS_AND) {
1413                         if (data->start_class->flags & ANYOF_LOCALE)
1414                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1415                     }
1416                     else {
1417                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418                         data->start_class->flags |= ANYOF_LOCALE;
1419                     }
1420                     break;
1421                 case NALNUM:
1422                     if (flags & SCF_DO_STCLASS_AND) {
1423                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1424                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1425                             for (value = 0; value < 256; value++)
1426                                 if (isALNUM(value))
1427                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1428                         }
1429                     }
1430                     else {
1431                         if (data->start_class->flags & ANYOF_LOCALE)
1432                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1433                         else {
1434                             for (value = 0; value < 256; value++)
1435                                 if (!isALNUM(value))
1436                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1437                         }
1438                     }
1439                     break;
1440                 case NALNUML:
1441                     if (flags & SCF_DO_STCLASS_AND) {
1442                         if (data->start_class->flags & ANYOF_LOCALE)
1443                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1444                     }
1445                     else {
1446                         data->start_class->flags |= ANYOF_LOCALE;
1447                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448                     }
1449                     break;
1450                 case SPACE:
1451                     if (flags & SCF_DO_STCLASS_AND) {
1452                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1453                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1454                             for (value = 0; value < 256; value++)
1455                                 if (!isSPACE(value))
1456                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1457                         }
1458                     }
1459                     else {
1460                         if (data->start_class->flags & ANYOF_LOCALE)
1461                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1462                         else {
1463                             for (value = 0; value < 256; value++)
1464                                 if (isSPACE(value))
1465                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1466                         }
1467                     }
1468                     break;
1469                 case SPACEL:
1470                     if (flags & SCF_DO_STCLASS_AND) {
1471                         if (data->start_class->flags & ANYOF_LOCALE)
1472                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1473                     }
1474                     else {
1475                         data->start_class->flags |= ANYOF_LOCALE;
1476                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477                     }
1478                     break;
1479                 case NSPACE:
1480                     if (flags & SCF_DO_STCLASS_AND) {
1481                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1482                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1483                             for (value = 0; value < 256; value++)
1484                                 if (isSPACE(value))
1485                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1486                         }
1487                     }
1488                     else {
1489                         if (data->start_class->flags & ANYOF_LOCALE)
1490                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1491                         else {
1492                             for (value = 0; value < 256; value++)
1493                                 if (!isSPACE(value))
1494                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1495                         }
1496                     }
1497                     break;
1498                 case NSPACEL:
1499                     if (flags & SCF_DO_STCLASS_AND) {
1500                         if (data->start_class->flags & ANYOF_LOCALE) {
1501                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1502                             for (value = 0; value < 256; value++)
1503                                 if (!isSPACE(value))
1504                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1505                         }
1506                     }
1507                     else {
1508                         data->start_class->flags |= ANYOF_LOCALE;
1509                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1510                     }
1511                     break;
1512                 case DIGIT:
1513                     if (flags & SCF_DO_STCLASS_AND) {
1514                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1515                         for (value = 0; value < 256; value++)
1516                             if (!isDIGIT(value))
1517                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1518                     }
1519                     else {
1520                         if (data->start_class->flags & ANYOF_LOCALE)
1521                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1522                         else {
1523                             for (value = 0; value < 256; value++)
1524                                 if (isDIGIT(value))
1525                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1526                         }
1527                     }
1528                     break;
1529                 case NDIGIT:
1530                     if (flags & SCF_DO_STCLASS_AND) {
1531                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1532                         for (value = 0; value < 256; value++)
1533                             if (isDIGIT(value))
1534                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1535                     }
1536                     else {
1537                         if (data->start_class->flags & ANYOF_LOCALE)
1538                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1539                         else {
1540                             for (value = 0; value < 256; value++)
1541                                 if (!isDIGIT(value))
1542                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1543                         }
1544                     }
1545                     break;
1546                 }
1547                 if (flags & SCF_DO_STCLASS_OR)
1548                     cl_and(data->start_class, &and_with);
1549                 flags &= ~SCF_DO_STCLASS;
1550             }
1551         }
1552         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1553             data->flags |= (OP(scan) == MEOL
1554                             ? SF_BEFORE_MEOL
1555                             : SF_BEFORE_SEOL);
1556         }
1557         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1558                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1559                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1560                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1561             /* Lookahead/lookbehind */
1562             I32 deltanext, minnext, fake = 0;
1563             regnode *nscan;
1564             struct regnode_charclass_class intrnl;
1565             int f = 0;
1566
1567             data_fake.flags = 0;
1568             if (data) {         
1569                 data_fake.whilem_c = data->whilem_c;
1570                 data_fake.last_closep = data->last_closep;
1571             }
1572             else
1573                 data_fake.last_closep = &fake;
1574             if ( flags & SCF_DO_STCLASS && !scan->flags
1575                  && OP(scan) == IFMATCH ) { /* Lookahead */
1576                 cl_init(pRExC_state, &intrnl);
1577                 data_fake.start_class = &intrnl;
1578                 f |= SCF_DO_STCLASS_AND;
1579             }
1580             if (flags & SCF_WHILEM_VISITED_POS)
1581                 f |= SCF_WHILEM_VISITED_POS;
1582             next = regnext(scan);
1583             nscan = NEXTOPER(NEXTOPER(scan));
1584             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1585             if (scan->flags) {
1586                 if (deltanext) {
1587                     vFAIL("Variable length lookbehind not implemented");
1588                 }
1589                 else if (minnext > U8_MAX) {
1590                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1591                 }
1592                 scan->flags = minnext;
1593             }
1594             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1595                 pars++;
1596             if (data && (data_fake.flags & SF_HAS_EVAL))
1597                 data->flags |= SF_HAS_EVAL;
1598             if (data)
1599                 data->whilem_c = data_fake.whilem_c;
1600             if (f & SCF_DO_STCLASS_AND) {
1601                 int was = (data->start_class->flags & ANYOF_EOS);
1602
1603                 cl_and(data->start_class, &intrnl);
1604                 if (was)
1605                     data->start_class->flags |= ANYOF_EOS;
1606             }
1607         }
1608         else if (OP(scan) == OPEN) {
1609             pars++;
1610         }
1611         else if (OP(scan) == CLOSE) {
1612             if (ARG(scan) == is_par) {
1613                 next = regnext(scan);
1614
1615                 if ( next && (OP(next) != WHILEM) && next < last)
1616                     is_par = 0;         /* Disable optimization */
1617             }
1618             if (data)
1619                 *(data->last_closep) = ARG(scan);
1620         }
1621         else if (OP(scan) == EVAL) {
1622                 if (data)
1623                     data->flags |= SF_HAS_EVAL;
1624         }
1625         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1626                 if (flags & SCF_DO_SUBSTR) {
1627                     scan_commit(pRExC_state,data);
1628                     data->longest = &(data->longest_float);
1629                 }
1630                 is_inf = is_inf_internal = 1;
1631                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1632                     cl_anything(pRExC_state, data->start_class);
1633                 flags &= ~SCF_DO_STCLASS;
1634         }
1635         /* Else: zero-length, ignore. */
1636         scan = regnext(scan);
1637     }
1638
1639   finish:
1640     *scanp = scan;
1641     *deltap = is_inf_internal ? I32_MAX : delta;
1642     if (flags & SCF_DO_SUBSTR && is_inf)
1643         data->pos_delta = I32_MAX - data->pos_min;
1644     if (is_par > U8_MAX)
1645         is_par = 0;
1646     if (is_par && pars==1 && data) {
1647         data->flags |= SF_IN_PAR;
1648         data->flags &= ~SF_HAS_PAR;
1649     }
1650     else if (pars && data) {
1651         data->flags |= SF_HAS_PAR;
1652         data->flags &= ~SF_IN_PAR;
1653     }
1654     if (flags & SCF_DO_STCLASS_OR)
1655         cl_and(data->start_class, &and_with);
1656     return min;
1657 }
1658
1659 STATIC I32
1660 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1661 {
1662     if (RExC_rx->data) {
1663         Renewc(RExC_rx->data,
1664                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1665                char, struct reg_data);
1666         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1667         RExC_rx->data->count += n;
1668     }
1669     else {
1670         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1671              char, struct reg_data);
1672         New(1208, RExC_rx->data->what, n, U8);
1673         RExC_rx->data->count = n;
1674     }
1675     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1676     return RExC_rx->data->count - n;
1677 }
1678
1679 void
1680 Perl_reginitcolors(pTHX)
1681 {
1682     int i = 0;
1683     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1684         
1685     if (s) {
1686         PL_colors[0] = s = savepv(s);
1687         while (++i < 6) {
1688             s = strchr(s, '\t');
1689             if (s) {
1690                 *s = '\0';
1691                 PL_colors[i] = ++s;
1692             }
1693             else
1694                 PL_colors[i] = s = "";
1695         }
1696     } else {
1697         while (i < 6)
1698             PL_colors[i++] = "";
1699     }
1700     PL_colorset = 1;
1701 }
1702
1703
1704 /*
1705  - pregcomp - compile a regular expression into internal code
1706  *
1707  * We can't allocate space until we know how big the compiled form will be,
1708  * but we can't compile it (and thus know how big it is) until we've got a
1709  * place to put the code.  So we cheat:  we compile it twice, once with code
1710  * generation turned off and size counting turned on, and once "for real".
1711  * This also means that we don't allocate space until we are sure that the
1712  * thing really will compile successfully, and we never have to move the
1713  * code and thus invalidate pointers into it.  (Note that it has to be in
1714  * one piece because free() must be able to free it all.) [NB: not true in perl]
1715  *
1716  * Beware that the optimization-preparation code in here knows about some
1717  * of the structure of the compiled regexp.  [I'll say.]
1718  */
1719 regexp *
1720 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1721 {
1722     register regexp *r;
1723     regnode *scan;
1724     regnode *first;
1725     I32 flags;
1726     I32 minlen = 0;
1727     I32 sawplus = 0;
1728     I32 sawopen = 0;
1729     scan_data_t data;
1730     RExC_state_t RExC_state;
1731     RExC_state_t *pRExC_state = &RExC_state;
1732
1733     if (exp == NULL)
1734         FAIL("NULL regexp argument");
1735
1736     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1737
1738     RExC_precomp = exp;
1739     DEBUG_r({
1740          if (!PL_colorset) reginitcolors();
1741          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1742                        PL_colors[4],PL_colors[5],PL_colors[0],
1743                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1744     });
1745     RExC_flags16 = pm->op_pmflags;
1746     RExC_sawback = 0;
1747
1748     RExC_seen = 0;
1749     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1750     RExC_seen_evals = 0;
1751     RExC_extralen = 0;
1752
1753     /* First pass: determine size, legality. */
1754     RExC_parse = exp;
1755     RExC_start = exp;
1756     RExC_end = xend;
1757     RExC_naughty = 0;
1758     RExC_npar = 1;
1759     RExC_size = 0L;
1760     RExC_emit = &PL_regdummy;
1761     RExC_whilem_seen = 0;
1762 #if 0 /* REGC() is (currently) a NOP at the first pass.
1763        * Clever compilers notice this and complain. --jhi */
1764     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1765 #endif
1766     if (reg(pRExC_state, 0, &flags) == NULL) {
1767         RExC_precomp = Nullch;
1768         return(NULL);
1769     }
1770     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1771
1772     /* Small enough for pointer-storage convention?
1773        If extralen==0, this means that we will not need long jumps. */
1774     if (RExC_size >= 0x10000L && RExC_extralen)
1775         RExC_size += RExC_extralen;
1776     else
1777         RExC_extralen = 0;
1778     if (RExC_whilem_seen > 15)
1779         RExC_whilem_seen = 15;
1780
1781     /* Allocate space and initialize. */
1782     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1783          char, regexp);
1784     if (r == NULL)
1785         FAIL("Regexp out of space");
1786
1787 #ifdef DEBUGGING
1788     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1789     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1790 #endif
1791     r->refcnt = 1;
1792     r->prelen = xend - exp;
1793     r->precomp = savepvn(RExC_precomp, r->prelen);
1794     r->subbeg = NULL;
1795     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1796     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1797
1798     r->substrs = 0;                     /* Useful during FAIL. */
1799     r->startp = 0;                      /* Useful during FAIL. */
1800     r->endp = 0;                        /* Useful during FAIL. */
1801
1802     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1803     if (r->offsets) {
1804       r->offsets[0] = RExC_size; 
1805     }
1806     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1807                           "%s %"UVuf" bytes for offset annotations.\n", 
1808                           r->offsets ? "Got" : "Couldn't get", 
1809                           (UV)((2*RExC_size+1) * sizeof(U32))));
1810
1811     RExC_rx = r;
1812
1813     /* Second pass: emit code. */
1814     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1815     RExC_parse = exp;
1816     RExC_end = xend;
1817     RExC_naughty = 0;
1818     RExC_npar = 1;
1819     RExC_emit_start = r->program;
1820     RExC_emit = r->program;
1821     /* Store the count of eval-groups for security checks: */
1822     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1823     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1824     r->data = 0;
1825     if (reg(pRExC_state, 0, &flags) == NULL)
1826         return(NULL);
1827
1828     /* Dig out information for optimizations. */
1829     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1830     pm->op_pmflags = RExC_flags16;
1831     if (UTF)
1832         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1833     r->regstclass = NULL;
1834     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1835         r->reganch |= ROPT_NAUGHTY;
1836     scan = r->program + 1;              /* First BRANCH. */
1837
1838     /* XXXX To minimize changes to RE engine we always allocate
1839        3-units-long substrs field. */
1840     Newz(1004, r->substrs, 1, struct reg_substr_data);
1841
1842     StructCopy(&zero_scan_data, &data, scan_data_t);
1843     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1844     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1845         I32 fake;
1846         STRLEN longest_float_length, longest_fixed_length;
1847         struct regnode_charclass_class ch_class;
1848         int stclass_flag;
1849         I32 last_close = 0;
1850
1851         first = scan;
1852         /* Skip introductions and multiplicators >= 1. */
1853         while ((OP(first) == OPEN && (sawopen = 1)) ||
1854                /* An OR of *one* alternative - should not happen now. */
1855             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1856             (OP(first) == PLUS) ||
1857             (OP(first) == MINMOD) ||
1858                /* An {n,m} with n>0 */
1859             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1860                 if (OP(first) == PLUS)
1861                     sawplus = 1;
1862                 else
1863                     first += regarglen[(U8)OP(first)];
1864                 first = NEXTOPER(first);
1865         }
1866
1867         /* Starting-point info. */
1868       again:
1869         if (PL_regkind[(U8)OP(first)] == EXACT) {
1870             if (OP(first) == EXACT)
1871                 ;       /* Empty, get anchored substr later. */
1872             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1873                 r->regstclass = first;
1874         }
1875         else if (strchr((char*)PL_simple,OP(first)))
1876             r->regstclass = first;
1877         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1878                  PL_regkind[(U8)OP(first)] == NBOUND)
1879             r->regstclass = first;
1880         else if (PL_regkind[(U8)OP(first)] == BOL) {
1881             r->reganch |= (OP(first) == MBOL
1882                            ? ROPT_ANCH_MBOL
1883                            : (OP(first) == SBOL
1884                               ? ROPT_ANCH_SBOL
1885                               : ROPT_ANCH_BOL));
1886             first = NEXTOPER(first);
1887             goto again;
1888         }
1889         else if (OP(first) == GPOS) {
1890             r->reganch |= ROPT_ANCH_GPOS;
1891             first = NEXTOPER(first);
1892             goto again;
1893         }
1894         else if (!sawopen && (OP(first) == STAR &&
1895             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1896             !(r->reganch & ROPT_ANCH) )
1897         {
1898             /* turn .* into ^.* with an implied $*=1 */
1899             int type = OP(NEXTOPER(first));
1900
1901             if (type == REG_ANY)
1902                 type = ROPT_ANCH_MBOL;
1903             else
1904                 type = ROPT_ANCH_SBOL;
1905
1906             r->reganch |= type | ROPT_IMPLICIT;
1907             first = NEXTOPER(first);
1908             goto again;
1909         }
1910         if (sawplus && (!sawopen || !RExC_sawback)
1911             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1912             /* x+ must match at the 1st pos of run of x's */
1913             r->reganch |= ROPT_SKIP;
1914
1915         /* Scan is after the zeroth branch, first is atomic matcher. */
1916         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1917                               (IV)(first - scan + 1)));
1918         /*
1919         * If there's something expensive in the r.e., find the
1920         * longest literal string that must appear and make it the
1921         * regmust.  Resolve ties in favor of later strings, since
1922         * the regstart check works with the beginning of the r.e.
1923         * and avoiding duplication strengthens checking.  Not a
1924         * strong reason, but sufficient in the absence of others.
1925         * [Now we resolve ties in favor of the earlier string if
1926         * it happens that c_offset_min has been invalidated, since the
1927         * earlier string may buy us something the later one won't.]
1928         */
1929         minlen = 0;
1930
1931         data.longest_fixed = newSVpvn("",0);
1932         data.longest_float = newSVpvn("",0);
1933         data.last_found = newSVpvn("",0);
1934         data.longest = &(data.longest_fixed);
1935         first = scan;
1936         if (!r->regstclass) {
1937             cl_init(pRExC_state, &ch_class);
1938             data.start_class = &ch_class;
1939             stclass_flag = SCF_DO_STCLASS_AND;
1940         } else                          /* XXXX Check for BOUND? */
1941             stclass_flag = 0;
1942         data.last_closep = &last_close;
1943
1944         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1945                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1946         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1947              && data.last_start_min == 0 && data.last_end > 0
1948              && !RExC_seen_zerolen
1949              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1950             r->reganch |= ROPT_CHECK_ALL;
1951         scan_commit(pRExC_state, &data);
1952         SvREFCNT_dec(data.last_found);
1953
1954         longest_float_length = CHR_SVLEN(data.longest_float);
1955         if (longest_float_length
1956             || (data.flags & SF_FL_BEFORE_EOL
1957                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1958                     || (RExC_flags16 & PMf_MULTILINE)))) {
1959             int t;
1960
1961             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1962                 && data.offset_fixed == data.offset_float_min
1963                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1964                     goto remove_float;          /* As in (a)+. */
1965
1966             r->float_substr = data.longest_float;
1967             r->float_min_offset = data.offset_float_min;
1968             r->float_max_offset = data.offset_float_max;
1969             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1970                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1971                            || (RExC_flags16 & PMf_MULTILINE)));
1972             fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1973         }
1974         else {
1975           remove_float:
1976             r->float_substr = Nullsv;
1977             SvREFCNT_dec(data.longest_float);
1978             longest_float_length = 0;
1979         }
1980
1981         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1982         if (longest_fixed_length
1983             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1984                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1985                     || (RExC_flags16 & PMf_MULTILINE)))) {
1986             int t;
1987
1988             r->anchored_substr = data.longest_fixed;
1989             r->anchored_offset = data.offset_fixed;
1990             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1991                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1992                      || (RExC_flags16 & PMf_MULTILINE)));
1993             fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1994         }
1995         else {
1996             r->anchored_substr = Nullsv;
1997             SvREFCNT_dec(data.longest_fixed);
1998             longest_fixed_length = 0;
1999         }
2000         if (r->regstclass
2001             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2002             r->regstclass = NULL;
2003         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2004             && !(data.start_class->flags & ANYOF_EOS)
2005             && !cl_is_anything(data.start_class)) {
2006             I32 n = add_data(pRExC_state, 1, "f");
2007
2008             New(1006, RExC_rx->data->data[n], 1,
2009                 struct regnode_charclass_class);
2010             StructCopy(data.start_class,
2011                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2012                        struct regnode_charclass_class);
2013             r->regstclass = (regnode*)RExC_rx->data->data[n];
2014             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2015             PL_regdata = r->data; /* for regprop() */
2016             DEBUG_r({ SV *sv = sv_newmortal();
2017                       regprop(sv, (regnode*)data.start_class);
2018                       PerlIO_printf(Perl_debug_log,
2019                                     "synthetic stclass `%s'.\n",
2020                                     SvPVX(sv));});
2021         }
2022
2023         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2024         if (longest_fixed_length > longest_float_length) {
2025             r->check_substr = r->anchored_substr;
2026             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2027             if (r->reganch & ROPT_ANCH_SINGLE)
2028                 r->reganch |= ROPT_NOSCAN;
2029         }
2030         else {
2031             r->check_substr = r->float_substr;
2032             r->check_offset_min = data.offset_float_min;
2033             r->check_offset_max = data.offset_float_max;
2034         }
2035         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036            This should be changed ASAP!  */
2037         if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2038             r->reganch |= RE_USE_INTUIT;
2039             if (SvTAIL(r->check_substr))
2040                 r->reganch |= RE_INTUIT_TAIL;
2041         }
2042     }
2043     else {
2044         /* Several toplevels. Best we can is to set minlen. */
2045         I32 fake;
2046         struct regnode_charclass_class ch_class;
2047         I32 last_close = 0;
2048         
2049         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050         scan = r->program + 1;
2051         cl_init(pRExC_state, &ch_class);
2052         data.start_class = &ch_class;
2053         data.last_closep = &last_close;
2054         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2055         r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2056         if (!(data.start_class->flags & ANYOF_EOS)
2057             && !cl_is_anything(data.start_class)) {
2058             I32 n = add_data(pRExC_state, 1, "f");
2059
2060             New(1006, RExC_rx->data->data[n], 1,
2061                 struct regnode_charclass_class);
2062             StructCopy(data.start_class,
2063                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2064                        struct regnode_charclass_class);
2065             r->regstclass = (regnode*)RExC_rx->data->data[n];
2066             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2067             DEBUG_r({ SV* sv = sv_newmortal();
2068                       regprop(sv, (regnode*)data.start_class);
2069                       PerlIO_printf(Perl_debug_log,
2070                                     "synthetic stclass `%s'.\n",
2071                                     SvPVX(sv));});
2072         }
2073     }
2074
2075     r->minlen = minlen;
2076     if (RExC_seen & REG_SEEN_GPOS)
2077         r->reganch |= ROPT_GPOS_SEEN;
2078     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2079         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2080     if (RExC_seen & REG_SEEN_EVAL)
2081         r->reganch |= ROPT_EVAL_SEEN;
2082     if (RExC_seen & REG_SEEN_CANY)
2083         r->reganch |= ROPT_CANY_SEEN;
2084     Newz(1002, r->startp, RExC_npar, I32);
2085     Newz(1002, r->endp, RExC_npar, I32);
2086     PL_regdata = r->data; /* for regprop() */
2087     DEBUG_r(regdump(r));
2088     return(r);
2089 }
2090
2091 /*
2092  - reg - regular expression, i.e. main body or parenthesized thing
2093  *
2094  * Caller must absorb opening parenthesis.
2095  *
2096  * Combining parenthesis handling with the base level of regular expression
2097  * is a trifle forced, but the need to tie the tails of the branches to what
2098  * follows makes it hard to avoid.
2099  */
2100 STATIC regnode *
2101 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2102     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2103 {
2104     register regnode *ret;              /* Will be the head of the group. */
2105     register regnode *br;
2106     register regnode *lastbr;
2107     register regnode *ender = 0;
2108     register I32 parno = 0;
2109     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2110
2111     /* for (?g), (?gc), and (?o) warnings; warning
2112        about (?c) will warn about (?g) -- japhy    */
2113
2114     I32 wastedflags = 0x00,
2115         wasted_o    = 0x01,
2116         wasted_g    = 0x02,
2117         wasted_gc   = 0x02 | 0x04,
2118         wasted_c    = 0x04;
2119
2120     char * parse_start = RExC_parse; /* MJD */
2121     char *oregcomp_parse = RExC_parse;
2122     char c;
2123
2124     *flagp = 0;                         /* Tentatively. */
2125
2126
2127     /* Make an OPEN node, if parenthesized. */
2128     if (paren) {
2129         if (*RExC_parse == '?') { /* (?...) */
2130             U16 posflags = 0, negflags = 0;
2131             U16 *flagsp = &posflags;
2132             int logical = 0;
2133             char *seqstart = RExC_parse;
2134
2135             RExC_parse++;
2136             paren = *RExC_parse++;
2137             ret = NULL;                 /* For look-ahead/behind. */
2138             switch (paren) {
2139             case '<':           /* (?<...) */
2140                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2141                 if (*RExC_parse == '!')
2142                     paren = ',';
2143                 if (*RExC_parse != '=' && *RExC_parse != '!')
2144                     goto unknown;
2145                 RExC_parse++;
2146             case '=':           /* (?=...) */
2147             case '!':           /* (?!...) */
2148                 RExC_seen_zerolen++;
2149             case ':':           /* (?:...) */
2150             case '>':           /* (?>...) */
2151                 break;
2152             case '$':           /* (?$...) */
2153             case '@':           /* (?@...) */
2154                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2155                 break;
2156             case '#':           /* (?#...) */
2157                 while (*RExC_parse && *RExC_parse != ')')
2158                     RExC_parse++;
2159                 if (*RExC_parse != ')')
2160                     FAIL("Sequence (?#... not terminated");
2161                 nextchar(pRExC_state);
2162                 *flagp = TRYAGAIN;
2163                 return NULL;
2164             case 'p':           /* (?p...) */
2165                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2166                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2167                 /* FALL THROUGH*/
2168             case '?':           /* (??...) */
2169                 logical = 1;
2170                 if (*RExC_parse != '{')
2171                     goto unknown;
2172                 paren = *RExC_parse++;
2173                 /* FALL THROUGH */
2174             case '{':           /* (?{...}) */
2175             {
2176                 I32 count = 1, n = 0;
2177                 char c;
2178                 char *s = RExC_parse;
2179                 SV *sv;
2180                 OP_4tree *sop, *rop;
2181
2182                 RExC_seen_zerolen++;
2183                 RExC_seen |= REG_SEEN_EVAL;
2184                 while (count && (c = *RExC_parse)) {
2185                     if (c == '\\' && RExC_parse[1])
2186                         RExC_parse++;
2187                     else if (c == '{')
2188                         count++;
2189                     else if (c == '}')
2190                         count--;
2191                     RExC_parse++;
2192                 }
2193                 if (*RExC_parse != ')')
2194                 {
2195                     RExC_parse = s;             
2196                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2197                 }
2198                 if (!SIZE_ONLY) {
2199                     AV *av;
2200                 
2201                     if (RExC_parse - 1 - s)
2202                         sv = newSVpvn(s, RExC_parse - 1 - s);
2203                     else
2204                         sv = newSVpvn("", 0);
2205
2206                     ENTER;
2207                     Perl_save_re_context(aTHX);
2208                     rop = sv_compile_2op(sv, &sop, "re", &av);
2209                     sop->op_private |= OPpREFCOUNTED;
2210                     /* re_dup will OpREFCNT_inc */
2211                     OpREFCNT_set(sop, 1);
2212                     LEAVE;
2213
2214                     n = add_data(pRExC_state, 3, "nop");
2215                     RExC_rx->data->data[n] = (void*)rop;
2216                     RExC_rx->data->data[n+1] = (void*)sop;
2217                     RExC_rx->data->data[n+2] = (void*)av;
2218                     SvREFCNT_dec(sv);
2219                 }
2220                 else {                                          /* First pass */
2221                     if (PL_reginterp_cnt < ++RExC_seen_evals
2222                         && PL_curcop != &PL_compiling)
2223                         /* No compiled RE interpolated, has runtime
2224                            components ===> unsafe.  */
2225                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2226                     if (PL_tainting && PL_tainted)
2227                         FAIL("Eval-group in insecure regular expression");
2228                 }
2229                 
2230                 nextchar(pRExC_state);
2231                 if (logical) {
2232                     ret = reg_node(pRExC_state, LOGICAL);
2233                     if (!SIZE_ONLY)
2234                         ret->flags = 2;
2235                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2236                     /* deal with the length of this later - MJD */
2237                     return ret;
2238                 }
2239                 return reganode(pRExC_state, EVAL, n);
2240             }
2241             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2242             {
2243                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2244                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2245                         || RExC_parse[1] == '<'
2246                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2247                         I32 flag;
2248                         
2249                         ret = reg_node(pRExC_state, LOGICAL);
2250                         if (!SIZE_ONLY)
2251                             ret->flags = 1;
2252                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2253                         goto insert_if;
2254                     }
2255                 }
2256                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2257                     /* (?(1)...) */
2258                     parno = atoi(RExC_parse++);
2259
2260                     while (isDIGIT(*RExC_parse))
2261                         RExC_parse++;
2262                     ret = reganode(pRExC_state, GROUPP, parno);
2263                     
2264                     if ((c = *nextchar(pRExC_state)) != ')')
2265                         vFAIL("Switch condition not recognized");
2266                   insert_if:
2267                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2268                     br = regbranch(pRExC_state, &flags, 1);
2269                     if (br == NULL)
2270                         br = reganode(pRExC_state, LONGJMP, 0);
2271                     else
2272                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2273                     c = *nextchar(pRExC_state);
2274                     if (flags&HASWIDTH)
2275                         *flagp |= HASWIDTH;
2276                     if (c == '|') {
2277                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2278                         regbranch(pRExC_state, &flags, 1);
2279                         regtail(pRExC_state, ret, lastbr);
2280                         if (flags&HASWIDTH)
2281                             *flagp |= HASWIDTH;
2282                         c = *nextchar(pRExC_state);
2283                     }
2284                     else
2285                         lastbr = NULL;
2286                     if (c != ')')
2287                         vFAIL("Switch (?(condition)... contains too many branches");
2288                     ender = reg_node(pRExC_state, TAIL);
2289                     regtail(pRExC_state, br, ender);
2290                     if (lastbr) {
2291                         regtail(pRExC_state, lastbr, ender);
2292                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2293                     }
2294                     else
2295                         regtail(pRExC_state, ret, ender);
2296                     return ret;
2297                 }
2298                 else {
2299                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2300                 }
2301             }
2302             case 0:
2303                 RExC_parse--; /* for vFAIL to print correctly */
2304                 vFAIL("Sequence (? incomplete");
2305                 break;
2306             default:
2307                 --RExC_parse;
2308               parse_flags:      /* (?i) */
2309                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2310                     /* (?g), (?gc) and (?o) are useless here
2311                        and must be globally applied -- japhy */
2312
2313                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2314                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2315                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2316                             if (! (wastedflags & wflagbit) ) {
2317                                 wastedflags |= wflagbit;
2318                                 vWARN5(
2319                                     RExC_parse + 1,
2320                                     "Useless (%s%c) - %suse /%c modifier",
2321                                     flagsp == &negflags ? "?-" : "?",
2322                                     *RExC_parse,
2323                                     flagsp == &negflags ? "don't " : "",
2324                                     *RExC_parse
2325                                 );
2326                             }
2327                         }
2328                     }
2329                     else if (*RExC_parse == 'c') {
2330                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2331                             if (! (wastedflags & wasted_c) ) {
2332                                 wastedflags |= wasted_gc;
2333                                 vWARN3(
2334                                     RExC_parse + 1,
2335                                     "Useless (%sc) - %suse /gc modifier",
2336                                     flagsp == &negflags ? "?-" : "?",
2337                                     flagsp == &negflags ? "don't " : ""
2338                                 );
2339                             }
2340                         }
2341                     }
2342                     else { pmflag(flagsp, *RExC_parse); }
2343
2344                     ++RExC_parse;
2345                 }
2346                 if (*RExC_parse == '-') {
2347                     flagsp = &negflags;
2348                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2349                     ++RExC_parse;
2350                     goto parse_flags;
2351                 }
2352                 RExC_flags16 |= posflags;
2353                 RExC_flags16 &= ~negflags;
2354                 if (*RExC_parse == ':') {
2355                     RExC_parse++;
2356                     paren = ':';
2357                     break;
2358                 }               
2359               unknown:
2360                 if (*RExC_parse != ')') {
2361                     RExC_parse++;
2362                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2363                 }
2364                 nextchar(pRExC_state);
2365                 *flagp = TRYAGAIN;
2366                 return NULL;
2367             }
2368         }
2369         else {                  /* (...) */
2370             parno = RExC_npar;
2371             RExC_npar++;
2372             ret = reganode(pRExC_state, OPEN, parno);
2373             Set_Node_Length(ret, 1); /* MJD */
2374             Set_Node_Offset(ret, RExC_parse); /* MJD */
2375             open = 1;
2376         }
2377     }
2378     else                        /* ! paren */
2379         ret = NULL;
2380
2381     /* Pick up the branches, linking them together. */
2382     parse_start = RExC_parse;   /* MJD */
2383     br = regbranch(pRExC_state, &flags, 1);
2384     /*     branch_len = (paren != 0); */
2385     
2386     if (br == NULL)
2387         return(NULL);
2388     if (*RExC_parse == '|') {
2389         if (!SIZE_ONLY && RExC_extralen) {
2390             reginsert(pRExC_state, BRANCHJ, br);
2391         }
2392         else {                  /* MJD */
2393             reginsert(pRExC_state, BRANCH, br);
2394             Set_Node_Length(br, paren != 0);
2395             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2396         }
2397         have_branch = 1;
2398         if (SIZE_ONLY)
2399             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2400     }
2401     else if (paren == ':') {
2402         *flagp |= flags&SIMPLE;
2403     }
2404     if (open) {                         /* Starts with OPEN. */
2405         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2406     }
2407     else if (paren != '?')              /* Not Conditional */
2408         ret = br;
2409     *flagp |= flags & (SPSTART | HASWIDTH);
2410     lastbr = br;
2411     while (*RExC_parse == '|') {
2412         if (!SIZE_ONLY && RExC_extralen) {
2413             ender = reganode(pRExC_state, LONGJMP,0);
2414             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2415         }
2416         if (SIZE_ONLY)
2417             RExC_extralen += 2;         /* Account for LONGJMP. */
2418         nextchar(pRExC_state);
2419         br = regbranch(pRExC_state, &flags, 0);
2420         
2421         if (br == NULL)
2422             return(NULL);
2423         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2424         lastbr = br;
2425         if (flags&HASWIDTH)
2426             *flagp |= HASWIDTH;
2427         *flagp |= flags&SPSTART;
2428     }
2429
2430     if (have_branch || paren != ':') {
2431         /* Make a closing node, and hook it on the end. */
2432         switch (paren) {
2433         case ':':
2434             ender = reg_node(pRExC_state, TAIL);
2435             break;
2436         case 1:
2437             ender = reganode(pRExC_state, CLOSE, parno);
2438             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2439             Set_Node_Length(ender,1); /* MJD */
2440             break;
2441         case '<':
2442         case ',':
2443         case '=':
2444         case '!':
2445             *flagp &= ~HASWIDTH;
2446             /* FALL THROUGH */
2447         case '>':
2448             ender = reg_node(pRExC_state, SUCCEED);
2449             break;
2450         case 0:
2451             ender = reg_node(pRExC_state, END);
2452             break;
2453         }
2454         regtail(pRExC_state, lastbr, ender);
2455
2456         if (have_branch) {
2457             /* Hook the tails of the branches to the closing node. */
2458             for (br = ret; br != NULL; br = regnext(br)) {
2459                 regoptail(pRExC_state, br, ender);
2460             }
2461         }
2462     }
2463
2464     {
2465         char *p;
2466         static char parens[] = "=!<,>";
2467
2468         if (paren && (p = strchr(parens, paren))) {
2469             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2470             int flag = (p - parens) > 1;
2471
2472             if (paren == '>')
2473                 node = SUSPEND, flag = 0;
2474             reginsert(pRExC_state, node,ret);
2475             ret->flags = flag;
2476             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2477         }
2478     }
2479
2480     /* Check for proper termination. */
2481     if (paren) {
2482         RExC_flags16 = oregflags;
2483         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2484             RExC_parse = oregcomp_parse;
2485             vFAIL("Unmatched (");
2486         }
2487     }
2488     else if (!paren && RExC_parse < RExC_end) {
2489         if (*RExC_parse == ')') {
2490             RExC_parse++;
2491             vFAIL("Unmatched )");
2492         }
2493         else
2494             FAIL("Junk on end of regexp");      /* "Can't happen". */
2495         /* NOTREACHED */
2496     }
2497
2498     return(ret);
2499 }
2500
2501 /*
2502  - regbranch - one alternative of an | operator
2503  *
2504  * Implements the concatenation operator.
2505  */
2506 STATIC regnode *
2507 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2508 {
2509     register regnode *ret;
2510     register regnode *chain = NULL;
2511     register regnode *latest;
2512     I32 flags = 0, c = 0;
2513
2514     if (first)
2515         ret = NULL;
2516     else {
2517         if (!SIZE_ONLY && RExC_extralen)
2518             ret = reganode(pRExC_state, BRANCHJ,0);
2519         else {
2520             ret = reg_node(pRExC_state, BRANCH);
2521             Set_Node_Length(ret, 1);
2522         }
2523     }
2524         
2525     if (!first && SIZE_ONLY)
2526         RExC_extralen += 1;                     /* BRANCHJ */
2527
2528     *flagp = WORST;                     /* Tentatively. */
2529
2530     RExC_parse--;
2531     nextchar(pRExC_state);
2532     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2533         flags &= ~TRYAGAIN;
2534         latest = regpiece(pRExC_state, &flags);
2535         if (latest == NULL) {
2536             if (flags & TRYAGAIN)
2537                 continue;
2538             return(NULL);
2539         }
2540         else if (ret == NULL)
2541             ret = latest;
2542         *flagp |= flags&HASWIDTH;
2543         if (chain == NULL)      /* First piece. */
2544             *flagp |= flags&SPSTART;
2545         else {
2546             RExC_naughty++;
2547             regtail(pRExC_state, chain, latest);
2548         }
2549         chain = latest;
2550         c++;
2551     }
2552     if (chain == NULL) {        /* Loop ran zero times. */
2553         chain = reg_node(pRExC_state, NOTHING);
2554         if (ret == NULL)
2555             ret = chain;
2556     }
2557     if (c == 1) {
2558         *flagp |= flags&SIMPLE;
2559     }
2560
2561     return(ret);
2562 }
2563
2564 /*
2565  - regpiece - something followed by possible [*+?]
2566  *
2567  * Note that the branching code sequences used for ? and the general cases
2568  * of * and + are somewhat optimized:  they use the same NOTHING node as
2569  * both the endmarker for their branch list and the body of the last branch.
2570  * It might seem that this node could be dispensed with entirely, but the
2571  * endmarker role is not redundant.
2572  */
2573 STATIC regnode *
2574 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2575 {
2576     register regnode *ret;
2577     register char op;
2578     register char *next;
2579     I32 flags;
2580     char *origparse = RExC_parse;
2581     char *maxpos;
2582     I32 min;
2583     I32 max = REG_INFTY;
2584     char *parse_start;
2585
2586     ret = regatom(pRExC_state, &flags);
2587     if (ret == NULL) {
2588         if (flags & TRYAGAIN)
2589             *flagp |= TRYAGAIN;
2590         return(NULL);
2591     }
2592
2593     op = *RExC_parse;
2594
2595     if (op == '{' && regcurly(RExC_parse)) {
2596         parse_start = RExC_parse; /* MJD */
2597         next = RExC_parse + 1;
2598         maxpos = Nullch;
2599         while (isDIGIT(*next) || *next == ',') {
2600             if (*next == ',') {
2601                 if (maxpos)
2602                     break;
2603                 else
2604                     maxpos = next;
2605             }
2606             next++;
2607         }
2608         if (*next == '}') {             /* got one */
2609             if (!maxpos)
2610                 maxpos = next;
2611             RExC_parse++;
2612             min = atoi(RExC_parse);
2613             if (*maxpos == ',')
2614                 maxpos++;
2615             else
2616                 maxpos = RExC_parse;
2617             max = atoi(maxpos);
2618             if (!max && *maxpos != '0')
2619                 max = REG_INFTY;                /* meaning "infinity" */
2620             else if (max >= REG_INFTY)
2621                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2622             RExC_parse = next;
2623             nextchar(pRExC_state);
2624
2625         do_curly:
2626             if ((flags&SIMPLE)) {
2627                 RExC_naughty += 2 + RExC_naughty / 2;
2628                 reginsert(pRExC_state, CURLY, ret);
2629                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2630                 Set_Node_Cur_Length(ret);
2631             }
2632             else {
2633                 regnode *w = reg_node(pRExC_state, WHILEM);
2634
2635                 w->flags = 0;
2636                 regtail(pRExC_state, ret, w);
2637                 if (!SIZE_ONLY && RExC_extralen) {
2638                     reginsert(pRExC_state, LONGJMP,ret);
2639                     reginsert(pRExC_state, NOTHING,ret);
2640                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2641                 }
2642                 reginsert(pRExC_state, CURLYX,ret);
2643                                 /* MJD hk */
2644                 Set_Node_Offset(ret, parse_start+1);
2645                 Set_Node_Length(ret, 
2646                                 op == '{' ? (RExC_parse - parse_start) : 1);
2647                 
2648                 if (!SIZE_ONLY && RExC_extralen)
2649                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2650                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2651                 if (SIZE_ONLY)
2652                     RExC_whilem_seen++, RExC_extralen += 3;
2653                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2654             }
2655             ret->flags = 0;
2656
2657             if (min > 0)
2658                 *flagp = WORST;
2659             if (max > 0)
2660                 *flagp |= HASWIDTH;
2661             if (max && max < min)
2662                 vFAIL("Can't do {n,m} with n > m");
2663             if (!SIZE_ONLY) {
2664                 ARG1_SET(ret, min);
2665                 ARG2_SET(ret, max);
2666             }
2667
2668             goto nest_check;
2669         }
2670     }
2671
2672     if (!ISMULT1(op)) {
2673         *flagp = flags;
2674         return(ret);
2675     }
2676
2677 #if 0                           /* Now runtime fix should be reliable. */
2678
2679     /* if this is reinstated, don't forget to put this back into perldiag:
2680
2681             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2682
2683            (F) The part of the regexp subject to either the * or + quantifier
2684            could match an empty string. The {#} shows in the regular
2685            expression about where the problem was discovered.
2686
2687     */
2688
2689     if (!(flags&HASWIDTH) && op != '?')
2690       vFAIL("Regexp *+ operand could be empty");
2691 #endif
2692
2693     parse_start = RExC_parse;
2694     nextchar(pRExC_state);
2695
2696     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2697
2698     if (op == '*' && (flags&SIMPLE)) {
2699         reginsert(pRExC_state, STAR, ret);
2700         ret->flags = 0;
2701         RExC_naughty += 4;
2702     }
2703     else if (op == '*') {
2704         min = 0;
2705         goto do_curly;
2706     }
2707     else if (op == '+' && (flags&SIMPLE)) {
2708         reginsert(pRExC_state, PLUS, ret);
2709         ret->flags = 0;
2710         RExC_naughty += 3;
2711     }
2712     else if (op == '+') {
2713         min = 1;
2714         goto do_curly;
2715     }
2716     else if (op == '?') {
2717         min = 0; max = 1;
2718         goto do_curly;
2719     }
2720   nest_check:
2721     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2722         vWARN3(RExC_parse,
2723                "%.*s matches null string many times",
2724                RExC_parse - origparse,
2725                origparse);
2726     }
2727
2728     if (*RExC_parse == '?') {
2729         nextchar(pRExC_state);
2730         reginsert(pRExC_state, MINMOD, ret);
2731         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2732     }
2733     if (ISMULT2(RExC_parse)) {
2734         RExC_parse++;
2735         vFAIL("Nested quantifiers");
2736     }
2737
2738     return(ret);
2739 }
2740
2741 /*
2742  - regatom - the lowest level
2743  *
2744  * Optimization:  gobbles an entire sequence of ordinary characters so that
2745  * it can turn them into a single node, which is smaller to store and
2746  * faster to run.  Backslashed characters are exceptions, each becoming a
2747  * separate node; the code is simpler that way and it's not worth fixing.
2748  *
2749  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2750 STATIC regnode *
2751 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2752 {
2753     register regnode *ret = 0;
2754     I32 flags;
2755     char *parse_start = 0;
2756
2757     *flagp = WORST;             /* Tentatively. */
2758
2759 tryagain:
2760     switch (*RExC_parse) {
2761     case '^':
2762         RExC_seen_zerolen++;
2763         nextchar(pRExC_state);
2764         if (RExC_flags16 & PMf_MULTILINE)
2765             ret = reg_node(pRExC_state, MBOL);
2766         else if (RExC_flags16 & PMf_SINGLELINE)
2767             ret = reg_node(pRExC_state, SBOL);
2768         else
2769             ret = reg_node(pRExC_state, BOL);
2770         Set_Node_Length(ret, 1); /* MJD */
2771         break;
2772     case '$':
2773         nextchar(pRExC_state);
2774         if (*RExC_parse)
2775             RExC_seen_zerolen++;
2776         if (RExC_flags16 & PMf_MULTILINE)
2777             ret = reg_node(pRExC_state, MEOL);
2778         else if (RExC_flags16 & PMf_SINGLELINE)
2779             ret = reg_node(pRExC_state, SEOL);
2780         else
2781             ret = reg_node(pRExC_state, EOL);
2782         Set_Node_Length(ret, 1); /* MJD */
2783         break;
2784     case '.':
2785         nextchar(pRExC_state);
2786         if (RExC_flags16 & PMf_SINGLELINE)
2787             ret = reg_node(pRExC_state, SANY);
2788         else
2789             ret = reg_node(pRExC_state, REG_ANY);
2790         *flagp |= HASWIDTH|SIMPLE;
2791         RExC_naughty++;
2792         Set_Node_Length(ret, 1); /* MJD */
2793         break;
2794     case '[':
2795     {
2796         char *oregcomp_parse = ++RExC_parse;
2797         ret = regclass(pRExC_state);
2798         if (*RExC_parse != ']') {
2799             RExC_parse = oregcomp_parse;
2800             vFAIL("Unmatched [");
2801         }
2802         nextchar(pRExC_state);
2803         *flagp |= HASWIDTH|SIMPLE;
2804         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2805         break;
2806     }
2807     case '(':
2808         nextchar(pRExC_state);
2809         ret = reg(pRExC_state, 1, &flags);
2810         if (ret == NULL) {
2811                 if (flags & TRYAGAIN) {
2812                     if (RExC_parse == RExC_end) {
2813                          /* Make parent create an empty node if needed. */
2814                         *flagp |= TRYAGAIN;
2815                         return(NULL);
2816                     }
2817                     goto tryagain;
2818                 }
2819                 return(NULL);
2820         }
2821         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2822         break;
2823     case '|':
2824     case ')':
2825         if (flags & TRYAGAIN) {
2826             *flagp |= TRYAGAIN;
2827             return NULL;
2828         }
2829         vFAIL("Internal urp");
2830                                 /* Supposed to be caught earlier. */
2831         break;
2832     case '{':
2833         if (!regcurly(RExC_parse)) {
2834             RExC_parse++;
2835             goto defchar;
2836         }
2837         /* FALL THROUGH */
2838     case '?':
2839     case '+':
2840     case '*':
2841         RExC_parse++;
2842         vFAIL("Quantifier follows nothing");
2843         break;
2844     case '\\':
2845         switch (*++RExC_parse) {
2846         case 'A':
2847             RExC_seen_zerolen++;
2848             ret = reg_node(pRExC_state, SBOL);
2849             *flagp |= SIMPLE;
2850             nextchar(pRExC_state);
2851             Set_Node_Length(ret, 2); /* MJD */
2852             break;
2853         case 'G':
2854             ret = reg_node(pRExC_state, GPOS);
2855             RExC_seen |= REG_SEEN_GPOS;
2856             *flagp |= SIMPLE;
2857             nextchar(pRExC_state);
2858             Set_Node_Length(ret, 2); /* MJD */
2859             break;
2860         case 'Z':
2861             ret = reg_node(pRExC_state, SEOL);
2862             *flagp |= SIMPLE;
2863             RExC_seen_zerolen++;                /* Do not optimize RE away */
2864             nextchar(pRExC_state);
2865             break;
2866         case 'z':
2867             ret = reg_node(pRExC_state, EOS);
2868             *flagp |= SIMPLE;
2869             RExC_seen_zerolen++;                /* Do not optimize RE away */
2870             nextchar(pRExC_state);
2871             Set_Node_Length(ret, 2); /* MJD */
2872             break;
2873         case 'C':
2874             ret = reg_node(pRExC_state, CANY);
2875             RExC_seen |= REG_SEEN_CANY;
2876             *flagp |= HASWIDTH|SIMPLE;
2877             nextchar(pRExC_state);
2878             Set_Node_Length(ret, 2); /* MJD */
2879             break;
2880         case 'X':
2881             ret = reg_node(pRExC_state, CLUMP);
2882             *flagp |= HASWIDTH;
2883             nextchar(pRExC_state);
2884             Set_Node_Length(ret, 2); /* MJD */
2885             break;
2886         case 'w':
2887             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2888             *flagp |= HASWIDTH|SIMPLE;
2889             nextchar(pRExC_state);
2890             Set_Node_Length(ret, 2); /* MJD */
2891             break;
2892         case 'W':
2893             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2894             *flagp |= HASWIDTH|SIMPLE;
2895             nextchar(pRExC_state);
2896             Set_Node_Length(ret, 2); /* MJD */
2897             break;
2898         case 'b':
2899             RExC_seen_zerolen++;
2900             RExC_seen |= REG_SEEN_LOOKBEHIND;
2901             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2902             *flagp |= SIMPLE;
2903             nextchar(pRExC_state);
2904             Set_Node_Length(ret, 2); /* MJD */
2905             break;
2906         case 'B':
2907             RExC_seen_zerolen++;
2908             RExC_seen |= REG_SEEN_LOOKBEHIND;
2909             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2910             *flagp |= SIMPLE;
2911             nextchar(pRExC_state);
2912             Set_Node_Length(ret, 2); /* MJD */
2913             break;
2914         case 's':
2915             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2916             *flagp |= HASWIDTH|SIMPLE;
2917             nextchar(pRExC_state);
2918             Set_Node_Length(ret, 2); /* MJD */
2919             break;
2920         case 'S':
2921             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2922             *flagp |= HASWIDTH|SIMPLE;
2923             nextchar(pRExC_state);
2924             Set_Node_Length(ret, 2); /* MJD */
2925             break;
2926         case 'd':
2927             ret = reg_node(pRExC_state, DIGIT);
2928             *flagp |= HASWIDTH|SIMPLE;
2929             nextchar(pRExC_state);
2930             Set_Node_Length(ret, 2); /* MJD */
2931             break;
2932         case 'D':
2933             ret = reg_node(pRExC_state, NDIGIT);
2934             *flagp |= HASWIDTH|SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'p':
2939         case 'P':
2940             {   
2941                 char* oldregxend = RExC_end;
2942                 char* parse_start = RExC_parse;
2943
2944                 if (RExC_parse[1] == '{') {
2945                   /* a lovely hack--pretend we saw [\pX] instead */
2946                     RExC_end = strchr(RExC_parse, '}');
2947                     if (!RExC_end) {
2948                         U8 c = (U8)*RExC_parse;
2949                         RExC_parse += 2;
2950                         RExC_end = oldregxend;
2951                         vFAIL2("Missing right brace on \\%c{}", c);
2952                     }
2953                     RExC_end++;
2954                 }
2955                 else
2956                     RExC_end = RExC_parse + 2;
2957                 RExC_parse--;
2958
2959                 ret = regclass(pRExC_state);
2960
2961                 RExC_end = oldregxend;
2962                 RExC_parse--;
2963                 Set_Node_Cur_Length(ret); /* MJD */
2964                 nextchar(pRExC_state);
2965                 *flagp |= HASWIDTH|SIMPLE;
2966             }
2967             break;
2968         case 'n':
2969         case 'r':
2970         case 't':
2971         case 'f':
2972         case 'e':
2973         case 'a':
2974         case 'x':
2975         case 'c':
2976         case '0':
2977             goto defchar;
2978         case '1': case '2': case '3': case '4':
2979         case '5': case '6': case '7': case '8': case '9':
2980             {
2981                 I32 num = atoi(RExC_parse);
2982
2983                 if (num > 9 && num >= RExC_npar)
2984                     goto defchar;
2985                 else {
2986                     char * parse_start = RExC_parse - 1; /* MJD */
2987                     while (isDIGIT(*RExC_parse))
2988                         RExC_parse++;
2989
2990                     if (!SIZE_ONLY && num > RExC_rx->nparens)
2991                         vFAIL("Reference to nonexistent group");
2992                     RExC_sawback = 1;
2993                     ret = reganode(pRExC_state, FOLD
2994                                    ? (LOC ? REFFL : REFF)
2995                                    : REF, num);
2996                     *flagp |= HASWIDTH;
2997                     
2998                     /* override incorrect value set in reganode MJD */
2999                     Set_Node_Offset(ret, parse_start+1); 
3000                     Set_Node_Cur_Length(ret); /* MJD */
3001                     RExC_parse--;
3002                     nextchar(pRExC_state);
3003                 }
3004             }
3005             break;
3006         case '\0':
3007             if (RExC_parse >= RExC_end)
3008                 FAIL("Trailing \\");
3009             /* FALL THROUGH */
3010         default:
3011             /* Do not generate `unrecognized' warnings here, we fall
3012                back into the quick-grab loop below */
3013             goto defchar;
3014         }
3015         break;
3016
3017     case '#':
3018         if (RExC_flags16 & PMf_EXTENDED) {
3019             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3020             if (RExC_parse < RExC_end)
3021                 goto tryagain;
3022         }
3023         /* FALL THROUGH */
3024
3025     default: {
3026             register STRLEN len;
3027             register UV ender;
3028             register char *p;
3029             char *oldp, *s;
3030             STRLEN numlen;
3031             STRLEN foldlen;
3032             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3033
3034             parse_start = RExC_parse - 1;
3035
3036             RExC_parse++;
3037
3038         defchar:
3039             ender = 0;
3040             ret = reg_node(pRExC_state, FOLD
3041                           ? (LOC ? EXACTFL : EXACTF)
3042                           : EXACT);
3043             s = STRING(ret);
3044             for (len = 0, p = RExC_parse - 1;
3045               len < 127 && p < RExC_end;
3046               len++)
3047             {
3048                 oldp = p;
3049
3050                 if (RExC_flags16 & PMf_EXTENDED)
3051                     p = regwhite(p, RExC_end);
3052                 switch (*p) {
3053                 case '^':
3054                 case '$':
3055                 case '.':
3056                 case '[':
3057                 case '(':
3058                 case ')':
3059                 case '|':
3060                     goto loopdone;
3061                 case '\\':
3062                     switch (*++p) {
3063                     case 'A':
3064                     case 'C':
3065                     case 'X':
3066                     case 'G':
3067                     case 'Z':
3068                     case 'z':
3069                     case 'w':
3070                     case 'W':
3071                     case 'b':
3072                     case 'B':
3073                     case 's':
3074                     case 'S':
3075                     case 'd':
3076                     case 'D':
3077                     case 'p':
3078                     case 'P':
3079                         --p;
3080                         goto loopdone;
3081                     case 'n':
3082                         ender = '\n';
3083                         p++;
3084                         break;
3085                     case 'r':
3086                         ender = '\r';
3087                         p++;
3088                         break;
3089                     case 't':
3090                         ender = '\t';
3091                         p++;
3092                         break;
3093                     case 'f':
3094                         ender = '\f';
3095                         p++;
3096                         break;
3097                     case 'e':
3098                           ender = ASCII_TO_NATIVE('\033');
3099                         p++;
3100                         break;
3101                     case 'a':
3102                           ender = ASCII_TO_NATIVE('\007');
3103                         p++;
3104                         break;
3105                     case 'x':
3106                         if (*++p == '{') {
3107                             char* e = strchr(p, '}');
3108         
3109                             if (!e) {
3110                                 RExC_parse = p + 1;
3111                                 vFAIL("Missing right brace on \\x{}");
3112                             }
3113                             else {
3114                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115                                     | PERL_SCAN_DISALLOW_PREFIX;
3116                                 numlen = e - p - 1;
3117                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3118                                 if (ender > 0xff)
3119                                     RExC_utf8 = 1;
3120                                 /* numlen is generous */
3121                                 if (numlen + len >= 127) {
3122                                     p--;
3123                                     goto loopdone;
3124                                 }
3125                                 p = e + 1;
3126                             }
3127                         }
3128                         else {
3129                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3130                             numlen = 2;
3131                             ender = grok_hex(p, &numlen, &flags, NULL);
3132                             p += numlen;
3133                         }
3134                         break;
3135                     case 'c':
3136                         p++;
3137                         ender = UCHARAT(p++);
3138                         ender = toCTRL(ender);
3139                         break;
3140                     case '0': case '1': case '2': case '3':case '4':
3141                     case '5': case '6': case '7': case '8':case '9':
3142                         if (*p == '0' ||
3143                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3144                             I32 flags = 0;
3145                             numlen = 3;
3146                             ender = grok_oct(p, &numlen, &flags, NULL);
3147                             p += numlen;
3148                         }
3149                         else {
3150                             --p;
3151                             goto loopdone;
3152                         }
3153                         break;
3154                     case '\0':
3155                         if (p >= RExC_end)
3156                             FAIL("Trailing \\");
3157                         /* FALL THROUGH */
3158                     default:
3159                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3160                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3161                         goto normal_default;
3162                     }
3163                     break;
3164                 default:
3165                   normal_default:
3166                     if (UTF8_IS_START(*p) && UTF) {
3167                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3168                                                &numlen, 0);
3169                         p += numlen;
3170                     }
3171                     else
3172                         ender = *p++;
3173                     break;
3174                 }
3175                 if (RExC_flags16 & PMf_EXTENDED)
3176                     p = regwhite(p, RExC_end);
3177                 if (UTF && FOLD) {
3178                     /* Prime the casefolded buffer. */
3179                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3180                 }
3181                 if (ISMULT2(p)) { /* Back off on ?+*. */
3182                     if (len)
3183                         p = oldp;
3184                     else if (UTF) {
3185                          STRLEN unilen;
3186
3187                          if (FOLD) {
3188                               /* Emit all the Unicode characters. */
3189                               for (foldbuf = tmpbuf;
3190                                    foldlen;
3191                                    foldlen -= numlen) {
3192                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3193                                    if (numlen > 0) {
3194                                         reguni(pRExC_state, ender, s, &unilen);
3195                                         s       += unilen;
3196                                         len     += unilen;
3197                                         /* In EBCDIC the numlen
3198                                          * and unilen can differ. */
3199                                         foldbuf += numlen;
3200                                         if (numlen >= foldlen)
3201                                              break;
3202                                    }
3203                                    else
3204                                         break; /* "Can't happen." */
3205                               }
3206                          }
3207                          else {
3208                               reguni(pRExC_state, ender, s, &unilen);
3209                               if (unilen > 0) {
3210                                    s   += unilen;
3211                                    len += unilen;
3212                               }
3213                          }
3214                     }
3215                     else {
3216                         len++;
3217                         REGC(ender, s++);
3218                     }
3219                     break;
3220                 }
3221                 if (UTF) {
3222                      STRLEN unilen;
3223
3224                      if (FOLD) {
3225                           /* Emit all the Unicode characters. */
3226                           for (foldbuf = tmpbuf;
3227                                foldlen;
3228                                foldlen -= numlen) {
3229                                ender = utf8_to_uvchr(foldbuf, &numlen);
3230                                if (numlen > 0) {
3231                                     reguni(pRExC_state, ender, s, &unilen);
3232                                     len     += unilen;
3233                                     s       += unilen;
3234                                     /* In EBCDIC the numlen
3235                                      * and unilen can differ. */
3236                                     foldbuf += numlen;
3237                                     if (numlen >= foldlen)
3238                                          break;
3239                                }
3240                                else
3241                                     break;
3242                           }
3243                      }
3244                      else {
3245                           reguni(pRExC_state, ender, s, &unilen);
3246                           if (unilen > 0) {
3247                                s   += unilen;
3248                                len += unilen;
3249                           }
3250                      }
3251                      len--;
3252                 }
3253                 else
3254                     REGC(ender, s++);
3255             }
3256         loopdone:
3257             RExC_parse = p - 1;
3258             Set_Node_Cur_Length(ret); /* MJD */
3259             nextchar(pRExC_state);
3260             {
3261                 /* len is STRLEN which is unsigned, need to copy to signed */
3262                 IV iv = len;
3263                 if (iv < 0)
3264                     vFAIL("Internal disaster");
3265             }
3266             if (len > 0)
3267                 *flagp |= HASWIDTH;
3268             if (len == 1)
3269                 *flagp |= SIMPLE;
3270             if (!SIZE_ONLY)
3271                 STR_LEN(ret) = len;
3272             if (SIZE_ONLY)
3273                 RExC_size += STR_SZ(len);
3274             else
3275                 RExC_emit += STR_SZ(len);
3276         }
3277         break;
3278     }
3279
3280     /* If the encoding pragma is in effect recode the text of
3281      * any EXACT-kind nodes. */
3282     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3283          STRLEN oldlen = STR_LEN(ret);
3284          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3285
3286          if (RExC_utf8)
3287               SvUTF8_on(sv);
3288          if (sv_utf8_downgrade(sv, TRUE)) {
3289               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3290               STRLEN newlen = SvCUR(sv);
3291          
3292               if (!SIZE_ONLY) {
3293                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3294                                          (int)oldlen, STRING(ret),
3295                                          (int)newlen, s));
3296                    Copy(s, STRING(ret), newlen, char);
3297                    STR_LEN(ret) += newlen - oldlen;
3298                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3299               } else
3300                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3301          }
3302     }
3303
3304     return(ret);
3305 }
3306
3307 STATIC char *
3308 S_regwhite(pTHX_ char *p, char *e)
3309 {
3310     while (p < e) {
3311         if (isSPACE(*p))
3312             ++p;
3313         else if (*p == '#') {
3314             do {
3315                 p++;
3316             } while (p < e && *p != '\n');
3317         }
3318         else
3319             break;
3320     }
3321     return p;
3322 }
3323
3324 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3325    Character classes ([:foo:]) can also be negated ([:^foo:]).
3326    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3327    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3328    but trigger failures because they are currently unimplemented. */
3329
3330 #define POSIXCC_DONE(c)   ((c) == ':')
3331 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3332 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3333
3334 STATIC I32
3335 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3336 {
3337     char *posixcc = 0;
3338     I32 namedclass = OOB_NAMEDCLASS;
3339
3340     if (value == '[' && RExC_parse + 1 < RExC_end &&
3341         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3342         POSIXCC(UCHARAT(RExC_parse))) {
3343         char  c = UCHARAT(RExC_parse);
3344         char* s = RExC_parse++;
3345         
3346         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3347             RExC_parse++;
3348         if (RExC_parse == RExC_end)
3349             /* Grandfather lone [:, [=, [. */
3350             RExC_parse = s;
3351         else {
3352             char* t = RExC_parse++; /* skip over the c */
3353
3354             if (UCHARAT(RExC_parse) == ']') {
3355                 RExC_parse++; /* skip over the ending ] */
3356                 posixcc = s + 1;
3357                 if (*s == ':') {
3358                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3359                     I32 skip = 5; /* the most common skip */
3360
3361                     switch (*posixcc) {
3362                     case 'a':
3363                         if (strnEQ(posixcc, "alnum", 5))
3364                             namedclass =
3365                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3366                         else if (strnEQ(posixcc, "alpha", 5))
3367                             namedclass =
3368                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3369                         else if (strnEQ(posixcc, "ascii", 5))
3370                             namedclass =
3371                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3372                         break;
3373                     case 'b':
3374                         if (strnEQ(posixcc, "blank", 5))
3375                             namedclass =
3376                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3377                         break;
3378                     case 'c':
3379                         if (strnEQ(posixcc, "cntrl", 5))
3380                             namedclass =
3381                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3382                         break;
3383                     case 'd':
3384                         if (strnEQ(posixcc, "digit", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3387                         break;
3388                     case 'g':
3389                         if (strnEQ(posixcc, "graph", 5))
3390                             namedclass =
3391                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3392                         break;
3393                     case 'l':
3394                         if (strnEQ(posixcc, "lower", 5))
3395                             namedclass =
3396                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3397                         break;
3398                     case 'p':
3399                         if (strnEQ(posixcc, "print", 5))
3400                             namedclass =
3401                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3402                         else if (strnEQ(posixcc, "punct", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3405                         break;
3406                     case 's':
3407                         if (strnEQ(posixcc, "space", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3410                         break;
3411                     case 'u':
3412                         if (strnEQ(posixcc, "upper", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3415                         break;
3416                     case 'w': /* this is not POSIX, this is the Perl \w */
3417                         if (strnEQ(posixcc, "word", 4)) {
3418                             namedclass =
3419                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3420                             skip = 4;
3421                         }
3422                         break;
3423                     case 'x':
3424                         if (strnEQ(posixcc, "xdigit", 6)) {
3425                             namedclass =
3426                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3427                             skip = 6;
3428                         }
3429                         break;
3430                     }
3431                     if (namedclass == OOB_NAMEDCLASS ||
3432                         posixcc[skip] != ':' ||
3433                         posixcc[skip+1] != ']')
3434                     {
3435                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3436                                       t - s - 1, s + 1);
3437                     }
3438                 } else if (!SIZE_ONLY) {
3439                     /* [[=foo=]] and [[.foo.]] are still future. */
3440
3441                     /* adjust RExC_parse so the warning shows after
3442                        the class closes */
3443                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3444                         RExC_parse++;
3445                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3446                 }
3447             } else {
3448                 /* Maternal grandfather:
3449                  * "[:" ending in ":" but not in ":]" */
3450                 RExC_parse = s;
3451             }
3452         }
3453     }
3454
3455     return namedclass;
3456 }
3457
3458 STATIC void
3459 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3460 {
3461     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3462         char *s = RExC_parse;
3463         char  c = *s++;
3464
3465         while(*s && isALNUM(*s))
3466             s++;
3467         if (*s && c == *s && s[1] == ']') {
3468             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3469
3470             /* [[=foo=]] and [[.foo.]] are still future. */
3471             if (POSIXCC_NOTYET(c)) {
3472                 /* adjust RExC_parse so the error shows after
3473                    the class closes */
3474                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3475                     ;
3476                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3477             }
3478         }
3479     }
3480 }
3481
3482 STATIC regnode *
3483 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3484 {
3485     register UV value;
3486     register UV nextvalue;
3487     register IV prevvalue = OOB_UNICODE;
3488     register IV range = 0;
3489     register regnode *ret;
3490     STRLEN numlen;
3491     IV namedclass;
3492     char *rangebegin = 0;
3493     bool need_class = 0;
3494     SV *listsv = Nullsv;
3495     register char *e;
3496     UV n;
3497     bool optimize_invert   = TRUE;
3498     AV* unicode_alternate  = 0;
3499
3500     ret = reganode(pRExC_state, ANYOF, 0);
3501
3502     if (!SIZE_ONLY)
3503         ANYOF_FLAGS(ret) = 0;
3504
3505     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3506         RExC_naughty++;
3507         RExC_parse++;
3508         if (!SIZE_ONLY)
3509             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3510     }
3511
3512     if (SIZE_ONLY)
3513         RExC_size += ANYOF_SKIP;
3514     else {
3515         RExC_emit += ANYOF_SKIP;
3516         if (FOLD)
3517             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3518         if (LOC)
3519             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3520         ANYOF_BITMAP_ZERO(ret);
3521         listsv = newSVpvn("# comment\n", 10);
3522     }
3523
3524     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3525
3526     if (!SIZE_ONLY && POSIXCC(nextvalue))
3527         checkposixcc(pRExC_state);
3528
3529     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3530     if (UCHARAT(RExC_parse) == ']')
3531         goto charclassloop;
3532
3533     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3534
3535     charclassloop:
3536
3537         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3538
3539         if (!range)
3540             rangebegin = RExC_parse;
3541         if (UTF) {
3542             value = utf8n_to_uvchr((U8*)RExC_parse,
3543                                    RExC_end - RExC_parse,
3544                                    &numlen, 0);
3545             RExC_parse += numlen;
3546         }
3547         else
3548             value = UCHARAT(RExC_parse++);
3549         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3550         if (value == '[' && POSIXCC(nextvalue))
3551             namedclass = regpposixcc(pRExC_state, value);
3552         else if (value == '\\') {
3553             if (UTF) {
3554                 value = utf8n_to_uvchr((U8*)RExC_parse,
3555                                    RExC_end - RExC_parse,
3556                                    &numlen, 0);
3557                 RExC_parse += numlen;
3558             }
3559             else
3560                 value = UCHARAT(RExC_parse++);
3561             /* Some compilers cannot handle switching on 64-bit integer
3562              * values, therefore value cannot be an UV.  Yes, this will
3563              * be a problem later if we want switch on Unicode.
3564              * A similar issue a little bit later when switching on
3565              * namedclass. --jhi */
3566             switch ((I32)value) {
3567             case 'w':   namedclass = ANYOF_ALNUM;       break;
3568             case 'W':   namedclass = ANYOF_NALNUM;      break;
3569             case 's':   namedclass = ANYOF_SPACE;       break;
3570             case 'S':   namedclass = ANYOF_NSPACE;      break;
3571             case 'd':   namedclass = ANYOF_DIGIT;       break;
3572             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3573             case 'p':
3574             case 'P':
3575                 if (*RExC_parse == '{') {
3576                     U8 c = (U8)value;
3577                     e = strchr(RExC_parse++, '}');
3578                     if (!e)
3579                         vFAIL2("Missing right brace on \\%c{}", c);
3580                     while (isSPACE(UCHARAT(RExC_parse)))
3581                         RExC_parse++;
3582                     if (e == RExC_parse)
3583                         vFAIL2("Empty \\%c{}", c);
3584                     n = e - RExC_parse;
3585                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3586                         n--;
3587                 }
3588                 else {
3589                     e = RExC_parse;
3590                     n = 1;
3591                 }
3592                 if (!SIZE_ONLY) {
3593                     if (UCHARAT(RExC_parse) == '^') {
3594                          RExC_parse++;
3595                          n--;
3596                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3597                          while (isSPACE(UCHARAT(RExC_parse))) {
3598                               RExC_parse++;
3599                               n--;
3600                          }
3601                     }
3602                     if (value == 'p')
3603                          Perl_sv_catpvf(aTHX_ listsv,
3604                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3605                     else
3606                          Perl_sv_catpvf(aTHX_ listsv,
3607                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3608                 }
3609                 RExC_parse = e + 1;
3610                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3611                 continue;
3612             case 'n':   value = '\n';                   break;
3613             case 'r':   value = '\r';                   break;
3614             case 't':   value = '\t';                   break;
3615             case 'f':   value = '\f';                   break;
3616             case 'b':   value = '\b';                   break;
3617             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3618             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3619             case 'x':
3620                 if (*RExC_parse == '{') {
3621                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3622                         | PERL_SCAN_DISALLOW_PREFIX;
3623                     e = strchr(RExC_parse++, '}');
3624                     if (!e)
3625                         vFAIL("Missing right brace on \\x{}");
3626
3627                     numlen = e - RExC_parse;
3628                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3629                     RExC_parse = e + 1;
3630                 }
3631                 else {
3632                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3633                     numlen = 2;
3634                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3635                     RExC_parse += numlen;
3636                 }
3637                 break;
3638             case 'c':
3639                 value = UCHARAT(RExC_parse++);
3640                 value = toCTRL(value);
3641                 break;
3642             case '0': case '1': case '2': case '3': case '4':
3643             case '5': case '6': case '7': case '8': case '9':
3644             {
3645                 I32 flags = 0;
3646                 numlen = 3;
3647                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3648                 RExC_parse += numlen;
3649                 break;
3650             }
3651             default:
3652                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3653                     vWARN2(RExC_parse,
3654                            "Unrecognized escape \\%c in character class passed through",
3655                            (int)value);
3656                 break;
3657             }
3658         } /* end of \blah */
3659
3660         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3661
3662             if (!SIZE_ONLY && !need_class)
3663                 ANYOF_CLASS_ZERO(ret);
3664
3665             need_class = 1;
3666
3667             /* a bad range like a-\d, a-[:digit:] ? */
3668             if (range) {
3669                 if (!SIZE_ONLY) {
3670                     if (ckWARN(WARN_REGEXP))
3671                         vWARN4(RExC_parse,
3672                                "False [] range \"%*.*s\"",
3673                                RExC_parse - rangebegin,
3674                                RExC_parse - rangebegin,
3675                                rangebegin);
3676                     if (prevvalue < 256) {
3677                         ANYOF_BITMAP_SET(ret, prevvalue);
3678                         ANYOF_BITMAP_SET(ret, '-');
3679                     }
3680                     else {
3681                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3682                         Perl_sv_catpvf(aTHX_ listsv,
3683                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3684                     }
3685                 }
3686
3687                 range = 0; /* this was not a true range */
3688             }
3689
3690             if (!SIZE_ONLY) {
3691                 if (namedclass > OOB_NAMEDCLASS)
3692                     optimize_invert = FALSE;
3693                 /* Possible truncation here but in some 64-bit environments
3694                  * the compiler gets heartburn about switch on 64-bit values.
3695                  * A similar issue a little earlier when switching on value.
3696                  * --jhi */
3697                 switch ((I32)namedclass) {
3698                 case ANYOF_ALNUM:
3699                     if (LOC)
3700                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3701                     else {
3702                         for (value = 0; value < 256; value++)
3703                             if (isALNUM(value))
3704                                 ANYOF_BITMAP_SET(ret, value);
3705                     }
3706                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3707                     break;
3708                 case ANYOF_NALNUM:
3709                     if (LOC)
3710                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3711                     else {
3712                         for (value = 0; value < 256; value++)
3713                             if (!isALNUM(value))
3714                                 ANYOF_BITMAP_SET(ret, value);
3715                     }
3716                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3717                     break;
3718                 case ANYOF_ALNUMC:
3719                     if (LOC)
3720                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3721                     else {
3722                         for (value = 0; value < 256; value++)
3723                             if (isALNUMC(value))
3724                                 ANYOF_BITMAP_SET(ret, value);
3725                     }
3726                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3727                     break;
3728                 case ANYOF_NALNUMC:
3729                     if (LOC)
3730                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3731                     else {
3732                         for (value = 0; value < 256; value++)
3733                             if (!isALNUMC(value))
3734                                 ANYOF_BITMAP_SET(ret, value);
3735                     }
3736                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3737                     break;
3738                 case ANYOF_ALPHA:
3739                     if (LOC)
3740                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3741                     else {
3742                         for (value = 0; value < 256; value++)
3743                             if (isALPHA(value))
3744                                 ANYOF_BITMAP_SET(ret, value);
3745                     }
3746                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3747                     break;
3748                 case ANYOF_NALPHA:
3749                     if (LOC)
3750                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3751                     else {
3752                         for (value = 0; value < 256; value++)
3753                             if (!isALPHA(value))
3754                                 ANYOF_BITMAP_SET(ret, value);
3755                     }
3756                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3757                     break;
3758                 case ANYOF_ASCII:
3759                     if (LOC)
3760                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3761                     else {
3762 #ifndef EBCDIC
3763                         for (value = 0; value < 128; value++)
3764                             ANYOF_BITMAP_SET(ret, value);
3765 #else  /* EBCDIC */
3766                         for (value = 0; value < 256; value++) {
3767                             if (isASCII(value))
3768                                 ANYOF_BITMAP_SET(ret, value);
3769                         }
3770 #endif /* EBCDIC */
3771                     }
3772                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3773                     break;
3774                 case ANYOF_NASCII:
3775                     if (LOC)
3776                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3777                     else {
3778 #ifndef EBCDIC
3779                         for (value = 128; value < 256; value++)
3780                             ANYOF_BITMAP_SET(ret, value);
3781 #else  /* EBCDIC */
3782                         for (value = 0; value < 256; value++) {
3783                             if (!isASCII(value))
3784                                 ANYOF_BITMAP_SET(ret, value);
3785                         }
3786 #endif /* EBCDIC */
3787                     }
3788                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3789                     break;
3790                 case ANYOF_BLANK:
3791                     if (LOC)
3792                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3793                     else {
3794                         for (value = 0; value < 256; value++)
3795                             if (isBLANK(value))
3796                                 ANYOF_BITMAP_SET(ret, value);
3797                     }
3798                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3799                     break;
3800                 case ANYOF_NBLANK:
3801                     if (LOC)
3802                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3803                     else {
3804                         for (value = 0; value < 256; value++)
3805                             if (!isBLANK(value))
3806                                 ANYOF_BITMAP_SET(ret, value);
3807                     }
3808                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3809                     break;
3810                 case ANYOF_CNTRL:
3811                     if (LOC)
3812                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3813                     else {
3814                         for (value = 0; value < 256; value++)
3815                             if (isCNTRL(value))
3816                                 ANYOF_BITMAP_SET(ret, value);
3817                     }
3818                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3819                     break;
3820                 case ANYOF_NCNTRL:
3821                     if (LOC)
3822                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3823                     else {
3824                         for (value = 0; value < 256; value++)
3825                             if (!isCNTRL(value))
3826                                 ANYOF_BITMAP_SET(ret, value);
3827                     }
3828                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3829                     break;
3830                 case ANYOF_DIGIT:
3831                     if (LOC)
3832                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3833                     else {
3834                         /* consecutive digits assumed */
3835                         for (value = '0'; value <= '9'; value++)
3836                             ANYOF_BITMAP_SET(ret, value);
3837                     }
3838                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3839                     break;
3840                 case ANYOF_NDIGIT:
3841                     if (LOC)
3842                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3843                     else {
3844                         /* consecutive digits assumed */
3845                         for (value = 0; value < '0'; value++)
3846                             ANYOF_BITMAP_SET(ret, value);
3847                         for (value = '9' + 1; value < 256; value++)
3848                             ANYOF_BITMAP_SET(ret, value);
3849                     }
3850                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3851                     break;
3852                 case ANYOF_GRAPH:
3853                     if (LOC)
3854                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3855                     else {
3856                         for (value = 0; value < 256; value++)
3857                             if (isGRAPH(value))
3858                                 ANYOF_BITMAP_SET(ret, value);
3859                     }
3860                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3861                     break;
3862                 case ANYOF_NGRAPH:
3863                     if (LOC)
3864                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3865                     else {
3866                         for (value = 0; value < 256; value++)
3867                             if (!isGRAPH(value))
3868                                 ANYOF_BITMAP_SET(ret, value);
3869                     }
3870                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3871                     break;
3872                 case ANYOF_LOWER:
3873                     if (LOC)
3874                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3875                     else {
3876                         for (value = 0; value < 256; value++)
3877                             if (isLOWER(value))
3878                                 ANYOF_BITMAP_SET(ret, value);
3879                     }
3880                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3881                     break;
3882                 case ANYOF_NLOWER:
3883                     if (LOC)
3884                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3885                     else {
3886                         for (value = 0; value < 256; value++)
3887                             if (!isLOWER(value))
3888                                 ANYOF_BITMAP_SET(ret, value);
3889                     }
3890                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3891                     break;
3892                 case ANYOF_PRINT:
3893                     if (LOC)
3894                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3895                     else {
3896                         for (value = 0; value < 256; value++)
3897                             if (isPRINT(value))
3898                                 ANYOF_BITMAP_SET(ret, value);
3899                     }
3900                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3901                     break;
3902                 case ANYOF_NPRINT:
3903                     if (LOC)
3904                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3905                     else {
3906                         for (value = 0; value < 256; value++)
3907                             if (!isPRINT(value))
3908                                 ANYOF_BITMAP_SET(ret, value);
3909                     }
3910                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3911                     break;
3912                 case ANYOF_PSXSPC:
3913                     if (LOC)
3914                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3915                     else {
3916                         for (value = 0; value < 256; value++)
3917                             if (isPSXSPC(value))
3918                                 ANYOF_BITMAP_SET(ret, value);
3919                     }
3920                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3921                     break;
3922                 case ANYOF_NPSXSPC:
3923                     if (LOC)
3924                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3925                     else {
3926                         for (value = 0; value < 256; value++)
3927                             if (!isPSXSPC(value))
3928                                 ANYOF_BITMAP_SET(ret, value);
3929                     }
3930                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3931                     break;
3932                 case ANYOF_PUNCT:
3933                     if (LOC)
3934                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3935                     else {
3936                         for (value = 0; value < 256; value++)
3937                             if (isPUNCT(value))
3938                                 ANYOF_BITMAP_SET(ret, value);
3939                     }
3940                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3941                     break;
3942                 case ANYOF_NPUNCT:
3943                     if (LOC)
3944                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3945                     else {
3946                         for (value = 0; value < 256; value++)
3947                             if (!isPUNCT(value))
3948                                 ANYOF_BITMAP_SET(ret, value);
3949                     }
3950                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3951                     break;
3952                 case ANYOF_SPACE:
3953                     if (LOC)
3954                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3955                     else {
3956                         for (value = 0; value < 256; value++)
3957                             if (isSPACE(value))
3958                                 ANYOF_BITMAP_SET(ret, value);
3959                     }
3960                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3961                     break;
3962                 case ANYOF_NSPACE:
3963                     if (LOC)
3964                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3965                     else {
3966                         for (value = 0; value < 256; value++)
3967                             if (!isSPACE(value))
3968                                 ANYOF_BITMAP_SET(ret, value);
3969                     }
3970                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3971                     break;
3972                 case ANYOF_UPPER:
3973                     if (LOC)
3974                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3975                     else {
3976                         for (value = 0; value < 256; value++)
3977                             if (isUPPER(value))
3978                                 ANYOF_BITMAP_SET(ret, value);
3979                     }
3980                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3981                     break;
3982                 case ANYOF_NUPPER:
3983                     if (LOC)
3984                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3985                     else {
3986                         for (value = 0; value < 256; value++)
3987                             if (!isUPPER(value))
3988                                 ANYOF_BITMAP_SET(ret, value);
3989                     }
3990                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3991                     break;
3992                 case ANYOF_XDIGIT:
3993                     if (LOC)
3994                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3995                     else {
3996                         for (value = 0; value < 256; value++)
3997                             if (isXDIGIT(value))
3998                                 ANYOF_BITMAP_SET(ret, value);
3999                     }
4000                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4001                     break;
4002                 case ANYOF_NXDIGIT:
4003                     if (LOC)
4004                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4005                     else {
4006                         for (value = 0; value < 256; value++)
4007                             if (!isXDIGIT(value))
4008                                 ANYOF_BITMAP_SET(ret, value);
4009                     }
4010                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4011                     break;
4012                 default:
4013                     vFAIL("Invalid [::] class");
4014                     break;
4015                 }
4016                 if (LOC)
4017                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4018                 continue;
4019             }
4020         } /* end of namedclass \blah */
4021
4022         if (range) {
4023             if (prevvalue > value) /* b-a */ {
4024                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4025                               RExC_parse - rangebegin,
4026                               RExC_parse - rangebegin,
4027                               rangebegin);
4028                 range = 0; /* not a valid range */
4029             }
4030         }
4031         else {
4032             prevvalue = value; /* save the beginning of the range */
4033             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4034                 RExC_parse[1] != ']') {
4035                 RExC_parse++;
4036
4037                 /* a bad range like \w-, [:word:]- ? */
4038                 if (namedclass > OOB_NAMEDCLASS) {
4039                     if (ckWARN(WARN_REGEXP))
4040                         vWARN4(RExC_parse,
4041                                "False [] range \"%*.*s\"",
4042                                RExC_parse - rangebegin,
4043                                RExC_parse - rangebegin,
4044                                rangebegin);
4045                     if (!SIZE_ONLY)
4046                         ANYOF_BITMAP_SET(ret, '-');
4047                 } else
4048                     range = 1;  /* yeah, it's a range! */
4049                 continue;       /* but do it the next time */
4050             }
4051         }
4052
4053         /* now is the next time */
4054         if (!SIZE_ONLY) {
4055             IV i;
4056
4057             if (prevvalue < 256) {
4058                 IV ceilvalue = value < 256 ? value : 255;
4059
4060 #ifdef EBCDIC
4061                 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4062                     (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4063                 {
4064                     if (isLOWER(prevvalue)) {
4065                         for (i = prevvalue; i <= ceilvalue; i++)
4066                             if (isLOWER(i))
4067                                 ANYOF_BITMAP_SET(ret, i);
4068                     } else {
4069                         for (i = prevvalue; i <= ceilvalue; i++)
4070                             if (isUPPER(i))
4071                                 ANYOF_BITMAP_SET(ret, i);
4072                     }
4073                 }
4074                 else
4075 #endif
4076                       for (i = prevvalue; i <= ceilvalue; i++)
4077                           ANYOF_BITMAP_SET(ret, i);
4078           }
4079           if (value > 255 || UTF) {
4080                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4081                 UV natvalue      = NATIVE_TO_UNI(value);
4082
4083                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4084                 if (prevnatvalue < natvalue) { /* what about > ? */
4085                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4086                                    prevnatvalue, natvalue);
4087                 }
4088                 else if (prevnatvalue == natvalue) {
4089                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4090                     if (FOLD) {
4091                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4092                          STRLEN foldlen;
4093                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4094
4095                          /* If folding and foldable and a single
4096                           * character, insert also the folded version
4097                           * to the charclass. */
4098                          if (f != value) {
4099                               if (foldlen == UNISKIP(f))
4100                                   Perl_sv_catpvf(aTHX_ listsv,
4101                                                  "%04"UVxf"\n", f);
4102                               else {
4103                                   /* Any multicharacter foldings
4104                                    * require the following transform:
4105                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4106                                    * where E folds into "pq" and F folds
4107                                    * into "rst", all other characters
4108                                    * fold to single characters.  We save
4109                                    * away these multicharacter foldings,
4110                                    * to be later saved as part of the
4111                                    * additional "s" data. */
4112                                   SV *sv;
4113
4114                                   if (!unicode_alternate)
4115                                       unicode_alternate = newAV();
4116                                   sv = newSVpvn((char*)foldbuf, foldlen);
4117                                   SvUTF8_on(sv);
4118                                   av_push(unicode_alternate, sv);
4119                               }
4120                          }
4121
4122                          /* If folding and the value is one of the Greek
4123                           * sigmas insert a few more sigmas to make the
4124                           * folding rules of the sigmas to work right.
4125                           * Note that not all the possible combinations
4126                           * are handled here: some of them are handled
4127                           * by the standard folding rules, and some of
4128                           * them (literal or EXACTF cases) are handled
4129                           * during runtime in regexec.c:S_find_byclass(). */
4130                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4131                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4132                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4133                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4134                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4135                          }
4136                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4137                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4138                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4139                     }
4140                 }
4141             }
4142         }
4143
4144         range = 0; /* this range (if it was one) is done now */
4145     }
4146
4147     if (need_class) {
4148         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4149         if (SIZE_ONLY)
4150             RExC_size += ANYOF_CLASS_ADD_SKIP;
4151         else
4152             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4153     }
4154
4155     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4156     if (!SIZE_ONLY &&
4157          /* If the only flag is folding (plus possibly inversion). */
4158         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4159        ) {
4160         for (value = 0; value < 256; ++value) {
4161             if (ANYOF_BITMAP_TEST(ret, value)) {
4162                 IV fold = PL_fold[value];
4163
4164                 if (fold != value)
4165                     ANYOF_BITMAP_SET(ret, fold);
4166             }
4167         }
4168         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4169     }
4170
4171     /* optimize inverted simple patterns (e.g. [^a-z]) */
4172     if (!SIZE_ONLY && optimize_invert &&
4173         /* If the only flag is inversion. */
4174         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4175         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4176             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4177         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4178     }
4179
4180     if (!SIZE_ONLY) {
4181         AV *av = newAV();
4182         SV *rv;
4183
4184         /* The 0th element stores the character class description
4185          * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4186          * to initialize the appropriate swash (which gets stored in
4187          * the 1st element), and also useful for dumping the regnode.
4188          * The 2nd element stores the multicharacter foldings,
4189          * used later (regexec.c:s_reginclasslen()). */
4190         av_store(av, 0, listsv);
4191         av_store(av, 1, NULL);
4192         av_store(av, 2, (SV*)unicode_alternate);
4193         rv = newRV_noinc((SV*)av);
4194         n = add_data(pRExC_state, 1, "s");
4195         RExC_rx->data->data[n] = (void*)rv;
4196         ARG_SET(ret, n);
4197     }
4198
4199     return ret;
4200 }
4201
4202 STATIC char*
4203 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4204 {
4205     char* retval = RExC_parse++;
4206
4207     for (;;) {
4208         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4209                 RExC_parse[2] == '#') {
4210             while (*RExC_parse && *RExC_parse != ')')
4211                 RExC_parse++;
4212             RExC_parse++;
4213             continue;
4214         }
4215         if (RExC_flags16 & PMf_EXTENDED) {
4216             if (isSPACE(*RExC_parse)) {
4217                 RExC_parse++;
4218                 continue;
4219             }
4220             else if (*RExC_parse == '#') {
4221                 while (*RExC_parse && *RExC_parse != '\n')
4222                     RExC_parse++;
4223                 RExC_parse++;
4224                 continue;
4225             }
4226         }
4227         return retval;
4228     }
4229 }
4230
4231 /*
4232 - reg_node - emit a node
4233 */
4234 STATIC regnode *                        /* Location. */
4235 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4236 {
4237     register regnode *ret;
4238     register regnode *ptr;
4239
4240     ret = RExC_emit;
4241     if (SIZE_ONLY) {
4242         SIZE_ALIGN(RExC_size);
4243         RExC_size += 1;
4244         return(ret);
4245     }
4246
4247     NODE_ALIGN_FILL(ret);
4248     ptr = ret;
4249     FILL_ADVANCE_NODE(ptr, op);
4250     if (RExC_offsets) {         /* MJD */
4251       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4252               "reg_node", __LINE__, 
4253               reg_name[op],
4254               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4255               ? "Overwriting end of array!\n" : "OK",
4256               RExC_emit - RExC_emit_start,
4257               RExC_parse - RExC_start,
4258               RExC_offsets[0])); 
4259       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4260     }
4261             
4262     RExC_emit = ptr;
4263
4264     return(ret);
4265 }
4266
4267 /*
4268 - reganode - emit a node with an argument
4269 */
4270 STATIC regnode *                        /* Location. */
4271 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4272 {
4273     register regnode *ret;
4274     register regnode *ptr;
4275
4276     ret = RExC_emit;
4277     if (SIZE_ONLY) {
4278         SIZE_ALIGN(RExC_size);
4279         RExC_size += 2;
4280         return(ret);
4281     }
4282
4283     NODE_ALIGN_FILL(ret);
4284     ptr = ret;
4285     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4286     if (RExC_offsets) {         /* MJD */
4287       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4288               "reganode",
4289               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4290               "Overwriting end of array!\n" : "OK",
4291               RExC_emit - RExC_emit_start,
4292               RExC_parse - RExC_start,
4293               RExC_offsets[0])); 
4294       Set_Cur_Node_Offset;
4295     }
4296             
4297     RExC_emit = ptr;
4298
4299     return(ret);
4300 }
4301
4302 /*
4303 - reguni - emit (if appropriate) a Unicode character
4304 */
4305 STATIC void
4306 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4307 {
4308     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4309 }
4310
4311 /*
4312 - reginsert - insert an operator in front of already-emitted operand
4313 *
4314 * Means relocating the operand.
4315 */
4316 STATIC void
4317 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4318 {
4319     register regnode *src;
4320     register regnode *dst;
4321     register regnode *place;
4322     register int offset = regarglen[(U8)op];
4323
4324 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4325
4326     if (SIZE_ONLY) {
4327         RExC_size += NODE_STEP_REGNODE + offset;
4328         return;
4329     }
4330
4331     src = RExC_emit;
4332     RExC_emit += NODE_STEP_REGNODE + offset;
4333     dst = RExC_emit;
4334     while (src > opnd) {
4335         StructCopy(--src, --dst, regnode);
4336         if (RExC_offsets) {     /* MJD 20010112 */
4337           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4338                   "reg_insert",
4339                   dst - RExC_emit_start > RExC_offsets[0] 
4340                   ? "Overwriting end of array!\n" : "OK",
4341                   src - RExC_emit_start,
4342                   dst - RExC_emit_start,
4343                   RExC_offsets[0])); 
4344           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4345           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4346         }
4347     }
4348     
4349
4350     place = opnd;               /* Op node, where operand used to be. */
4351     if (RExC_offsets) {         /* MJD */
4352       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4353               "reginsert",
4354               place - RExC_emit_start > RExC_offsets[0] 
4355               ? "Overwriting end of array!\n" : "OK",
4356               place - RExC_emit_start,
4357               RExC_parse - RExC_start,
4358               RExC_offsets[0])); 
4359       Set_Node_Offset(place, RExC_parse);
4360     }
4361     src = NEXTOPER(place);
4362     FILL_ADVANCE_NODE(place, op);
4363     Zero(src, offset, regnode);
4364 }
4365
4366 /*
4367 - regtail - set the next-pointer at the end of a node chain of p to val.
4368 */
4369 STATIC void
4370 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4371 {
4372     register regnode *scan;
4373     register regnode *temp;
4374
4375     if (SIZE_ONLY)
4376         return;
4377
4378     /* Find last node. */
4379     scan = p;
4380     for (;;) {
4381         temp = regnext(scan);
4382         if (temp == NULL)
4383             break;
4384         scan = temp;
4385     }
4386
4387     if (reg_off_by_arg[OP(scan)]) {
4388         ARG_SET(scan, val - scan);
4389     }
4390     else {
4391         NEXT_OFF(scan) = val - scan;
4392     }
4393 }
4394
4395 /*
4396 - regoptail - regtail on operand of first argument; nop if operandless
4397 */
4398 STATIC void
4399 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4400 {
4401     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4402     if (p == NULL || SIZE_ONLY)
4403         return;
4404     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4405         regtail(pRExC_state, NEXTOPER(p), val);
4406     }
4407     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4408         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4409     }
4410     else
4411         return;
4412 }
4413
4414 /*
4415  - regcurly - a little FSA that accepts {\d+,?\d*}
4416  */
4417 STATIC I32
4418 S_regcurly(pTHX_ register char *s)
4419 {
4420     if (*s++ != '{')
4421         return FALSE;
4422     if (!isDIGIT(*s))
4423         return FALSE;
4424     while (isDIGIT(*s))
4425         s++;
4426     if (*s == ',')
4427         s++;
4428     while (isDIGIT(*s))
4429         s++;
4430     if (*s != '}')
4431         return FALSE;
4432     return TRUE;
4433 }
4434
4435
4436 #ifdef DEBUGGING
4437
4438 STATIC regnode *
4439 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4440 {
4441     register U8 op = EXACT;     /* Arbitrary non-END op. */
4442     register regnode *next;
4443
4444     while (op != END && (!last || node < last)) {
4445         /* While that wasn't END last time... */
4446
4447         NODE_ALIGN(node);
4448         op = OP(node);
4449         if (op == CLOSE)
4450             l--;        
4451         next = regnext(node);
4452         /* Where, what. */
4453         if (OP(node) == OPTIMIZED)
4454             goto after_print;
4455         regprop(sv, node);
4456         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4457                       (int)(2*l + 1), "", SvPVX(sv));
4458         if (next == NULL)               /* Next ptr. */
4459             PerlIO_printf(Perl_debug_log, "(0)");
4460         else
4461             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4462         (void)PerlIO_putc(Perl_debug_log, '\n');
4463       after_print:
4464         if (PL_regkind[(U8)op] == BRANCHJ) {
4465             register regnode *nnode = (OP(next) == LONGJMP
4466                                        ? regnext(next)
4467                                        : next);
4468             if (last && nnode > last)
4469                 nnode = last;
4470             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4471         }
4472         else if (PL_regkind[(U8)op] == BRANCH) {
4473             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4474         }
4475         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4476             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4477                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4478         }
4479         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4480             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4481                              next, sv, l + 1);
4482         }
4483         else if ( op == PLUS || op == STAR) {
4484             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4485         }
4486         else if (op == ANYOF) {
4487             /* arglen 1 + class block */
4488             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4489                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4490             node = NEXTOPER(node);
4491         }
4492         else if (PL_regkind[(U8)op] == EXACT) {
4493             /* Literal string, where present. */
4494             node += NODE_SZ_STR(node) - 1;
4495             node = NEXTOPER(node);
4496         }
4497         else {
4498             node = NEXTOPER(node);
4499             node += regarglen[(U8)op];
4500         }
4501         if (op == CURLYX || op == OPEN)
4502             l++;
4503         else if (op == WHILEM)
4504             l--;
4505     }
4506     return node;
4507 }
4508
4509 #endif  /* DEBUGGING */
4510
4511 /*
4512  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4513  */
4514 void
4515 Perl_regdump(pTHX_ regexp *r)
4516 {
4517 #ifdef DEBUGGING
4518     SV *sv = sv_newmortal();
4519
4520     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4521
4522     /* Header fields of interest. */
4523     if (r->anchored_substr)
4524         PerlIO_printf(Perl_debug_log,
4525                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4526                       PL_colors[0],
4527                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4528                       SvPVX(r->anchored_substr),
4529                       PL_colors[1],
4530                       SvTAIL(r->anchored_substr) ? "$" : "",
4531                       (IV)r->anchored_offset);
4532     if (r->float_substr)
4533         PerlIO_printf(Perl_debug_log,
4534                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4535                       PL_colors[0],
4536                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4537                       SvPVX(r->float_substr),
4538                       PL_colors[1],
4539                       SvTAIL(r->float_substr) ? "$" : "",
4540                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4541     if (r->check_substr)
4542         PerlIO_printf(Perl_debug_log,
4543                       r->check_substr == r->float_substr
4544                       ? "(checking floating" : "(checking anchored");
4545     if (r->reganch & ROPT_NOSCAN)
4546         PerlIO_printf(Perl_debug_log, " noscan");
4547     if (r->reganch & ROPT_CHECK_ALL)
4548         PerlIO_printf(Perl_debug_log, " isall");
4549     if (r->check_substr)
4550         PerlIO_printf(Perl_debug_log, ") ");
4551
4552     if (r->regstclass) {
4553         regprop(sv, r->regstclass);
4554         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4555     }
4556     if (r->reganch & ROPT_ANCH) {
4557         PerlIO_printf(Perl_debug_log, "anchored");
4558         if (r->reganch & ROPT_ANCH_BOL)
4559             PerlIO_printf(Perl_debug_log, "(BOL)");
4560         if (r->reganch & ROPT_ANCH_MBOL)
4561             PerlIO_printf(Perl_debug_log, "(MBOL)");
4562         if (r->reganch & ROPT_ANCH_SBOL)
4563             PerlIO_printf(Perl_debug_log, "(SBOL)");
4564         if (r->reganch & ROPT_ANCH_GPOS)
4565             PerlIO_printf(Perl_debug_log, "(GPOS)");
4566         PerlIO_putc(Perl_debug_log, ' ');
4567     }
4568     if (r->reganch & ROPT_GPOS_SEEN)
4569         PerlIO_printf(Perl_debug_log, "GPOS ");
4570     if (r->reganch & ROPT_SKIP)
4571         PerlIO_printf(Perl_debug_log, "plus ");
4572     if (r->reganch & ROPT_IMPLICIT)
4573         PerlIO_printf(Perl_debug_log, "implicit ");
4574     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4575     if (r->reganch & ROPT_EVAL_SEEN)
4576         PerlIO_printf(Perl_debug_log, "with eval ");
4577     PerlIO_printf(Perl_debug_log, "\n");
4578     if (r->offsets) {
4579       U32 i;
4580       U32 len = r->offsets[0];
4581       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4582       for (i = 1; i <= len; i++)
4583         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4584                       (UV)r->offsets[i*2-1], 
4585                       (UV)r->offsets[i*2]);
4586       PerlIO_printf(Perl_debug_log, "\n");
4587     }
4588 #endif  /* DEBUGGING */
4589 }
4590
4591 #ifdef DEBUGGING
4592
4593 STATIC void
4594 S_put_byte(pTHX_ SV *sv, int c)
4595 {
4596     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4597         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4598     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4599         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4600     else
4601         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4602 }
4603
4604 #endif  /* DEBUGGING */
4605
4606 /*
4607 - regprop - printable representation of opcode
4608 */
4609 void
4610 Perl_regprop(pTHX_ SV *sv, regnode *o)
4611 {
4612 #ifdef DEBUGGING
4613     register int k;
4614
4615     sv_setpvn(sv, "", 0);
4616     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4617         /* It would be nice to FAIL() here, but this may be called from
4618            regexec.c, and it would be hard to supply pRExC_state. */
4619         Perl_croak(aTHX_ "Corrupted regexp opcode");
4620     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4621
4622     k = PL_regkind[(U8)OP(o)];
4623
4624     if (k == EXACT) {
4625         SV *dsv = sv_2mortal(newSVpvn("", 0));
4626         /* Using is_utf8_string() is a crude hack but it may
4627          * be the best for now since we have no flag "this EXACTish
4628          * node was UTF-8" --jhi */
4629         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4630         char *s    = do_utf8 ?
4631           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4632                          UNI_DISPLAY_REGEX) :
4633           STRING(o);
4634         int len = do_utf8 ?
4635           strlen(s) :
4636           STR_LEN(o);
4637         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4638                        PL_colors[0],
4639                        len, s,
4640                        PL_colors[1]);
4641     }
4642     else if (k == CURLY) {
4643         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4644             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4645         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4646     }
4647     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4648         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4649     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4650         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4651     else if (k == LOGICAL)
4652         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4653     else if (k == ANYOF) {
4654         int i, rangestart = -1;
4655         U8 flags = ANYOF_FLAGS(o);
4656         const char * const anyofs[] = { /* Should be syncronized with
4657                                          * ANYOF_ #xdefines in regcomp.h */
4658             "\\w",
4659             "\\W",
4660             "\\s",
4661             "\\S",
4662             "\\d",
4663             "\\D",
4664             "[:alnum:]",
4665             "[:^alnum:]",
4666             "[:alpha:]",
4667             "[:^alpha:]",
4668             "[:ascii:]",
4669             "[:^ascii:]",
4670             "[:ctrl:]",
4671             "[:^ctrl:]",
4672             "[:graph:]",
4673             "[:^graph:]",
4674             "[:lower:]",
4675             "[:^lower:]",
4676             "[:print:]",
4677             "[:^print:]",
4678             "[:punct:]",
4679             "[:^punct:]",
4680             "[:upper:]",
4681             "[:^upper:]",
4682             "[:xdigit:]",
4683             "[:^xdigit:]",
4684             "[:space:]",
4685             "[:^space:]",
4686             "[:blank:]",
4687             "[:^blank:]"
4688         };
4689
4690         if (flags & ANYOF_LOCALE)
4691             sv_catpv(sv, "{loc}");
4692         if (flags & ANYOF_FOLD)
4693             sv_catpv(sv, "{i}");
4694         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4695         if (flags & ANYOF_INVERT)
4696             sv_catpv(sv, "^");
4697         for (i = 0; i <= 256; i++) {
4698             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4699                 if (rangestart == -1)
4700                     rangestart = i;
4701             } else if (rangestart != -1) {
4702                 if (i <= rangestart + 3)
4703                     for (; rangestart < i; rangestart++)
4704                         put_byte(sv, rangestart);
4705                 else {
4706                     put_byte(sv, rangestart);
4707                     sv_catpv(sv, "-");
4708                     put_byte(sv, i - 1);
4709                 }
4710                 rangestart = -1;
4711             }
4712         }
4713
4714         if (o->flags & ANYOF_CLASS)
4715             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4716                 if (ANYOF_CLASS_TEST(o,i))
4717                     sv_catpv(sv, anyofs[i]);
4718
4719         if (flags & ANYOF_UNICODE)
4720             sv_catpv(sv, "{unicode}");
4721         else if (flags & ANYOF_UNICODE_ALL)
4722             sv_catpv(sv, "{unicode_all}");
4723
4724         {
4725             SV *lv;
4726             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4727         
4728             if (lv) {
4729                 if (sw) {
4730                     UV i;
4731                     U8 s[UTF8_MAXLEN+1];
4732                 
4733                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4734                         U8 *e = uvchr_to_utf8(s, i);
4735                         
4736                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4737                             if (rangestart == -1)
4738                                 rangestart = i;
4739                         } else if (rangestart != -1) {
4740                             U8 *p;
4741                         
4742                             if (i <= rangestart + 3)
4743                                 for (; rangestart < i; rangestart++) {
4744                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4745                                         put_byte(sv, *p);
4746                                 }
4747                             else {
4748                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4749                                     put_byte(sv, *p);
4750                                 sv_catpv(sv, "-");
4751                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4752                                         put_byte(sv, *p);
4753                                 }
4754                                 rangestart = -1;
4755                             }
4756                         }
4757                         
4758                     sv_catpv(sv, "..."); /* et cetera */
4759                 }
4760
4761                 {
4762                     char *s = savepv(SvPVX(lv));
4763                     char *origs = s;
4764                 
4765                     while(*s && *s != '\n') s++;
4766                 
4767                     if (*s == '\n') {
4768                         char *t = ++s;
4769                         
4770                         while (*s) {
4771                             if (*s == '\n')
4772                                 *s = ' ';
4773                             s++;
4774                         }
4775                         if (s[-1] == ' ')
4776                             s[-1] = 0;
4777                         
4778                         sv_catpv(sv, t);
4779                     }
4780                 
4781                     Safefree(origs);
4782                 }
4783             }
4784         }
4785
4786         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4787     }
4788     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4789         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4790 #endif  /* DEBUGGING */
4791 }
4792
4793 SV *
4794 Perl_re_intuit_string(pTHX_ regexp *prog)
4795 {                               /* Assume that RE_INTUIT is set */
4796     DEBUG_r(
4797         {   STRLEN n_a;
4798             char *s = SvPV(prog->check_substr,n_a);
4799
4800             if (!PL_colorset) reginitcolors();
4801             PerlIO_printf(Perl_debug_log,
4802                       "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4803                       PL_colors[4],PL_colors[5],PL_colors[0],
4804                       s,
4805                       PL_colors[1],
4806                       (strlen(s) > 60 ? "..." : ""));
4807         } );
4808
4809     return prog->check_substr;
4810 }
4811
4812 void
4813 Perl_pregfree(pTHX_ struct regexp *r)
4814 {
4815 #ifdef DEBUGGING
4816     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4817 #endif
4818
4819     if (!r || (--r->refcnt > 0))
4820         return;
4821     DEBUG_r({
4822          char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4823                                   UNI_DISPLAY_REGEX);
4824          int len = SvCUR(dsv);
4825          if (!PL_colorset)
4826               reginitcolors();
4827          PerlIO_printf(Perl_debug_log,
4828                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4829                        PL_colors[4],PL_colors[5],PL_colors[0],
4830                        len, len, s,
4831                        PL_colors[1],
4832                        len > 60 ? "..." : "");
4833     });
4834
4835     if (r->precomp)
4836         Safefree(r->precomp);
4837     if (r->offsets)             /* 20010421 MJD */
4838         Safefree(r->offsets);
4839     if (RX_MATCH_COPIED(r))
4840         Safefree(r->subbeg);
4841     if (r->substrs) {
4842         if (r->anchored_substr)
4843             SvREFCNT_dec(r->anchored_substr);
4844         if (r->float_substr)
4845             SvREFCNT_dec(r->float_substr);
4846         Safefree(r->substrs);
4847     }
4848     if (r->data) {
4849         int n = r->data->count;
4850         AV* new_comppad = NULL;
4851         AV* old_comppad;
4852         SV** old_curpad;
4853
4854         while (--n >= 0) {
4855           /* If you add a ->what type here, update the comment in regcomp.h */
4856             switch (r->data->what[n]) {
4857             case 's':
4858                 SvREFCNT_dec((SV*)r->data->data[n]);
4859                 break;
4860             case 'f':
4861                 Safefree(r->data->data[n]);
4862                 break;
4863             case 'p':
4864                 new_comppad = (AV*)r->data->data[n];
4865                 break;
4866             case 'o':
4867                 if (new_comppad == NULL)
4868                     Perl_croak(aTHX_ "panic: pregfree comppad");
4869                 old_comppad = PL_comppad;
4870                 old_curpad = PL_curpad;
4871                 /* Watch out for global destruction's random ordering. */
4872                 if (SvTYPE(new_comppad) == SVt_PVAV) {
4873                     PL_comppad = new_comppad;
4874                     PL_curpad = AvARRAY(new_comppad);
4875                 }
4876                 else
4877                     PL_curpad = NULL;
4878
4879                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4880                     op_free((OP_4tree*)r->data->data[n]);
4881                 }
4882
4883                 PL_comppad = old_comppad;
4884                 PL_curpad = old_curpad;
4885                 SvREFCNT_dec((SV*)new_comppad);
4886                 new_comppad = NULL;
4887                 break;
4888             case 'n':
4889                 break;
4890             default:
4891                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4892             }
4893         }
4894         Safefree(r->data->what);
4895         Safefree(r->data);
4896     }
4897     Safefree(r->startp);
4898     Safefree(r->endp);
4899     Safefree(r);
4900 }
4901
4902 /*
4903  - regnext - dig the "next" pointer out of a node
4904  *
4905  * [Note, when REGALIGN is defined there are two places in regmatch()
4906  * that bypass this code for speed.]
4907  */
4908 regnode *
4909 Perl_regnext(pTHX_ register regnode *p)
4910 {
4911     register I32 offset;
4912
4913     if (p == &PL_regdummy)
4914         return(NULL);
4915
4916     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4917     if (offset == 0)
4918         return(NULL);
4919
4920     return(p+offset);
4921 }
4922
4923 STATIC void     
4924 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4925 {
4926     va_list args;
4927     STRLEN l1 = strlen(pat1);
4928     STRLEN l2 = strlen(pat2);
4929     char buf[512];
4930     SV *msv;
4931     char *message;
4932
4933     if (l1 > 510)
4934         l1 = 510;
4935     if (l1 + l2 > 510)
4936         l2 = 510 - l1;
4937     Copy(pat1, buf, l1 , char);
4938     Copy(pat2, buf + l1, l2 , char);
4939     buf[l1 + l2] = '\n';
4940     buf[l1 + l2 + 1] = '\0';
4941 #ifdef I_STDARG
4942     /* ANSI variant takes additional second argument */
4943     va_start(args, pat2);
4944 #else
4945     va_start(args);
4946 #endif
4947     msv = vmess(buf, &args);
4948     va_end(args);
4949     message = SvPV(msv,l1);
4950     if (l1 > 512)
4951         l1 = 512;
4952     Copy(message, buf, l1 , char);
4953     buf[l1] = '\0';                     /* Overwrite \n */
4954     Perl_croak(aTHX_ "%s", buf);
4955 }
4956
4957 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
4958
4959 void
4960 Perl_save_re_context(pTHX)
4961 {
4962 #if 0
4963     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
4964     SAVEI32(RExC_npar);         /* () count. */
4965     SAVEI32(RExC_size);         /* Code size. */
4966     SAVEI16(RExC_flags16);              /* are we folding, multilining? */
4967     SAVEVPTR(RExC_rx);          /* from regcomp.c */
4968     SAVEI32(RExC_seen);         /* from regcomp.c */
4969     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
4970     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
4971     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
4972     SAVEPPTR(RExC_end);         /* End of input for compile */
4973     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
4974 #endif
4975
4976     SAVEI32(PL_reg_flags);              /* from regexec.c */
4977     SAVEPPTR(PL_bostr);
4978     SAVEPPTR(PL_reginput);              /* String-input pointer. */
4979     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
4980     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
4981     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
4982     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
4983     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
4984     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
4985     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
4986     PL_reg_start_tmp = 0;
4987     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
4988     PL_reg_start_tmpl = 0;
4989     SAVEVPTR(PL_regdata);
4990     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
4991     SAVEI32(PL_regnarrate);             /* from regexec.c */
4992     SAVEVPTR(PL_regprogram);            /* from regexec.c */
4993     SAVEINT(PL_regindent);              /* from regexec.c */
4994     SAVEVPTR(PL_regcc);                 /* from regexec.c */
4995     SAVEVPTR(PL_curcop);
4996     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
4997     SAVEVPTR(PL_reg_re);                /* from regexec.c */
4998     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
4999     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5000     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5001     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5002     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5003     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5004     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5005     SAVEI32(PL_regnpar);                /* () count. */
5006     SAVEI32(PL_regsize);                /* from regexec.c */
5007 #ifdef DEBUGGING
5008     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5009 #endif
5010 }
5011
5012 static void
5013 clear_re(pTHX_ void *r)
5014 {
5015     ReREFCNT_dec((regexp *)r);
5016 }
5017