43c46bb2bb03ed91b65f4bb8ae50bc401f7d9f75
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if (data->flags & SF_BEFORE_EOL)
509                 data->flags
510                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
511             else
512                 data->flags &= ~SF_FL_BEFORE_EOL;
513         }
514     }
515     SvCUR_set(data->last_found, 0);
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590         cl->flags &= ~ANYOF_UNICODE_ALL;
591         cl->flags |= ANYOF_UNICODE;
592         ARG_SET(cl, ARG(and_with));
593     }
594     if (!(and_with->flags & ANYOF_UNICODE_ALL))
595         cl->flags &= ~ANYOF_UNICODE_ALL;
596     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597         cl->flags &= ~ANYOF_UNICODE;
598 }
599
600 /* 'OR' a given class with another one.  Can create false positives */
601 /* We assume that cl is not inverted */
602 STATIC void
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
604 {
605     if (or_with->flags & ANYOF_INVERT) {
606         /* We do not use
607          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608          *   <= (B1 | !B2) | (CL1 | !CL2)
609          * which is wasteful if CL2 is small, but we ignore CL2:
610          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611          * XXXX Can we handle case-fold?  Unclear:
612          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
614          */
615         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616              && !(or_with->flags & ANYOF_FOLD)
617              && !(cl->flags & ANYOF_FOLD) ) {
618             int i;
619
620             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621                 cl->bitmap[i] |= ~or_with->bitmap[i];
622         } /* XXXX: logic is complicated otherwise */
623         else {
624             cl_anything(pRExC_state, cl);
625         }
626     } else {
627         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629              && (!(or_with->flags & ANYOF_FOLD)
630                  || (cl->flags & ANYOF_FOLD)) ) {
631             int i;
632
633             /* OR char bitmap and class bitmap separately */
634             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635                 cl->bitmap[i] |= or_with->bitmap[i];
636             if (or_with->flags & ANYOF_CLASS) {
637                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638                     cl->classflags[i] |= or_with->classflags[i];
639                 cl->flags |= ANYOF_CLASS;
640             }
641         }
642         else { /* XXXX: logic is complicated, leave it along for a moment. */
643             cl_anything(pRExC_state, cl);
644         }
645     }
646     if (or_with->flags & ANYOF_EOS)
647         cl->flags |= ANYOF_EOS;
648
649     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650         ARG(cl) != ARG(or_with)) {
651         cl->flags |= ANYOF_UNICODE_ALL;
652         cl->flags &= ~ANYOF_UNICODE;
653     }
654     if (or_with->flags & ANYOF_UNICODE_ALL) {
655         cl->flags |= ANYOF_UNICODE_ALL;
656         cl->flags &= ~ANYOF_UNICODE;
657     }
658 }
659
660 /*
661  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662  * These need to be revisited when a newer toolchain becomes available.
663  */
664 #if defined(__sparc64__) && defined(__GNUC__)
665 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 #       undef  SPARC64_GCC_WORKAROUND
667 #       define SPARC64_GCC_WORKAROUND 1
668 #   endif
669 #endif
670
671 /* REx optimizer.  Converts nodes into quickier variants "in place".
672    Finds fixed substrings.  */
673
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675    to the position after last scanned or to NULL. */
676
677 STATIC I32
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679                         /* scanp: Start here (read-write). */
680                         /* deltap: Write maxlen-minlen here. */
681                         /* last: Stop before this one. */
682 {
683     I32 min = 0, pars = 0, code;
684     regnode *scan = *scanp, *next;
685     I32 delta = 0;
686     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687     int is_inf_internal = 0;            /* The studied chunk is infinite */
688     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689     scan_data_t data_fake;
690     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
691
692     while (scan && OP(scan) != END && scan < last) {
693         /* Peephole optimizer: */
694
695         if (PL_regkind[(U8)OP(scan)] == EXACT) {
696             /* Merge several consecutive EXACTish nodes into one. */
697             regnode *n = regnext(scan);
698             U32 stringok = 1;
699 #ifdef DEBUGGING
700             regnode *stop = scan;
701 #endif
702
703             next = scan + NODE_SZ_STR(scan);
704             /* Skip NOTHING, merge EXACT*. */
705             while (n &&
706                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
707                      (stringok && (OP(n) == OP(scan))))
708                    && NEXT_OFF(n)
709                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710                 if (OP(n) == TAIL || n > next)
711                     stringok = 0;
712                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713                     NEXT_OFF(scan) += NEXT_OFF(n);
714                     next = n + NODE_STEP_REGNODE;
715 #ifdef DEBUGGING
716                     if (stringok)
717                         stop = n;
718 #endif
719                     n = regnext(n);
720                 }
721                 else if (stringok) {
722                     int oldl = STR_LEN(scan);
723                     regnode *nnext = regnext(n);
724
725                     if (oldl + STR_LEN(n) > U8_MAX)
726                         break;
727                     NEXT_OFF(scan) += NEXT_OFF(n);
728                     STR_LEN(scan) += STR_LEN(n);
729                     next = n + NODE_SZ_STR(n);
730                     /* Now we can overwrite *n : */
731                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 #ifdef DEBUGGING
733                     stop = next - 1;
734 #endif
735                     n = nnext;
736                 }
737             }
738
739             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
740 /*
741   Two problematic code points in Unicode casefolding of EXACT nodes:
742
743    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
745
746    which casefold to
747
748    Unicode                      UTF-8
749
750    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
751    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
752
753    This means that in case-insensitive matching (or "loose matching",
754    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755    length of the above casefolded versions) can match a target string
756    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757    This would rather mess up the minimum length computation.
758
759    What we'll do is to look for the tail four bytes, and then peek
760    at the preceding two bytes to see whether we need to decrease
761    the minimum length by four (six minus two).
762
763    Thanks to the design of UTF-8, there cannot be false matches:
764    A sequence of valid UTF-8 bytes cannot be a subsequence of
765    another valid sequence of UTF-8 bytes.
766
767 */
768                  char *s0 = STRING(scan), *s, *t;
769                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770                  char *t0 = "\xcc\x88\xcc\x81";
771                  char *t1 = t0 + 3;
772                  
773                  for (s = s0 + 2;
774                       s < s2 && (t = ninstr(s, s1, t0, t1));
775                       s = t + 4) {
776                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
778                            min -= 4;
779                  }
780             }
781
782 #ifdef DEBUGGING
783             /* Allow dumping */
784             n = scan + NODE_SZ_STR(scan);
785             while (n <= stop) {
786                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787                     OP(n) = OPTIMIZED;
788                     NEXT_OFF(n) = 0;
789                 }
790                 n++;
791             }
792 #endif
793         }
794         /* Follow the next-chain of the current node and optimize
795            away all the NOTHINGs from it.  */
796         if (OP(scan) != CURLYX) {
797             int max = (reg_off_by_arg[OP(scan)]
798                        ? I32_MAX
799                        /* I32 may be smaller than U16 on CRAYs! */
800                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
802             int noff;
803             regnode *n = scan;
804         
805             /* Skip NOTHING and LONGJMP. */
806             while ((n = regnext(n))
807                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809                    && off + noff < max)
810                 off += noff;
811             if (reg_off_by_arg[OP(scan)])
812                 ARG(scan) = off;
813             else
814                 NEXT_OFF(scan) = off;
815         }
816         /* The principal pseudo-switch.  Cannot be a switch, since we
817            look into several different things.  */
818         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820             next = regnext(scan);
821             code = OP(scan);
822         
823             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824                 I32 max1 = 0, min1 = I32_MAX, num = 0;
825                 struct regnode_charclass_class accum;
826                 
827                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829                 if (flags & SCF_DO_STCLASS)
830                     cl_init_zero(pRExC_state, &accum);
831                 while (OP(scan) == code) {
832                     I32 deltanext, minnext, f = 0, fake;
833                     struct regnode_charclass_class this_class;
834
835                     num++;
836                     data_fake.flags = 0;
837                     if (data) {         
838                         data_fake.whilem_c = data->whilem_c;
839                         data_fake.last_closep = data->last_closep;
840                     }
841                     else
842                         data_fake.last_closep = &fake;
843                     next = regnext(scan);
844                     scan = NEXTOPER(scan);
845                     if (code != BRANCH)
846                         scan = NEXTOPER(scan);
847                     if (flags & SCF_DO_STCLASS) {
848                         cl_init(pRExC_state, &this_class);
849                         data_fake.start_class = &this_class;
850                         f = SCF_DO_STCLASS_AND;
851                     }           
852                     if (flags & SCF_WHILEM_VISITED_POS)
853                         f |= SCF_WHILEM_VISITED_POS;
854                     /* we suppose the run is continuous, last=next...*/
855                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
856                                           next, &data_fake, f);
857                     if (min1 > minnext)
858                         min1 = minnext;
859                     if (max1 < minnext + deltanext)
860                         max1 = minnext + deltanext;
861                     if (deltanext == I32_MAX)
862                         is_inf = is_inf_internal = 1;
863                     scan = next;
864                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
865                         pars++;
866                     if (data && (data_fake.flags & SF_HAS_EVAL))
867                         data->flags |= SF_HAS_EVAL;
868                     if (data)
869                         data->whilem_c = data_fake.whilem_c;
870                     if (flags & SCF_DO_STCLASS)
871                         cl_or(pRExC_state, &accum, &this_class);
872                     if (code == SUSPEND)
873                         break;
874                 }
875                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
876                     min1 = 0;
877                 if (flags & SCF_DO_SUBSTR) {
878                     data->pos_min += min1;
879                     data->pos_delta += max1 - min1;
880                     if (max1 != min1 || is_inf)
881                         data->longest = &(data->longest_float);
882                 }
883                 min += min1;
884                 delta += max1 - min1;
885                 if (flags & SCF_DO_STCLASS_OR) {
886                     cl_or(pRExC_state, data->start_class, &accum);
887                     if (min1) {
888                         cl_and(data->start_class, &and_with);
889                         flags &= ~SCF_DO_STCLASS;
890                     }
891                 }
892                 else if (flags & SCF_DO_STCLASS_AND) {
893                     if (min1) {
894                         cl_and(data->start_class, &accum);
895                         flags &= ~SCF_DO_STCLASS;
896                     }
897                     else {
898                         /* Switch to OR mode: cache the old value of
899                          * data->start_class */
900                         StructCopy(data->start_class, &and_with,
901                                    struct regnode_charclass_class);
902                         flags &= ~SCF_DO_STCLASS_AND;
903                         StructCopy(&accum, data->start_class,
904                                    struct regnode_charclass_class);
905                         flags |= SCF_DO_STCLASS_OR;
906                         data->start_class->flags |= ANYOF_EOS;
907                     }
908                 }
909             }
910             else if (code == BRANCHJ)   /* single branch is optimized. */
911                 scan = NEXTOPER(NEXTOPER(scan));
912             else                        /* single branch is optimized. */
913                 scan = NEXTOPER(scan);
914             continue;
915         }
916         else if (OP(scan) == EXACT) {
917             I32 l = STR_LEN(scan);
918             UV uc = *((U8*)STRING(scan));
919             if (UTF) {
920                 U8 *s = (U8*)STRING(scan);
921                 l = utf8_length(s, s + l);
922                 uc = utf8_to_uvchr(s, NULL);
923             }
924             min += l;
925             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926                 /* The code below prefers earlier match for fixed
927                    offset, later match for variable offset.  */
928                 if (data->last_end == -1) { /* Update the start info. */
929                     data->last_start_min = data->pos_min;
930                     data->last_start_max = is_inf
931                         ? I32_MAX : data->pos_min + data->pos_delta;
932                 }
933                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
934                 if (UTF)
935                     SvUTF8_on(data->last_found);
936                 data->last_end = data->pos_min + l;
937                 data->pos_min += l; /* As in the first entry. */
938                 data->flags &= ~SF_BEFORE_EOL;
939             }
940             if (flags & SCF_DO_STCLASS_AND) {
941                 /* Check whether it is compatible with what we know already! */
942                 int compat = 1;
943
944                 if (uc >= 0x100 ||
945                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
946                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
947                     && (!(data->start_class->flags & ANYOF_FOLD)
948                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
949                     )
950                     compat = 0;
951                 ANYOF_CLASS_ZERO(data->start_class);
952                 ANYOF_BITMAP_ZERO(data->start_class);
953                 if (compat)
954                     ANYOF_BITMAP_SET(data->start_class, uc);
955                 data->start_class->flags &= ~ANYOF_EOS;
956                 if (uc < 0x100)
957                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
958             }
959             else if (flags & SCF_DO_STCLASS_OR) {
960                 /* false positive possible if the class is case-folded */
961                 if (uc < 0x100)
962                     ANYOF_BITMAP_SET(data->start_class, uc);
963                 else
964                     data->start_class->flags |= ANYOF_UNICODE_ALL;
965                 data->start_class->flags &= ~ANYOF_EOS;
966                 cl_and(data->start_class, &and_with);
967             }
968             flags &= ~SCF_DO_STCLASS;
969         }
970         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
971             I32 l = STR_LEN(scan);
972             UV uc = *((U8*)STRING(scan));
973
974             /* Search for fixed substrings supports EXACT only. */
975             if (flags & SCF_DO_SUBSTR)
976                 scan_commit(pRExC_state, data);
977             if (UTF) {
978                 U8 *s = (U8 *)STRING(scan);
979                 l = utf8_length(s, s + l);
980                 uc = utf8_to_uvchr(s, NULL);
981             }
982             min += l;
983             if (data && (flags & SCF_DO_SUBSTR))
984                 data->pos_min += l;
985             if (flags & SCF_DO_STCLASS_AND) {
986                 /* Check whether it is compatible with what we know already! */
987                 int compat = 1;
988
989                 if (uc >= 0x100 ||
990                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
991                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
992                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
993                     compat = 0;
994                 ANYOF_CLASS_ZERO(data->start_class);
995                 ANYOF_BITMAP_ZERO(data->start_class);
996                 if (compat) {
997                     ANYOF_BITMAP_SET(data->start_class, uc);
998                     data->start_class->flags &= ~ANYOF_EOS;
999                     data->start_class->flags |= ANYOF_FOLD;
1000                     if (OP(scan) == EXACTFL)
1001                         data->start_class->flags |= ANYOF_LOCALE;
1002                 }
1003             }
1004             else if (flags & SCF_DO_STCLASS_OR) {
1005                 if (data->start_class->flags & ANYOF_FOLD) {
1006                     /* false positive possible if the class is case-folded.
1007                        Assume that the locale settings are the same... */
1008                     if (uc < 0x100)
1009                         ANYOF_BITMAP_SET(data->start_class, uc);
1010                     data->start_class->flags &= ~ANYOF_EOS;
1011                 }
1012                 cl_and(data->start_class, &and_with);
1013             }
1014             flags &= ~SCF_DO_STCLASS;
1015         }
1016         else if (strchr((char*)PL_varies,OP(scan))) {
1017             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1018             I32 f = flags, pos_before = 0;
1019             regnode *oscan = scan;
1020             struct regnode_charclass_class this_class;
1021             struct regnode_charclass_class *oclass = NULL;
1022             I32 next_is_eval = 0;
1023
1024             switch (PL_regkind[(U8)OP(scan)]) {
1025             case WHILEM:                /* End of (?:...)* . */
1026                 scan = NEXTOPER(scan);
1027                 goto finish;
1028             case PLUS:
1029                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1030                     next = NEXTOPER(scan);
1031                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1032                         mincount = 1;
1033                         maxcount = REG_INFTY;
1034                         next = regnext(scan);
1035                         scan = NEXTOPER(scan);
1036                         goto do_curly;
1037                     }
1038                 }
1039                 if (flags & SCF_DO_SUBSTR)
1040                     data->pos_min++;
1041                 min++;
1042                 /* Fall through. */
1043             case STAR:
1044                 if (flags & SCF_DO_STCLASS) {
1045                     mincount = 0;
1046                     maxcount = REG_INFTY;
1047                     next = regnext(scan);
1048                     scan = NEXTOPER(scan);
1049                     goto do_curly;
1050                 }
1051                 is_inf = is_inf_internal = 1;
1052                 scan = regnext(scan);
1053                 if (flags & SCF_DO_SUBSTR) {
1054                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1055                     data->longest = &(data->longest_float);
1056                 }
1057                 goto optimize_curly_tail;
1058             case CURLY:
1059                 mincount = ARG1(scan);
1060                 maxcount = ARG2(scan);
1061                 next = regnext(scan);
1062                 if (OP(scan) == CURLYX) {
1063                     I32 lp = (data ? *(data->last_closep) : 0);
1064
1065                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1066                 }
1067                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1068                 next_is_eval = (OP(scan) == EVAL);
1069               do_curly:
1070                 if (flags & SCF_DO_SUBSTR) {
1071                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1072                     pos_before = data->pos_min;
1073                 }
1074                 if (data) {
1075                     fl = data->flags;
1076                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1077                     if (is_inf)
1078                         data->flags |= SF_IS_INF;
1079                 }
1080                 if (flags & SCF_DO_STCLASS) {
1081                     cl_init(pRExC_state, &this_class);
1082                     oclass = data->start_class;
1083                     data->start_class = &this_class;
1084                     f |= SCF_DO_STCLASS_AND;
1085                     f &= ~SCF_DO_STCLASS_OR;
1086                 }
1087                 /* These are the cases when once a subexpression
1088                    fails at a particular position, it cannot succeed
1089                    even after backtracking at the enclosing scope.
1090                 
1091                    XXXX what if minimal match and we are at the
1092                         initial run of {n,m}? */
1093                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094                     f &= ~SCF_WHILEM_VISITED_POS;
1095
1096                 /* This will finish on WHILEM, setting scan, or on NULL: */
1097                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1098                                       mincount == 0
1099                                         ? (f & ~SCF_DO_SUBSTR) : f);
1100
1101                 if (flags & SCF_DO_STCLASS)
1102                     data->start_class = oclass;
1103                 if (mincount == 0 || minnext == 0) {
1104                     if (flags & SCF_DO_STCLASS_OR) {
1105                         cl_or(pRExC_state, data->start_class, &this_class);
1106                     }
1107                     else if (flags & SCF_DO_STCLASS_AND) {
1108                         /* Switch to OR mode: cache the old value of
1109                          * data->start_class */
1110                         StructCopy(data->start_class, &and_with,
1111                                    struct regnode_charclass_class);
1112                         flags &= ~SCF_DO_STCLASS_AND;
1113                         StructCopy(&this_class, data->start_class,
1114                                    struct regnode_charclass_class);
1115                         flags |= SCF_DO_STCLASS_OR;
1116                         data->start_class->flags |= ANYOF_EOS;
1117                     }
1118                 } else {                /* Non-zero len */
1119                     if (flags & SCF_DO_STCLASS_OR) {
1120                         cl_or(pRExC_state, data->start_class, &this_class);
1121                         cl_and(data->start_class, &and_with);
1122                     }
1123                     else if (flags & SCF_DO_STCLASS_AND)
1124                         cl_and(data->start_class, &this_class);
1125                     flags &= ~SCF_DO_STCLASS;
1126                 }
1127                 if (!scan)              /* It was not CURLYX, but CURLY. */
1128                     scan = next;
1129                 if (ckWARN(WARN_REGEXP)
1130                        /* ? quantifier ok, except for (?{ ... }) */
1131                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1132                     && (minnext == 0) && (deltanext == 0)
1133                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1135                 {
1136                     vWARN(RExC_parse,
1137                           "Quantifier unexpected on zero-length expression");
1138                 }
1139
1140                 min += minnext * mincount;
1141                 is_inf_internal |= ((maxcount == REG_INFTY
1142                                      && (minnext + deltanext) > 0)
1143                                     || deltanext == I32_MAX);
1144                 is_inf |= is_inf_internal;
1145                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1146
1147                 /* Try powerful optimization CURLYX => CURLYN. */
1148                 if (  OP(oscan) == CURLYX && data
1149                       && data->flags & SF_IN_PAR
1150                       && !(data->flags & SF_HAS_EVAL)
1151                       && !deltanext && minnext == 1 ) {
1152                     /* Try to optimize to CURLYN.  */
1153                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1154                     regnode *nxt1 = nxt;
1155 #ifdef DEBUGGING
1156                     regnode *nxt2;
1157 #endif
1158
1159                     /* Skip open. */
1160                     nxt = regnext(nxt);
1161                     if (!strchr((char*)PL_simple,OP(nxt))
1162                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1163                              && STR_LEN(nxt) == 1))
1164                         goto nogo;
1165 #ifdef DEBUGGING
1166                     nxt2 = nxt;
1167 #endif
1168                     nxt = regnext(nxt);
1169                     if (OP(nxt) != CLOSE)
1170                         goto nogo;
1171                     /* Now we know that nxt2 is the only contents: */
1172                     oscan->flags = ARG(nxt);
1173                     OP(oscan) = CURLYN;
1174                     OP(nxt1) = NOTHING; /* was OPEN. */
1175 #ifdef DEBUGGING
1176                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1180                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1181                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1182 #endif
1183                 }
1184               nogo:
1185
1186                 /* Try optimization CURLYX => CURLYM. */
1187                 if (  OP(oscan) == CURLYX && data
1188                       && !(data->flags & SF_HAS_PAR)
1189                       && !(data->flags & SF_HAS_EVAL)
1190                       && !deltanext  ) {
1191                     /* XXXX How to optimize if data == 0? */
1192                     /* Optimize to a simpler form.  */
1193                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194                     regnode *nxt2;
1195
1196                     OP(oscan) = CURLYM;
1197                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1198                             && (OP(nxt2) != WHILEM))
1199                         nxt = nxt2;
1200                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1201                     /* Need to optimize away parenths. */
1202                     if (data->flags & SF_IN_PAR) {
1203                         /* Set the parenth number.  */
1204                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1205
1206                         if (OP(nxt) != CLOSE)
1207                             FAIL("Panic opt close");
1208                         oscan->flags = ARG(nxt);
1209                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1210                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1211 #ifdef DEBUGGING
1212                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1214                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1216 #endif
1217 #if 0
1218                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219                             regnode *nnxt = regnext(nxt1);
1220                         
1221                             if (nnxt == nxt) {
1222                                 if (reg_off_by_arg[OP(nxt1)])
1223                                     ARG_SET(nxt1, nxt2 - nxt1);
1224                                 else if (nxt2 - nxt1 < U16_MAX)
1225                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1226                                 else
1227                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1228                             }
1229                             nxt1 = nnxt;
1230                         }
1231 #endif
1232                         /* Optimize again: */
1233                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1234                                     NULL, 0);
1235                     }
1236                     else
1237                         oscan->flags = 0;
1238                 }
1239                 else if ((OP(oscan) == CURLYX)
1240                          && (flags & SCF_WHILEM_VISITED_POS)
1241                          /* See the comment on a similar expression above.
1242                             However, this time it not a subexpression
1243                             we care about, but the expression itself. */
1244                          && (maxcount == REG_INFTY)
1245                          && data && ++data->whilem_c < 16) {
1246                     /* This stays as CURLYX, we can put the count/of pair. */
1247                     /* Find WHILEM (as in regexec.c) */
1248                     regnode *nxt = oscan + NEXT_OFF(oscan);
1249
1250                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1251                         nxt += ARG(nxt);
1252                     PREVOPER(nxt)->flags = data->whilem_c
1253                         | (RExC_whilem_seen << 4); /* On WHILEM */
1254                 }
1255                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1256                     pars++;
1257                 if (flags & SCF_DO_SUBSTR) {
1258                     SV *last_str = Nullsv;
1259                     int counted = mincount != 0;
1260
1261                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1262 #if defined(SPARC64_GCC_WORKAROUND)
1263                         I32 b = 0;
1264                         STRLEN l = 0;
1265                         char *s = NULL;
1266                         I32 old = 0;
1267
1268                         if (pos_before >= data->last_start_min)
1269                             b = pos_before;
1270                         else
1271                             b = data->last_start_min;
1272
1273                         l = 0;
1274                         s = SvPV(data->last_found, l);
1275                         old = b - data->last_start_min;
1276
1277 #else
1278                         I32 b = pos_before >= data->last_start_min
1279                             ? pos_before : data->last_start_min;
1280                         STRLEN l;
1281                         char *s = SvPV(data->last_found, l);
1282                         I32 old = b - data->last_start_min;
1283 #endif
1284
1285                         if (UTF)
1286                             old = utf8_hop((U8*)s, old) - (U8*)s;
1287                         
1288                         l -= old;
1289                         /* Get the added string: */
1290                         last_str = newSVpvn(s  + old, l);
1291                         if (deltanext == 0 && pos_before == b) {
1292                             /* What was added is a constant string */
1293                             if (mincount > 1) {
1294                                 SvGROW(last_str, (mincount * l) + 1);
1295                                 repeatcpy(SvPVX(last_str) + l,
1296                                           SvPVX(last_str), l, mincount - 1);
1297                                 SvCUR(last_str) *= mincount;
1298                                 /* Add additional parts. */
1299                                 SvCUR_set(data->last_found,
1300                                           SvCUR(data->last_found) - l);
1301                                 sv_catsv(data->last_found, last_str);
1302                                 data->last_end += l * (mincount - 1);
1303                             }
1304                         } else {
1305                             /* start offset must point into the last copy */
1306                             data->last_start_min += minnext * (mincount - 1);
1307                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308                                 * (minnext + data->pos_delta);
1309                         }
1310                     }
1311                     /* It is counted once already... */
1312                     data->pos_min += minnext * (mincount - counted);
1313                     data->pos_delta += - counted * deltanext +
1314                         (minnext + deltanext) * maxcount - minnext * mincount;
1315                     if (mincount != maxcount) {
1316                          /* Cannot extend fixed substrings found inside
1317                             the group.  */
1318                         scan_commit(pRExC_state,data);
1319                         if (mincount && last_str) {
1320                             sv_setsv(data->last_found, last_str);
1321                             data->last_end = data->pos_min;
1322                             data->last_start_min =
1323                                 data->pos_min - CHR_SVLEN(last_str);
1324                             data->last_start_max = is_inf
1325                                 ? I32_MAX
1326                                 : data->pos_min + data->pos_delta
1327                                 - CHR_SVLEN(last_str);
1328                         }
1329                         data->longest = &(data->longest_float);
1330                     }
1331                     SvREFCNT_dec(last_str);
1332                 }
1333                 if (data && (fl & SF_HAS_EVAL))
1334                     data->flags |= SF_HAS_EVAL;
1335               optimize_curly_tail:
1336                 if (OP(oscan) != CURLYX) {
1337                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1338                            && NEXT_OFF(next))
1339                         NEXT_OFF(oscan) += NEXT_OFF(next);
1340                 }
1341                 continue;
1342             default:                    /* REF and CLUMP only? */
1343                 if (flags & SCF_DO_SUBSTR) {
1344                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1345                     data->longest = &(data->longest_float);
1346                 }
1347                 is_inf = is_inf_internal = 1;
1348                 if (flags & SCF_DO_STCLASS_OR)
1349                     cl_anything(pRExC_state, data->start_class);
1350                 flags &= ~SCF_DO_STCLASS;
1351                 break;
1352             }
1353         }
1354         else if (strchr((char*)PL_simple,OP(scan))) {
1355             int value = 0;
1356
1357             if (flags & SCF_DO_SUBSTR) {
1358                 scan_commit(pRExC_state,data);
1359                 data->pos_min++;
1360             }
1361             min++;
1362             if (flags & SCF_DO_STCLASS) {
1363                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1364
1365                 /* Some of the logic below assumes that switching
1366                    locale on will only add false positives. */
1367                 switch (PL_regkind[(U8)OP(scan)]) {
1368                 case SANY:
1369                 default:
1370                   do_default:
1371                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1373                         cl_anything(pRExC_state, data->start_class);
1374                     break;
1375                 case REG_ANY:
1376                     if (OP(scan) == SANY)
1377                         goto do_default;
1378                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380                                  || (data->start_class->flags & ANYOF_CLASS));
1381                         cl_anything(pRExC_state, data->start_class);
1382                     }
1383                     if (flags & SCF_DO_STCLASS_AND || !value)
1384                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385                     break;
1386                 case ANYOF:
1387                     if (flags & SCF_DO_STCLASS_AND)
1388                         cl_and(data->start_class,
1389                                (struct regnode_charclass_class*)scan);
1390                     else
1391                         cl_or(pRExC_state, data->start_class,
1392                               (struct regnode_charclass_class*)scan);
1393                     break;
1394                 case ALNUM:
1395                     if (flags & SCF_DO_STCLASS_AND) {
1396                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398                             for (value = 0; value < 256; value++)
1399                                 if (!isALNUM(value))
1400                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1401                         }
1402                     }
1403                     else {
1404                         if (data->start_class->flags & ANYOF_LOCALE)
1405                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1406                         else {
1407                             for (value = 0; value < 256; value++)
1408                                 if (isALNUM(value))
1409                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1410                         }
1411                     }
1412                     break;
1413                 case ALNUML:
1414                     if (flags & SCF_DO_STCLASS_AND) {
1415                         if (data->start_class->flags & ANYOF_LOCALE)
1416                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417                     }
1418                     else {
1419                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420                         data->start_class->flags |= ANYOF_LOCALE;
1421                     }
1422                     break;
1423                 case NALNUM:
1424                     if (flags & SCF_DO_STCLASS_AND) {
1425                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427                             for (value = 0; value < 256; value++)
1428                                 if (isALNUM(value))
1429                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1430                         }
1431                     }
1432                     else {
1433                         if (data->start_class->flags & ANYOF_LOCALE)
1434                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435                         else {
1436                             for (value = 0; value < 256; value++)
1437                                 if (!isALNUM(value))
1438                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1439                         }
1440                     }
1441                     break;
1442                 case NALNUML:
1443                     if (flags & SCF_DO_STCLASS_AND) {
1444                         if (data->start_class->flags & ANYOF_LOCALE)
1445                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446                     }
1447                     else {
1448                         data->start_class->flags |= ANYOF_LOCALE;
1449                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450                     }
1451                     break;
1452                 case SPACE:
1453                     if (flags & SCF_DO_STCLASS_AND) {
1454                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456                             for (value = 0; value < 256; value++)
1457                                 if (!isSPACE(value))
1458                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1459                         }
1460                     }
1461                     else {
1462                         if (data->start_class->flags & ANYOF_LOCALE)
1463                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464                         else {
1465                             for (value = 0; value < 256; value++)
1466                                 if (isSPACE(value))
1467                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1468                         }
1469                     }
1470                     break;
1471                 case SPACEL:
1472                     if (flags & SCF_DO_STCLASS_AND) {
1473                         if (data->start_class->flags & ANYOF_LOCALE)
1474                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475                     }
1476                     else {
1477                         data->start_class->flags |= ANYOF_LOCALE;
1478                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479                     }
1480                     break;
1481                 case NSPACE:
1482                     if (flags & SCF_DO_STCLASS_AND) {
1483                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485                             for (value = 0; value < 256; value++)
1486                                 if (isSPACE(value))
1487                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1488                         }
1489                     }
1490                     else {
1491                         if (data->start_class->flags & ANYOF_LOCALE)
1492                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1493                         else {
1494                             for (value = 0; value < 256; value++)
1495                                 if (!isSPACE(value))
1496                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1497                         }
1498                     }
1499                     break;
1500                 case NSPACEL:
1501                     if (flags & SCF_DO_STCLASS_AND) {
1502                         if (data->start_class->flags & ANYOF_LOCALE) {
1503                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504                             for (value = 0; value < 256; value++)
1505                                 if (!isSPACE(value))
1506                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1507                         }
1508                     }
1509                     else {
1510                         data->start_class->flags |= ANYOF_LOCALE;
1511                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512                     }
1513                     break;
1514                 case DIGIT:
1515                     if (flags & SCF_DO_STCLASS_AND) {
1516                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517                         for (value = 0; value < 256; value++)
1518                             if (!isDIGIT(value))
1519                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520                     }
1521                     else {
1522                         if (data->start_class->flags & ANYOF_LOCALE)
1523                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1524                         else {
1525                             for (value = 0; value < 256; value++)
1526                                 if (isDIGIT(value))
1527                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1528                         }
1529                     }
1530                     break;
1531                 case NDIGIT:
1532                     if (flags & SCF_DO_STCLASS_AND) {
1533                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534                         for (value = 0; value < 256; value++)
1535                             if (isDIGIT(value))
1536                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537                     }
1538                     else {
1539                         if (data->start_class->flags & ANYOF_LOCALE)
1540                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1541                         else {
1542                             for (value = 0; value < 256; value++)
1543                                 if (!isDIGIT(value))
1544                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1545                         }
1546                     }
1547                     break;
1548                 }
1549                 if (flags & SCF_DO_STCLASS_OR)
1550                     cl_and(data->start_class, &and_with);
1551                 flags &= ~SCF_DO_STCLASS;
1552             }
1553         }
1554         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1555             data->flags |= (OP(scan) == MEOL
1556                             ? SF_BEFORE_MEOL
1557                             : SF_BEFORE_SEOL);
1558         }
1559         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1560                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1561                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1562                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1563             /* Lookahead/lookbehind */
1564             I32 deltanext, minnext, fake = 0;
1565             regnode *nscan;
1566             struct regnode_charclass_class intrnl;
1567             int f = 0;
1568
1569             data_fake.flags = 0;
1570             if (data) {         
1571                 data_fake.whilem_c = data->whilem_c;
1572                 data_fake.last_closep = data->last_closep;
1573             }
1574             else
1575                 data_fake.last_closep = &fake;
1576             if ( flags & SCF_DO_STCLASS && !scan->flags
1577                  && OP(scan) == IFMATCH ) { /* Lookahead */
1578                 cl_init(pRExC_state, &intrnl);
1579                 data_fake.start_class = &intrnl;
1580                 f |= SCF_DO_STCLASS_AND;
1581             }
1582             if (flags & SCF_WHILEM_VISITED_POS)
1583                 f |= SCF_WHILEM_VISITED_POS;
1584             next = regnext(scan);
1585             nscan = NEXTOPER(NEXTOPER(scan));
1586             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1587             if (scan->flags) {
1588                 if (deltanext) {
1589                     vFAIL("Variable length lookbehind not implemented");
1590                 }
1591                 else if (minnext > U8_MAX) {
1592                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1593                 }
1594                 scan->flags = minnext;
1595             }
1596             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1597                 pars++;
1598             if (data && (data_fake.flags & SF_HAS_EVAL))
1599                 data->flags |= SF_HAS_EVAL;
1600             if (data)
1601                 data->whilem_c = data_fake.whilem_c;
1602             if (f & SCF_DO_STCLASS_AND) {
1603                 int was = (data->start_class->flags & ANYOF_EOS);
1604
1605                 cl_and(data->start_class, &intrnl);
1606                 if (was)
1607                     data->start_class->flags |= ANYOF_EOS;
1608             }
1609         }
1610         else if (OP(scan) == OPEN) {
1611             pars++;
1612         }
1613         else if (OP(scan) == CLOSE) {
1614             if (ARG(scan) == is_par) {
1615                 next = regnext(scan);
1616
1617                 if ( next && (OP(next) != WHILEM) && next < last)
1618                     is_par = 0;         /* Disable optimization */
1619             }
1620             if (data)
1621                 *(data->last_closep) = ARG(scan);
1622         }
1623         else if (OP(scan) == EVAL) {
1624                 if (data)
1625                     data->flags |= SF_HAS_EVAL;
1626         }
1627         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1628                 if (flags & SCF_DO_SUBSTR) {
1629                     scan_commit(pRExC_state,data);
1630                     data->longest = &(data->longest_float);
1631                 }
1632                 is_inf = is_inf_internal = 1;
1633                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1634                     cl_anything(pRExC_state, data->start_class);
1635                 flags &= ~SCF_DO_STCLASS;
1636         }
1637         /* Else: zero-length, ignore. */
1638         scan = regnext(scan);
1639     }
1640
1641   finish:
1642     *scanp = scan;
1643     *deltap = is_inf_internal ? I32_MAX : delta;
1644     if (flags & SCF_DO_SUBSTR && is_inf)
1645         data->pos_delta = I32_MAX - data->pos_min;
1646     if (is_par > U8_MAX)
1647         is_par = 0;
1648     if (is_par && pars==1 && data) {
1649         data->flags |= SF_IN_PAR;
1650         data->flags &= ~SF_HAS_PAR;
1651     }
1652     else if (pars && data) {
1653         data->flags |= SF_HAS_PAR;
1654         data->flags &= ~SF_IN_PAR;
1655     }
1656     if (flags & SCF_DO_STCLASS_OR)
1657         cl_and(data->start_class, &and_with);
1658     return min;
1659 }
1660
1661 STATIC I32
1662 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1663 {
1664     if (RExC_rx->data) {
1665         Renewc(RExC_rx->data,
1666                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1667                char, struct reg_data);
1668         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669         RExC_rx->data->count += n;
1670     }
1671     else {
1672         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1673              char, struct reg_data);
1674         New(1208, RExC_rx->data->what, n, U8);
1675         RExC_rx->data->count = n;
1676     }
1677     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678     return RExC_rx->data->count - n;
1679 }
1680
1681 void
1682 Perl_reginitcolors(pTHX)
1683 {
1684     int i = 0;
1685     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1686         
1687     if (s) {
1688         PL_colors[0] = s = savepv(s);
1689         while (++i < 6) {
1690             s = strchr(s, '\t');
1691             if (s) {
1692                 *s = '\0';
1693                 PL_colors[i] = ++s;
1694             }
1695             else
1696                 PL_colors[i] = s = "";
1697         }
1698     } else {
1699         while (i < 6)
1700             PL_colors[i++] = "";
1701     }
1702     PL_colorset = 1;
1703 }
1704
1705
1706 /*
1707  - pregcomp - compile a regular expression into internal code
1708  *
1709  * We can't allocate space until we know how big the compiled form will be,
1710  * but we can't compile it (and thus know how big it is) until we've got a
1711  * place to put the code.  So we cheat:  we compile it twice, once with code
1712  * generation turned off and size counting turned on, and once "for real".
1713  * This also means that we don't allocate space until we are sure that the
1714  * thing really will compile successfully, and we never have to move the
1715  * code and thus invalidate pointers into it.  (Note that it has to be in
1716  * one piece because free() must be able to free it all.) [NB: not true in perl]
1717  *
1718  * Beware that the optimization-preparation code in here knows about some
1719  * of the structure of the compiled regexp.  [I'll say.]
1720  */
1721 regexp *
1722 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1723 {
1724     register regexp *r;
1725     regnode *scan;
1726     regnode *first;
1727     I32 flags;
1728     I32 minlen = 0;
1729     I32 sawplus = 0;
1730     I32 sawopen = 0;
1731     scan_data_t data;
1732     RExC_state_t RExC_state;
1733     RExC_state_t *pRExC_state = &RExC_state;
1734
1735     if (exp == NULL)
1736         FAIL("NULL regexp argument");
1737
1738     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1739
1740     RExC_precomp = exp;
1741     DEBUG_r({
1742          if (!PL_colorset) reginitcolors();
1743          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744                        PL_colors[4],PL_colors[5],PL_colors[0],
1745                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1746     });
1747     RExC_flags16 = pm->op_pmflags;
1748     RExC_sawback = 0;
1749
1750     RExC_seen = 0;
1751     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752     RExC_seen_evals = 0;
1753     RExC_extralen = 0;
1754
1755     /* First pass: determine size, legality. */
1756     RExC_parse = exp;
1757     RExC_start = exp;
1758     RExC_end = xend;
1759     RExC_naughty = 0;
1760     RExC_npar = 1;
1761     RExC_size = 0L;
1762     RExC_emit = &PL_regdummy;
1763     RExC_whilem_seen = 0;
1764 #if 0 /* REGC() is (currently) a NOP at the first pass.
1765        * Clever compilers notice this and complain. --jhi */
1766     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1767 #endif
1768     if (reg(pRExC_state, 0, &flags) == NULL) {
1769         RExC_precomp = Nullch;
1770         return(NULL);
1771     }
1772     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1773
1774     /* Small enough for pointer-storage convention?
1775        If extralen==0, this means that we will not need long jumps. */
1776     if (RExC_size >= 0x10000L && RExC_extralen)
1777         RExC_size += RExC_extralen;
1778     else
1779         RExC_extralen = 0;
1780     if (RExC_whilem_seen > 15)
1781         RExC_whilem_seen = 15;
1782
1783     /* Allocate space and initialize. */
1784     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1785          char, regexp);
1786     if (r == NULL)
1787         FAIL("Regexp out of space");
1788
1789 #ifdef DEBUGGING
1790     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1791     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1792 #endif
1793     r->refcnt = 1;
1794     r->prelen = xend - exp;
1795     r->precomp = savepvn(RExC_precomp, r->prelen);
1796     r->subbeg = NULL;
1797     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1798     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1799
1800     r->substrs = 0;                     /* Useful during FAIL. */
1801     r->startp = 0;                      /* Useful during FAIL. */
1802     r->endp = 0;                        /* Useful during FAIL. */
1803
1804     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1805     if (r->offsets) {
1806       r->offsets[0] = RExC_size; 
1807     }
1808     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1809                           "%s %"UVuf" bytes for offset annotations.\n", 
1810                           r->offsets ? "Got" : "Couldn't get", 
1811                           (UV)((2*RExC_size+1) * sizeof(U32))));
1812
1813     RExC_rx = r;
1814
1815     /* Second pass: emit code. */
1816     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1817     RExC_parse = exp;
1818     RExC_end = xend;
1819     RExC_naughty = 0;
1820     RExC_npar = 1;
1821     RExC_emit_start = r->program;
1822     RExC_emit = r->program;
1823     /* Store the count of eval-groups for security checks: */
1824     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1826     r->data = 0;
1827     if (reg(pRExC_state, 0, &flags) == NULL)
1828         return(NULL);
1829
1830     /* Dig out information for optimizations. */
1831     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1832     pm->op_pmflags = RExC_flags16;
1833     if (UTF)
1834         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1835     r->regstclass = NULL;
1836     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1837         r->reganch |= ROPT_NAUGHTY;
1838     scan = r->program + 1;              /* First BRANCH. */
1839
1840     /* XXXX To minimize changes to RE engine we always allocate
1841        3-units-long substrs field. */
1842     Newz(1004, r->substrs, 1, struct reg_substr_data);
1843
1844     StructCopy(&zero_scan_data, &data, scan_data_t);
1845     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1846     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1847         I32 fake;
1848         STRLEN longest_float_length, longest_fixed_length;
1849         struct regnode_charclass_class ch_class;
1850         int stclass_flag;
1851         I32 last_close = 0;
1852
1853         first = scan;
1854         /* Skip introductions and multiplicators >= 1. */
1855         while ((OP(first) == OPEN && (sawopen = 1)) ||
1856                /* An OR of *one* alternative - should not happen now. */
1857             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858             (OP(first) == PLUS) ||
1859             (OP(first) == MINMOD) ||
1860                /* An {n,m} with n>0 */
1861             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1862                 if (OP(first) == PLUS)
1863                     sawplus = 1;
1864                 else
1865                     first += regarglen[(U8)OP(first)];
1866                 first = NEXTOPER(first);
1867         }
1868
1869         /* Starting-point info. */
1870       again:
1871         if (PL_regkind[(U8)OP(first)] == EXACT) {
1872             if (OP(first) == EXACT)
1873                 ;       /* Empty, get anchored substr later. */
1874             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1875                 r->regstclass = first;
1876         }
1877         else if (strchr((char*)PL_simple,OP(first)))
1878             r->regstclass = first;
1879         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880                  PL_regkind[(U8)OP(first)] == NBOUND)
1881             r->regstclass = first;
1882         else if (PL_regkind[(U8)OP(first)] == BOL) {
1883             r->reganch |= (OP(first) == MBOL
1884                            ? ROPT_ANCH_MBOL
1885                            : (OP(first) == SBOL
1886                               ? ROPT_ANCH_SBOL
1887                               : ROPT_ANCH_BOL));
1888             first = NEXTOPER(first);
1889             goto again;
1890         }
1891         else if (OP(first) == GPOS) {
1892             r->reganch |= ROPT_ANCH_GPOS;
1893             first = NEXTOPER(first);
1894             goto again;
1895         }
1896         else if (!sawopen && (OP(first) == STAR &&
1897             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1898             !(r->reganch & ROPT_ANCH) )
1899         {
1900             /* turn .* into ^.* with an implied $*=1 */
1901             int type = OP(NEXTOPER(first));
1902
1903             if (type == REG_ANY)
1904                 type = ROPT_ANCH_MBOL;
1905             else
1906                 type = ROPT_ANCH_SBOL;
1907
1908             r->reganch |= type | ROPT_IMPLICIT;
1909             first = NEXTOPER(first);
1910             goto again;
1911         }
1912         if (sawplus && (!sawopen || !RExC_sawback)
1913             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1914             /* x+ must match at the 1st pos of run of x's */
1915             r->reganch |= ROPT_SKIP;
1916
1917         /* Scan is after the zeroth branch, first is atomic matcher. */
1918         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1919                               (IV)(first - scan + 1)));
1920         /*
1921         * If there's something expensive in the r.e., find the
1922         * longest literal string that must appear and make it the
1923         * regmust.  Resolve ties in favor of later strings, since
1924         * the regstart check works with the beginning of the r.e.
1925         * and avoiding duplication strengthens checking.  Not a
1926         * strong reason, but sufficient in the absence of others.
1927         * [Now we resolve ties in favor of the earlier string if
1928         * it happens that c_offset_min has been invalidated, since the
1929         * earlier string may buy us something the later one won't.]
1930         */
1931         minlen = 0;
1932
1933         data.longest_fixed = newSVpvn("",0);
1934         data.longest_float = newSVpvn("",0);
1935         data.last_found = newSVpvn("",0);
1936         data.longest = &(data.longest_fixed);
1937         first = scan;
1938         if (!r->regstclass) {
1939             cl_init(pRExC_state, &ch_class);
1940             data.start_class = &ch_class;
1941             stclass_flag = SCF_DO_STCLASS_AND;
1942         } else                          /* XXXX Check for BOUND? */
1943             stclass_flag = 0;
1944         data.last_closep = &last_close;
1945
1946         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1947                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1948         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1949              && data.last_start_min == 0 && data.last_end > 0
1950              && !RExC_seen_zerolen
1951              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1952             r->reganch |= ROPT_CHECK_ALL;
1953         scan_commit(pRExC_state, &data);
1954         SvREFCNT_dec(data.last_found);
1955
1956         longest_float_length = CHR_SVLEN(data.longest_float);
1957         if (longest_float_length
1958             || (data.flags & SF_FL_BEFORE_EOL
1959                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1960                     || (RExC_flags16 & PMf_MULTILINE)))) {
1961             int t;
1962
1963             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1964                 && data.offset_fixed == data.offset_float_min
1965                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966                     goto remove_float;          /* As in (a)+. */
1967
1968             if (SvUTF8(data.longest_float)) {
1969                 r->float_utf8 = data.longest_float;
1970                 r->float_substr = Nullsv;
1971             } else {
1972                 r->float_substr = data.longest_float;
1973                 r->float_utf8 = Nullsv;
1974             }
1975             r->float_min_offset = data.offset_float_min;
1976             r->float_max_offset = data.offset_float_max;
1977             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1979                            || (RExC_flags16 & PMf_MULTILINE)));
1980             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1981         }
1982         else {
1983           remove_float:
1984             r->float_substr = r->float_utf8 = Nullsv;
1985             SvREFCNT_dec(data.longest_float);
1986             longest_float_length = 0;
1987         }
1988
1989         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1990         if (longest_fixed_length
1991             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993                     || (RExC_flags16 & PMf_MULTILINE)))) {
1994             int t;
1995
1996             if (SvUTF8(data.longest_fixed)) {
1997                 r->anchored_utf8 = data.longest_fixed;
1998                 r->anchored_substr = Nullsv;
1999             } else {
2000                 r->anchored_substr = data.longest_fixed;
2001                 r->anchored_utf8 = Nullsv;
2002             }
2003             r->anchored_offset = data.offset_fixed;
2004             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2006                      || (RExC_flags16 & PMf_MULTILINE)));
2007             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2008         }
2009         else {
2010             r->anchored_substr = r->anchored_utf8 = Nullsv;
2011             SvREFCNT_dec(data.longest_fixed);
2012             longest_fixed_length = 0;
2013         }
2014         if (r->regstclass
2015             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2016             r->regstclass = NULL;
2017         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2018             && stclass_flag
2019             && !(data.start_class->flags & ANYOF_EOS)
2020             && !cl_is_anything(data.start_class)) {
2021             I32 n = add_data(pRExC_state, 1, "f");
2022
2023             New(1006, RExC_rx->data->data[n], 1,
2024                 struct regnode_charclass_class);
2025             StructCopy(data.start_class,
2026                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2027                        struct regnode_charclass_class);
2028             r->regstclass = (regnode*)RExC_rx->data->data[n];
2029             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2030             PL_regdata = r->data; /* for regprop() */
2031             DEBUG_r({ SV *sv = sv_newmortal();
2032                       regprop(sv, (regnode*)data.start_class);
2033                       PerlIO_printf(Perl_debug_log,
2034                                     "synthetic stclass `%s'.\n",
2035                                     SvPVX(sv));});
2036         }
2037
2038         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2039         if (longest_fixed_length > longest_float_length) {
2040             r->check_substr = r->anchored_substr;
2041             r->check_utf8 = r->anchored_utf8;
2042             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043             if (r->reganch & ROPT_ANCH_SINGLE)
2044                 r->reganch |= ROPT_NOSCAN;
2045         }
2046         else {
2047             r->check_substr = r->float_substr;
2048             r->check_utf8 = r->float_utf8;
2049             r->check_offset_min = data.offset_float_min;
2050             r->check_offset_max = data.offset_float_max;
2051         }
2052         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053            This should be changed ASAP!  */
2054         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2055             r->reganch |= RE_USE_INTUIT;
2056             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2057                 r->reganch |= RE_INTUIT_TAIL;
2058         }
2059     }
2060     else {
2061         /* Several toplevels. Best we can is to set minlen. */
2062         I32 fake;
2063         struct regnode_charclass_class ch_class;
2064         I32 last_close = 0;
2065         
2066         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067         scan = r->program + 1;
2068         cl_init(pRExC_state, &ch_class);
2069         data.start_class = &ch_class;
2070         data.last_closep = &last_close;
2071         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2072         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073                 = r->float_substr = r->float_utf8 = Nullsv;
2074         if (!(data.start_class->flags & ANYOF_EOS)
2075             && !cl_is_anything(data.start_class)) {
2076             I32 n = add_data(pRExC_state, 1, "f");
2077
2078             New(1006, RExC_rx->data->data[n], 1,
2079                 struct regnode_charclass_class);
2080             StructCopy(data.start_class,
2081                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2082                        struct regnode_charclass_class);
2083             r->regstclass = (regnode*)RExC_rx->data->data[n];
2084             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2085             DEBUG_r({ SV* sv = sv_newmortal();
2086                       regprop(sv, (regnode*)data.start_class);
2087                       PerlIO_printf(Perl_debug_log,
2088                                     "synthetic stclass `%s'.\n",
2089                                     SvPVX(sv));});
2090         }
2091     }
2092
2093     r->minlen = minlen;
2094     if (RExC_seen & REG_SEEN_GPOS)
2095         r->reganch |= ROPT_GPOS_SEEN;
2096     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2097         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2098     if (RExC_seen & REG_SEEN_EVAL)
2099         r->reganch |= ROPT_EVAL_SEEN;
2100     if (RExC_seen & REG_SEEN_CANY)
2101         r->reganch |= ROPT_CANY_SEEN;
2102     Newz(1002, r->startp, RExC_npar, I32);
2103     Newz(1002, r->endp, RExC_npar, I32);
2104     PL_regdata = r->data; /* for regprop() */
2105     DEBUG_r(regdump(r));
2106     return(r);
2107 }
2108
2109 /*
2110  - reg - regular expression, i.e. main body or parenthesized thing
2111  *
2112  * Caller must absorb opening parenthesis.
2113  *
2114  * Combining parenthesis handling with the base level of regular expression
2115  * is a trifle forced, but the need to tie the tails of the branches to what
2116  * follows makes it hard to avoid.
2117  */
2118 STATIC regnode *
2119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2120     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2121 {
2122     register regnode *ret;              /* Will be the head of the group. */
2123     register regnode *br;
2124     register regnode *lastbr;
2125     register regnode *ender = 0;
2126     register I32 parno = 0;
2127     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2128
2129     /* for (?g), (?gc), and (?o) warnings; warning
2130        about (?c) will warn about (?g) -- japhy    */
2131
2132     I32 wastedflags = 0x00,
2133         wasted_o    = 0x01,
2134         wasted_g    = 0x02,
2135         wasted_gc   = 0x02 | 0x04,
2136         wasted_c    = 0x04;
2137
2138     char * parse_start = RExC_parse; /* MJD */
2139     char *oregcomp_parse = RExC_parse;
2140     char c;
2141
2142     *flagp = 0;                         /* Tentatively. */
2143
2144
2145     /* Make an OPEN node, if parenthesized. */
2146     if (paren) {
2147         if (*RExC_parse == '?') { /* (?...) */
2148             U16 posflags = 0, negflags = 0;
2149             U16 *flagsp = &posflags;
2150             int logical = 0;
2151             char *seqstart = RExC_parse;
2152
2153             RExC_parse++;
2154             paren = *RExC_parse++;
2155             ret = NULL;                 /* For look-ahead/behind. */
2156             switch (paren) {
2157             case '<':           /* (?<...) */
2158                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2159                 if (*RExC_parse == '!')
2160                     paren = ',';
2161                 if (*RExC_parse != '=' && *RExC_parse != '!')
2162                     goto unknown;
2163                 RExC_parse++;
2164             case '=':           /* (?=...) */
2165             case '!':           /* (?!...) */
2166                 RExC_seen_zerolen++;
2167             case ':':           /* (?:...) */
2168             case '>':           /* (?>...) */
2169                 break;
2170             case '$':           /* (?$...) */
2171             case '@':           /* (?@...) */
2172                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2173                 break;
2174             case '#':           /* (?#...) */
2175                 while (*RExC_parse && *RExC_parse != ')')
2176                     RExC_parse++;
2177                 if (*RExC_parse != ')')
2178                     FAIL("Sequence (?#... not terminated");
2179                 nextchar(pRExC_state);
2180                 *flagp = TRYAGAIN;
2181                 return NULL;
2182             case 'p':           /* (?p...) */
2183                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2184                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2185                 /* FALL THROUGH*/
2186             case '?':           /* (??...) */
2187                 logical = 1;
2188                 if (*RExC_parse != '{')
2189                     goto unknown;
2190                 paren = *RExC_parse++;
2191                 /* FALL THROUGH */
2192             case '{':           /* (?{...}) */
2193             {
2194                 I32 count = 1, n = 0;
2195                 char c;
2196                 char *s = RExC_parse;
2197                 SV *sv;
2198                 OP_4tree *sop, *rop;
2199
2200                 RExC_seen_zerolen++;
2201                 RExC_seen |= REG_SEEN_EVAL;
2202                 while (count && (c = *RExC_parse)) {
2203                     if (c == '\\' && RExC_parse[1])
2204                         RExC_parse++;
2205                     else if (c == '{')
2206                         count++;
2207                     else if (c == '}')
2208                         count--;
2209                     RExC_parse++;
2210                 }
2211                 if (*RExC_parse != ')')
2212                 {
2213                     RExC_parse = s;             
2214                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2215                 }
2216                 if (!SIZE_ONLY) {
2217                     AV *av;
2218                 
2219                     if (RExC_parse - 1 - s)
2220                         sv = newSVpvn(s, RExC_parse - 1 - s);
2221                     else
2222                         sv = newSVpvn("", 0);
2223
2224                     ENTER;
2225                     Perl_save_re_context(aTHX);
2226                     rop = sv_compile_2op(sv, &sop, "re", &av);
2227                     sop->op_private |= OPpREFCOUNTED;
2228                     /* re_dup will OpREFCNT_inc */
2229                     OpREFCNT_set(sop, 1);
2230                     LEAVE;
2231
2232                     n = add_data(pRExC_state, 3, "nop");
2233                     RExC_rx->data->data[n] = (void*)rop;
2234                     RExC_rx->data->data[n+1] = (void*)sop;
2235                     RExC_rx->data->data[n+2] = (void*)av;
2236                     SvREFCNT_dec(sv);
2237                 }
2238                 else {                                          /* First pass */
2239                     if (PL_reginterp_cnt < ++RExC_seen_evals
2240                         && PL_curcop != &PL_compiling)
2241                         /* No compiled RE interpolated, has runtime
2242                            components ===> unsafe.  */
2243                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2244                     if (PL_tainting && PL_tainted)
2245                         FAIL("Eval-group in insecure regular expression");
2246                 }
2247                 
2248                 nextchar(pRExC_state);
2249                 if (logical) {
2250                     ret = reg_node(pRExC_state, LOGICAL);
2251                     if (!SIZE_ONLY)
2252                         ret->flags = 2;
2253                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2254                     /* deal with the length of this later - MJD */
2255                     return ret;
2256                 }
2257                 return reganode(pRExC_state, EVAL, n);
2258             }
2259             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2260             {
2261                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2262                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263                         || RExC_parse[1] == '<'
2264                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2265                         I32 flag;
2266                         
2267                         ret = reg_node(pRExC_state, LOGICAL);
2268                         if (!SIZE_ONLY)
2269                             ret->flags = 1;
2270                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2271                         goto insert_if;
2272                     }
2273                 }
2274                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2275                     /* (?(1)...) */
2276                     parno = atoi(RExC_parse++);
2277
2278                     while (isDIGIT(*RExC_parse))
2279                         RExC_parse++;
2280                     ret = reganode(pRExC_state, GROUPP, parno);
2281                     
2282                     if ((c = *nextchar(pRExC_state)) != ')')
2283                         vFAIL("Switch condition not recognized");
2284                   insert_if:
2285                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286                     br = regbranch(pRExC_state, &flags, 1);
2287                     if (br == NULL)
2288                         br = reganode(pRExC_state, LONGJMP, 0);
2289                     else
2290                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291                     c = *nextchar(pRExC_state);
2292                     if (flags&HASWIDTH)
2293                         *flagp |= HASWIDTH;
2294                     if (c == '|') {
2295                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296                         regbranch(pRExC_state, &flags, 1);
2297                         regtail(pRExC_state, ret, lastbr);
2298                         if (flags&HASWIDTH)
2299                             *flagp |= HASWIDTH;
2300                         c = *nextchar(pRExC_state);
2301                     }
2302                     else
2303                         lastbr = NULL;
2304                     if (c != ')')
2305                         vFAIL("Switch (?(condition)... contains too many branches");
2306                     ender = reg_node(pRExC_state, TAIL);
2307                     regtail(pRExC_state, br, ender);
2308                     if (lastbr) {
2309                         regtail(pRExC_state, lastbr, ender);
2310                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2311                     }
2312                     else
2313                         regtail(pRExC_state, ret, ender);
2314                     return ret;
2315                 }
2316                 else {
2317                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2318                 }
2319             }
2320             case 0:
2321                 RExC_parse--; /* for vFAIL to print correctly */
2322                 vFAIL("Sequence (? incomplete");
2323                 break;
2324             default:
2325                 --RExC_parse;
2326               parse_flags:      /* (?i) */
2327                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2328                     /* (?g), (?gc) and (?o) are useless here
2329                        and must be globally applied -- japhy */
2330
2331                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334                             if (! (wastedflags & wflagbit) ) {
2335                                 wastedflags |= wflagbit;
2336                                 vWARN5(
2337                                     RExC_parse + 1,
2338                                     "Useless (%s%c) - %suse /%c modifier",
2339                                     flagsp == &negflags ? "?-" : "?",
2340                                     *RExC_parse,
2341                                     flagsp == &negflags ? "don't " : "",
2342                                     *RExC_parse
2343                                 );
2344                             }
2345                         }
2346                     }
2347                     else if (*RExC_parse == 'c') {
2348                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349                             if (! (wastedflags & wasted_c) ) {
2350                                 wastedflags |= wasted_gc;
2351                                 vWARN3(
2352                                     RExC_parse + 1,
2353                                     "Useless (%sc) - %suse /gc modifier",
2354                                     flagsp == &negflags ? "?-" : "?",
2355                                     flagsp == &negflags ? "don't " : ""
2356                                 );
2357                             }
2358                         }
2359                     }
2360                     else { pmflag(flagsp, *RExC_parse); }
2361
2362                     ++RExC_parse;
2363                 }
2364                 if (*RExC_parse == '-') {
2365                     flagsp = &negflags;
2366                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2367                     ++RExC_parse;
2368                     goto parse_flags;
2369                 }
2370                 RExC_flags16 |= posflags;
2371                 RExC_flags16 &= ~negflags;
2372                 if (*RExC_parse == ':') {
2373                     RExC_parse++;
2374                     paren = ':';
2375                     break;
2376                 }               
2377               unknown:
2378                 if (*RExC_parse != ')') {
2379                     RExC_parse++;
2380                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2381                 }
2382                 nextchar(pRExC_state);
2383                 *flagp = TRYAGAIN;
2384                 return NULL;
2385             }
2386         }
2387         else {                  /* (...) */
2388             parno = RExC_npar;
2389             RExC_npar++;
2390             ret = reganode(pRExC_state, OPEN, parno);
2391             Set_Node_Length(ret, 1); /* MJD */
2392             Set_Node_Offset(ret, RExC_parse); /* MJD */
2393             open = 1;
2394         }
2395     }
2396     else                        /* ! paren */
2397         ret = NULL;
2398
2399     /* Pick up the branches, linking them together. */
2400     parse_start = RExC_parse;   /* MJD */
2401     br = regbranch(pRExC_state, &flags, 1);
2402     /*     branch_len = (paren != 0); */
2403     
2404     if (br == NULL)
2405         return(NULL);
2406     if (*RExC_parse == '|') {
2407         if (!SIZE_ONLY && RExC_extralen) {
2408             reginsert(pRExC_state, BRANCHJ, br);
2409         }
2410         else {                  /* MJD */
2411             reginsert(pRExC_state, BRANCH, br);
2412             Set_Node_Length(br, paren != 0);
2413             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2414         }
2415         have_branch = 1;
2416         if (SIZE_ONLY)
2417             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2418     }
2419     else if (paren == ':') {
2420         *flagp |= flags&SIMPLE;
2421     }
2422     if (open) {                         /* Starts with OPEN. */
2423         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2424     }
2425     else if (paren != '?')              /* Not Conditional */
2426         ret = br;
2427     *flagp |= flags & (SPSTART | HASWIDTH);
2428     lastbr = br;
2429     while (*RExC_parse == '|') {
2430         if (!SIZE_ONLY && RExC_extralen) {
2431             ender = reganode(pRExC_state, LONGJMP,0);
2432             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2433         }
2434         if (SIZE_ONLY)
2435             RExC_extralen += 2;         /* Account for LONGJMP. */
2436         nextchar(pRExC_state);
2437         br = regbranch(pRExC_state, &flags, 0);
2438         
2439         if (br == NULL)
2440             return(NULL);
2441         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2442         lastbr = br;
2443         if (flags&HASWIDTH)
2444             *flagp |= HASWIDTH;
2445         *flagp |= flags&SPSTART;
2446     }
2447
2448     if (have_branch || paren != ':') {
2449         /* Make a closing node, and hook it on the end. */
2450         switch (paren) {
2451         case ':':
2452             ender = reg_node(pRExC_state, TAIL);
2453             break;
2454         case 1:
2455             ender = reganode(pRExC_state, CLOSE, parno);
2456             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457             Set_Node_Length(ender,1); /* MJD */
2458             break;
2459         case '<':
2460         case ',':
2461         case '=':
2462         case '!':
2463             *flagp &= ~HASWIDTH;
2464             /* FALL THROUGH */
2465         case '>':
2466             ender = reg_node(pRExC_state, SUCCEED);
2467             break;
2468         case 0:
2469             ender = reg_node(pRExC_state, END);
2470             break;
2471         }
2472         regtail(pRExC_state, lastbr, ender);
2473
2474         if (have_branch) {
2475             /* Hook the tails of the branches to the closing node. */
2476             for (br = ret; br != NULL; br = regnext(br)) {
2477                 regoptail(pRExC_state, br, ender);
2478             }
2479         }
2480     }
2481
2482     {
2483         char *p;
2484         static char parens[] = "=!<,>";
2485
2486         if (paren && (p = strchr(parens, paren))) {
2487             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488             int flag = (p - parens) > 1;
2489
2490             if (paren == '>')
2491                 node = SUSPEND, flag = 0;
2492             reginsert(pRExC_state, node,ret);
2493             ret->flags = flag;
2494             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2495         }
2496     }
2497
2498     /* Check for proper termination. */
2499     if (paren) {
2500         RExC_flags16 = oregflags;
2501         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502             RExC_parse = oregcomp_parse;
2503             vFAIL("Unmatched (");
2504         }
2505     }
2506     else if (!paren && RExC_parse < RExC_end) {
2507         if (*RExC_parse == ')') {
2508             RExC_parse++;
2509             vFAIL("Unmatched )");
2510         }
2511         else
2512             FAIL("Junk on end of regexp");      /* "Can't happen". */
2513         /* NOTREACHED */
2514     }
2515
2516     return(ret);
2517 }
2518
2519 /*
2520  - regbranch - one alternative of an | operator
2521  *
2522  * Implements the concatenation operator.
2523  */
2524 STATIC regnode *
2525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2526 {
2527     register regnode *ret;
2528     register regnode *chain = NULL;
2529     register regnode *latest;
2530     I32 flags = 0, c = 0;
2531
2532     if (first)
2533         ret = NULL;
2534     else {
2535         if (!SIZE_ONLY && RExC_extralen)
2536             ret = reganode(pRExC_state, BRANCHJ,0);
2537         else {
2538             ret = reg_node(pRExC_state, BRANCH);
2539             Set_Node_Length(ret, 1);
2540         }
2541     }
2542         
2543     if (!first && SIZE_ONLY)
2544         RExC_extralen += 1;                     /* BRANCHJ */
2545
2546     *flagp = WORST;                     /* Tentatively. */
2547
2548     RExC_parse--;
2549     nextchar(pRExC_state);
2550     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2551         flags &= ~TRYAGAIN;
2552         latest = regpiece(pRExC_state, &flags);
2553         if (latest == NULL) {
2554             if (flags & TRYAGAIN)
2555                 continue;
2556             return(NULL);
2557         }
2558         else if (ret == NULL)
2559             ret = latest;
2560         *flagp |= flags&HASWIDTH;
2561         if (chain == NULL)      /* First piece. */
2562             *flagp |= flags&SPSTART;
2563         else {
2564             RExC_naughty++;
2565             regtail(pRExC_state, chain, latest);
2566         }
2567         chain = latest;
2568         c++;
2569     }
2570     if (chain == NULL) {        /* Loop ran zero times. */
2571         chain = reg_node(pRExC_state, NOTHING);
2572         if (ret == NULL)
2573             ret = chain;
2574     }
2575     if (c == 1) {
2576         *flagp |= flags&SIMPLE;
2577     }
2578
2579     return(ret);
2580 }
2581
2582 /*
2583  - regpiece - something followed by possible [*+?]
2584  *
2585  * Note that the branching code sequences used for ? and the general cases
2586  * of * and + are somewhat optimized:  they use the same NOTHING node as
2587  * both the endmarker for their branch list and the body of the last branch.
2588  * It might seem that this node could be dispensed with entirely, but the
2589  * endmarker role is not redundant.
2590  */
2591 STATIC regnode *
2592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2593 {
2594     register regnode *ret;
2595     register char op;
2596     register char *next;
2597     I32 flags;
2598     char *origparse = RExC_parse;
2599     char *maxpos;
2600     I32 min;
2601     I32 max = REG_INFTY;
2602     char *parse_start;
2603
2604     ret = regatom(pRExC_state, &flags);
2605     if (ret == NULL) {
2606         if (flags & TRYAGAIN)
2607             *flagp |= TRYAGAIN;
2608         return(NULL);
2609     }
2610
2611     op = *RExC_parse;
2612
2613     if (op == '{' && regcurly(RExC_parse)) {
2614         parse_start = RExC_parse; /* MJD */
2615         next = RExC_parse + 1;
2616         maxpos = Nullch;
2617         while (isDIGIT(*next) || *next == ',') {
2618             if (*next == ',') {
2619                 if (maxpos)
2620                     break;
2621                 else
2622                     maxpos = next;
2623             }
2624             next++;
2625         }
2626         if (*next == '}') {             /* got one */
2627             if (!maxpos)
2628                 maxpos = next;
2629             RExC_parse++;
2630             min = atoi(RExC_parse);
2631             if (*maxpos == ',')
2632                 maxpos++;
2633             else
2634                 maxpos = RExC_parse;
2635             max = atoi(maxpos);
2636             if (!max && *maxpos != '0')
2637                 max = REG_INFTY;                /* meaning "infinity" */
2638             else if (max >= REG_INFTY)
2639                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2640             RExC_parse = next;
2641             nextchar(pRExC_state);
2642
2643         do_curly:
2644             if ((flags&SIMPLE)) {
2645                 RExC_naughty += 2 + RExC_naughty / 2;
2646                 reginsert(pRExC_state, CURLY, ret);
2647                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648                 Set_Node_Cur_Length(ret);
2649             }
2650             else {
2651                 regnode *w = reg_node(pRExC_state, WHILEM);
2652
2653                 w->flags = 0;
2654                 regtail(pRExC_state, ret, w);
2655                 if (!SIZE_ONLY && RExC_extralen) {
2656                     reginsert(pRExC_state, LONGJMP,ret);
2657                     reginsert(pRExC_state, NOTHING,ret);
2658                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2659                 }
2660                 reginsert(pRExC_state, CURLYX,ret);
2661                                 /* MJD hk */
2662                 Set_Node_Offset(ret, parse_start+1);
2663                 Set_Node_Length(ret, 
2664                                 op == '{' ? (RExC_parse - parse_start) : 1);
2665                 
2666                 if (!SIZE_ONLY && RExC_extralen)
2667                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2668                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2669                 if (SIZE_ONLY)
2670                     RExC_whilem_seen++, RExC_extralen += 3;
2671                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2672             }
2673             ret->flags = 0;
2674
2675             if (min > 0)
2676                 *flagp = WORST;
2677             if (max > 0)
2678                 *flagp |= HASWIDTH;
2679             if (max && max < min)
2680                 vFAIL("Can't do {n,m} with n > m");
2681             if (!SIZE_ONLY) {
2682                 ARG1_SET(ret, min);
2683                 ARG2_SET(ret, max);
2684             }
2685
2686             goto nest_check;
2687         }
2688     }
2689
2690     if (!ISMULT1(op)) {
2691         *flagp = flags;
2692         return(ret);
2693     }
2694
2695 #if 0                           /* Now runtime fix should be reliable. */
2696
2697     /* if this is reinstated, don't forget to put this back into perldiag:
2698
2699             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2700
2701            (F) The part of the regexp subject to either the * or + quantifier
2702            could match an empty string. The {#} shows in the regular
2703            expression about where the problem was discovered.
2704
2705     */
2706
2707     if (!(flags&HASWIDTH) && op != '?')
2708       vFAIL("Regexp *+ operand could be empty");
2709 #endif
2710
2711     parse_start = RExC_parse;
2712     nextchar(pRExC_state);
2713
2714     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2715
2716     if (op == '*' && (flags&SIMPLE)) {
2717         reginsert(pRExC_state, STAR, ret);
2718         ret->flags = 0;
2719         RExC_naughty += 4;
2720     }
2721     else if (op == '*') {
2722         min = 0;
2723         goto do_curly;
2724     }
2725     else if (op == '+' && (flags&SIMPLE)) {
2726         reginsert(pRExC_state, PLUS, ret);
2727         ret->flags = 0;
2728         RExC_naughty += 3;
2729     }
2730     else if (op == '+') {
2731         min = 1;
2732         goto do_curly;
2733     }
2734     else if (op == '?') {
2735         min = 0; max = 1;
2736         goto do_curly;
2737     }
2738   nest_check:
2739     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2740         vWARN3(RExC_parse,
2741                "%.*s matches null string many times",
2742                RExC_parse - origparse,
2743                origparse);
2744     }
2745
2746     if (*RExC_parse == '?') {
2747         nextchar(pRExC_state);
2748         reginsert(pRExC_state, MINMOD, ret);
2749         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2750     }
2751     if (ISMULT2(RExC_parse)) {
2752         RExC_parse++;
2753         vFAIL("Nested quantifiers");
2754     }
2755
2756     return(ret);
2757 }
2758
2759 /*
2760  - regatom - the lowest level
2761  *
2762  * Optimization:  gobbles an entire sequence of ordinary characters so that
2763  * it can turn them into a single node, which is smaller to store and
2764  * faster to run.  Backslashed characters are exceptions, each becoming a
2765  * separate node; the code is simpler that way and it's not worth fixing.
2766  *
2767  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2768 STATIC regnode *
2769 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2770 {
2771     register regnode *ret = 0;
2772     I32 flags;
2773     char *parse_start = 0;
2774
2775     *flagp = WORST;             /* Tentatively. */
2776
2777 tryagain:
2778     switch (*RExC_parse) {
2779     case '^':
2780         RExC_seen_zerolen++;
2781         nextchar(pRExC_state);
2782         if (RExC_flags16 & PMf_MULTILINE)
2783             ret = reg_node(pRExC_state, MBOL);
2784         else if (RExC_flags16 & PMf_SINGLELINE)
2785             ret = reg_node(pRExC_state, SBOL);
2786         else
2787             ret = reg_node(pRExC_state, BOL);
2788         Set_Node_Length(ret, 1); /* MJD */
2789         break;
2790     case '$':
2791         nextchar(pRExC_state);
2792         if (*RExC_parse)
2793             RExC_seen_zerolen++;
2794         if (RExC_flags16 & PMf_MULTILINE)
2795             ret = reg_node(pRExC_state, MEOL);
2796         else if (RExC_flags16 & PMf_SINGLELINE)
2797             ret = reg_node(pRExC_state, SEOL);
2798         else
2799             ret = reg_node(pRExC_state, EOL);
2800         Set_Node_Length(ret, 1); /* MJD */
2801         break;
2802     case '.':
2803         nextchar(pRExC_state);
2804         if (RExC_flags16 & PMf_SINGLELINE)
2805             ret = reg_node(pRExC_state, SANY);
2806         else
2807             ret = reg_node(pRExC_state, REG_ANY);
2808         *flagp |= HASWIDTH|SIMPLE;
2809         RExC_naughty++;
2810         Set_Node_Length(ret, 1); /* MJD */
2811         break;
2812     case '[':
2813     {
2814         char *oregcomp_parse = ++RExC_parse;
2815         ret = regclass(pRExC_state);
2816         if (*RExC_parse != ']') {
2817             RExC_parse = oregcomp_parse;
2818             vFAIL("Unmatched [");
2819         }
2820         nextchar(pRExC_state);
2821         *flagp |= HASWIDTH|SIMPLE;
2822         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2823         break;
2824     }
2825     case '(':
2826         nextchar(pRExC_state);
2827         ret = reg(pRExC_state, 1, &flags);
2828         if (ret == NULL) {
2829                 if (flags & TRYAGAIN) {
2830                     if (RExC_parse == RExC_end) {
2831                          /* Make parent create an empty node if needed. */
2832                         *flagp |= TRYAGAIN;
2833                         return(NULL);
2834                     }
2835                     goto tryagain;
2836                 }
2837                 return(NULL);
2838         }
2839         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2840         break;
2841     case '|':
2842     case ')':
2843         if (flags & TRYAGAIN) {
2844             *flagp |= TRYAGAIN;
2845             return NULL;
2846         }
2847         vFAIL("Internal urp");
2848                                 /* Supposed to be caught earlier. */
2849         break;
2850     case '{':
2851         if (!regcurly(RExC_parse)) {
2852             RExC_parse++;
2853             goto defchar;
2854         }
2855         /* FALL THROUGH */
2856     case '?':
2857     case '+':
2858     case '*':
2859         RExC_parse++;
2860         vFAIL("Quantifier follows nothing");
2861         break;
2862     case '\\':
2863         switch (*++RExC_parse) {
2864         case 'A':
2865             RExC_seen_zerolen++;
2866             ret = reg_node(pRExC_state, SBOL);
2867             *flagp |= SIMPLE;
2868             nextchar(pRExC_state);
2869             Set_Node_Length(ret, 2); /* MJD */
2870             break;
2871         case 'G':
2872             ret = reg_node(pRExC_state, GPOS);
2873             RExC_seen |= REG_SEEN_GPOS;
2874             *flagp |= SIMPLE;
2875             nextchar(pRExC_state);
2876             Set_Node_Length(ret, 2); /* MJD */
2877             break;
2878         case 'Z':
2879             ret = reg_node(pRExC_state, SEOL);
2880             *flagp |= SIMPLE;
2881             RExC_seen_zerolen++;                /* Do not optimize RE away */
2882             nextchar(pRExC_state);
2883             break;
2884         case 'z':
2885             ret = reg_node(pRExC_state, EOS);
2886             *flagp |= SIMPLE;
2887             RExC_seen_zerolen++;                /* Do not optimize RE away */
2888             nextchar(pRExC_state);
2889             Set_Node_Length(ret, 2); /* MJD */
2890             break;
2891         case 'C':
2892             ret = reg_node(pRExC_state, CANY);
2893             RExC_seen |= REG_SEEN_CANY;
2894             *flagp |= HASWIDTH|SIMPLE;
2895             nextchar(pRExC_state);
2896             Set_Node_Length(ret, 2); /* MJD */
2897             break;
2898         case 'X':
2899             ret = reg_node(pRExC_state, CLUMP);
2900             *flagp |= HASWIDTH;
2901             nextchar(pRExC_state);
2902             Set_Node_Length(ret, 2); /* MJD */
2903             break;
2904         case 'w':
2905             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2906             *flagp |= HASWIDTH|SIMPLE;
2907             nextchar(pRExC_state);
2908             Set_Node_Length(ret, 2); /* MJD */
2909             break;
2910         case 'W':
2911             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2912             *flagp |= HASWIDTH|SIMPLE;
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'b':
2917             RExC_seen_zerolen++;
2918             RExC_seen |= REG_SEEN_LOOKBEHIND;
2919             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2920             *flagp |= SIMPLE;
2921             nextchar(pRExC_state);
2922             Set_Node_Length(ret, 2); /* MJD */
2923             break;
2924         case 'B':
2925             RExC_seen_zerolen++;
2926             RExC_seen |= REG_SEEN_LOOKBEHIND;
2927             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2928             *flagp |= SIMPLE;
2929             nextchar(pRExC_state);
2930             Set_Node_Length(ret, 2); /* MJD */
2931             break;
2932         case 's':
2933             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2934             *flagp |= HASWIDTH|SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'S':
2939             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2940             *flagp |= HASWIDTH|SIMPLE;
2941             nextchar(pRExC_state);
2942             Set_Node_Length(ret, 2); /* MJD */
2943             break;
2944         case 'd':
2945             ret = reg_node(pRExC_state, DIGIT);
2946             *flagp |= HASWIDTH|SIMPLE;
2947             nextchar(pRExC_state);
2948             Set_Node_Length(ret, 2); /* MJD */
2949             break;
2950         case 'D':
2951             ret = reg_node(pRExC_state, NDIGIT);
2952             *flagp |= HASWIDTH|SIMPLE;
2953             nextchar(pRExC_state);
2954             Set_Node_Length(ret, 2); /* MJD */
2955             break;
2956         case 'p':
2957         case 'P':
2958             {   
2959                 char* oldregxend = RExC_end;
2960                 char* parse_start = RExC_parse;
2961
2962                 if (RExC_parse[1] == '{') {
2963                   /* a lovely hack--pretend we saw [\pX] instead */
2964                     RExC_end = strchr(RExC_parse, '}');
2965                     if (!RExC_end) {
2966                         U8 c = (U8)*RExC_parse;
2967                         RExC_parse += 2;
2968                         RExC_end = oldregxend;
2969                         vFAIL2("Missing right brace on \\%c{}", c);
2970                     }
2971                     RExC_end++;
2972                 }
2973                 else {
2974                     RExC_end = RExC_parse + 2;
2975                     if (RExC_end > oldregxend)
2976                         RExC_end = oldregxend;
2977                 }
2978                 RExC_parse--;
2979
2980                 ret = regclass(pRExC_state);
2981
2982                 RExC_end = oldregxend;
2983                 RExC_parse--;
2984                 Set_Node_Cur_Length(ret); /* MJD */
2985                 nextchar(pRExC_state);
2986                 *flagp |= HASWIDTH|SIMPLE;
2987             }
2988             break;
2989         case 'n':
2990         case 'r':
2991         case 't':
2992         case 'f':
2993         case 'e':
2994         case 'a':
2995         case 'x':
2996         case 'c':
2997         case '0':
2998             goto defchar;
2999         case '1': case '2': case '3': case '4':
3000         case '5': case '6': case '7': case '8': case '9':
3001             {
3002                 I32 num = atoi(RExC_parse);
3003
3004                 if (num > 9 && num >= RExC_npar)
3005                     goto defchar;
3006                 else {
3007                     char * parse_start = RExC_parse - 1; /* MJD */
3008                     while (isDIGIT(*RExC_parse))
3009                         RExC_parse++;
3010
3011                     if (!SIZE_ONLY && num > RExC_rx->nparens)
3012                         vFAIL("Reference to nonexistent group");
3013                     RExC_sawback = 1;
3014                     ret = reganode(pRExC_state, FOLD
3015                                    ? (LOC ? REFFL : REFF)
3016                                    : REF, num);
3017                     *flagp |= HASWIDTH;
3018                     
3019                     /* override incorrect value set in reganode MJD */
3020                     Set_Node_Offset(ret, parse_start+1); 
3021                     Set_Node_Cur_Length(ret); /* MJD */
3022                     RExC_parse--;
3023                     nextchar(pRExC_state);
3024                 }
3025             }
3026             break;
3027         case '\0':
3028             if (RExC_parse >= RExC_end)
3029                 FAIL("Trailing \\");
3030             /* FALL THROUGH */
3031         default:
3032             /* Do not generate `unrecognized' warnings here, we fall
3033                back into the quick-grab loop below */
3034             goto defchar;
3035         }
3036         break;
3037
3038     case '#':
3039         if (RExC_flags16 & PMf_EXTENDED) {
3040             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3041             if (RExC_parse < RExC_end)
3042                 goto tryagain;
3043         }
3044         /* FALL THROUGH */
3045
3046     default: {
3047             register STRLEN len;
3048             register UV ender;
3049             register char *p;
3050             char *oldp, *s;
3051             STRLEN numlen;
3052             STRLEN foldlen;
3053             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3054
3055             parse_start = RExC_parse - 1;
3056
3057             RExC_parse++;
3058
3059         defchar:
3060             ender = 0;
3061             ret = reg_node(pRExC_state, FOLD
3062                           ? (LOC ? EXACTFL : EXACTF)
3063                           : EXACT);
3064             s = STRING(ret);
3065             for (len = 0, p = RExC_parse - 1;
3066               len < 127 && p < RExC_end;
3067               len++)
3068             {
3069                 oldp = p;
3070
3071                 if (RExC_flags16 & PMf_EXTENDED)
3072                     p = regwhite(p, RExC_end);
3073                 switch (*p) {
3074                 case '^':
3075                 case '$':
3076                 case '.':
3077                 case '[':
3078                 case '(':
3079                 case ')':
3080                 case '|':
3081                     goto loopdone;
3082                 case '\\':
3083                     switch (*++p) {
3084                     case 'A':
3085                     case 'C':
3086                     case 'X':
3087                     case 'G':
3088                     case 'Z':
3089                     case 'z':
3090                     case 'w':
3091                     case 'W':
3092                     case 'b':
3093                     case 'B':
3094                     case 's':
3095                     case 'S':
3096                     case 'd':
3097                     case 'D':
3098                     case 'p':
3099                     case 'P':
3100                         --p;
3101                         goto loopdone;
3102                     case 'n':
3103                         ender = '\n';
3104                         p++;
3105                         break;
3106                     case 'r':
3107                         ender = '\r';
3108                         p++;
3109                         break;
3110                     case 't':
3111                         ender = '\t';
3112                         p++;
3113                         break;
3114                     case 'f':
3115                         ender = '\f';
3116                         p++;
3117                         break;
3118                     case 'e':
3119                           ender = ASCII_TO_NATIVE('\033');
3120                         p++;
3121                         break;
3122                     case 'a':
3123                           ender = ASCII_TO_NATIVE('\007');
3124                         p++;
3125                         break;
3126                     case 'x':
3127                         if (*++p == '{') {
3128                             char* e = strchr(p, '}');
3129         
3130                             if (!e) {
3131                                 RExC_parse = p + 1;
3132                                 vFAIL("Missing right brace on \\x{}");
3133                             }
3134                             else {
3135                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3136                                     | PERL_SCAN_DISALLOW_PREFIX;
3137                                 numlen = e - p - 1;
3138                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3139                                 if (ender > 0xff)
3140                                     RExC_utf8 = 1;
3141                                 /* numlen is generous */
3142                                 if (numlen + len >= 127) {
3143                                     p--;
3144                                     goto loopdone;
3145                                 }
3146                                 p = e + 1;
3147                             }
3148                         }
3149                         else {
3150                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3151                             numlen = 2;
3152                             ender = grok_hex(p, &numlen, &flags, NULL);
3153                             p += numlen;
3154                         }
3155                         break;
3156                     case 'c':
3157                         p++;
3158                         ender = UCHARAT(p++);
3159                         ender = toCTRL(ender);
3160                         break;
3161                     case '0': case '1': case '2': case '3':case '4':
3162                     case '5': case '6': case '7': case '8':case '9':
3163                         if (*p == '0' ||
3164                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3165                             I32 flags = 0;
3166                             numlen = 3;
3167                             ender = grok_oct(p, &numlen, &flags, NULL);
3168                             p += numlen;
3169                         }
3170                         else {
3171                             --p;
3172                             goto loopdone;
3173                         }
3174                         break;
3175                     case '\0':
3176                         if (p >= RExC_end)
3177                             FAIL("Trailing \\");
3178                         /* FALL THROUGH */
3179                     default:
3180                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3181                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3182                         goto normal_default;
3183                     }
3184                     break;
3185                 default:
3186                   normal_default:
3187                     if (UTF8_IS_START(*p) && UTF) {
3188                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3189                                                &numlen, 0);
3190                         p += numlen;
3191                     }
3192                     else
3193                         ender = *p++;
3194                     break;
3195                 }
3196                 if (RExC_flags16 & PMf_EXTENDED)
3197                     p = regwhite(p, RExC_end);
3198                 if (UTF && FOLD) {
3199                     /* Prime the casefolded buffer. */
3200                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3201                 }
3202                 if (ISMULT2(p)) { /* Back off on ?+*. */
3203                     if (len)
3204                         p = oldp;
3205                     else if (UTF) {
3206                          STRLEN unilen;
3207
3208                          if (FOLD) {
3209                               /* Emit all the Unicode characters. */
3210                               for (foldbuf = tmpbuf;
3211                                    foldlen;
3212                                    foldlen -= numlen) {
3213                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3214                                    if (numlen > 0) {
3215                                         reguni(pRExC_state, ender, s, &unilen);
3216                                         s       += unilen;
3217                                         len     += unilen;
3218                                         /* In EBCDIC the numlen
3219                                          * and unilen can differ. */
3220                                         foldbuf += numlen;
3221                                         if (numlen >= foldlen)
3222                                              break;
3223                                    }
3224                                    else
3225                                         break; /* "Can't happen." */
3226                               }
3227                          }
3228                          else {
3229                               reguni(pRExC_state, ender, s, &unilen);
3230                               if (unilen > 0) {
3231                                    s   += unilen;
3232                                    len += unilen;
3233                               }
3234                          }
3235                     }
3236                     else {
3237                         len++;
3238                         REGC(ender, s++);
3239                     }
3240                     break;
3241                 }
3242                 if (UTF) {
3243                      STRLEN unilen;
3244
3245                      if (FOLD) {
3246                           /* Emit all the Unicode characters. */
3247                           for (foldbuf = tmpbuf;
3248                                foldlen;
3249                                foldlen -= numlen) {
3250                                ender = utf8_to_uvchr(foldbuf, &numlen);
3251                                if (numlen > 0) {
3252                                     reguni(pRExC_state, ender, s, &unilen);
3253                                     len     += unilen;
3254                                     s       += unilen;
3255                                     /* In EBCDIC the numlen
3256                                      * and unilen can differ. */
3257                                     foldbuf += numlen;
3258                                     if (numlen >= foldlen)
3259                                          break;
3260                                }
3261                                else
3262                                     break;
3263                           }
3264                      }
3265                      else {
3266                           reguni(pRExC_state, ender, s, &unilen);
3267                           if (unilen > 0) {
3268                                s   += unilen;
3269                                len += unilen;
3270                           }
3271                      }
3272                      len--;
3273                 }
3274                 else
3275                     REGC(ender, s++);
3276             }
3277         loopdone:
3278             RExC_parse = p - 1;
3279             Set_Node_Cur_Length(ret); /* MJD */
3280             nextchar(pRExC_state);
3281             {
3282                 /* len is STRLEN which is unsigned, need to copy to signed */
3283                 IV iv = len;
3284                 if (iv < 0)
3285                     vFAIL("Internal disaster");
3286             }
3287             if (len > 0)
3288                 *flagp |= HASWIDTH;
3289             if (len == 1)
3290                 *flagp |= SIMPLE;
3291             if (!SIZE_ONLY)
3292                 STR_LEN(ret) = len;
3293             if (SIZE_ONLY)
3294                 RExC_size += STR_SZ(len);
3295             else
3296                 RExC_emit += STR_SZ(len);
3297         }
3298         break;
3299     }
3300
3301     /* If the encoding pragma is in effect recode the text of
3302      * any EXACT-kind nodes. */
3303     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3304          STRLEN oldlen = STR_LEN(ret);
3305          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3306
3307          if (RExC_utf8)
3308               SvUTF8_on(sv);
3309          if (sv_utf8_downgrade(sv, TRUE)) {
3310               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3311               STRLEN newlen = SvCUR(sv);
3312          
3313               if (!SIZE_ONLY) {
3314                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3315                                          (int)oldlen, STRING(ret),
3316                                          (int)newlen, s));
3317                    Copy(s, STRING(ret), newlen, char);
3318                    STR_LEN(ret) += newlen - oldlen;
3319                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3320               } else
3321                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3322          }
3323     }
3324
3325     return(ret);
3326 }
3327
3328 STATIC char *
3329 S_regwhite(pTHX_ char *p, char *e)
3330 {
3331     while (p < e) {
3332         if (isSPACE(*p))
3333             ++p;
3334         else if (*p == '#') {
3335             do {
3336                 p++;
3337             } while (p < e && *p != '\n');
3338         }
3339         else
3340             break;
3341     }
3342     return p;
3343 }
3344
3345 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3346    Character classes ([:foo:]) can also be negated ([:^foo:]).
3347    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3348    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3349    but trigger failures because they are currently unimplemented. */
3350
3351 #define POSIXCC_DONE(c)   ((c) == ':')
3352 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3353 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3354
3355 STATIC I32
3356 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3357 {
3358     char *posixcc = 0;
3359     I32 namedclass = OOB_NAMEDCLASS;
3360
3361     if (value == '[' && RExC_parse + 1 < RExC_end &&
3362         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3363         POSIXCC(UCHARAT(RExC_parse))) {
3364         char  c = UCHARAT(RExC_parse);
3365         char* s = RExC_parse++;
3366         
3367         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3368             RExC_parse++;
3369         if (RExC_parse == RExC_end)
3370             /* Grandfather lone [:, [=, [. */
3371             RExC_parse = s;
3372         else {
3373             char* t = RExC_parse++; /* skip over the c */
3374
3375             if (UCHARAT(RExC_parse) == ']') {
3376                 RExC_parse++; /* skip over the ending ] */
3377                 posixcc = s + 1;
3378                 if (*s == ':') {
3379                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3380                     I32 skip = 5; /* the most common skip */
3381
3382                     switch (*posixcc) {
3383                     case 'a':
3384                         if (strnEQ(posixcc, "alnum", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3387                         else if (strnEQ(posixcc, "alpha", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3390                         else if (strnEQ(posixcc, "ascii", 5))
3391                             namedclass =
3392                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3393                         break;
3394                     case 'b':
3395                         if (strnEQ(posixcc, "blank", 5))
3396                             namedclass =
3397                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3398                         break;
3399                     case 'c':
3400                         if (strnEQ(posixcc, "cntrl", 5))
3401                             namedclass =
3402                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3403                         break;
3404                     case 'd':
3405                         if (strnEQ(posixcc, "digit", 5))
3406                             namedclass =
3407                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3408                         break;
3409                     case 'g':
3410                         if (strnEQ(posixcc, "graph", 5))
3411                             namedclass =
3412                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3413                         break;
3414                     case 'l':
3415                         if (strnEQ(posixcc, "lower", 5))
3416                             namedclass =
3417                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3418                         break;
3419                     case 'p':
3420                         if (strnEQ(posixcc, "print", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3423                         else if (strnEQ(posixcc, "punct", 5))
3424                             namedclass =
3425                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3426                         break;
3427                     case 's':
3428                         if (strnEQ(posixcc, "space", 5))
3429                             namedclass =
3430                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3431                         break;
3432                     case 'u':
3433                         if (strnEQ(posixcc, "upper", 5))
3434                             namedclass =
3435                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3436                         break;
3437                     case 'w': /* this is not POSIX, this is the Perl \w */
3438                         if (strnEQ(posixcc, "word", 4)) {
3439                             namedclass =
3440                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3441                             skip = 4;
3442                         }
3443                         break;
3444                     case 'x':
3445                         if (strnEQ(posixcc, "xdigit", 6)) {
3446                             namedclass =
3447                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3448                             skip = 6;
3449                         }
3450                         break;
3451                     }
3452                     if (namedclass == OOB_NAMEDCLASS ||
3453                         posixcc[skip] != ':' ||
3454                         posixcc[skip+1] != ']')
3455                     {
3456                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3457                                       t - s - 1, s + 1);
3458                     }
3459                 } else if (!SIZE_ONLY) {
3460                     /* [[=foo=]] and [[.foo.]] are still future. */
3461
3462                     /* adjust RExC_parse so the warning shows after
3463                        the class closes */
3464                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3465                         RExC_parse++;
3466                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3467                 }
3468             } else {
3469                 /* Maternal grandfather:
3470                  * "[:" ending in ":" but not in ":]" */
3471                 RExC_parse = s;
3472             }
3473         }
3474     }
3475
3476     return namedclass;
3477 }
3478
3479 STATIC void
3480 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3481 {
3482     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3483         char *s = RExC_parse;
3484         char  c = *s++;
3485
3486         while(*s && isALNUM(*s))
3487             s++;
3488         if (*s && c == *s && s[1] == ']') {
3489             if (ckWARN(WARN_REGEXP))
3490                 vWARN3(s+2,
3491                         "POSIX syntax [%c %c] belongs inside character classes",
3492                         c, c);
3493
3494             /* [[=foo=]] and [[.foo.]] are still future. */
3495             if (POSIXCC_NOTYET(c)) {
3496                 /* adjust RExC_parse so the error shows after
3497                    the class closes */
3498                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3499                     ;
3500                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3501             }
3502         }
3503     }
3504 }
3505
3506 STATIC regnode *
3507 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3508 {
3509     register UV value;
3510     register UV nextvalue;
3511     register IV prevvalue = OOB_UNICODE;
3512     register IV range = 0;
3513     register regnode *ret;
3514     STRLEN numlen;
3515     IV namedclass;
3516     char *rangebegin = 0;
3517     bool need_class = 0;
3518     SV *listsv = Nullsv;
3519     register char *e;
3520     UV n;
3521     bool optimize_invert   = TRUE;
3522     AV* unicode_alternate  = 0;
3523
3524     ret = reganode(pRExC_state, ANYOF, 0);
3525
3526     if (!SIZE_ONLY)
3527         ANYOF_FLAGS(ret) = 0;
3528
3529     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3530         RExC_naughty++;
3531         RExC_parse++;
3532         if (!SIZE_ONLY)
3533             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534     }
3535
3536     if (SIZE_ONLY)
3537         RExC_size += ANYOF_SKIP;
3538     else {
3539         RExC_emit += ANYOF_SKIP;
3540         if (FOLD)
3541             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3542         if (LOC)
3543             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3544         ANYOF_BITMAP_ZERO(ret);
3545         listsv = newSVpvn("# comment\n", 10);
3546     }
3547
3548     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549
3550     if (!SIZE_ONLY && POSIXCC(nextvalue))
3551         checkposixcc(pRExC_state);
3552
3553     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554     if (UCHARAT(RExC_parse) == ']')
3555         goto charclassloop;
3556
3557     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3558
3559     charclassloop:
3560
3561         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3562
3563         if (!range)
3564             rangebegin = RExC_parse;
3565         if (UTF) {
3566             value = utf8n_to_uvchr((U8*)RExC_parse,
3567                                    RExC_end - RExC_parse,
3568                                    &numlen, 0);
3569             RExC_parse += numlen;
3570         }
3571         else
3572             value = UCHARAT(RExC_parse++);
3573         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574         if (value == '[' && POSIXCC(nextvalue))
3575             namedclass = regpposixcc(pRExC_state, value);
3576         else if (value == '\\') {
3577             if (UTF) {
3578                 value = utf8n_to_uvchr((U8*)RExC_parse,
3579                                    RExC_end - RExC_parse,
3580                                    &numlen, 0);
3581                 RExC_parse += numlen;
3582             }
3583             else
3584                 value = UCHARAT(RExC_parse++);
3585             /* Some compilers cannot handle switching on 64-bit integer
3586              * values, therefore value cannot be an UV.  Yes, this will
3587              * be a problem later if we want switch on Unicode.
3588              * A similar issue a little bit later when switching on
3589              * namedclass. --jhi */
3590             switch ((I32)value) {
3591             case 'w':   namedclass = ANYOF_ALNUM;       break;
3592             case 'W':   namedclass = ANYOF_NALNUM;      break;
3593             case 's':   namedclass = ANYOF_SPACE;       break;
3594             case 'S':   namedclass = ANYOF_NSPACE;      break;
3595             case 'd':   namedclass = ANYOF_DIGIT;       break;
3596             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3597             case 'p':
3598             case 'P':
3599                 if (RExC_parse >= RExC_end)
3600                     vFAIL2("Empty \\%c{}", (U8)value);
3601                 if (*RExC_parse == '{') {
3602                     U8 c = (U8)value;
3603                     e = strchr(RExC_parse++, '}');
3604                     if (!e)
3605                         vFAIL2("Missing right brace on \\%c{}", c);
3606                     while (isSPACE(UCHARAT(RExC_parse)))
3607                         RExC_parse++;
3608                     if (e == RExC_parse)
3609                         vFAIL2("Empty \\%c{}", c);
3610                     n = e - RExC_parse;
3611                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3612                         n--;
3613                 }
3614                 else {
3615                     e = RExC_parse;
3616                     n = 1;
3617                 }
3618                 if (!SIZE_ONLY) {
3619                     if (UCHARAT(RExC_parse) == '^') {
3620                          RExC_parse++;
3621                          n--;
3622                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3623                          while (isSPACE(UCHARAT(RExC_parse))) {
3624                               RExC_parse++;
3625                               n--;
3626                          }
3627                     }
3628                     if (value == 'p')
3629                          Perl_sv_catpvf(aTHX_ listsv,
3630                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3631                     else
3632                          Perl_sv_catpvf(aTHX_ listsv,
3633                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3634                 }
3635                 RExC_parse = e + 1;
3636                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3637                 continue;
3638             case 'n':   value = '\n';                   break;
3639             case 'r':   value = '\r';                   break;
3640             case 't':   value = '\t';                   break;
3641             case 'f':   value = '\f';                   break;
3642             case 'b':   value = '\b';                   break;
3643             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3644             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3645             case 'x':
3646                 if (*RExC_parse == '{') {
3647                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648                         | PERL_SCAN_DISALLOW_PREFIX;
3649                     e = strchr(RExC_parse++, '}');
3650                     if (!e)
3651                         vFAIL("Missing right brace on \\x{}");
3652
3653                     numlen = e - RExC_parse;
3654                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3655                     RExC_parse = e + 1;
3656                 }
3657                 else {
3658                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3659                     numlen = 2;
3660                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3661                     RExC_parse += numlen;
3662                 }
3663                 break;
3664             case 'c':
3665                 value = UCHARAT(RExC_parse++);
3666                 value = toCTRL(value);
3667                 break;
3668             case '0': case '1': case '2': case '3': case '4':
3669             case '5': case '6': case '7': case '8': case '9':
3670             {
3671                 I32 flags = 0;
3672                 numlen = 3;
3673                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3674                 RExC_parse += numlen;
3675                 break;
3676             }
3677             default:
3678                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3679                     vWARN2(RExC_parse,
3680                            "Unrecognized escape \\%c in character class passed through",
3681                            (int)value);
3682                 break;
3683             }
3684         } /* end of \blah */
3685
3686         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3687
3688             if (!SIZE_ONLY && !need_class)
3689                 ANYOF_CLASS_ZERO(ret);
3690
3691             need_class = 1;
3692
3693             /* a bad range like a-\d, a-[:digit:] ? */
3694             if (range) {
3695                 if (!SIZE_ONLY) {
3696                     if (ckWARN(WARN_REGEXP))
3697                         vWARN4(RExC_parse,
3698                                "False [] range \"%*.*s\"",
3699                                RExC_parse - rangebegin,
3700                                RExC_parse - rangebegin,
3701                                rangebegin);
3702                     if (prevvalue < 256) {
3703                         ANYOF_BITMAP_SET(ret, prevvalue);
3704                         ANYOF_BITMAP_SET(ret, '-');
3705                     }
3706                     else {
3707                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3708                         Perl_sv_catpvf(aTHX_ listsv,
3709                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3710                     }
3711                 }
3712
3713                 range = 0; /* this was not a true range */
3714             }
3715
3716             if (!SIZE_ONLY) {
3717                 if (namedclass > OOB_NAMEDCLASS)
3718                     optimize_invert = FALSE;
3719                 /* Possible truncation here but in some 64-bit environments
3720                  * the compiler gets heartburn about switch on 64-bit values.
3721                  * A similar issue a little earlier when switching on value.
3722                  * --jhi */
3723                 switch ((I32)namedclass) {
3724                 case ANYOF_ALNUM:
3725                     if (LOC)
3726                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3727                     else {
3728                         for (value = 0; value < 256; value++)
3729                             if (isALNUM(value))
3730                                 ANYOF_BITMAP_SET(ret, value);
3731                     }
3732                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3733                     break;
3734                 case ANYOF_NALNUM:
3735                     if (LOC)
3736                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3737                     else {
3738                         for (value = 0; value < 256; value++)
3739                             if (!isALNUM(value))
3740                                 ANYOF_BITMAP_SET(ret, value);
3741                     }
3742                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3743                     break;
3744                 case ANYOF_ALNUMC:
3745                     if (LOC)
3746                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3747                     else {
3748                         for (value = 0; value < 256; value++)
3749                             if (isALNUMC(value))
3750                                 ANYOF_BITMAP_SET(ret, value);
3751                     }
3752                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3753                     break;
3754                 case ANYOF_NALNUMC:
3755                     if (LOC)
3756                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3757                     else {
3758                         for (value = 0; value < 256; value++)
3759                             if (!isALNUMC(value))
3760                                 ANYOF_BITMAP_SET(ret, value);
3761                     }
3762                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3763                     break;
3764                 case ANYOF_ALPHA:
3765                     if (LOC)
3766                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3767                     else {
3768                         for (value = 0; value < 256; value++)
3769                             if (isALPHA(value))
3770                                 ANYOF_BITMAP_SET(ret, value);
3771                     }
3772                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3773                     break;
3774                 case ANYOF_NALPHA:
3775                     if (LOC)
3776                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3777                     else {
3778                         for (value = 0; value < 256; value++)
3779                             if (!isALPHA(value))
3780                                 ANYOF_BITMAP_SET(ret, value);
3781                     }
3782                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3783                     break;
3784                 case ANYOF_ASCII:
3785                     if (LOC)
3786                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3787                     else {
3788 #ifndef EBCDIC
3789                         for (value = 0; value < 128; value++)
3790                             ANYOF_BITMAP_SET(ret, value);
3791 #else  /* EBCDIC */
3792                         for (value = 0; value < 256; value++) {
3793                             if (isASCII(value))
3794                                 ANYOF_BITMAP_SET(ret, value);
3795                         }
3796 #endif /* EBCDIC */
3797                     }
3798                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3799                     break;
3800                 case ANYOF_NASCII:
3801                     if (LOC)
3802                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3803                     else {
3804 #ifndef EBCDIC
3805                         for (value = 128; value < 256; value++)
3806                             ANYOF_BITMAP_SET(ret, value);
3807 #else  /* EBCDIC */
3808                         for (value = 0; value < 256; value++) {
3809                             if (!isASCII(value))
3810                                 ANYOF_BITMAP_SET(ret, value);
3811                         }
3812 #endif /* EBCDIC */
3813                     }
3814                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3815                     break;
3816                 case ANYOF_BLANK:
3817                     if (LOC)
3818                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3819                     else {
3820                         for (value = 0; value < 256; value++)
3821                             if (isBLANK(value))
3822                                 ANYOF_BITMAP_SET(ret, value);
3823                     }
3824                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3825                     break;
3826                 case ANYOF_NBLANK:
3827                     if (LOC)
3828                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3829                     else {
3830                         for (value = 0; value < 256; value++)
3831                             if (!isBLANK(value))
3832                                 ANYOF_BITMAP_SET(ret, value);
3833                     }
3834                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3835                     break;
3836                 case ANYOF_CNTRL:
3837                     if (LOC)
3838                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3839                     else {
3840                         for (value = 0; value < 256; value++)
3841                             if (isCNTRL(value))
3842                                 ANYOF_BITMAP_SET(ret, value);
3843                     }
3844                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3845                     break;
3846                 case ANYOF_NCNTRL:
3847                     if (LOC)
3848                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3849                     else {
3850                         for (value = 0; value < 256; value++)
3851                             if (!isCNTRL(value))
3852                                 ANYOF_BITMAP_SET(ret, value);
3853                     }
3854                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3855                     break;
3856                 case ANYOF_DIGIT:
3857                     if (LOC)
3858                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3859                     else {
3860                         /* consecutive digits assumed */
3861                         for (value = '0'; value <= '9'; value++)
3862                             ANYOF_BITMAP_SET(ret, value);
3863                     }
3864                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3865                     break;
3866                 case ANYOF_NDIGIT:
3867                     if (LOC)
3868                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3869                     else {
3870                         /* consecutive digits assumed */
3871                         for (value = 0; value < '0'; value++)
3872                             ANYOF_BITMAP_SET(ret, value);
3873                         for (value = '9' + 1; value < 256; value++)
3874                             ANYOF_BITMAP_SET(ret, value);
3875                     }
3876                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3877                     break;
3878                 case ANYOF_GRAPH:
3879                     if (LOC)
3880                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3881                     else {
3882                         for (value = 0; value < 256; value++)
3883                             if (isGRAPH(value))
3884                                 ANYOF_BITMAP_SET(ret, value);
3885                     }
3886                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3887                     break;
3888                 case ANYOF_NGRAPH:
3889                     if (LOC)
3890                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3891                     else {
3892                         for (value = 0; value < 256; value++)
3893                             if (!isGRAPH(value))
3894                                 ANYOF_BITMAP_SET(ret, value);
3895                     }
3896                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3897                     break;
3898                 case ANYOF_LOWER:
3899                     if (LOC)
3900                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3901                     else {
3902                         for (value = 0; value < 256; value++)
3903                             if (isLOWER(value))
3904                                 ANYOF_BITMAP_SET(ret, value);
3905                     }
3906                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3907                     break;
3908                 case ANYOF_NLOWER:
3909                     if (LOC)
3910                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3911                     else {
3912                         for (value = 0; value < 256; value++)
3913                             if (!isLOWER(value))
3914                                 ANYOF_BITMAP_SET(ret, value);
3915                     }
3916                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3917                     break;
3918                 case ANYOF_PRINT:
3919                     if (LOC)
3920                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3921                     else {
3922                         for (value = 0; value < 256; value++)
3923                             if (isPRINT(value))
3924                                 ANYOF_BITMAP_SET(ret, value);
3925                     }
3926                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3927                     break;
3928                 case ANYOF_NPRINT:
3929                     if (LOC)
3930                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3931                     else {
3932                         for (value = 0; value < 256; value++)
3933                             if (!isPRINT(value))
3934                                 ANYOF_BITMAP_SET(ret, value);
3935                     }
3936                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3937                     break;
3938                 case ANYOF_PSXSPC:
3939                     if (LOC)
3940                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3941                     else {
3942                         for (value = 0; value < 256; value++)
3943                             if (isPSXSPC(value))
3944                                 ANYOF_BITMAP_SET(ret, value);
3945                     }
3946                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3947                     break;
3948                 case ANYOF_NPSXSPC:
3949                     if (LOC)
3950                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3951                     else {
3952                         for (value = 0; value < 256; value++)
3953                             if (!isPSXSPC(value))
3954                                 ANYOF_BITMAP_SET(ret, value);
3955                     }
3956                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3957                     break;
3958                 case ANYOF_PUNCT:
3959                     if (LOC)
3960                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3961                     else {
3962                         for (value = 0; value < 256; value++)
3963                             if (isPUNCT(value))
3964                                 ANYOF_BITMAP_SET(ret, value);
3965                     }
3966                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3967                     break;
3968                 case ANYOF_NPUNCT:
3969                     if (LOC)
3970                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3971                     else {
3972                         for (value = 0; value < 256; value++)
3973                             if (!isPUNCT(value))
3974                                 ANYOF_BITMAP_SET(ret, value);
3975                     }
3976                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3977                     break;
3978                 case ANYOF_SPACE:
3979                     if (LOC)
3980                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3981                     else {
3982                         for (value = 0; value < 256; value++)
3983                             if (isSPACE(value))
3984                                 ANYOF_BITMAP_SET(ret, value);
3985                     }
3986                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3987                     break;
3988                 case ANYOF_NSPACE:
3989                     if (LOC)
3990                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3991                     else {
3992                         for (value = 0; value < 256; value++)
3993                             if (!isSPACE(value))
3994                                 ANYOF_BITMAP_SET(ret, value);
3995                     }
3996                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3997                     break;
3998                 case ANYOF_UPPER:
3999                     if (LOC)
4000                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4001                     else {
4002                         for (value = 0; value < 256; value++)
4003                             if (isUPPER(value))
4004                                 ANYOF_BITMAP_SET(ret, value);
4005                     }
4006                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4007                     break;
4008                 case ANYOF_NUPPER:
4009                     if (LOC)
4010                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4011                     else {
4012                         for (value = 0; value < 256; value++)
4013                             if (!isUPPER(value))
4014                                 ANYOF_BITMAP_SET(ret, value);
4015                     }
4016                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4017                     break;
4018                 case ANYOF_XDIGIT:
4019                     if (LOC)
4020                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4021                     else {
4022                         for (value = 0; value < 256; value++)
4023                             if (isXDIGIT(value))
4024                                 ANYOF_BITMAP_SET(ret, value);
4025                     }
4026                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4027                     break;
4028                 case ANYOF_NXDIGIT:
4029                     if (LOC)
4030                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4031                     else {
4032                         for (value = 0; value < 256; value++)
4033                             if (!isXDIGIT(value))
4034                                 ANYOF_BITMAP_SET(ret, value);
4035                     }
4036                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4037                     break;
4038                 default:
4039                     vFAIL("Invalid [::] class");
4040                     break;
4041                 }
4042                 if (LOC)
4043                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4044                 continue;
4045             }
4046         } /* end of namedclass \blah */
4047
4048         if (range) {
4049             if (prevvalue > value) /* b-a */ {
4050                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4051                               RExC_parse - rangebegin,
4052                               RExC_parse - rangebegin,
4053                               rangebegin);
4054                 range = 0; /* not a valid range */
4055             }
4056         }
4057         else {
4058             prevvalue = value; /* save the beginning of the range */
4059             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4060                 RExC_parse[1] != ']') {
4061                 RExC_parse++;
4062
4063                 /* a bad range like \w-, [:word:]- ? */
4064                 if (namedclass > OOB_NAMEDCLASS) {
4065                     if (ckWARN(WARN_REGEXP))
4066                         vWARN4(RExC_parse,
4067                                "False [] range \"%*.*s\"",
4068                                RExC_parse - rangebegin,
4069                                RExC_parse - rangebegin,
4070                                rangebegin);
4071                     if (!SIZE_ONLY)
4072                         ANYOF_BITMAP_SET(ret, '-');
4073                 } else
4074                     range = 1;  /* yeah, it's a range! */
4075                 continue;       /* but do it the next time */
4076             }
4077         }
4078
4079         /* now is the next time */
4080         if (!SIZE_ONLY) {
4081             IV i;
4082
4083             if (prevvalue < 256) {
4084                 IV ceilvalue = value < 256 ? value : 255;
4085
4086 #ifdef EBCDIC
4087                 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4088                     (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4089                 {
4090                     if (isLOWER(prevvalue)) {
4091                         for (i = prevvalue; i <= ceilvalue; i++)
4092                             if (isLOWER(i))
4093                                 ANYOF_BITMAP_SET(ret, i);
4094                     } else {
4095                         for (i = prevvalue; i <= ceilvalue; i++)
4096                             if (isUPPER(i))
4097                                 ANYOF_BITMAP_SET(ret, i);
4098                     }
4099                 }
4100                 else
4101 #endif
4102                       for (i = prevvalue; i <= ceilvalue; i++)
4103                           ANYOF_BITMAP_SET(ret, i);
4104           }
4105           if (value > 255 || UTF) {
4106                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4107                 UV natvalue      = NATIVE_TO_UNI(value);
4108
4109                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4110                 if (prevnatvalue < natvalue) { /* what about > ? */
4111                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4112                                    prevnatvalue, natvalue);
4113                 }
4114                 else if (prevnatvalue == natvalue) {
4115                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4116                     if (FOLD) {
4117                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4118                          STRLEN foldlen;
4119                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4120
4121                          /* If folding and foldable and a single
4122                           * character, insert also the folded version
4123                           * to the charclass. */
4124                          if (f != value) {
4125                               if (foldlen == UNISKIP(f))
4126                                   Perl_sv_catpvf(aTHX_ listsv,
4127                                                  "%04"UVxf"\n", f);
4128                               else {
4129                                   /* Any multicharacter foldings
4130                                    * require the following transform:
4131                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4132                                    * where E folds into "pq" and F folds
4133                                    * into "rst", all other characters
4134                                    * fold to single characters.  We save
4135                                    * away these multicharacter foldings,
4136                                    * to be later saved as part of the
4137                                    * additional "s" data. */
4138                                   SV *sv;
4139
4140                                   if (!unicode_alternate)
4141                                       unicode_alternate = newAV();
4142                                   sv = newSVpvn((char*)foldbuf, foldlen);
4143                                   SvUTF8_on(sv);
4144                                   av_push(unicode_alternate, sv);
4145                               }
4146                          }
4147
4148                          /* If folding and the value is one of the Greek
4149                           * sigmas insert a few more sigmas to make the
4150                           * folding rules of the sigmas to work right.
4151                           * Note that not all the possible combinations
4152                           * are handled here: some of them are handled
4153                           * by the standard folding rules, and some of
4154                           * them (literal or EXACTF cases) are handled
4155                           * during runtime in regexec.c:S_find_byclass(). */
4156                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4157                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4158                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4159                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4160                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4161                          }
4162                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4163                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4164                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4165                     }
4166                 }
4167             }
4168         }
4169
4170         range = 0; /* this range (if it was one) is done now */
4171     }
4172
4173     if (need_class) {
4174         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4175         if (SIZE_ONLY)
4176             RExC_size += ANYOF_CLASS_ADD_SKIP;
4177         else
4178             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4179     }
4180
4181     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4182     if (!SIZE_ONLY &&
4183          /* If the only flag is folding (plus possibly inversion). */
4184         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4185        ) {
4186         for (value = 0; value < 256; ++value) {
4187             if (ANYOF_BITMAP_TEST(ret, value)) {
4188                 IV fold = PL_fold[value];
4189
4190                 if (fold != value)
4191                     ANYOF_BITMAP_SET(ret, fold);
4192             }
4193         }
4194         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4195     }
4196
4197     /* optimize inverted simple patterns (e.g. [^a-z]) */
4198     if (!SIZE_ONLY && optimize_invert &&
4199         /* If the only flag is inversion. */
4200         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4201         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4202             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4203         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4204     }
4205
4206     if (!SIZE_ONLY) {
4207         AV *av = newAV();
4208         SV *rv;
4209
4210         /* The 0th element stores the character class description
4211          * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4212          * to initialize the appropriate swash (which gets stored in
4213          * the 1st element), and also useful for dumping the regnode.
4214          * The 2nd element stores the multicharacter foldings,
4215          * used later (regexec.c:s_reginclasslen()). */
4216         av_store(av, 0, listsv);
4217         av_store(av, 1, NULL);
4218         av_store(av, 2, (SV*)unicode_alternate);
4219         rv = newRV_noinc((SV*)av);
4220         n = add_data(pRExC_state, 1, "s");
4221         RExC_rx->data->data[n] = (void*)rv;
4222         ARG_SET(ret, n);
4223     }
4224
4225     return ret;
4226 }
4227
4228 STATIC char*
4229 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4230 {
4231     char* retval = RExC_parse++;
4232
4233     for (;;) {
4234         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4235                 RExC_parse[2] == '#') {
4236             while (*RExC_parse && *RExC_parse != ')')
4237                 RExC_parse++;
4238             RExC_parse++;
4239             continue;
4240         }
4241         if (RExC_flags16 & PMf_EXTENDED) {
4242             if (isSPACE(*RExC_parse)) {
4243                 RExC_parse++;
4244                 continue;
4245             }
4246             else if (*RExC_parse == '#') {
4247                 while (*RExC_parse && *RExC_parse != '\n')
4248                     RExC_parse++;
4249                 RExC_parse++;
4250                 continue;
4251             }
4252         }
4253         return retval;
4254     }
4255 }
4256
4257 /*
4258 - reg_node - emit a node
4259 */
4260 STATIC regnode *                        /* Location. */
4261 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4262 {
4263     register regnode *ret;
4264     register regnode *ptr;
4265
4266     ret = RExC_emit;
4267     if (SIZE_ONLY) {
4268         SIZE_ALIGN(RExC_size);
4269         RExC_size += 1;
4270         return(ret);
4271     }
4272
4273     NODE_ALIGN_FILL(ret);
4274     ptr = ret;
4275     FILL_ADVANCE_NODE(ptr, op);
4276     if (RExC_offsets) {         /* MJD */
4277       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4278               "reg_node", __LINE__, 
4279               reg_name[op],
4280               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4281               ? "Overwriting end of array!\n" : "OK",
4282               RExC_emit - RExC_emit_start,
4283               RExC_parse - RExC_start,
4284               RExC_offsets[0])); 
4285       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4286     }
4287             
4288     RExC_emit = ptr;
4289
4290     return(ret);
4291 }
4292
4293 /*
4294 - reganode - emit a node with an argument
4295 */
4296 STATIC regnode *                        /* Location. */
4297 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4298 {
4299     register regnode *ret;
4300     register regnode *ptr;
4301
4302     ret = RExC_emit;
4303     if (SIZE_ONLY) {
4304         SIZE_ALIGN(RExC_size);
4305         RExC_size += 2;
4306         return(ret);
4307     }
4308
4309     NODE_ALIGN_FILL(ret);
4310     ptr = ret;
4311     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4312     if (RExC_offsets) {         /* MJD */
4313       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4314               "reganode",
4315               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4316               "Overwriting end of array!\n" : "OK",
4317               RExC_emit - RExC_emit_start,
4318               RExC_parse - RExC_start,
4319               RExC_offsets[0])); 
4320       Set_Cur_Node_Offset;
4321     }
4322             
4323     RExC_emit = ptr;
4324
4325     return(ret);
4326 }
4327
4328 /*
4329 - reguni - emit (if appropriate) a Unicode character
4330 */
4331 STATIC void
4332 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4333 {
4334     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4335 }
4336
4337 /*
4338 - reginsert - insert an operator in front of already-emitted operand
4339 *
4340 * Means relocating the operand.
4341 */
4342 STATIC void
4343 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4344 {
4345     register regnode *src;
4346     register regnode *dst;
4347     register regnode *place;
4348     register int offset = regarglen[(U8)op];
4349
4350 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4351
4352     if (SIZE_ONLY) {
4353         RExC_size += NODE_STEP_REGNODE + offset;
4354         return;
4355     }
4356
4357     src = RExC_emit;
4358     RExC_emit += NODE_STEP_REGNODE + offset;
4359     dst = RExC_emit;
4360     while (src > opnd) {
4361         StructCopy(--src, --dst, regnode);
4362         if (RExC_offsets) {     /* MJD 20010112 */
4363           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4364                   "reg_insert",
4365                   dst - RExC_emit_start > RExC_offsets[0] 
4366                   ? "Overwriting end of array!\n" : "OK",
4367                   src - RExC_emit_start,
4368                   dst - RExC_emit_start,
4369                   RExC_offsets[0])); 
4370           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4371           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4372         }
4373     }
4374     
4375
4376     place = opnd;               /* Op node, where operand used to be. */
4377     if (RExC_offsets) {         /* MJD */
4378       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4379               "reginsert",
4380               place - RExC_emit_start > RExC_offsets[0] 
4381               ? "Overwriting end of array!\n" : "OK",
4382               place - RExC_emit_start,
4383               RExC_parse - RExC_start,
4384               RExC_offsets[0])); 
4385       Set_Node_Offset(place, RExC_parse);
4386     }
4387     src = NEXTOPER(place);
4388     FILL_ADVANCE_NODE(place, op);
4389     Zero(src, offset, regnode);
4390 }
4391
4392 /*
4393 - regtail - set the next-pointer at the end of a node chain of p to val.
4394 */
4395 STATIC void
4396 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4397 {
4398     register regnode *scan;
4399     register regnode *temp;
4400
4401     if (SIZE_ONLY)
4402         return;
4403
4404     /* Find last node. */
4405     scan = p;
4406     for (;;) {
4407         temp = regnext(scan);
4408         if (temp == NULL)
4409             break;
4410         scan = temp;
4411     }
4412
4413     if (reg_off_by_arg[OP(scan)]) {
4414         ARG_SET(scan, val - scan);
4415     }
4416     else {
4417         NEXT_OFF(scan) = val - scan;
4418     }
4419 }
4420
4421 /*
4422 - regoptail - regtail on operand of first argument; nop if operandless
4423 */
4424 STATIC void
4425 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4426 {
4427     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4428     if (p == NULL || SIZE_ONLY)
4429         return;
4430     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4431         regtail(pRExC_state, NEXTOPER(p), val);
4432     }
4433     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4434         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4435     }
4436     else
4437         return;
4438 }
4439
4440 /*
4441  - regcurly - a little FSA that accepts {\d+,?\d*}
4442  */
4443 STATIC I32
4444 S_regcurly(pTHX_ register char *s)
4445 {
4446     if (*s++ != '{')
4447         return FALSE;
4448     if (!isDIGIT(*s))
4449         return FALSE;
4450     while (isDIGIT(*s))
4451         s++;
4452     if (*s == ',')
4453         s++;
4454     while (isDIGIT(*s))
4455         s++;
4456     if (*s != '}')
4457         return FALSE;
4458     return TRUE;
4459 }
4460
4461
4462 #ifdef DEBUGGING
4463
4464 STATIC regnode *
4465 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4466 {
4467     register U8 op = EXACT;     /* Arbitrary non-END op. */
4468     register regnode *next;
4469
4470     while (op != END && (!last || node < last)) {
4471         /* While that wasn't END last time... */
4472
4473         NODE_ALIGN(node);
4474         op = OP(node);
4475         if (op == CLOSE)
4476             l--;        
4477         next = regnext(node);
4478         /* Where, what. */
4479         if (OP(node) == OPTIMIZED)
4480             goto after_print;
4481         regprop(sv, node);
4482         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4483                       (int)(2*l + 1), "", SvPVX(sv));
4484         if (next == NULL)               /* Next ptr. */
4485             PerlIO_printf(Perl_debug_log, "(0)");
4486         else
4487             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4488         (void)PerlIO_putc(Perl_debug_log, '\n');
4489       after_print:
4490         if (PL_regkind[(U8)op] == BRANCHJ) {
4491             register regnode *nnode = (OP(next) == LONGJMP
4492                                        ? regnext(next)
4493                                        : next);
4494             if (last && nnode > last)
4495                 nnode = last;
4496             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4497         }
4498         else if (PL_regkind[(U8)op] == BRANCH) {
4499             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4500         }
4501         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4502             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4503                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4504         }
4505         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4506             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4507                              next, sv, l + 1);
4508         }
4509         else if ( op == PLUS || op == STAR) {
4510             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4511         }
4512         else if (op == ANYOF) {
4513             /* arglen 1 + class block */
4514             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4515                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4516             node = NEXTOPER(node);
4517         }
4518         else if (PL_regkind[(U8)op] == EXACT) {
4519             /* Literal string, where present. */
4520             node += NODE_SZ_STR(node) - 1;
4521             node = NEXTOPER(node);
4522         }
4523         else {
4524             node = NEXTOPER(node);
4525             node += regarglen[(U8)op];
4526         }
4527         if (op == CURLYX || op == OPEN)
4528             l++;
4529         else if (op == WHILEM)
4530             l--;
4531     }
4532     return node;
4533 }
4534
4535 #endif  /* DEBUGGING */
4536
4537 /*
4538  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4539  */
4540 void
4541 Perl_regdump(pTHX_ regexp *r)
4542 {
4543 #ifdef DEBUGGING
4544     SV *sv = sv_newmortal();
4545
4546     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4547
4548     /* Header fields of interest. */
4549     if (r->anchored_substr)
4550         PerlIO_printf(Perl_debug_log,
4551                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4552                       PL_colors[0],
4553                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4554                       SvPVX(r->anchored_substr),
4555                       PL_colors[1],
4556                       SvTAIL(r->anchored_substr) ? "$" : "",
4557                       (IV)r->anchored_offset);
4558     else if (r->anchored_utf8)
4559         PerlIO_printf(Perl_debug_log,
4560                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4561                       PL_colors[0],
4562                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4563                       SvPVX(r->anchored_utf8),
4564                       PL_colors[1],
4565                       SvTAIL(r->anchored_utf8) ? "$" : "",
4566                       (IV)r->anchored_offset);
4567     if (r->float_substr)
4568         PerlIO_printf(Perl_debug_log,
4569                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4570                       PL_colors[0],
4571                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4572                       SvPVX(r->float_substr),
4573                       PL_colors[1],
4574                       SvTAIL(r->float_substr) ? "$" : "",
4575                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4576     else if (r->float_utf8)
4577         PerlIO_printf(Perl_debug_log,
4578                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4579                       PL_colors[0],
4580                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4581                       SvPVX(r->float_utf8),
4582                       PL_colors[1],
4583                       SvTAIL(r->float_utf8) ? "$" : "",
4584                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4585     if (r->check_substr || r->check_utf8)
4586         PerlIO_printf(Perl_debug_log,
4587                       r->check_substr == r->float_substr
4588                       && r->check_utf8 == r->float_utf8
4589                       ? "(checking floating" : "(checking anchored");
4590     if (r->reganch & ROPT_NOSCAN)
4591         PerlIO_printf(Perl_debug_log, " noscan");
4592     if (r->reganch & ROPT_CHECK_ALL)
4593         PerlIO_printf(Perl_debug_log, " isall");
4594     if (r->check_substr || r->check_utf8)
4595         PerlIO_printf(Perl_debug_log, ") ");
4596
4597     if (r->regstclass) {
4598         regprop(sv, r->regstclass);
4599         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4600     }
4601     if (r->reganch & ROPT_ANCH) {
4602         PerlIO_printf(Perl_debug_log, "anchored");
4603         if (r->reganch & ROPT_ANCH_BOL)
4604             PerlIO_printf(Perl_debug_log, "(BOL)");
4605         if (r->reganch & ROPT_ANCH_MBOL)
4606             PerlIO_printf(Perl_debug_log, "(MBOL)");
4607         if (r->reganch & ROPT_ANCH_SBOL)
4608             PerlIO_printf(Perl_debug_log, "(SBOL)");
4609         if (r->reganch & ROPT_ANCH_GPOS)
4610             PerlIO_printf(Perl_debug_log, "(GPOS)");
4611         PerlIO_putc(Perl_debug_log, ' ');
4612     }
4613     if (r->reganch & ROPT_GPOS_SEEN)
4614         PerlIO_printf(Perl_debug_log, "GPOS ");
4615     if (r->reganch & ROPT_SKIP)
4616         PerlIO_printf(Perl_debug_log, "plus ");
4617     if (r->reganch & ROPT_IMPLICIT)
4618         PerlIO_printf(Perl_debug_log, "implicit ");
4619     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4620     if (r->reganch & ROPT_EVAL_SEEN)
4621         PerlIO_printf(Perl_debug_log, "with eval ");
4622     PerlIO_printf(Perl_debug_log, "\n");
4623     if (r->offsets) {
4624       U32 i;
4625       U32 len = r->offsets[0];
4626       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4627       for (i = 1; i <= len; i++)
4628         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4629                       (UV)r->offsets[i*2-1], 
4630                       (UV)r->offsets[i*2]);
4631       PerlIO_printf(Perl_debug_log, "\n");
4632     }
4633 #endif  /* DEBUGGING */
4634 }
4635
4636 #ifdef DEBUGGING
4637
4638 STATIC void
4639 S_put_byte(pTHX_ SV *sv, int c)
4640 {
4641     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4642         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4643     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4644         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4645     else
4646         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4647 }
4648
4649 #endif  /* DEBUGGING */
4650
4651 /*
4652 - regprop - printable representation of opcode
4653 */
4654 void
4655 Perl_regprop(pTHX_ SV *sv, regnode *o)
4656 {
4657 #ifdef DEBUGGING
4658     register int k;
4659
4660     sv_setpvn(sv, "", 0);
4661     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4662         /* It would be nice to FAIL() here, but this may be called from
4663            regexec.c, and it would be hard to supply pRExC_state. */
4664         Perl_croak(aTHX_ "Corrupted regexp opcode");
4665     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4666
4667     k = PL_regkind[(U8)OP(o)];
4668
4669     if (k == EXACT) {
4670         SV *dsv = sv_2mortal(newSVpvn("", 0));
4671         /* Using is_utf8_string() is a crude hack but it may
4672          * be the best for now since we have no flag "this EXACTish
4673          * node was UTF-8" --jhi */
4674         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4675         char *s    = do_utf8 ?
4676           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4677                          UNI_DISPLAY_REGEX) :
4678           STRING(o);
4679         int len = do_utf8 ?
4680           strlen(s) :
4681           STR_LEN(o);
4682         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4683                        PL_colors[0],
4684                        len, s,
4685                        PL_colors[1]);
4686     }
4687     else if (k == CURLY) {
4688         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4689             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4690         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4691     }
4692     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4693         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4694     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4695         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4696     else if (k == LOGICAL)
4697         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4698     else if (k == ANYOF) {
4699         int i, rangestart = -1;
4700         U8 flags = ANYOF_FLAGS(o);
4701         const char * const anyofs[] = { /* Should be syncronized with
4702                                          * ANYOF_ #xdefines in regcomp.h */
4703             "\\w",
4704             "\\W",
4705             "\\s",
4706             "\\S",
4707             "\\d",
4708             "\\D",
4709             "[:alnum:]",
4710             "[:^alnum:]",
4711             "[:alpha:]",
4712             "[:^alpha:]",
4713             "[:ascii:]",
4714             "[:^ascii:]",
4715             "[:ctrl:]",
4716             "[:^ctrl:]",
4717             "[:graph:]",
4718             "[:^graph:]",
4719             "[:lower:]",
4720             "[:^lower:]",
4721             "[:print:]",
4722             "[:^print:]",
4723             "[:punct:]",
4724             "[:^punct:]",
4725             "[:upper:]",
4726             "[:^upper:]",
4727             "[:xdigit:]",
4728             "[:^xdigit:]",
4729             "[:space:]",
4730             "[:^space:]",
4731             "[:blank:]",
4732             "[:^blank:]"
4733         };
4734
4735         if (flags & ANYOF_LOCALE)
4736             sv_catpv(sv, "{loc}");
4737         if (flags & ANYOF_FOLD)
4738             sv_catpv(sv, "{i}");
4739         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4740         if (flags & ANYOF_INVERT)
4741             sv_catpv(sv, "^");
4742         for (i = 0; i <= 256; i++) {
4743             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4744                 if (rangestart == -1)
4745                     rangestart = i;
4746             } else if (rangestart != -1) {
4747                 if (i <= rangestart + 3)
4748                     for (; rangestart < i; rangestart++)
4749                         put_byte(sv, rangestart);
4750                 else {
4751                     put_byte(sv, rangestart);
4752                     sv_catpv(sv, "-");
4753                     put_byte(sv, i - 1);
4754                 }
4755                 rangestart = -1;
4756             }
4757         }
4758
4759         if (o->flags & ANYOF_CLASS)
4760             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4761                 if (ANYOF_CLASS_TEST(o,i))
4762                     sv_catpv(sv, anyofs[i]);
4763
4764         if (flags & ANYOF_UNICODE)
4765             sv_catpv(sv, "{unicode}");
4766         else if (flags & ANYOF_UNICODE_ALL)
4767             sv_catpv(sv, "{unicode_all}");
4768
4769         {
4770             SV *lv;
4771             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4772         
4773             if (lv) {
4774                 if (sw) {
4775                     UV i;
4776                     U8 s[UTF8_MAXLEN+1];
4777                 
4778                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4779                         U8 *e = uvchr_to_utf8(s, i);
4780                         
4781                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4782                             if (rangestart == -1)
4783                                 rangestart = i;
4784                         } else if (rangestart != -1) {
4785                             U8 *p;
4786                         
4787                             if (i <= rangestart + 3)
4788                                 for (; rangestart < i; rangestart++) {
4789                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4790                                         put_byte(sv, *p);
4791                                 }
4792                             else {
4793                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4794                                     put_byte(sv, *p);
4795                                 sv_catpv(sv, "-");
4796                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4797                                         put_byte(sv, *p);
4798                                 }
4799                                 rangestart = -1;
4800                             }
4801                         }
4802                         
4803                     sv_catpv(sv, "..."); /* et cetera */
4804                 }
4805
4806                 {
4807                     char *s = savepv(SvPVX(lv));
4808                     char *origs = s;
4809                 
4810                     while(*s && *s != '\n') s++;
4811                 
4812                     if (*s == '\n') {
4813                         char *t = ++s;
4814                         
4815                         while (*s) {
4816                             if (*s == '\n')
4817                                 *s = ' ';
4818                             s++;
4819                         }
4820                         if (s[-1] == ' ')
4821                             s[-1] = 0;
4822                         
4823                         sv_catpv(sv, t);
4824                     }
4825                 
4826                     Safefree(origs);
4827                 }
4828             }
4829         }
4830
4831         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4832     }
4833     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4834         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4835 #endif  /* DEBUGGING */
4836 }
4837
4838 SV *
4839 Perl_re_intuit_string(pTHX_ regexp *prog)
4840 {                               /* Assume that RE_INTUIT is set */
4841     DEBUG_r(
4842         {   STRLEN n_a;
4843             char *s = SvPV(prog->check_substr
4844                       ? prog->check_substr : prog->check_utf8, n_a);
4845
4846             if (!PL_colorset) reginitcolors();
4847             PerlIO_printf(Perl_debug_log,
4848                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4849                       PL_colors[4],
4850                       prog->check_substr ? "" : "utf8 ",
4851                       PL_colors[5],PL_colors[0],
4852                       s,
4853                       PL_colors[1],
4854                       (strlen(s) > 60 ? "..." : ""));
4855         } );
4856
4857     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4858 }
4859
4860 void
4861 Perl_pregfree(pTHX_ struct regexp *r)
4862 {
4863 #ifdef DEBUGGING
4864     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4865 #endif
4866
4867     if (!r || (--r->refcnt > 0))
4868         return;
4869     DEBUG_r({
4870          char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4871                                   UNI_DISPLAY_REGEX);
4872          int len = SvCUR(dsv);
4873          if (!PL_colorset)
4874               reginitcolors();
4875          PerlIO_printf(Perl_debug_log,
4876                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4877                        PL_colors[4],PL_colors[5],PL_colors[0],
4878                        len, len, s,
4879                        PL_colors[1],
4880                        len > 60 ? "..." : "");
4881     });
4882
4883     if (r->precomp)
4884         Safefree(r->precomp);
4885     if (r->offsets)             /* 20010421 MJD */
4886         Safefree(r->offsets);
4887     if (RX_MATCH_COPIED(r))
4888         Safefree(r->subbeg);
4889     if (r->substrs) {
4890         if (r->anchored_substr)
4891             SvREFCNT_dec(r->anchored_substr);
4892         if (r->anchored_utf8)
4893             SvREFCNT_dec(r->anchored_utf8);
4894         if (r->float_substr)
4895             SvREFCNT_dec(r->float_substr);
4896         if (r->float_utf8)
4897             SvREFCNT_dec(r->float_utf8);
4898         Safefree(r->substrs);
4899     }
4900     if (r->data) {
4901         int n = r->data->count;
4902         AV* new_comppad = NULL;
4903         AV* old_comppad;
4904         SV** old_curpad;
4905
4906         while (--n >= 0) {
4907           /* If you add a ->what type here, update the comment in regcomp.h */
4908             switch (r->data->what[n]) {
4909             case 's':
4910                 SvREFCNT_dec((SV*)r->data->data[n]);
4911                 break;
4912             case 'f':
4913                 Safefree(r->data->data[n]);
4914                 break;
4915             case 'p':
4916                 new_comppad = (AV*)r->data->data[n];
4917                 break;
4918             case 'o':
4919                 if (new_comppad == NULL)
4920                     Perl_croak(aTHX_ "panic: pregfree comppad");
4921                 old_comppad = PL_comppad;
4922                 old_curpad = PL_curpad;
4923                 /* Watch out for global destruction's random ordering. */
4924                 if (SvTYPE(new_comppad) == SVt_PVAV) {
4925                     PL_comppad = new_comppad;
4926                     PL_curpad = AvARRAY(new_comppad);
4927                 }
4928                 else
4929                     PL_curpad = NULL;
4930
4931                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4932                     op_free((OP_4tree*)r->data->data[n]);
4933                 }
4934
4935                 PL_comppad = old_comppad;
4936                 PL_curpad = old_curpad;
4937                 SvREFCNT_dec((SV*)new_comppad);
4938                 new_comppad = NULL;
4939                 break;
4940             case 'n':
4941                 break;
4942             default:
4943                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4944             }
4945         }
4946         Safefree(r->data->what);
4947         Safefree(r->data);
4948     }
4949     Safefree(r->startp);
4950     Safefree(r->endp);
4951     Safefree(r);
4952 }
4953
4954 /*
4955  - regnext - dig the "next" pointer out of a node
4956  *
4957  * [Note, when REGALIGN is defined there are two places in regmatch()
4958  * that bypass this code for speed.]
4959  */
4960 regnode *
4961 Perl_regnext(pTHX_ register regnode *p)
4962 {
4963     register I32 offset;
4964
4965     if (p == &PL_regdummy)
4966         return(NULL);
4967
4968     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4969     if (offset == 0)
4970         return(NULL);
4971
4972     return(p+offset);
4973 }
4974
4975 STATIC void     
4976 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4977 {
4978     va_list args;
4979     STRLEN l1 = strlen(pat1);
4980     STRLEN l2 = strlen(pat2);
4981     char buf[512];
4982     SV *msv;
4983     char *message;
4984
4985     if (l1 > 510)
4986         l1 = 510;
4987     if (l1 + l2 > 510)
4988         l2 = 510 - l1;
4989     Copy(pat1, buf, l1 , char);
4990     Copy(pat2, buf + l1, l2 , char);
4991     buf[l1 + l2] = '\n';
4992     buf[l1 + l2 + 1] = '\0';
4993 #ifdef I_STDARG
4994     /* ANSI variant takes additional second argument */
4995     va_start(args, pat2);
4996 #else
4997     va_start(args);
4998 #endif
4999     msv = vmess(buf, &args);
5000     va_end(args);
5001     message = SvPV(msv,l1);
5002     if (l1 > 512)
5003         l1 = 512;
5004     Copy(message, buf, l1 , char);
5005     buf[l1] = '\0';                     /* Overwrite \n */
5006     Perl_croak(aTHX_ "%s", buf);
5007 }
5008
5009 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5010
5011 void
5012 Perl_save_re_context(pTHX)
5013 {
5014 #if 0
5015     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
5016     SAVEI32(RExC_npar);         /* () count. */
5017     SAVEI32(RExC_size);         /* Code size. */
5018     SAVEI16(RExC_flags16);              /* are we folding, multilining? */
5019     SAVEVPTR(RExC_rx);          /* from regcomp.c */
5020     SAVEI32(RExC_seen);         /* from regcomp.c */
5021     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
5022     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
5023     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
5024     SAVEPPTR(RExC_end);         /* End of input for compile */
5025     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
5026 #endif
5027
5028     SAVEI32(PL_reg_flags);              /* from regexec.c */
5029     SAVEPPTR(PL_bostr);
5030     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5031     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5032     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5033     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5034     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5035     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5036     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5037     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5038     PL_reg_start_tmp = 0;
5039     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5040     PL_reg_start_tmpl = 0;
5041     SAVEVPTR(PL_regdata);
5042     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5043     SAVEI32(PL_regnarrate);             /* from regexec.c */
5044     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5045     SAVEINT(PL_regindent);              /* from regexec.c */
5046     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5047     SAVEVPTR(PL_curcop);
5048     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5049     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5050     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5051     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5052     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5053     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5054     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5055     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5056     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5057     SAVEI32(PL_regnpar);                /* () count. */
5058     SAVEI32(PL_regsize);                /* from regexec.c */
5059 #ifdef DEBUGGING
5060     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5061 #endif
5062 }
5063
5064 static void
5065 clear_re(pTHX_ void *r)
5066 {
5067     ReREFCNT_dec((regexp *)r);
5068 }
5069