[perl #30509] use encoding and "eq" cause memory leak
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73  ****    2000, 2001, 2002, 2003, by Larry Wall and others
74  ****
75  ****    You may distribute under the terms of either the GNU General Public
76  ****    License or the Artistic License, as specified in the README file.
77
78  *
79  * Beware that some of this code is subtly aware of the way operator
80  * precedence is structured in regular expressions.  Serious changes in
81  * regular-expression syntax might require a total rethink.
82  */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86
87 #ifndef PERL_IN_XSUB_RE
88 #  include "INTERN.h"
89 #endif
90
91 #define REG_COMP_C
92 #include "regcomp.h"
93
94 #ifdef op
95 #undef op
96 #endif /* op */
97
98 #ifdef MSDOS
99 #  if defined(BUGGY_MSC6)
100  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 #    pragma optimize("a",off)
102  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 #    pragma optimize("w",on )
104 #  endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106
107 #ifndef STATIC
108 #define STATIC  static
109 #endif
110
111 typedef struct RExC_state_t {
112     U32         flags;                  /* are we folding, multilining? */
113     char        *precomp;               /* uncompiled string. */
114     regexp      *rx;
115     char        *start;                 /* Start of input for compile */
116     char        *end;                   /* End of input for compile */
117     char        *parse;                 /* Input-scan pointer. */
118     I32         whilem_seen;            /* number of WHILEM in this expr */
119     regnode     *emit_start;            /* Start of emitted-code area */
120     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
121     I32         naughty;                /* How bad is this pattern? */
122     I32         sawback;                /* Did we see \1, ...? */
123     U32         seen;
124     I32         size;                   /* Code size. */
125     I32         npar;                   /* () count. */
126     I32         extralen;
127     I32         seen_zerolen;
128     I32         seen_evals;
129     I32         utf8;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags      (pRExC_state->flags)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_start      (pRExC_state->start)
140 #define RExC_end        (pRExC_state->end)
141 #define RExC_parse      (pRExC_state->parse)
142 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
143 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit       (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty    (pRExC_state->naughty)
147 #define RExC_sawback    (pRExC_state->sawback)
148 #define RExC_seen       (pRExC_state->seen)
149 #define RExC_size       (pRExC_state->size)
150 #define RExC_npar       (pRExC_state->npar)
151 #define RExC_extralen   (pRExC_state->extralen)
152 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8       (pRExC_state->utf8)
155
156 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158         ((*s) == '{' && regcurly(s)))
159
160 #ifdef SPSTART
161 #undef SPSTART          /* dratted cpp namespace... */
162 #endif
163 /*
164  * Flags to be passed up and down.
165  */
166 #define WORST           0       /* Worst case. */
167 #define HASWIDTH        0x1     /* Known to match non-null strings. */
168 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART         0x4     /* Starts with * or +. */
170 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
171
172 /* Length of a variant. */
173
174 typedef struct scan_data_t {
175     I32 len_min;
176     I32 len_delta;
177     I32 pos_min;
178     I32 pos_delta;
179     SV *last_found;
180     I32 last_end;                       /* min value, <0 unless valid. */
181     I32 last_start_min;
182     I32 last_start_max;
183     SV **longest;                       /* Either &l_fixed, or &l_float. */
184     SV *longest_fixed;
185     I32 offset_fixed;
186     SV *longest_float;
187     I32 offset_float_min;
188     I32 offset_float_max;
189     I32 flags;
190     I32 whilem_c;
191     I32 *last_closep;
192     struct regnode_charclass_class *start_class;
193 } scan_data_t;
194
195 /*
196  * Forward declarations for pregcomp()'s friends.
197  */
198
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200                                       0, 0, 0, 0, 0, 0};
201
202 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL          0x1
204 #define SF_BEFORE_MEOL          0x2
205 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
208 #ifdef NO_UNARY_PLUS
209 #  define SF_FIX_SHIFT_EOL      (0+2)
210 #  define SF_FL_SHIFT_EOL               (0+4)
211 #else
212 #  define SF_FIX_SHIFT_EOL      (+2)
213 #  define SF_FL_SHIFT_EOL               (+4)
214 #endif
215
216 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF               0x40
222 #define SF_HAS_PAR              0x80
223 #define SF_IN_PAR               0x100
224 #define SF_HAS_EVAL             0x200
225 #define SCF_DO_SUBSTR           0x400
226 #define SCF_DO_STCLASS_AND      0x0800
227 #define SCF_DO_STCLASS_OR       0x1000
228 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS  0x2000
230
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234
235 #define OOB_UNICODE             12345678
236 #define OOB_NAMEDCLASS          -1
237
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
241
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
244
245 /*
246  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248  * op/pragma/warn/regcomp.
249  */
250 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
252
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
254
255 /*
256  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257  * arg. Show regex, up to a maximum length. If it's too long, chop and add
258  * "...".
259  */
260 #define FAIL(msg) STMT_START {                                          \
261     char *ellipses = "";                                                \
262     IV len = RExC_end - RExC_precomp;                                   \
263                                                                         \
264     if (!SIZE_ONLY)                                                     \
265         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
266     if (len > RegexLengthToShowInErrorMessages) {                       \
267         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
268         len = RegexLengthToShowInErrorMessages - 10;                    \
269         ellipses = "...";                                               \
270     }                                                                   \
271     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
272             msg, (int)len, RExC_precomp, ellipses);                     \
273 } STMT_END
274
275 /*
276  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277  * args. Show regex, up to a maximum length. If it's too long, chop and add
278  * "...".
279  */
280 #define FAIL2(pat,msg) STMT_START {                                     \
281     char *ellipses = "";                                                \
282     IV len = RExC_end - RExC_precomp;                                   \
283                                                                         \
284     if (!SIZE_ONLY)                                                     \
285         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
286     if (len > RegexLengthToShowInErrorMessages) {                       \
287         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
288         len = RegexLengthToShowInErrorMessages - 10;                    \
289         ellipses = "...";                                               \
290     }                                                                   \
291     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
292             msg, (int)len, RExC_precomp, ellipses);                     \
293 } STMT_END
294
295
296 /*
297  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298  */
299 #define Simple_vFAIL(m) STMT_START {                                    \
300     IV offset = RExC_parse - RExC_precomp;                              \
301     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
302             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
303 } STMT_END
304
305 /*
306  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307  */
308 #define vFAIL(m) STMT_START {                           \
309     if (!SIZE_ONLY)                                     \
310         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
311     Simple_vFAIL(m);                                    \
312 } STMT_END
313
314 /*
315  * Like Simple_vFAIL(), but accepts two arguments.
316  */
317 #define Simple_vFAIL2(m,a1) STMT_START {                        \
318     IV offset = RExC_parse - RExC_precomp;                      \
319     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
320             (int)offset, RExC_precomp, RExC_precomp + offset);  \
321 } STMT_END
322
323 /*
324  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325  */
326 #define vFAIL2(m,a1) STMT_START {                       \
327     if (!SIZE_ONLY)                                     \
328         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
329     Simple_vFAIL2(m, a1);                               \
330 } STMT_END
331
332
333 /*
334  * Like Simple_vFAIL(), but accepts three arguments.
335  */
336 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
337     IV offset = RExC_parse - RExC_precomp;                      \
338     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
339             (int)offset, RExC_precomp, RExC_precomp + offset);  \
340 } STMT_END
341
342 /*
343  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344  */
345 #define vFAIL3(m,a1,a2) STMT_START {                    \
346     if (!SIZE_ONLY)                                     \
347         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
348     Simple_vFAIL3(m, a1, a2);                           \
349 } STMT_END
350
351 /*
352  * Like Simple_vFAIL(), but accepts four arguments.
353  */
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
355     IV offset = RExC_parse - RExC_precomp;                      \
356     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
357             (int)offset, RExC_precomp, RExC_precomp + offset);  \
358 } STMT_END
359
360 /*
361  * Like Simple_vFAIL(), but accepts five arguments.
362  */
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
364     IV offset = RExC_parse - RExC_precomp;                      \
365     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
366             (int)offset, RExC_precomp, RExC_precomp + offset);  \
367 } STMT_END
368
369
370 #define vWARN(loc,m) STMT_START {                                       \
371     IV offset = loc - RExC_precomp;                                     \
372     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
373             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
374 } STMT_END
375
376 #define vWARNdep(loc,m) STMT_START {                                    \
377     IV offset = loc - RExC_precomp;                                     \
378     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
379             "%s" REPORT_LOCATION,                                       \
380             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
381 } STMT_END
382
383
384 #define vWARN2(loc, m, a1) STMT_START {                                 \
385     IV offset = loc - RExC_precomp;                                     \
386     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
387             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
388 } STMT_END
389
390 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
391     IV offset = loc - RExC_precomp;                                     \
392     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
393             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
394 } STMT_END
395
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
397     IV offset = loc - RExC_precomp;                                     \
398     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
399             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 } STMT_END
401
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
403     IV offset = loc - RExC_precomp;                                     \
404     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
405             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 } STMT_END
407
408
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START {                  \
411     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412 } STMT_END
413
414 /* Macros for recording node offsets.   20001227 mjd@plover.com 
415  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
416  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
417  * Element 0 holds the number n.
418  */
419
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
425     if (! SIZE_ONLY) {                                                  \
426         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
427                 __LINE__, (node), (byte)));                             \
428         if((node) < 0) {                                                \
429             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430         } else {                                                        \
431             RExC_offsets[2*(node)-1] = (byte);                          \
432         }                                                               \
433     }                                                                   \
434 } STMT_END
435
436 #define Set_Node_Offset(node,byte) \
437     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
441     if (! SIZE_ONLY) {                                                  \
442         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
443                 __LINE__, (node), (len)));                              \
444         if((node) < 0) {                                                \
445             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446         } else {                                                        \
447             RExC_offsets[2*(node)] = (len);                             \
448         }                                                               \
449     }                                                                   \
450 } STMT_END
451
452 #define Set_Node_Length(node,len) \
453     Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456     Set_Node_Length(node, RExC_parse - parse_start)
457
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
462 static void clear_re(pTHX_ void *r);
463
464 /* Mark that we cannot extend a found fixed substring at this point.
465    Updata the longest found anchored substring and the longest found
466    floating substrings if needed. */
467
468 STATIC void
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 {
471     STRLEN l = CHR_SVLEN(data->last_found);
472     STRLEN old_l = CHR_SVLEN(*data->longest);
473
474     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475         SvSetMagicSV(*data->longest, data->last_found);
476         if (*data->longest == data->longest_fixed) {
477             data->offset_fixed = l ? data->last_start_min : data->pos_min;
478             if (data->flags & SF_BEFORE_EOL)
479                 data->flags
480                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481             else
482                 data->flags &= ~SF_FIX_BEFORE_EOL;
483         }
484         else {
485             data->offset_float_min = l ? data->last_start_min : data->pos_min;
486             data->offset_float_max = (l
487                                       ? data->last_start_max
488                                       : data->pos_min + data->pos_delta);
489             if ((U32)data->offset_float_max > (U32)I32_MAX)
490                 data->offset_float_max = I32_MAX;
491             if (data->flags & SF_BEFORE_EOL)
492                 data->flags
493                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494             else
495                 data->flags &= ~SF_FL_BEFORE_EOL;
496         }
497     }
498     SvCUR_set(data->last_found, 0);
499     {
500         SV * sv = data->last_found;
501         MAGIC *mg =
502             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503         if (mg && mg->mg_len > 0)
504             mg->mg_len = 0;
505     }
506     data->last_end = -1;
507     data->flags &= ~SF_BEFORE_EOL;
508 }
509
510 /* Can match anything (initialization) */
511 STATIC void
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 {
514     ANYOF_CLASS_ZERO(cl);
515     ANYOF_BITMAP_SETALL(cl);
516     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517     if (LOC)
518         cl->flags |= ANYOF_LOCALE;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC int
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524 {
525     int value;
526
527     for (value = 0; value <= ANYOF_MAX; value += 2)
528         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529             return 1;
530     if (!(cl->flags & ANYOF_UNICODE_ALL))
531         return 0;
532     if (!ANYOF_BITMAP_TESTALLSET(cl))
533         return 0;
534     return 1;
535 }
536
537 /* Can match anything (initialization) */
538 STATIC void
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544 }
545
546 STATIC void
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 {
549     Zero(cl, 1, struct regnode_charclass_class);
550     cl->type = ANYOF;
551     cl_anything(pRExC_state, cl);
552     if (LOC)
553         cl->flags |= ANYOF_LOCALE;
554 }
555
556 /* 'And' a given class with another one.  Can create false positives */
557 /* We assume that cl is not inverted */
558 STATIC void
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560          struct regnode_charclass_class *and_with)
561 {
562     if (!(and_with->flags & ANYOF_CLASS)
563         && !(cl->flags & ANYOF_CLASS)
564         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565         && !(and_with->flags & ANYOF_FOLD)
566         && !(cl->flags & ANYOF_FOLD)) {
567         int i;
568
569         if (and_with->flags & ANYOF_INVERT)
570             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571                 cl->bitmap[i] &= ~and_with->bitmap[i];
572         else
573             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574                 cl->bitmap[i] &= and_with->bitmap[i];
575     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576     if (!(and_with->flags & ANYOF_EOS))
577         cl->flags &= ~ANYOF_EOS;
578
579     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580         !(and_with->flags & ANYOF_INVERT)) {
581         cl->flags &= ~ANYOF_UNICODE_ALL;
582         cl->flags |= ANYOF_UNICODE;
583         ARG_SET(cl, ARG(and_with));
584     }
585     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586         !(and_with->flags & ANYOF_INVERT))
587         cl->flags &= ~ANYOF_UNICODE_ALL;
588     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589         !(and_with->flags & ANYOF_INVERT))
590         cl->flags &= ~ANYOF_UNICODE;
591 }
592
593 /* 'OR' a given class with another one.  Can create false positives */
594 /* We assume that cl is not inverted */
595 STATIC void
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
597 {
598     if (or_with->flags & ANYOF_INVERT) {
599         /* We do not use
600          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601          *   <= (B1 | !B2) | (CL1 | !CL2)
602          * which is wasteful if CL2 is small, but we ignore CL2:
603          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604          * XXXX Can we handle case-fold?  Unclear:
605          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607          */
608         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609              && !(or_with->flags & ANYOF_FOLD)
610              && !(cl->flags & ANYOF_FOLD) ) {
611             int i;
612
613             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614                 cl->bitmap[i] |= ~or_with->bitmap[i];
615         } /* XXXX: logic is complicated otherwise */
616         else {
617             cl_anything(pRExC_state, cl);
618         }
619     } else {
620         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622              && (!(or_with->flags & ANYOF_FOLD)
623                  || (cl->flags & ANYOF_FOLD)) ) {
624             int i;
625
626             /* OR char bitmap and class bitmap separately */
627             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628                 cl->bitmap[i] |= or_with->bitmap[i];
629             if (or_with->flags & ANYOF_CLASS) {
630                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631                     cl->classflags[i] |= or_with->classflags[i];
632                 cl->flags |= ANYOF_CLASS;
633             }
634         }
635         else { /* XXXX: logic is complicated, leave it along for a moment. */
636             cl_anything(pRExC_state, cl);
637         }
638     }
639     if (or_with->flags & ANYOF_EOS)
640         cl->flags |= ANYOF_EOS;
641
642     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643         ARG(cl) != ARG(or_with)) {
644         cl->flags |= ANYOF_UNICODE_ALL;
645         cl->flags &= ~ANYOF_UNICODE;
646     }
647     if (or_with->flags & ANYOF_UNICODE_ALL) {
648         cl->flags |= ANYOF_UNICODE_ALL;
649         cl->flags &= ~ANYOF_UNICODE;
650     }
651 }
652
653 /*
654  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655  * These need to be revisited when a newer toolchain becomes available.
656  */
657 #if defined(__sparc64__) && defined(__GNUC__)
658 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 #       undef  SPARC64_GCC_WORKAROUND
660 #       define SPARC64_GCC_WORKAROUND 1
661 #   endif
662 #endif
663
664 /* REx optimizer.  Converts nodes into quickier variants "in place".
665    Finds fixed substrings.  */
666
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668    to the position after last scanned or to NULL. */
669
670 STATIC I32
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672                         /* scanp: Start here (read-write). */
673                         /* deltap: Write maxlen-minlen here. */
674                         /* last: Stop before this one. */
675 {
676     I32 min = 0, pars = 0, code;
677     regnode *scan = *scanp, *next;
678     I32 delta = 0;
679     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680     int is_inf_internal = 0;            /* The studied chunk is infinite */
681     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682     scan_data_t data_fake;
683     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
684
685     while (scan && OP(scan) != END && scan < last) {
686         /* Peephole optimizer: */
687
688         if (PL_regkind[(U8)OP(scan)] == EXACT) {
689             /* Merge several consecutive EXACTish nodes into one. */
690             regnode *n = regnext(scan);
691             U32 stringok = 1;
692 #ifdef DEBUGGING
693             regnode *stop = scan;
694 #endif
695
696             next = scan + NODE_SZ_STR(scan);
697             /* Skip NOTHING, merge EXACT*. */
698             while (n &&
699                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
700                      (stringok && (OP(n) == OP(scan))))
701                    && NEXT_OFF(n)
702                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703                 if (OP(n) == TAIL || n > next)
704                     stringok = 0;
705                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706                     NEXT_OFF(scan) += NEXT_OFF(n);
707                     next = n + NODE_STEP_REGNODE;
708 #ifdef DEBUGGING
709                     if (stringok)
710                         stop = n;
711 #endif
712                     n = regnext(n);
713                 }
714                 else if (stringok) {
715                     int oldl = STR_LEN(scan);
716                     regnode *nnext = regnext(n);
717
718                     if (oldl + STR_LEN(n) > U8_MAX)
719                         break;
720                     NEXT_OFF(scan) += NEXT_OFF(n);
721                     STR_LEN(scan) += STR_LEN(n);
722                     next = n + NODE_SZ_STR(n);
723                     /* Now we can overwrite *n : */
724                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
725 #ifdef DEBUGGING
726                     stop = next - 1;
727 #endif
728                     n = nnext;
729                 }
730             }
731
732             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
733 /*
734   Two problematic code points in Unicode casefolding of EXACT nodes:
735
736    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
738
739    which casefold to
740
741    Unicode                      UTF-8
742
743    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
744    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
745
746    This means that in case-insensitive matching (or "loose matching",
747    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748    length of the above casefolded versions) can match a target string
749    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750    This would rather mess up the minimum length computation.
751
752    What we'll do is to look for the tail four bytes, and then peek
753    at the preceding two bytes to see whether we need to decrease
754    the minimum length by four (six minus two).
755
756    Thanks to the design of UTF-8, there cannot be false matches:
757    A sequence of valid UTF-8 bytes cannot be a subsequence of
758    another valid sequence of UTF-8 bytes.
759
760 */
761                  char *s0 = STRING(scan), *s, *t;
762                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763                  char *t0 = "\xcc\x88\xcc\x81";
764                  char *t1 = t0 + 3;
765                  
766                  for (s = s0 + 2;
767                       s < s2 && (t = ninstr(s, s1, t0, t1));
768                       s = t + 4) {
769                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
771                            min -= 4;
772                  }
773             }
774
775 #ifdef DEBUGGING
776             /* Allow dumping */
777             n = scan + NODE_SZ_STR(scan);
778             while (n <= stop) {
779                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
780                     OP(n) = OPTIMIZED;
781                     NEXT_OFF(n) = 0;
782                 }
783                 n++;
784             }
785 #endif
786         }
787         /* Follow the next-chain of the current node and optimize
788            away all the NOTHINGs from it.  */
789         if (OP(scan) != CURLYX) {
790             int max = (reg_off_by_arg[OP(scan)]
791                        ? I32_MAX
792                        /* I32 may be smaller than U16 on CRAYs! */
793                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795             int noff;
796             regnode *n = scan;
797         
798             /* Skip NOTHING and LONGJMP. */
799             while ((n = regnext(n))
800                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
802                    && off + noff < max)
803                 off += noff;
804             if (reg_off_by_arg[OP(scan)])
805                 ARG(scan) = off;
806             else
807                 NEXT_OFF(scan) = off;
808         }
809         /* The principal pseudo-switch.  Cannot be a switch, since we
810            look into several different things.  */
811         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813             next = regnext(scan);
814             code = OP(scan);
815         
816             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817                 I32 max1 = 0, min1 = I32_MAX, num = 0;
818                 struct regnode_charclass_class accum;
819                 
820                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822                 if (flags & SCF_DO_STCLASS)
823                     cl_init_zero(pRExC_state, &accum);
824                 while (OP(scan) == code) {
825                     I32 deltanext, minnext, f = 0, fake;
826                     struct regnode_charclass_class this_class;
827
828                     num++;
829                     data_fake.flags = 0;
830                     if (data) {         
831                         data_fake.whilem_c = data->whilem_c;
832                         data_fake.last_closep = data->last_closep;
833                     }
834                     else
835                         data_fake.last_closep = &fake;
836                     next = regnext(scan);
837                     scan = NEXTOPER(scan);
838                     if (code != BRANCH)
839                         scan = NEXTOPER(scan);
840                     if (flags & SCF_DO_STCLASS) {
841                         cl_init(pRExC_state, &this_class);
842                         data_fake.start_class = &this_class;
843                         f = SCF_DO_STCLASS_AND;
844                     }           
845                     if (flags & SCF_WHILEM_VISITED_POS)
846                         f |= SCF_WHILEM_VISITED_POS;
847                     /* we suppose the run is continuous, last=next...*/
848                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
849                                           next, &data_fake, f);
850                     if (min1 > minnext)
851                         min1 = minnext;
852                     if (max1 < minnext + deltanext)
853                         max1 = minnext + deltanext;
854                     if (deltanext == I32_MAX)
855                         is_inf = is_inf_internal = 1;
856                     scan = next;
857                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858                         pars++;
859                     if (data && (data_fake.flags & SF_HAS_EVAL))
860                         data->flags |= SF_HAS_EVAL;
861                     if (data)
862                         data->whilem_c = data_fake.whilem_c;
863                     if (flags & SCF_DO_STCLASS)
864                         cl_or(pRExC_state, &accum, &this_class);
865                     if (code == SUSPEND)
866                         break;
867                 }
868                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869                     min1 = 0;
870                 if (flags & SCF_DO_SUBSTR) {
871                     data->pos_min += min1;
872                     data->pos_delta += max1 - min1;
873                     if (max1 != min1 || is_inf)
874                         data->longest = &(data->longest_float);
875                 }
876                 min += min1;
877                 delta += max1 - min1;
878                 if (flags & SCF_DO_STCLASS_OR) {
879                     cl_or(pRExC_state, data->start_class, &accum);
880                     if (min1) {
881                         cl_and(data->start_class, &and_with);
882                         flags &= ~SCF_DO_STCLASS;
883                     }
884                 }
885                 else if (flags & SCF_DO_STCLASS_AND) {
886                     if (min1) {
887                         cl_and(data->start_class, &accum);
888                         flags &= ~SCF_DO_STCLASS;
889                     }
890                     else {
891                         /* Switch to OR mode: cache the old value of
892                          * data->start_class */
893                         StructCopy(data->start_class, &and_with,
894                                    struct regnode_charclass_class);
895                         flags &= ~SCF_DO_STCLASS_AND;
896                         StructCopy(&accum, data->start_class,
897                                    struct regnode_charclass_class);
898                         flags |= SCF_DO_STCLASS_OR;
899                         data->start_class->flags |= ANYOF_EOS;
900                     }
901                 }
902             }
903             else if (code == BRANCHJ)   /* single branch is optimized. */
904                 scan = NEXTOPER(NEXTOPER(scan));
905             else                        /* single branch is optimized. */
906                 scan = NEXTOPER(scan);
907             continue;
908         }
909         else if (OP(scan) == EXACT) {
910             I32 l = STR_LEN(scan);
911             UV uc = *((U8*)STRING(scan));
912             if (UTF) {
913                 U8 *s = (U8*)STRING(scan);
914                 l = utf8_length(s, s + l);
915                 uc = utf8_to_uvchr(s, NULL);
916             }
917             min += l;
918             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919                 /* The code below prefers earlier match for fixed
920                    offset, later match for variable offset.  */
921                 if (data->last_end == -1) { /* Update the start info. */
922                     data->last_start_min = data->pos_min;
923                     data->last_start_max = is_inf
924                         ? I32_MAX : data->pos_min + data->pos_delta;
925                 }
926                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
927                 {
928                     SV * sv = data->last_found;
929                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
931                     if (mg && mg->mg_len >= 0)
932                         mg->mg_len += utf8_length((U8*)STRING(scan),
933                                                   (U8*)STRING(scan)+STR_LEN(scan));
934                 }
935                 if (UTF)
936                     SvUTF8_on(data->last_found);
937                 data->last_end = data->pos_min + l;
938                 data->pos_min += l; /* As in the first entry. */
939                 data->flags &= ~SF_BEFORE_EOL;
940             }
941             if (flags & SCF_DO_STCLASS_AND) {
942                 /* Check whether it is compatible with what we know already! */
943                 int compat = 1;
944
945                 if (uc >= 0x100 ||
946                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
948                     && (!(data->start_class->flags & ANYOF_FOLD)
949                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950                     )
951                     compat = 0;
952                 ANYOF_CLASS_ZERO(data->start_class);
953                 ANYOF_BITMAP_ZERO(data->start_class);
954                 if (compat)
955                     ANYOF_BITMAP_SET(data->start_class, uc);
956                 data->start_class->flags &= ~ANYOF_EOS;
957                 if (uc < 0x100)
958                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959             }
960             else if (flags & SCF_DO_STCLASS_OR) {
961                 /* false positive possible if the class is case-folded */
962                 if (uc < 0x100)
963                     ANYOF_BITMAP_SET(data->start_class, uc);
964                 else
965                     data->start_class->flags |= ANYOF_UNICODE_ALL;
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 cl_and(data->start_class, &and_with);
968             }
969             flags &= ~SCF_DO_STCLASS;
970         }
971         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972             I32 l = STR_LEN(scan);
973             UV uc = *((U8*)STRING(scan));
974
975             /* Search for fixed substrings supports EXACT only. */
976             if (flags & SCF_DO_SUBSTR)
977                 scan_commit(pRExC_state, data);
978             if (UTF) {
979                 U8 *s = (U8 *)STRING(scan);
980                 l = utf8_length(s, s + l);
981                 uc = utf8_to_uvchr(s, NULL);
982             }
983             min += l;
984             if (data && (flags & SCF_DO_SUBSTR))
985                 data->pos_min += l;
986             if (flags & SCF_DO_STCLASS_AND) {
987                 /* Check whether it is compatible with what we know already! */
988                 int compat = 1;
989
990                 if (uc >= 0x100 ||
991                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
993                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994                     compat = 0;
995                 ANYOF_CLASS_ZERO(data->start_class);
996                 ANYOF_BITMAP_ZERO(data->start_class);
997                 if (compat) {
998                     ANYOF_BITMAP_SET(data->start_class, uc);
999                     data->start_class->flags &= ~ANYOF_EOS;
1000                     data->start_class->flags |= ANYOF_FOLD;
1001                     if (OP(scan) == EXACTFL)
1002                         data->start_class->flags |= ANYOF_LOCALE;
1003                 }
1004             }
1005             else if (flags & SCF_DO_STCLASS_OR) {
1006                 if (data->start_class->flags & ANYOF_FOLD) {
1007                     /* false positive possible if the class is case-folded.
1008                        Assume that the locale settings are the same... */
1009                     if (uc < 0x100)
1010                         ANYOF_BITMAP_SET(data->start_class, uc);
1011                     data->start_class->flags &= ~ANYOF_EOS;
1012                 }
1013                 cl_and(data->start_class, &and_with);
1014             }
1015             flags &= ~SCF_DO_STCLASS;
1016         }
1017         else if (strchr((char*)PL_varies,OP(scan))) {
1018             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019             I32 f = flags, pos_before = 0;
1020             regnode *oscan = scan;
1021             struct regnode_charclass_class this_class;
1022             struct regnode_charclass_class *oclass = NULL;
1023             I32 next_is_eval = 0;
1024
1025             switch (PL_regkind[(U8)OP(scan)]) {
1026             case WHILEM:                /* End of (?:...)* . */
1027                 scan = NEXTOPER(scan);
1028                 goto finish;
1029             case PLUS:
1030                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031                     next = NEXTOPER(scan);
1032                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033                         mincount = 1;
1034                         maxcount = REG_INFTY;
1035                         next = regnext(scan);
1036                         scan = NEXTOPER(scan);
1037                         goto do_curly;
1038                     }
1039                 }
1040                 if (flags & SCF_DO_SUBSTR)
1041                     data->pos_min++;
1042                 min++;
1043                 /* Fall through. */
1044             case STAR:
1045                 if (flags & SCF_DO_STCLASS) {
1046                     mincount = 0;
1047                     maxcount = REG_INFTY;
1048                     next = regnext(scan);
1049                     scan = NEXTOPER(scan);
1050                     goto do_curly;
1051                 }
1052                 is_inf = is_inf_internal = 1;
1053                 scan = regnext(scan);
1054                 if (flags & SCF_DO_SUBSTR) {
1055                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056                     data->longest = &(data->longest_float);
1057                 }
1058                 goto optimize_curly_tail;
1059             case CURLY:
1060                 mincount = ARG1(scan);
1061                 maxcount = ARG2(scan);
1062                 next = regnext(scan);
1063                 if (OP(scan) == CURLYX) {
1064                     I32 lp = (data ? *(data->last_closep) : 0);
1065
1066                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067                 }
1068                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069                 next_is_eval = (OP(scan) == EVAL);
1070               do_curly:
1071                 if (flags & SCF_DO_SUBSTR) {
1072                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073                     pos_before = data->pos_min;
1074                 }
1075                 if (data) {
1076                     fl = data->flags;
1077                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078                     if (is_inf)
1079                         data->flags |= SF_IS_INF;
1080                 }
1081                 if (flags & SCF_DO_STCLASS) {
1082                     cl_init(pRExC_state, &this_class);
1083                     oclass = data->start_class;
1084                     data->start_class = &this_class;
1085                     f |= SCF_DO_STCLASS_AND;
1086                     f &= ~SCF_DO_STCLASS_OR;
1087                 }
1088                 /* These are the cases when once a subexpression
1089                    fails at a particular position, it cannot succeed
1090                    even after backtracking at the enclosing scope.
1091                 
1092                    XXXX what if minimal match and we are at the
1093                         initial run of {n,m}? */
1094                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095                     f &= ~SCF_WHILEM_VISITED_POS;
1096
1097                 /* This will finish on WHILEM, setting scan, or on NULL: */
1098                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099                                       mincount == 0
1100                                         ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102                 if (flags & SCF_DO_STCLASS)
1103                     data->start_class = oclass;
1104                 if (mincount == 0 || minnext == 0) {
1105                     if (flags & SCF_DO_STCLASS_OR) {
1106                         cl_or(pRExC_state, data->start_class, &this_class);
1107                     }
1108                     else if (flags & SCF_DO_STCLASS_AND) {
1109                         /* Switch to OR mode: cache the old value of
1110                          * data->start_class */
1111                         StructCopy(data->start_class, &and_with,
1112                                    struct regnode_charclass_class);
1113                         flags &= ~SCF_DO_STCLASS_AND;
1114                         StructCopy(&this_class, data->start_class,
1115                                    struct regnode_charclass_class);
1116                         flags |= SCF_DO_STCLASS_OR;
1117                         data->start_class->flags |= ANYOF_EOS;
1118                     }
1119                 } else {                /* Non-zero len */
1120                     if (flags & SCF_DO_STCLASS_OR) {
1121                         cl_or(pRExC_state, data->start_class, &this_class);
1122                         cl_and(data->start_class, &and_with);
1123                     }
1124                     else if (flags & SCF_DO_STCLASS_AND)
1125                         cl_and(data->start_class, &this_class);
1126                     flags &= ~SCF_DO_STCLASS;
1127                 }
1128                 if (!scan)              /* It was not CURLYX, but CURLY. */
1129                     scan = next;
1130                 if (ckWARN(WARN_REGEXP)
1131                        /* ? quantifier ok, except for (?{ ... }) */
1132                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133                     && (minnext == 0) && (deltanext == 0)
1134                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1136                 {
1137                     vWARN(RExC_parse,
1138                           "Quantifier unexpected on zero-length expression");
1139                 }
1140
1141                 min += minnext * mincount;
1142                 is_inf_internal |= ((maxcount == REG_INFTY
1143                                      && (minnext + deltanext) > 0)
1144                                     || deltanext == I32_MAX);
1145                 is_inf |= is_inf_internal;
1146                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148                 /* Try powerful optimization CURLYX => CURLYN. */
1149                 if (  OP(oscan) == CURLYX && data
1150                       && data->flags & SF_IN_PAR
1151                       && !(data->flags & SF_HAS_EVAL)
1152                       && !deltanext && minnext == 1 ) {
1153                     /* Try to optimize to CURLYN.  */
1154                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155                     regnode *nxt1 = nxt;
1156 #ifdef DEBUGGING
1157                     regnode *nxt2;
1158 #endif
1159
1160                     /* Skip open. */
1161                     nxt = regnext(nxt);
1162                     if (!strchr((char*)PL_simple,OP(nxt))
1163                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164                              && STR_LEN(nxt) == 1))
1165                         goto nogo;
1166 #ifdef DEBUGGING
1167                     nxt2 = nxt;
1168 #endif
1169                     nxt = regnext(nxt);
1170                     if (OP(nxt) != CLOSE)
1171                         goto nogo;
1172                     /* Now we know that nxt2 is the only contents: */
1173                     oscan->flags = (U8)ARG(nxt);
1174                     OP(oscan) = CURLYN;
1175                     OP(nxt1) = NOTHING; /* was OPEN. */
1176 #ifdef DEBUGGING
1177                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1181                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1182                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 #endif
1184                 }
1185               nogo:
1186
1187                 /* Try optimization CURLYX => CURLYM. */
1188                 if (  OP(oscan) == CURLYX && data
1189                       && !(data->flags & SF_HAS_PAR)
1190                       && !(data->flags & SF_HAS_EVAL)
1191                       && !deltanext     /* atom is fixed width */
1192                       && minnext != 0   /* CURLYM can't handle zero width */
1193                 ) {
1194                     /* XXXX How to optimize if data == 0? */
1195                     /* Optimize to a simpler form.  */
1196                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197                     regnode *nxt2;
1198
1199                     OP(oscan) = CURLYM;
1200                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1201                             && (OP(nxt2) != WHILEM))
1202                         nxt = nxt2;
1203                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1204                     /* Need to optimize away parenths. */
1205                     if (data->flags & SF_IN_PAR) {
1206                         /* Set the parenth number.  */
1207                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208
1209                         if (OP(nxt) != CLOSE)
1210                             FAIL("Panic opt close");
1211                         oscan->flags = (U8)ARG(nxt);
1212                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1213                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1214 #ifdef DEBUGGING
1215                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1217                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1219 #endif
1220 #if 0
1221                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222                             regnode *nnxt = regnext(nxt1);
1223                         
1224                             if (nnxt == nxt) {
1225                                 if (reg_off_by_arg[OP(nxt1)])
1226                                     ARG_SET(nxt1, nxt2 - nxt1);
1227                                 else if (nxt2 - nxt1 < U16_MAX)
1228                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1229                                 else
1230                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1231                             }
1232                             nxt1 = nnxt;
1233                         }
1234 #endif
1235                         /* Optimize again: */
1236                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237                                     NULL, 0);
1238                     }
1239                     else
1240                         oscan->flags = 0;
1241                 }
1242                 else if ((OP(oscan) == CURLYX)
1243                          && (flags & SCF_WHILEM_VISITED_POS)
1244                          /* See the comment on a similar expression above.
1245                             However, this time it not a subexpression
1246                             we care about, but the expression itself. */
1247                          && (maxcount == REG_INFTY)
1248                          && data && ++data->whilem_c < 16) {
1249                     /* This stays as CURLYX, we can put the count/of pair. */
1250                     /* Find WHILEM (as in regexec.c) */
1251                     regnode *nxt = oscan + NEXT_OFF(oscan);
1252
1253                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254                         nxt += ARG(nxt);
1255                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1257                 }
1258                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1259                     pars++;
1260                 if (flags & SCF_DO_SUBSTR) {
1261                     SV *last_str = Nullsv;
1262                     int counted = mincount != 0;
1263
1264                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1265 #if defined(SPARC64_GCC_WORKAROUND)
1266                         I32 b = 0;
1267                         STRLEN l = 0;
1268                         char *s = NULL;
1269                         I32 old = 0;
1270
1271                         if (pos_before >= data->last_start_min)
1272                             b = pos_before;
1273                         else
1274                             b = data->last_start_min;
1275
1276                         l = 0;
1277                         s = SvPV(data->last_found, l);
1278                         old = b - data->last_start_min;
1279
1280 #else
1281                         I32 b = pos_before >= data->last_start_min
1282                             ? pos_before : data->last_start_min;
1283                         STRLEN l;
1284                         char *s = SvPV(data->last_found, l);
1285                         I32 old = b - data->last_start_min;
1286 #endif
1287
1288                         if (UTF)
1289                             old = utf8_hop((U8*)s, old) - (U8*)s;
1290                         
1291                         l -= old;
1292                         /* Get the added string: */
1293                         last_str = newSVpvn(s  + old, l);
1294                         if (UTF)
1295                             SvUTF8_on(last_str);
1296                         if (deltanext == 0 && pos_before == b) {
1297                             /* What was added is a constant string */
1298                             if (mincount > 1) {
1299                                 SvGROW(last_str, (mincount * l) + 1);
1300                                 repeatcpy(SvPVX(last_str) + l,
1301                                           SvPVX(last_str), l, mincount - 1);
1302                                 SvCUR(last_str) *= mincount;
1303                                 /* Add additional parts. */
1304                                 SvCUR_set(data->last_found,
1305                                           SvCUR(data->last_found) - l);
1306                                 sv_catsv(data->last_found, last_str);
1307                                 {
1308                                     SV * sv = data->last_found;
1309                                     MAGIC *mg =
1310                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1311                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312                                     if (mg && mg->mg_len >= 0)
1313                                         mg->mg_len += CHR_SVLEN(last_str);
1314                                 }
1315                                 data->last_end += l * (mincount - 1);
1316                             }
1317                         } else {
1318                             /* start offset must point into the last copy */
1319                             data->last_start_min += minnext * (mincount - 1);
1320                             data->last_start_max += is_inf ? I32_MAX
1321                                 : (maxcount - 1) * (minnext + data->pos_delta);
1322                         }
1323                     }
1324                     /* It is counted once already... */
1325                     data->pos_min += minnext * (mincount - counted);
1326                     data->pos_delta += - counted * deltanext +
1327                         (minnext + deltanext) * maxcount - minnext * mincount;
1328                     if (mincount != maxcount) {
1329                          /* Cannot extend fixed substrings found inside
1330                             the group.  */
1331                         scan_commit(pRExC_state,data);
1332                         if (mincount && last_str) {
1333                             sv_setsv(data->last_found, last_str);
1334                             data->last_end = data->pos_min;
1335                             data->last_start_min =
1336                                 data->pos_min - CHR_SVLEN(last_str);
1337                             data->last_start_max = is_inf
1338                                 ? I32_MAX
1339                                 : data->pos_min + data->pos_delta
1340                                 - CHR_SVLEN(last_str);
1341                         }
1342                         data->longest = &(data->longest_float);
1343                     }
1344                     SvREFCNT_dec(last_str);
1345                 }
1346                 if (data && (fl & SF_HAS_EVAL))
1347                     data->flags |= SF_HAS_EVAL;
1348               optimize_curly_tail:
1349                 if (OP(oscan) != CURLYX) {
1350                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1351                            && NEXT_OFF(next))
1352                         NEXT_OFF(oscan) += NEXT_OFF(next);
1353                 }
1354                 continue;
1355             default:                    /* REF and CLUMP only? */
1356                 if (flags & SCF_DO_SUBSTR) {
1357                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1358                     data->longest = &(data->longest_float);
1359                 }
1360                 is_inf = is_inf_internal = 1;
1361                 if (flags & SCF_DO_STCLASS_OR)
1362                     cl_anything(pRExC_state, data->start_class);
1363                 flags &= ~SCF_DO_STCLASS;
1364                 break;
1365             }
1366         }
1367         else if (strchr((char*)PL_simple,OP(scan))) {
1368             int value = 0;
1369
1370             if (flags & SCF_DO_SUBSTR) {
1371                 scan_commit(pRExC_state,data);
1372                 data->pos_min++;
1373             }
1374             min++;
1375             if (flags & SCF_DO_STCLASS) {
1376                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1377
1378                 /* Some of the logic below assumes that switching
1379                    locale on will only add false positives. */
1380                 switch (PL_regkind[(U8)OP(scan)]) {
1381                 case SANY:
1382                 default:
1383                   do_default:
1384                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1385                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1386                         cl_anything(pRExC_state, data->start_class);
1387                     break;
1388                 case REG_ANY:
1389                     if (OP(scan) == SANY)
1390                         goto do_default;
1391                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1392                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1393                                  || (data->start_class->flags & ANYOF_CLASS));
1394                         cl_anything(pRExC_state, data->start_class);
1395                     }
1396                     if (flags & SCF_DO_STCLASS_AND || !value)
1397                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398                     break;
1399                 case ANYOF:
1400                     if (flags & SCF_DO_STCLASS_AND)
1401                         cl_and(data->start_class,
1402                                (struct regnode_charclass_class*)scan);
1403                     else
1404                         cl_or(pRExC_state, data->start_class,
1405                               (struct regnode_charclass_class*)scan);
1406                     break;
1407                 case ALNUM:
1408                     if (flags & SCF_DO_STCLASS_AND) {
1409                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1411                             for (value = 0; value < 256; value++)
1412                                 if (!isALNUM(value))
1413                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1414                         }
1415                     }
1416                     else {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419                         else {
1420                             for (value = 0; value < 256; value++)
1421                                 if (isALNUM(value))
1422                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1423                         }
1424                     }
1425                     break;
1426                 case ALNUML:
1427                     if (flags & SCF_DO_STCLASS_AND) {
1428                         if (data->start_class->flags & ANYOF_LOCALE)
1429                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430                     }
1431                     else {
1432                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1433                         data->start_class->flags |= ANYOF_LOCALE;
1434                     }
1435                     break;
1436                 case NALNUM:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1440                             for (value = 0; value < 256; value++)
1441                                 if (isALNUM(value))
1442                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1443                         }
1444                     }
1445                     else {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448                         else {
1449                             for (value = 0; value < 256; value++)
1450                                 if (!isALNUM(value))
1451                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1452                         }
1453                     }
1454                     break;
1455                 case NALNUML:
1456                     if (flags & SCF_DO_STCLASS_AND) {
1457                         if (data->start_class->flags & ANYOF_LOCALE)
1458                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459                     }
1460                     else {
1461                         data->start_class->flags |= ANYOF_LOCALE;
1462                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1463                     }
1464                     break;
1465                 case SPACE:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1469                             for (value = 0; value < 256; value++)
1470                                 if (!isSPACE(value))
1471                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1472                         }
1473                     }
1474                     else {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477                         else {
1478                             for (value = 0; value < 256; value++)
1479                                 if (isSPACE(value))
1480                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1481                         }
1482                     }
1483                     break;
1484                 case SPACEL:
1485                     if (flags & SCF_DO_STCLASS_AND) {
1486                         if (data->start_class->flags & ANYOF_LOCALE)
1487                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488                     }
1489                     else {
1490                         data->start_class->flags |= ANYOF_LOCALE;
1491                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1492                     }
1493                     break;
1494                 case NSPACE:
1495                     if (flags & SCF_DO_STCLASS_AND) {
1496                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1497                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1498                             for (value = 0; value < 256; value++)
1499                                 if (isSPACE(value))
1500                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1501                         }
1502                     }
1503                     else {
1504                         if (data->start_class->flags & ANYOF_LOCALE)
1505                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1506                         else {
1507                             for (value = 0; value < 256; value++)
1508                                 if (!isSPACE(value))
1509                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1510                         }
1511                     }
1512                     break;
1513                 case NSPACEL:
1514                     if (flags & SCF_DO_STCLASS_AND) {
1515                         if (data->start_class->flags & ANYOF_LOCALE) {
1516                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1517                             for (value = 0; value < 256; value++)
1518                                 if (!isSPACE(value))
1519                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1520                         }
1521                     }
1522                     else {
1523                         data->start_class->flags |= ANYOF_LOCALE;
1524                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1525                     }
1526                     break;
1527                 case DIGIT:
1528                     if (flags & SCF_DO_STCLASS_AND) {
1529                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1530                         for (value = 0; value < 256; value++)
1531                             if (!isDIGIT(value))
1532                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533                     }
1534                     else {
1535                         if (data->start_class->flags & ANYOF_LOCALE)
1536                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1537                         else {
1538                             for (value = 0; value < 256; value++)
1539                                 if (isDIGIT(value))
1540                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1541                         }
1542                     }
1543                     break;
1544                 case NDIGIT:
1545                     if (flags & SCF_DO_STCLASS_AND) {
1546                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1547                         for (value = 0; value < 256; value++)
1548                             if (isDIGIT(value))
1549                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550                     }
1551                     else {
1552                         if (data->start_class->flags & ANYOF_LOCALE)
1553                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1554                         else {
1555                             for (value = 0; value < 256; value++)
1556                                 if (!isDIGIT(value))
1557                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1558                         }
1559                     }
1560                     break;
1561                 }
1562                 if (flags & SCF_DO_STCLASS_OR)
1563                     cl_and(data->start_class, &and_with);
1564                 flags &= ~SCF_DO_STCLASS;
1565             }
1566         }
1567         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1568             data->flags |= (OP(scan) == MEOL
1569                             ? SF_BEFORE_MEOL
1570                             : SF_BEFORE_SEOL);
1571         }
1572         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1573                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1574                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1575                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1576             /* Lookahead/lookbehind */
1577             I32 deltanext, minnext, fake = 0;
1578             regnode *nscan;
1579             struct regnode_charclass_class intrnl;
1580             int f = 0;
1581
1582             data_fake.flags = 0;
1583             if (data) {         
1584                 data_fake.whilem_c = data->whilem_c;
1585                 data_fake.last_closep = data->last_closep;
1586             }
1587             else
1588                 data_fake.last_closep = &fake;
1589             if ( flags & SCF_DO_STCLASS && !scan->flags
1590                  && OP(scan) == IFMATCH ) { /* Lookahead */
1591                 cl_init(pRExC_state, &intrnl);
1592                 data_fake.start_class = &intrnl;
1593                 f |= SCF_DO_STCLASS_AND;
1594             }
1595             if (flags & SCF_WHILEM_VISITED_POS)
1596                 f |= SCF_WHILEM_VISITED_POS;
1597             next = regnext(scan);
1598             nscan = NEXTOPER(NEXTOPER(scan));
1599             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1600             if (scan->flags) {
1601                 if (deltanext) {
1602                     vFAIL("Variable length lookbehind not implemented");
1603                 }
1604                 else if (minnext > U8_MAX) {
1605                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1606                 }
1607                 scan->flags = (U8)minnext;
1608             }
1609             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1610                 pars++;
1611             if (data && (data_fake.flags & SF_HAS_EVAL))
1612                 data->flags |= SF_HAS_EVAL;
1613             if (data)
1614                 data->whilem_c = data_fake.whilem_c;
1615             if (f & SCF_DO_STCLASS_AND) {
1616                 int was = (data->start_class->flags & ANYOF_EOS);
1617
1618                 cl_and(data->start_class, &intrnl);
1619                 if (was)
1620                     data->start_class->flags |= ANYOF_EOS;
1621             }
1622         }
1623         else if (OP(scan) == OPEN) {
1624             pars++;
1625         }
1626         else if (OP(scan) == CLOSE) {
1627             if ((I32)ARG(scan) == is_par) {
1628                 next = regnext(scan);
1629
1630                 if ( next && (OP(next) != WHILEM) && next < last)
1631                     is_par = 0;         /* Disable optimization */
1632             }
1633             if (data)
1634                 *(data->last_closep) = ARG(scan);
1635         }
1636         else if (OP(scan) == EVAL) {
1637                 if (data)
1638                     data->flags |= SF_HAS_EVAL;
1639         }
1640         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1641                 if (flags & SCF_DO_SUBSTR) {
1642                     scan_commit(pRExC_state,data);
1643                     data->longest = &(data->longest_float);
1644                 }
1645                 is_inf = is_inf_internal = 1;
1646                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1647                     cl_anything(pRExC_state, data->start_class);
1648                 flags &= ~SCF_DO_STCLASS;
1649         }
1650         /* Else: zero-length, ignore. */
1651         scan = regnext(scan);
1652     }
1653
1654   finish:
1655     *scanp = scan;
1656     *deltap = is_inf_internal ? I32_MAX : delta;
1657     if (flags & SCF_DO_SUBSTR && is_inf)
1658         data->pos_delta = I32_MAX - data->pos_min;
1659     if (is_par > U8_MAX)
1660         is_par = 0;
1661     if (is_par && pars==1 && data) {
1662         data->flags |= SF_IN_PAR;
1663         data->flags &= ~SF_HAS_PAR;
1664     }
1665     else if (pars && data) {
1666         data->flags |= SF_HAS_PAR;
1667         data->flags &= ~SF_IN_PAR;
1668     }
1669     if (flags & SCF_DO_STCLASS_OR)
1670         cl_and(data->start_class, &and_with);
1671     return min;
1672 }
1673
1674 STATIC I32
1675 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1676 {
1677     if (RExC_rx->data) {
1678         Renewc(RExC_rx->data,
1679                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1680                char, struct reg_data);
1681         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1682         RExC_rx->data->count += n;
1683     }
1684     else {
1685         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1686              char, struct reg_data);
1687         New(1208, RExC_rx->data->what, n, U8);
1688         RExC_rx->data->count = n;
1689     }
1690     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1691     return RExC_rx->data->count - n;
1692 }
1693
1694 void
1695 Perl_reginitcolors(pTHX)
1696 {
1697     int i = 0;
1698     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1699         
1700     if (s) {
1701         PL_colors[0] = s = savepv(s);
1702         while (++i < 6) {
1703             s = strchr(s, '\t');
1704             if (s) {
1705                 *s = '\0';
1706                 PL_colors[i] = ++s;
1707             }
1708             else
1709                 PL_colors[i] = s = "";
1710         }
1711     } else {
1712         while (i < 6)
1713             PL_colors[i++] = "";
1714     }
1715     PL_colorset = 1;
1716 }
1717
1718
1719 /*
1720  - pregcomp - compile a regular expression into internal code
1721  *
1722  * We can't allocate space until we know how big the compiled form will be,
1723  * but we can't compile it (and thus know how big it is) until we've got a
1724  * place to put the code.  So we cheat:  we compile it twice, once with code
1725  * generation turned off and size counting turned on, and once "for real".
1726  * This also means that we don't allocate space until we are sure that the
1727  * thing really will compile successfully, and we never have to move the
1728  * code and thus invalidate pointers into it.  (Note that it has to be in
1729  * one piece because free() must be able to free it all.) [NB: not true in perl]
1730  *
1731  * Beware that the optimization-preparation code in here knows about some
1732  * of the structure of the compiled regexp.  [I'll say.]
1733  */
1734 regexp *
1735 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1736 {
1737     register regexp *r;
1738     regnode *scan;
1739     regnode *first;
1740     I32 flags;
1741     I32 minlen = 0;
1742     I32 sawplus = 0;
1743     I32 sawopen = 0;
1744     scan_data_t data;
1745     RExC_state_t RExC_state;
1746     RExC_state_t *pRExC_state = &RExC_state;
1747
1748     if (exp == NULL)
1749         FAIL("NULL regexp argument");
1750
1751     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1752
1753     RExC_precomp = exp;
1754     DEBUG_r({
1755          if (!PL_colorset) reginitcolors();
1756          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1757                        PL_colors[4],PL_colors[5],PL_colors[0],
1758                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1759     });
1760     RExC_flags = pm->op_pmflags;
1761     RExC_sawback = 0;
1762
1763     RExC_seen = 0;
1764     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1765     RExC_seen_evals = 0;
1766     RExC_extralen = 0;
1767
1768     /* First pass: determine size, legality. */
1769     RExC_parse = exp;
1770     RExC_start = exp;
1771     RExC_end = xend;
1772     RExC_naughty = 0;
1773     RExC_npar = 1;
1774     RExC_size = 0L;
1775     RExC_emit = &PL_regdummy;
1776     RExC_whilem_seen = 0;
1777 #if 0 /* REGC() is (currently) a NOP at the first pass.
1778        * Clever compilers notice this and complain. --jhi */
1779     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1780 #endif
1781     if (reg(pRExC_state, 0, &flags) == NULL) {
1782         RExC_precomp = Nullch;
1783         return(NULL);
1784     }
1785     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1786
1787     /* Small enough for pointer-storage convention?
1788        If extralen==0, this means that we will not need long jumps. */
1789     if (RExC_size >= 0x10000L && RExC_extralen)
1790         RExC_size += RExC_extralen;
1791     else
1792         RExC_extralen = 0;
1793     if (RExC_whilem_seen > 15)
1794         RExC_whilem_seen = 15;
1795
1796     /* Allocate space and initialize. */
1797     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1798          char, regexp);
1799     if (r == NULL)
1800         FAIL("Regexp out of space");
1801
1802 #ifdef DEBUGGING
1803     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1804     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1805 #endif
1806     r->refcnt = 1;
1807     r->prelen = xend - exp;
1808     r->precomp = savepvn(RExC_precomp, r->prelen);
1809     r->subbeg = NULL;
1810 #ifdef PERL_COPY_ON_WRITE
1811     r->saved_copy = Nullsv;
1812 #endif
1813     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1814     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1815
1816     r->substrs = 0;                     /* Useful during FAIL. */
1817     r->startp = 0;                      /* Useful during FAIL. */
1818     r->endp = 0;                        /* Useful during FAIL. */
1819
1820     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1821     if (r->offsets) {
1822       r->offsets[0] = RExC_size; 
1823     }
1824     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1825                           "%s %"UVuf" bytes for offset annotations.\n", 
1826                           r->offsets ? "Got" : "Couldn't get", 
1827                           (UV)((2*RExC_size+1) * sizeof(U32))));
1828
1829     RExC_rx = r;
1830
1831     /* Second pass: emit code. */
1832     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1833     RExC_parse = exp;
1834     RExC_end = xend;
1835     RExC_naughty = 0;
1836     RExC_npar = 1;
1837     RExC_emit_start = r->program;
1838     RExC_emit = r->program;
1839     /* Store the count of eval-groups for security checks: */
1840     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1841     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1842     r->data = 0;
1843     if (reg(pRExC_state, 0, &flags) == NULL)
1844         return(NULL);
1845
1846     /* Dig out information for optimizations. */
1847     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1848     pm->op_pmflags = RExC_flags;
1849     if (UTF)
1850         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1851     r->regstclass = NULL;
1852     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1853         r->reganch |= ROPT_NAUGHTY;
1854     scan = r->program + 1;              /* First BRANCH. */
1855
1856     /* XXXX To minimize changes to RE engine we always allocate
1857        3-units-long substrs field. */
1858     Newz(1004, r->substrs, 1, struct reg_substr_data);
1859
1860     StructCopy(&zero_scan_data, &data, scan_data_t);
1861     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1862     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1863         I32 fake;
1864         STRLEN longest_float_length, longest_fixed_length;
1865         struct regnode_charclass_class ch_class;
1866         int stclass_flag;
1867         I32 last_close = 0;
1868
1869         first = scan;
1870         /* Skip introductions and multiplicators >= 1. */
1871         while ((OP(first) == OPEN && (sawopen = 1)) ||
1872                /* An OR of *one* alternative - should not happen now. */
1873             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1874             (OP(first) == PLUS) ||
1875             (OP(first) == MINMOD) ||
1876                /* An {n,m} with n>0 */
1877             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1878                 if (OP(first) == PLUS)
1879                     sawplus = 1;
1880                 else
1881                     first += regarglen[(U8)OP(first)];
1882                 first = NEXTOPER(first);
1883         }
1884
1885         /* Starting-point info. */
1886       again:
1887         if (PL_regkind[(U8)OP(first)] == EXACT) {
1888             if (OP(first) == EXACT)
1889                 ;       /* Empty, get anchored substr later. */
1890             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1891                 r->regstclass = first;
1892         }
1893         else if (strchr((char*)PL_simple,OP(first)))
1894             r->regstclass = first;
1895         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1896                  PL_regkind[(U8)OP(first)] == NBOUND)
1897             r->regstclass = first;
1898         else if (PL_regkind[(U8)OP(first)] == BOL) {
1899             r->reganch |= (OP(first) == MBOL
1900                            ? ROPT_ANCH_MBOL
1901                            : (OP(first) == SBOL
1902                               ? ROPT_ANCH_SBOL
1903                               : ROPT_ANCH_BOL));
1904             first = NEXTOPER(first);
1905             goto again;
1906         }
1907         else if (OP(first) == GPOS) {
1908             r->reganch |= ROPT_ANCH_GPOS;
1909             first = NEXTOPER(first);
1910             goto again;
1911         }
1912         else if (!sawopen && (OP(first) == STAR &&
1913             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1914             !(r->reganch & ROPT_ANCH) )
1915         {
1916             /* turn .* into ^.* with an implied $*=1 */
1917             int type = OP(NEXTOPER(first));
1918
1919             if (type == REG_ANY)
1920                 type = ROPT_ANCH_MBOL;
1921             else
1922                 type = ROPT_ANCH_SBOL;
1923
1924             r->reganch |= type | ROPT_IMPLICIT;
1925             first = NEXTOPER(first);
1926             goto again;
1927         }
1928         if (sawplus && (!sawopen || !RExC_sawback)
1929             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1930             /* x+ must match at the 1st pos of run of x's */
1931             r->reganch |= ROPT_SKIP;
1932
1933         /* Scan is after the zeroth branch, first is atomic matcher. */
1934         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1935                               (IV)(first - scan + 1)));
1936         /*
1937         * If there's something expensive in the r.e., find the
1938         * longest literal string that must appear and make it the
1939         * regmust.  Resolve ties in favor of later strings, since
1940         * the regstart check works with the beginning of the r.e.
1941         * and avoiding duplication strengthens checking.  Not a
1942         * strong reason, but sufficient in the absence of others.
1943         * [Now we resolve ties in favor of the earlier string if
1944         * it happens that c_offset_min has been invalidated, since the
1945         * earlier string may buy us something the later one won't.]
1946         */
1947         minlen = 0;
1948
1949         data.longest_fixed = newSVpvn("",0);
1950         data.longest_float = newSVpvn("",0);
1951         data.last_found = newSVpvn("",0);
1952         data.longest = &(data.longest_fixed);
1953         first = scan;
1954         if (!r->regstclass) {
1955             cl_init(pRExC_state, &ch_class);
1956             data.start_class = &ch_class;
1957             stclass_flag = SCF_DO_STCLASS_AND;
1958         } else                          /* XXXX Check for BOUND? */
1959             stclass_flag = 0;
1960         data.last_closep = &last_close;
1961
1962         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1963                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1964         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1965              && data.last_start_min == 0 && data.last_end > 0
1966              && !RExC_seen_zerolen
1967              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1968             r->reganch |= ROPT_CHECK_ALL;
1969         scan_commit(pRExC_state, &data);
1970         SvREFCNT_dec(data.last_found);
1971
1972         longest_float_length = CHR_SVLEN(data.longest_float);
1973         if (longest_float_length
1974             || (data.flags & SF_FL_BEFORE_EOL
1975                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1976                     || (RExC_flags & PMf_MULTILINE)))) {
1977             int t;
1978
1979             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1980                 && data.offset_fixed == data.offset_float_min
1981                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1982                     goto remove_float;          /* As in (a)+. */
1983
1984             if (SvUTF8(data.longest_float)) {
1985                 r->float_utf8 = data.longest_float;
1986                 r->float_substr = Nullsv;
1987             } else {
1988                 r->float_substr = data.longest_float;
1989                 r->float_utf8 = Nullsv;
1990             }
1991             r->float_min_offset = data.offset_float_min;
1992             r->float_max_offset = data.offset_float_max;
1993             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1994                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1995                            || (RExC_flags & PMf_MULTILINE)));
1996             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1997         }
1998         else {
1999           remove_float:
2000             r->float_substr = r->float_utf8 = Nullsv;
2001             SvREFCNT_dec(data.longest_float);
2002             longest_float_length = 0;
2003         }
2004
2005         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2006         if (longest_fixed_length
2007             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2008                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2009                     || (RExC_flags & PMf_MULTILINE)))) {
2010             int t;
2011
2012             if (SvUTF8(data.longest_fixed)) {
2013                 r->anchored_utf8 = data.longest_fixed;
2014                 r->anchored_substr = Nullsv;
2015             } else {
2016                 r->anchored_substr = data.longest_fixed;
2017                 r->anchored_utf8 = Nullsv;
2018             }
2019             r->anchored_offset = data.offset_fixed;
2020             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2021                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2022                      || (RExC_flags & PMf_MULTILINE)));
2023             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2024         }
2025         else {
2026             r->anchored_substr = r->anchored_utf8 = Nullsv;
2027             SvREFCNT_dec(data.longest_fixed);
2028             longest_fixed_length = 0;
2029         }
2030         if (r->regstclass
2031             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2032             r->regstclass = NULL;
2033         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2034             && stclass_flag
2035             && !(data.start_class->flags & ANYOF_EOS)
2036             && !cl_is_anything(data.start_class))
2037         {
2038             I32 n = add_data(pRExC_state, 1, "f");
2039
2040             New(1006, RExC_rx->data->data[n], 1,
2041                 struct regnode_charclass_class);
2042             StructCopy(data.start_class,
2043                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2044                        struct regnode_charclass_class);
2045             r->regstclass = (regnode*)RExC_rx->data->data[n];
2046             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2047             PL_regdata = r->data; /* for regprop() */
2048             DEBUG_r({ SV *sv = sv_newmortal();
2049                       regprop(sv, (regnode*)data.start_class);
2050                       PerlIO_printf(Perl_debug_log,
2051                                     "synthetic stclass `%s'.\n",
2052                                     SvPVX(sv));});
2053         }
2054
2055         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2056         if (longest_fixed_length > longest_float_length) {
2057             r->check_substr = r->anchored_substr;
2058             r->check_utf8 = r->anchored_utf8;
2059             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2060             if (r->reganch & ROPT_ANCH_SINGLE)
2061                 r->reganch |= ROPT_NOSCAN;
2062         }
2063         else {
2064             r->check_substr = r->float_substr;
2065             r->check_utf8 = r->float_utf8;
2066             r->check_offset_min = data.offset_float_min;
2067             r->check_offset_max = data.offset_float_max;
2068         }
2069         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2070            This should be changed ASAP!  */
2071         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2072             r->reganch |= RE_USE_INTUIT;
2073             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2074                 r->reganch |= RE_INTUIT_TAIL;
2075         }
2076     }
2077     else {
2078         /* Several toplevels. Best we can is to set minlen. */
2079         I32 fake;
2080         struct regnode_charclass_class ch_class;
2081         I32 last_close = 0;
2082         
2083         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2084         scan = r->program + 1;
2085         cl_init(pRExC_state, &ch_class);
2086         data.start_class = &ch_class;
2087         data.last_closep = &last_close;
2088         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2089         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2090                 = r->float_substr = r->float_utf8 = Nullsv;
2091         if (!(data.start_class->flags & ANYOF_EOS)
2092             && !cl_is_anything(data.start_class))
2093         {
2094             I32 n = add_data(pRExC_state, 1, "f");
2095
2096             New(1006, RExC_rx->data->data[n], 1,
2097                 struct regnode_charclass_class);
2098             StructCopy(data.start_class,
2099                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2100                        struct regnode_charclass_class);
2101             r->regstclass = (regnode*)RExC_rx->data->data[n];
2102             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2103             DEBUG_r({ SV* sv = sv_newmortal();
2104                       regprop(sv, (regnode*)data.start_class);
2105                       PerlIO_printf(Perl_debug_log,
2106                                     "synthetic stclass `%s'.\n",
2107                                     SvPVX(sv));});
2108         }
2109     }
2110
2111     r->minlen = minlen;
2112     if (RExC_seen & REG_SEEN_GPOS)
2113         r->reganch |= ROPT_GPOS_SEEN;
2114     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2115         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2116     if (RExC_seen & REG_SEEN_EVAL)
2117         r->reganch |= ROPT_EVAL_SEEN;
2118     if (RExC_seen & REG_SEEN_CANY)
2119         r->reganch |= ROPT_CANY_SEEN;
2120     Newz(1002, r->startp, RExC_npar, I32);
2121     Newz(1002, r->endp, RExC_npar, I32);
2122     PL_regdata = r->data; /* for regprop() */
2123     DEBUG_r(regdump(r));
2124     return(r);
2125 }
2126
2127 /*
2128  - reg - regular expression, i.e. main body or parenthesized thing
2129  *
2130  * Caller must absorb opening parenthesis.
2131  *
2132  * Combining parenthesis handling with the base level of regular expression
2133  * is a trifle forced, but the need to tie the tails of the branches to what
2134  * follows makes it hard to avoid.
2135  */
2136 STATIC regnode *
2137 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2138     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2139 {
2140     register regnode *ret;              /* Will be the head of the group. */
2141     register regnode *br;
2142     register regnode *lastbr;
2143     register regnode *ender = 0;
2144     register I32 parno = 0;
2145     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2146
2147     /* for (?g), (?gc), and (?o) warnings; warning
2148        about (?c) will warn about (?g) -- japhy    */
2149
2150     I32 wastedflags = 0x00,
2151         wasted_o    = 0x01,
2152         wasted_g    = 0x02,
2153         wasted_gc   = 0x02 | 0x04,
2154         wasted_c    = 0x04;
2155
2156     char * parse_start = RExC_parse; /* MJD */
2157     char *oregcomp_parse = RExC_parse;
2158     char c;
2159
2160     *flagp = 0;                         /* Tentatively. */
2161
2162
2163     /* Make an OPEN node, if parenthesized. */
2164     if (paren) {
2165         if (*RExC_parse == '?') { /* (?...) */
2166             U32 posflags = 0, negflags = 0;
2167             U32 *flagsp = &posflags;
2168             int logical = 0;
2169             char *seqstart = RExC_parse;
2170
2171             RExC_parse++;
2172             paren = *RExC_parse++;
2173             ret = NULL;                 /* For look-ahead/behind. */
2174             switch (paren) {
2175             case '<':           /* (?<...) */
2176                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2177                 if (*RExC_parse == '!')
2178                     paren = ',';
2179                 if (*RExC_parse != '=' && *RExC_parse != '!')
2180                     goto unknown;
2181                 RExC_parse++;
2182             case '=':           /* (?=...) */
2183             case '!':           /* (?!...) */
2184                 RExC_seen_zerolen++;
2185             case ':':           /* (?:...) */
2186             case '>':           /* (?>...) */
2187                 break;
2188             case '$':           /* (?$...) */
2189             case '@':           /* (?@...) */
2190                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2191                 break;
2192             case '#':           /* (?#...) */
2193                 while (*RExC_parse && *RExC_parse != ')')
2194                     RExC_parse++;
2195                 if (*RExC_parse != ')')
2196                     FAIL("Sequence (?#... not terminated");
2197                 nextchar(pRExC_state);
2198                 *flagp = TRYAGAIN;
2199                 return NULL;
2200             case 'p':           /* (?p...) */
2201                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2202                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2203                 /* FALL THROUGH*/
2204             case '?':           /* (??...) */
2205                 logical = 1;
2206                 if (*RExC_parse != '{')
2207                     goto unknown;
2208                 paren = *RExC_parse++;
2209                 /* FALL THROUGH */
2210             case '{':           /* (?{...}) */
2211             {
2212                 I32 count = 1, n = 0;
2213                 char c;
2214                 char *s = RExC_parse;
2215                 SV *sv;
2216                 OP_4tree *sop, *rop;
2217
2218                 RExC_seen_zerolen++;
2219                 RExC_seen |= REG_SEEN_EVAL;
2220                 while (count && (c = *RExC_parse)) {
2221                     if (c == '\\' && RExC_parse[1])
2222                         RExC_parse++;
2223                     else if (c == '{')
2224                         count++;
2225                     else if (c == '}')
2226                         count--;
2227                     RExC_parse++;
2228                 }
2229                 if (*RExC_parse != ')')
2230                 {
2231                     RExC_parse = s;             
2232                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2233                 }
2234                 if (!SIZE_ONLY) {
2235                     PAD *pad;
2236                 
2237                     if (RExC_parse - 1 - s)
2238                         sv = newSVpvn(s, RExC_parse - 1 - s);
2239                     else
2240                         sv = newSVpvn("", 0);
2241
2242                     ENTER;
2243                     Perl_save_re_context(aTHX);
2244                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2245                     sop->op_private |= OPpREFCOUNTED;
2246                     /* re_dup will OpREFCNT_inc */
2247                     OpREFCNT_set(sop, 1);
2248                     LEAVE;
2249
2250                     n = add_data(pRExC_state, 3, "nop");
2251                     RExC_rx->data->data[n] = (void*)rop;
2252                     RExC_rx->data->data[n+1] = (void*)sop;
2253                     RExC_rx->data->data[n+2] = (void*)pad;
2254                     SvREFCNT_dec(sv);
2255                 }
2256                 else {                                          /* First pass */
2257                     if (PL_reginterp_cnt < ++RExC_seen_evals
2258                         && IN_PERL_RUNTIME)
2259                         /* No compiled RE interpolated, has runtime
2260                            components ===> unsafe.  */
2261                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2262                     if (PL_tainting && PL_tainted)
2263                         FAIL("Eval-group in insecure regular expression");
2264                     if (IN_PERL_COMPILETIME)
2265                         PL_cv_has_eval = 1;
2266                 }
2267
2268                 nextchar(pRExC_state);
2269                 if (logical) {
2270                     ret = reg_node(pRExC_state, LOGICAL);
2271                     if (!SIZE_ONLY)
2272                         ret->flags = 2;
2273                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2274                     /* deal with the length of this later - MJD */
2275                     return ret;
2276                 }
2277                 ret = reganode(pRExC_state, EVAL, n);
2278                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2279                 Set_Node_Offset(ret, parse_start);
2280                 return ret;
2281             }
2282             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2283             {
2284                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2285                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2286                         || RExC_parse[1] == '<'
2287                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2288                         I32 flag;
2289                         
2290                         ret = reg_node(pRExC_state, LOGICAL);
2291                         if (!SIZE_ONLY)
2292                             ret->flags = 1;
2293                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2294                         goto insert_if;
2295                     }
2296                 }
2297                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2298                     /* (?(1)...) */
2299                     parno = atoi(RExC_parse++);
2300
2301                     while (isDIGIT(*RExC_parse))
2302                         RExC_parse++;
2303                     ret = reganode(pRExC_state, GROUPP, parno);
2304                     
2305                     if ((c = *nextchar(pRExC_state)) != ')')
2306                         vFAIL("Switch condition not recognized");
2307                   insert_if:
2308                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2309                     br = regbranch(pRExC_state, &flags, 1);
2310                     if (br == NULL)
2311                         br = reganode(pRExC_state, LONGJMP, 0);
2312                     else
2313                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2314                     c = *nextchar(pRExC_state);
2315                     if (flags&HASWIDTH)
2316                         *flagp |= HASWIDTH;
2317                     if (c == '|') {
2318                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2319                         regbranch(pRExC_state, &flags, 1);
2320                         regtail(pRExC_state, ret, lastbr);
2321                         if (flags&HASWIDTH)
2322                             *flagp |= HASWIDTH;
2323                         c = *nextchar(pRExC_state);
2324                     }
2325                     else
2326                         lastbr = NULL;
2327                     if (c != ')')
2328                         vFAIL("Switch (?(condition)... contains too many branches");
2329                     ender = reg_node(pRExC_state, TAIL);
2330                     regtail(pRExC_state, br, ender);
2331                     if (lastbr) {
2332                         regtail(pRExC_state, lastbr, ender);
2333                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2334                     }
2335                     else
2336                         regtail(pRExC_state, ret, ender);
2337                     return ret;
2338                 }
2339                 else {
2340                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2341                 }
2342             }
2343             case 0:
2344                 RExC_parse--; /* for vFAIL to print correctly */
2345                 vFAIL("Sequence (? incomplete");
2346                 break;
2347             default:
2348                 --RExC_parse;
2349               parse_flags:      /* (?i) */
2350                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2351                     /* (?g), (?gc) and (?o) are useless here
2352                        and must be globally applied -- japhy */
2353
2354                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2355                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2356                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2357                             if (! (wastedflags & wflagbit) ) {
2358                                 wastedflags |= wflagbit;
2359                                 vWARN5(
2360                                     RExC_parse + 1,
2361                                     "Useless (%s%c) - %suse /%c modifier",
2362                                     flagsp == &negflags ? "?-" : "?",
2363                                     *RExC_parse,
2364                                     flagsp == &negflags ? "don't " : "",
2365                                     *RExC_parse
2366                                 );
2367                             }
2368                         }
2369                     }
2370                     else if (*RExC_parse == 'c') {
2371                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2372                             if (! (wastedflags & wasted_c) ) {
2373                                 wastedflags |= wasted_gc;
2374                                 vWARN3(
2375                                     RExC_parse + 1,
2376                                     "Useless (%sc) - %suse /gc modifier",
2377                                     flagsp == &negflags ? "?-" : "?",
2378                                     flagsp == &negflags ? "don't " : ""
2379                                 );
2380                             }
2381                         }
2382                     }
2383                     else { pmflag(flagsp, *RExC_parse); }
2384
2385                     ++RExC_parse;
2386                 }
2387                 if (*RExC_parse == '-') {
2388                     flagsp = &negflags;
2389                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2390                     ++RExC_parse;
2391                     goto parse_flags;
2392                 }
2393                 RExC_flags |= posflags;
2394                 RExC_flags &= ~negflags;
2395                 if (*RExC_parse == ':') {
2396                     RExC_parse++;
2397                     paren = ':';
2398                     break;
2399                 }               
2400               unknown:
2401                 if (*RExC_parse != ')') {
2402                     RExC_parse++;
2403                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2404                 }
2405                 nextchar(pRExC_state);
2406                 *flagp = TRYAGAIN;
2407                 return NULL;
2408             }
2409         }
2410         else {                  /* (...) */
2411             parno = RExC_npar;
2412             RExC_npar++;
2413             ret = reganode(pRExC_state, OPEN, parno);
2414             Set_Node_Length(ret, 1); /* MJD */
2415             Set_Node_Offset(ret, RExC_parse); /* MJD */
2416             open = 1;
2417         }
2418     }
2419     else                        /* ! paren */
2420         ret = NULL;
2421
2422     /* Pick up the branches, linking them together. */
2423     parse_start = RExC_parse;   /* MJD */
2424     br = regbranch(pRExC_state, &flags, 1);
2425     /*     branch_len = (paren != 0); */
2426     
2427     if (br == NULL)
2428         return(NULL);
2429     if (*RExC_parse == '|') {
2430         if (!SIZE_ONLY && RExC_extralen) {
2431             reginsert(pRExC_state, BRANCHJ, br);
2432         }
2433         else {                  /* MJD */
2434             reginsert(pRExC_state, BRANCH, br);
2435             Set_Node_Length(br, paren != 0);
2436             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2437         }
2438         have_branch = 1;
2439         if (SIZE_ONLY)
2440             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2441     }
2442     else if (paren == ':') {
2443         *flagp |= flags&SIMPLE;
2444     }
2445     if (open) {                         /* Starts with OPEN. */
2446         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2447     }
2448     else if (paren != '?')              /* Not Conditional */
2449         ret = br;
2450     *flagp |= flags & (SPSTART | HASWIDTH);
2451     lastbr = br;
2452     while (*RExC_parse == '|') {
2453         if (!SIZE_ONLY && RExC_extralen) {
2454             ender = reganode(pRExC_state, LONGJMP,0);
2455             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2456         }
2457         if (SIZE_ONLY)
2458             RExC_extralen += 2;         /* Account for LONGJMP. */
2459         nextchar(pRExC_state);
2460         br = regbranch(pRExC_state, &flags, 0);
2461         
2462         if (br == NULL)
2463             return(NULL);
2464         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2465         lastbr = br;
2466         if (flags&HASWIDTH)
2467             *flagp |= HASWIDTH;
2468         *flagp |= flags&SPSTART;
2469     }
2470
2471     if (have_branch || paren != ':') {
2472         /* Make a closing node, and hook it on the end. */
2473         switch (paren) {
2474         case ':':
2475             ender = reg_node(pRExC_state, TAIL);
2476             break;
2477         case 1:
2478             ender = reganode(pRExC_state, CLOSE, parno);
2479             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2480             Set_Node_Length(ender,1); /* MJD */
2481             break;
2482         case '<':
2483         case ',':
2484         case '=':
2485         case '!':
2486             *flagp &= ~HASWIDTH;
2487             /* FALL THROUGH */
2488         case '>':
2489             ender = reg_node(pRExC_state, SUCCEED);
2490             break;
2491         case 0:
2492             ender = reg_node(pRExC_state, END);
2493             break;
2494         }
2495         regtail(pRExC_state, lastbr, ender);
2496
2497         if (have_branch) {
2498             /* Hook the tails of the branches to the closing node. */
2499             for (br = ret; br != NULL; br = regnext(br)) {
2500                 regoptail(pRExC_state, br, ender);
2501             }
2502         }
2503     }
2504
2505     {
2506         char *p;
2507         static char parens[] = "=!<,>";
2508
2509         if (paren && (p = strchr(parens, paren))) {
2510             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2511             int flag = (p - parens) > 1;
2512
2513             if (paren == '>')
2514                 node = SUSPEND, flag = 0;
2515             reginsert(pRExC_state, node,ret);
2516             Set_Node_Cur_Length(ret);
2517             Set_Node_Offset(ret, parse_start + 1);
2518             ret->flags = flag;
2519             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2520         }
2521     }
2522
2523     /* Check for proper termination. */
2524     if (paren) {
2525         RExC_flags = oregflags;
2526         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2527             RExC_parse = oregcomp_parse;
2528             vFAIL("Unmatched (");
2529         }
2530     }
2531     else if (!paren && RExC_parse < RExC_end) {
2532         if (*RExC_parse == ')') {
2533             RExC_parse++;
2534             vFAIL("Unmatched )");
2535         }
2536         else
2537             FAIL("Junk on end of regexp");      /* "Can't happen". */
2538         /* NOTREACHED */
2539     }
2540
2541     return(ret);
2542 }
2543
2544 /*
2545  - regbranch - one alternative of an | operator
2546  *
2547  * Implements the concatenation operator.
2548  */
2549 STATIC regnode *
2550 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2551 {
2552     register regnode *ret;
2553     register regnode *chain = NULL;
2554     register regnode *latest;
2555     I32 flags = 0, c = 0;
2556
2557     if (first)
2558         ret = NULL;
2559     else {
2560         if (!SIZE_ONLY && RExC_extralen)
2561             ret = reganode(pRExC_state, BRANCHJ,0);
2562         else {
2563             ret = reg_node(pRExC_state, BRANCH);
2564             Set_Node_Length(ret, 1);
2565         }
2566     }
2567         
2568     if (!first && SIZE_ONLY)
2569         RExC_extralen += 1;                     /* BRANCHJ */
2570
2571     *flagp = WORST;                     /* Tentatively. */
2572
2573     RExC_parse--;
2574     nextchar(pRExC_state);
2575     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2576         flags &= ~TRYAGAIN;
2577         latest = regpiece(pRExC_state, &flags);
2578         if (latest == NULL) {
2579             if (flags & TRYAGAIN)
2580                 continue;
2581             return(NULL);
2582         }
2583         else if (ret == NULL)
2584             ret = latest;
2585         *flagp |= flags&HASWIDTH;
2586         if (chain == NULL)      /* First piece. */
2587             *flagp |= flags&SPSTART;
2588         else {
2589             RExC_naughty++;
2590             regtail(pRExC_state, chain, latest);
2591         }
2592         chain = latest;
2593         c++;
2594     }
2595     if (chain == NULL) {        /* Loop ran zero times. */
2596         chain = reg_node(pRExC_state, NOTHING);
2597         if (ret == NULL)
2598             ret = chain;
2599     }
2600     if (c == 1) {
2601         *flagp |= flags&SIMPLE;
2602     }
2603
2604     return(ret);
2605 }
2606
2607 /*
2608  - regpiece - something followed by possible [*+?]
2609  *
2610  * Note that the branching code sequences used for ? and the general cases
2611  * of * and + are somewhat optimized:  they use the same NOTHING node as
2612  * both the endmarker for their branch list and the body of the last branch.
2613  * It might seem that this node could be dispensed with entirely, but the
2614  * endmarker role is not redundant.
2615  */
2616 STATIC regnode *
2617 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2618 {
2619     register regnode *ret;
2620     register char op;
2621     register char *next;
2622     I32 flags;
2623     char *origparse = RExC_parse;
2624     char *maxpos;
2625     I32 min;
2626     I32 max = REG_INFTY;
2627     char *parse_start;
2628
2629     ret = regatom(pRExC_state, &flags);
2630     if (ret == NULL) {
2631         if (flags & TRYAGAIN)
2632             *flagp |= TRYAGAIN;
2633         return(NULL);
2634     }
2635
2636     op = *RExC_parse;
2637
2638     if (op == '{' && regcurly(RExC_parse)) {
2639         parse_start = RExC_parse; /* MJD */
2640         next = RExC_parse + 1;
2641         maxpos = Nullch;
2642         while (isDIGIT(*next) || *next == ',') {
2643             if (*next == ',') {
2644                 if (maxpos)
2645                     break;
2646                 else
2647                     maxpos = next;
2648             }
2649             next++;
2650         }
2651         if (*next == '}') {             /* got one */
2652             if (!maxpos)
2653                 maxpos = next;
2654             RExC_parse++;
2655             min = atoi(RExC_parse);
2656             if (*maxpos == ',')
2657                 maxpos++;
2658             else
2659                 maxpos = RExC_parse;
2660             max = atoi(maxpos);
2661             if (!max && *maxpos != '0')
2662                 max = REG_INFTY;                /* meaning "infinity" */
2663             else if (max >= REG_INFTY)
2664                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2665             RExC_parse = next;
2666             nextchar(pRExC_state);
2667
2668         do_curly:
2669             if ((flags&SIMPLE)) {
2670                 RExC_naughty += 2 + RExC_naughty / 2;
2671                 reginsert(pRExC_state, CURLY, ret);
2672                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2673                 Set_Node_Cur_Length(ret);
2674             }
2675             else {
2676                 regnode *w = reg_node(pRExC_state, WHILEM);
2677
2678                 w->flags = 0;
2679                 regtail(pRExC_state, ret, w);
2680                 if (!SIZE_ONLY && RExC_extralen) {
2681                     reginsert(pRExC_state, LONGJMP,ret);
2682                     reginsert(pRExC_state, NOTHING,ret);
2683                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2684                 }
2685                 reginsert(pRExC_state, CURLYX,ret);
2686                                 /* MJD hk */
2687                 Set_Node_Offset(ret, parse_start+1);
2688                 Set_Node_Length(ret, 
2689                                 op == '{' ? (RExC_parse - parse_start) : 1);
2690                 
2691                 if (!SIZE_ONLY && RExC_extralen)
2692                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2693                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2694                 if (SIZE_ONLY)
2695                     RExC_whilem_seen++, RExC_extralen += 3;
2696                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2697             }
2698             ret->flags = 0;
2699
2700             if (min > 0)
2701                 *flagp = WORST;
2702             if (max > 0)
2703                 *flagp |= HASWIDTH;
2704             if (max && max < min)
2705                 vFAIL("Can't do {n,m} with n > m");
2706             if (!SIZE_ONLY) {
2707                 ARG1_SET(ret, (U16)min);
2708                 ARG2_SET(ret, (U16)max);
2709             }
2710
2711             goto nest_check;
2712         }
2713     }
2714
2715     if (!ISMULT1(op)) {
2716         *flagp = flags;
2717         return(ret);
2718     }
2719
2720 #if 0                           /* Now runtime fix should be reliable. */
2721
2722     /* if this is reinstated, don't forget to put this back into perldiag:
2723
2724             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2725
2726            (F) The part of the regexp subject to either the * or + quantifier
2727            could match an empty string. The {#} shows in the regular
2728            expression about where the problem was discovered.
2729
2730     */
2731
2732     if (!(flags&HASWIDTH) && op != '?')
2733       vFAIL("Regexp *+ operand could be empty");
2734 #endif
2735
2736     parse_start = RExC_parse;
2737     nextchar(pRExC_state);
2738
2739     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2740
2741     if (op == '*' && (flags&SIMPLE)) {
2742         reginsert(pRExC_state, STAR, ret);
2743         ret->flags = 0;
2744         RExC_naughty += 4;
2745     }
2746     else if (op == '*') {
2747         min = 0;
2748         goto do_curly;
2749     }
2750     else if (op == '+' && (flags&SIMPLE)) {
2751         reginsert(pRExC_state, PLUS, ret);
2752         ret->flags = 0;
2753         RExC_naughty += 3;
2754     }
2755     else if (op == '+') {
2756         min = 1;
2757         goto do_curly;
2758     }
2759     else if (op == '?') {
2760         min = 0; max = 1;
2761         goto do_curly;
2762     }
2763   nest_check:
2764     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2765         vWARN3(RExC_parse,
2766                "%.*s matches null string many times",
2767                RExC_parse - origparse,
2768                origparse);
2769     }
2770
2771     if (*RExC_parse == '?') {
2772         nextchar(pRExC_state);
2773         reginsert(pRExC_state, MINMOD, ret);
2774         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2775     }
2776     if (ISMULT2(RExC_parse)) {
2777         RExC_parse++;
2778         vFAIL("Nested quantifiers");
2779     }
2780
2781     return(ret);
2782 }
2783
2784 /*
2785  - regatom - the lowest level
2786  *
2787  * Optimization:  gobbles an entire sequence of ordinary characters so that
2788  * it can turn them into a single node, which is smaller to store and
2789  * faster to run.  Backslashed characters are exceptions, each becoming a
2790  * separate node; the code is simpler that way and it's not worth fixing.
2791  *
2792  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2793 STATIC regnode *
2794 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2795 {
2796     register regnode *ret = 0;
2797     I32 flags;
2798     char *parse_start = RExC_parse;
2799
2800     *flagp = WORST;             /* Tentatively. */
2801
2802 tryagain:
2803     switch (*RExC_parse) {
2804     case '^':
2805         RExC_seen_zerolen++;
2806         nextchar(pRExC_state);
2807         if (RExC_flags & PMf_MULTILINE)
2808             ret = reg_node(pRExC_state, MBOL);
2809         else if (RExC_flags & PMf_SINGLELINE)
2810             ret = reg_node(pRExC_state, SBOL);
2811         else
2812             ret = reg_node(pRExC_state, BOL);
2813         Set_Node_Length(ret, 1); /* MJD */
2814         break;
2815     case '$':
2816         nextchar(pRExC_state);
2817         if (*RExC_parse)
2818             RExC_seen_zerolen++;
2819         if (RExC_flags & PMf_MULTILINE)
2820             ret = reg_node(pRExC_state, MEOL);
2821         else if (RExC_flags & PMf_SINGLELINE)
2822             ret = reg_node(pRExC_state, SEOL);
2823         else
2824             ret = reg_node(pRExC_state, EOL);
2825         Set_Node_Length(ret, 1); /* MJD */
2826         break;
2827     case '.':
2828         nextchar(pRExC_state);
2829         if (RExC_flags & PMf_SINGLELINE)
2830             ret = reg_node(pRExC_state, SANY);
2831         else
2832             ret = reg_node(pRExC_state, REG_ANY);
2833         *flagp |= HASWIDTH|SIMPLE;
2834         RExC_naughty++;
2835         Set_Node_Length(ret, 1); /* MJD */
2836         break;
2837     case '[':
2838     {
2839         char *oregcomp_parse = ++RExC_parse;
2840         ret = regclass(pRExC_state);
2841         if (*RExC_parse != ']') {
2842             RExC_parse = oregcomp_parse;
2843             vFAIL("Unmatched [");
2844         }
2845         nextchar(pRExC_state);
2846         *flagp |= HASWIDTH|SIMPLE;
2847         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2848         break;
2849     }
2850     case '(':
2851         nextchar(pRExC_state);
2852         ret = reg(pRExC_state, 1, &flags);
2853         if (ret == NULL) {
2854                 if (flags & TRYAGAIN) {
2855                     if (RExC_parse == RExC_end) {
2856                          /* Make parent create an empty node if needed. */
2857                         *flagp |= TRYAGAIN;
2858                         return(NULL);
2859                     }
2860                     goto tryagain;
2861                 }
2862                 return(NULL);
2863         }
2864         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2865         break;
2866     case '|':
2867     case ')':
2868         if (flags & TRYAGAIN) {
2869             *flagp |= TRYAGAIN;
2870             return NULL;
2871         }
2872         vFAIL("Internal urp");
2873                                 /* Supposed to be caught earlier. */
2874         break;
2875     case '{':
2876         if (!regcurly(RExC_parse)) {
2877             RExC_parse++;
2878             goto defchar;
2879         }
2880         /* FALL THROUGH */
2881     case '?':
2882     case '+':
2883     case '*':
2884         RExC_parse++;
2885         vFAIL("Quantifier follows nothing");
2886         break;
2887     case '\\':
2888         switch (*++RExC_parse) {
2889         case 'A':
2890             RExC_seen_zerolen++;
2891             ret = reg_node(pRExC_state, SBOL);
2892             *flagp |= SIMPLE;
2893             nextchar(pRExC_state);
2894             Set_Node_Length(ret, 2); /* MJD */
2895             break;
2896         case 'G':
2897             ret = reg_node(pRExC_state, GPOS);
2898             RExC_seen |= REG_SEEN_GPOS;
2899             *flagp |= SIMPLE;
2900             nextchar(pRExC_state);
2901             Set_Node_Length(ret, 2); /* MJD */
2902             break;
2903         case 'Z':
2904             ret = reg_node(pRExC_state, SEOL);
2905             *flagp |= SIMPLE;
2906             RExC_seen_zerolen++;                /* Do not optimize RE away */
2907             nextchar(pRExC_state);
2908             break;
2909         case 'z':
2910             ret = reg_node(pRExC_state, EOS);
2911             *flagp |= SIMPLE;
2912             RExC_seen_zerolen++;                /* Do not optimize RE away */
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'C':
2917             ret = reg_node(pRExC_state, CANY);
2918             RExC_seen |= REG_SEEN_CANY;
2919             *flagp |= HASWIDTH|SIMPLE;
2920             nextchar(pRExC_state);
2921             Set_Node_Length(ret, 2); /* MJD */
2922             break;
2923         case 'X':
2924             ret = reg_node(pRExC_state, CLUMP);
2925             *flagp |= HASWIDTH;
2926             nextchar(pRExC_state);
2927             Set_Node_Length(ret, 2); /* MJD */
2928             break;
2929         case 'w':
2930             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2931             *flagp |= HASWIDTH|SIMPLE;
2932             nextchar(pRExC_state);
2933             Set_Node_Length(ret, 2); /* MJD */
2934             break;
2935         case 'W':
2936             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2937             *flagp |= HASWIDTH|SIMPLE;
2938             nextchar(pRExC_state);
2939             Set_Node_Length(ret, 2); /* MJD */
2940             break;
2941         case 'b':
2942             RExC_seen_zerolen++;
2943             RExC_seen |= REG_SEEN_LOOKBEHIND;
2944             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2945             *flagp |= SIMPLE;
2946             nextchar(pRExC_state);
2947             Set_Node_Length(ret, 2); /* MJD */
2948             break;
2949         case 'B':
2950             RExC_seen_zerolen++;
2951             RExC_seen |= REG_SEEN_LOOKBEHIND;
2952             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2953             *flagp |= SIMPLE;
2954             nextchar(pRExC_state);
2955             Set_Node_Length(ret, 2); /* MJD */
2956             break;
2957         case 's':
2958             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2959             *flagp |= HASWIDTH|SIMPLE;
2960             nextchar(pRExC_state);
2961             Set_Node_Length(ret, 2); /* MJD */
2962             break;
2963         case 'S':
2964             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2965             *flagp |= HASWIDTH|SIMPLE;
2966             nextchar(pRExC_state);
2967             Set_Node_Length(ret, 2); /* MJD */
2968             break;
2969         case 'd':
2970             ret = reg_node(pRExC_state, DIGIT);
2971             *flagp |= HASWIDTH|SIMPLE;
2972             nextchar(pRExC_state);
2973             Set_Node_Length(ret, 2); /* MJD */
2974             break;
2975         case 'D':
2976             ret = reg_node(pRExC_state, NDIGIT);
2977             *flagp |= HASWIDTH|SIMPLE;
2978             nextchar(pRExC_state);
2979             Set_Node_Length(ret, 2); /* MJD */
2980             break;
2981         case 'p':
2982         case 'P':
2983             {   
2984                 char* oldregxend = RExC_end;
2985                 char* parse_start = RExC_parse - 2;
2986
2987                 if (RExC_parse[1] == '{') {
2988                   /* a lovely hack--pretend we saw [\pX] instead */
2989                     RExC_end = strchr(RExC_parse, '}');
2990                     if (!RExC_end) {
2991                         U8 c = (U8)*RExC_parse;
2992                         RExC_parse += 2;
2993                         RExC_end = oldregxend;
2994                         vFAIL2("Missing right brace on \\%c{}", c);
2995                     }
2996                     RExC_end++;
2997                 }
2998                 else {
2999                     RExC_end = RExC_parse + 2;
3000                     if (RExC_end > oldregxend)
3001                         RExC_end = oldregxend;
3002                 }
3003                 RExC_parse--;
3004
3005                 ret = regclass(pRExC_state);
3006
3007                 RExC_end = oldregxend;
3008                 RExC_parse--;
3009
3010                 Set_Node_Offset(ret, parse_start + 2);
3011                 Set_Node_Cur_Length(ret);
3012                 nextchar(pRExC_state);
3013                 *flagp |= HASWIDTH|SIMPLE;
3014             }
3015             break;
3016         case 'n':
3017         case 'r':
3018         case 't':
3019         case 'f':
3020         case 'e':
3021         case 'a':
3022         case 'x':
3023         case 'c':
3024         case '0':
3025             goto defchar;
3026         case '1': case '2': case '3': case '4':
3027         case '5': case '6': case '7': case '8': case '9':
3028             {
3029                 I32 num = atoi(RExC_parse);
3030
3031                 if (num > 9 && num >= RExC_npar)
3032                     goto defchar;
3033                 else {
3034                     char * parse_start = RExC_parse - 1; /* MJD */
3035                     while (isDIGIT(*RExC_parse))
3036                         RExC_parse++;
3037
3038                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3039                         vFAIL("Reference to nonexistent group");
3040                     RExC_sawback = 1;
3041                     ret = reganode(pRExC_state,
3042                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3043                                    num);
3044                     *flagp |= HASWIDTH;
3045                     
3046                     /* override incorrect value set in reganode MJD */
3047                     Set_Node_Offset(ret, parse_start+1); 
3048                     Set_Node_Cur_Length(ret); /* MJD */
3049                     RExC_parse--;
3050                     nextchar(pRExC_state);
3051                 }
3052             }
3053             break;
3054         case '\0':
3055             if (RExC_parse >= RExC_end)
3056                 FAIL("Trailing \\");
3057             /* FALL THROUGH */
3058         default:
3059             /* Do not generate `unrecognized' warnings here, we fall
3060                back into the quick-grab loop below */
3061             parse_start--;
3062             goto defchar;
3063         }
3064         break;
3065
3066     case '#':
3067         if (RExC_flags & PMf_EXTENDED) {
3068             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3069             if (RExC_parse < RExC_end)
3070                 goto tryagain;
3071         }
3072         /* FALL THROUGH */
3073
3074     default: {
3075             register STRLEN len;
3076             register UV ender;
3077             register char *p;
3078             char *oldp, *s;
3079             STRLEN numlen;
3080             STRLEN foldlen;
3081             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3082
3083             parse_start = RExC_parse - 1;
3084
3085             RExC_parse++;
3086
3087         defchar:
3088             ender = 0;
3089             ret = reg_node(pRExC_state,
3090                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3091             s = STRING(ret);
3092             for (len = 0, p = RExC_parse - 1;
3093               len < 127 && p < RExC_end;
3094               len++)
3095             {
3096                 oldp = p;
3097
3098                 if (RExC_flags & PMf_EXTENDED)
3099                     p = regwhite(p, RExC_end);
3100                 switch (*p) {
3101                 case '^':
3102                 case '$':
3103                 case '.':
3104                 case '[':
3105                 case '(':
3106                 case ')':
3107                 case '|':
3108                     goto loopdone;
3109                 case '\\':
3110                     switch (*++p) {
3111                     case 'A':
3112                     case 'C':
3113                     case 'X':
3114                     case 'G':
3115                     case 'Z':
3116                     case 'z':
3117                     case 'w':
3118                     case 'W':
3119                     case 'b':
3120                     case 'B':
3121                     case 's':
3122                     case 'S':
3123                     case 'd':
3124                     case 'D':
3125                     case 'p':
3126                     case 'P':
3127                         --p;
3128                         goto loopdone;
3129                     case 'n':
3130                         ender = '\n';
3131                         p++;
3132                         break;
3133                     case 'r':
3134                         ender = '\r';
3135                         p++;
3136                         break;
3137                     case 't':
3138                         ender = '\t';
3139                         p++;
3140                         break;
3141                     case 'f':
3142                         ender = '\f';
3143                         p++;
3144                         break;
3145                     case 'e':
3146                           ender = ASCII_TO_NATIVE('\033');
3147                         p++;
3148                         break;
3149                     case 'a':
3150                           ender = ASCII_TO_NATIVE('\007');
3151                         p++;
3152                         break;
3153                     case 'x':
3154                         if (*++p == '{') {
3155                             char* e = strchr(p, '}');
3156         
3157                             if (!e) {
3158                                 RExC_parse = p + 1;
3159                                 vFAIL("Missing right brace on \\x{}");
3160                             }
3161                             else {
3162                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3163                                     | PERL_SCAN_DISALLOW_PREFIX;
3164                                 numlen = e - p - 1;
3165                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3166                                 if (ender > 0xff)
3167                                     RExC_utf8 = 1;
3168                                 p = e + 1;
3169                             }
3170                         }
3171                         else {
3172                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3173                             numlen = 2;
3174                             ender = grok_hex(p, &numlen, &flags, NULL);
3175                             p += numlen;
3176                         }
3177                         break;
3178                     case 'c':
3179                         p++;
3180                         ender = UCHARAT(p++);
3181                         ender = toCTRL(ender);
3182                         break;
3183                     case '0': case '1': case '2': case '3':case '4':
3184                     case '5': case '6': case '7': case '8':case '9':
3185                         if (*p == '0' ||
3186                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3187                             I32 flags = 0;
3188                             numlen = 3;
3189                             ender = grok_oct(p, &numlen, &flags, NULL);
3190                             p += numlen;
3191                         }
3192                         else {
3193                             --p;
3194                             goto loopdone;
3195                         }
3196                         break;
3197                     case '\0':
3198                         if (p >= RExC_end)
3199                             FAIL("Trailing \\");
3200                         /* FALL THROUGH */
3201                     default:
3202                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3203                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3204                         goto normal_default;
3205                     }
3206                     break;
3207                 default:
3208                   normal_default:
3209                     if (UTF8_IS_START(*p) && UTF) {
3210                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3211                                                &numlen, 0);
3212                         p += numlen;
3213                     }
3214                     else
3215                         ender = *p++;
3216                     break;
3217                 }
3218                 if (RExC_flags & PMf_EXTENDED)
3219                     p = regwhite(p, RExC_end);
3220                 if (UTF && FOLD) {
3221                     /* Prime the casefolded buffer. */
3222                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3223                 }
3224                 if (ISMULT2(p)) { /* Back off on ?+*. */
3225                     if (len)
3226                         p = oldp;
3227                     else if (UTF) {
3228                          STRLEN unilen;
3229
3230                          if (FOLD) {
3231                               /* Emit all the Unicode characters. */
3232                               for (foldbuf = tmpbuf;
3233                                    foldlen;
3234                                    foldlen -= numlen) {
3235                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3236                                    if (numlen > 0) {
3237                                         reguni(pRExC_state, ender, s, &unilen);
3238                                         s       += unilen;
3239                                         len     += unilen;
3240                                         /* In EBCDIC the numlen
3241                                          * and unilen can differ. */
3242                                         foldbuf += numlen;
3243                                         if (numlen >= foldlen)
3244                                              break;
3245                                    }
3246                                    else
3247                                         break; /* "Can't happen." */
3248                               }
3249                          }
3250                          else {
3251                               reguni(pRExC_state, ender, s, &unilen);
3252                               if (unilen > 0) {
3253                                    s   += unilen;
3254                                    len += unilen;
3255                               }
3256                          }
3257                     }
3258                     else {
3259                         len++;
3260                         REGC((char)ender, s++);
3261                     }
3262                     break;
3263                 }
3264                 if (UTF) {
3265                      STRLEN unilen;
3266
3267                      if (FOLD) {
3268                           /* Emit all the Unicode characters. */
3269                           for (foldbuf = tmpbuf;
3270                                foldlen;
3271                                foldlen -= numlen) {
3272                                ender = utf8_to_uvchr(foldbuf, &numlen);
3273                                if (numlen > 0) {
3274                                     reguni(pRExC_state, ender, s, &unilen);
3275                                     len     += unilen;
3276                                     s       += unilen;
3277                                     /* In EBCDIC the numlen
3278                                      * and unilen can differ. */
3279                                     foldbuf += numlen;
3280                                     if (numlen >= foldlen)
3281                                          break;
3282                                }
3283                                else
3284                                     break;
3285                           }
3286                      }
3287                      else {
3288                           reguni(pRExC_state, ender, s, &unilen);
3289                           if (unilen > 0) {
3290                                s   += unilen;
3291                                len += unilen;
3292                           }
3293                      }
3294                      len--;
3295                 }
3296                 else
3297                     REGC((char)ender, s++);
3298             }
3299         loopdone:
3300             RExC_parse = p - 1;
3301             Set_Node_Cur_Length(ret); /* MJD */
3302             nextchar(pRExC_state);
3303             {
3304                 /* len is STRLEN which is unsigned, need to copy to signed */
3305                 IV iv = len;
3306                 if (iv < 0)
3307                     vFAIL("Internal disaster");
3308             }
3309             if (len > 0)
3310                 *flagp |= HASWIDTH;
3311             if (len == 1 && UNI_IS_INVARIANT(ender))
3312                 *flagp |= SIMPLE;
3313             if (!SIZE_ONLY)
3314                 STR_LEN(ret) = len;
3315             if (SIZE_ONLY)
3316                 RExC_size += STR_SZ(len);
3317             else
3318                 RExC_emit += STR_SZ(len);
3319         }
3320         break;
3321     }
3322
3323     /* If the encoding pragma is in effect recode the text of
3324      * any EXACT-kind nodes. */
3325     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3326         STRLEN oldlen = STR_LEN(ret);
3327         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3328
3329         if (RExC_utf8)
3330             SvUTF8_on(sv);
3331         if (sv_utf8_downgrade(sv, TRUE)) {
3332             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3333             STRLEN newlen = SvCUR(sv);
3334
3335             if (SvUTF8(sv))
3336                 RExC_utf8 = 1;
3337             if (!SIZE_ONLY) {
3338                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3339                                       (int)oldlen, STRING(ret),
3340                                       (int)newlen, s));
3341                 Copy(s, STRING(ret), newlen, char);
3342                 STR_LEN(ret) += newlen - oldlen;
3343                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3344             } else
3345                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3346         }
3347     }
3348
3349     return(ret);
3350 }
3351
3352 STATIC char *
3353 S_regwhite(pTHX_ char *p, char *e)
3354 {
3355     while (p < e) {
3356         if (isSPACE(*p))
3357             ++p;
3358         else if (*p == '#') {
3359             do {
3360                 p++;
3361             } while (p < e && *p != '\n');
3362         }
3363         else
3364             break;
3365     }
3366     return p;
3367 }
3368
3369 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3370    Character classes ([:foo:]) can also be negated ([:^foo:]).
3371    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3372    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3373    but trigger failures because they are currently unimplemented. */
3374
3375 #define POSIXCC_DONE(c)   ((c) == ':')
3376 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3377 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3378
3379 STATIC I32
3380 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3381 {
3382     char *posixcc = 0;
3383     I32 namedclass = OOB_NAMEDCLASS;
3384
3385     if (value == '[' && RExC_parse + 1 < RExC_end &&
3386         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3387         POSIXCC(UCHARAT(RExC_parse))) {
3388         char  c = UCHARAT(RExC_parse);
3389         char* s = RExC_parse++;
3390         
3391         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3392             RExC_parse++;
3393         if (RExC_parse == RExC_end)
3394             /* Grandfather lone [:, [=, [. */
3395             RExC_parse = s;
3396         else {
3397             char* t = RExC_parse++; /* skip over the c */
3398
3399             if (UCHARAT(RExC_parse) == ']') {
3400                 RExC_parse++; /* skip over the ending ] */
3401                 posixcc = s + 1;
3402                 if (*s == ':') {
3403                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3404                     I32 skip = 5; /* the most common skip */
3405
3406                     switch (*posixcc) {
3407                     case 'a':
3408                         if (strnEQ(posixcc, "alnum", 5))
3409                             namedclass =
3410                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3411                         else if (strnEQ(posixcc, "alpha", 5))
3412                             namedclass =
3413                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3414                         else if (strnEQ(posixcc, "ascii", 5))
3415                             namedclass =
3416                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3417                         break;
3418                     case 'b':
3419                         if (strnEQ(posixcc, "blank", 5))
3420                             namedclass =
3421                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3422                         break;
3423                     case 'c':
3424                         if (strnEQ(posixcc, "cntrl", 5))
3425                             namedclass =
3426                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3427                         break;
3428                     case 'd':
3429                         if (strnEQ(posixcc, "digit", 5))
3430                             namedclass =
3431                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3432                         break;
3433                     case 'g':
3434                         if (strnEQ(posixcc, "graph", 5))
3435                             namedclass =
3436                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3437                         break;
3438                     case 'l':
3439                         if (strnEQ(posixcc, "lower", 5))
3440                             namedclass =
3441                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3442                         break;
3443                     case 'p':
3444                         if (strnEQ(posixcc, "print", 5))
3445                             namedclass =
3446                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3447                         else if (strnEQ(posixcc, "punct", 5))
3448                             namedclass =
3449                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3450                         break;
3451                     case 's':
3452                         if (strnEQ(posixcc, "space", 5))
3453                             namedclass =
3454                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3455                         break;
3456                     case 'u':
3457                         if (strnEQ(posixcc, "upper", 5))
3458                             namedclass =
3459                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3460                         break;
3461                     case 'w': /* this is not POSIX, this is the Perl \w */
3462                         if (strnEQ(posixcc, "word", 4)) {
3463                             namedclass =
3464                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3465                             skip = 4;
3466                         }
3467                         break;
3468                     case 'x':
3469                         if (strnEQ(posixcc, "xdigit", 6)) {
3470                             namedclass =
3471                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3472                             skip = 6;
3473                         }
3474                         break;
3475                     }
3476                     if (namedclass == OOB_NAMEDCLASS ||
3477                         posixcc[skip] != ':' ||
3478                         posixcc[skip+1] != ']')
3479                     {
3480                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3481                                       t - s - 1, s + 1);
3482                     }
3483                 } else if (!SIZE_ONLY) {
3484                     /* [[=foo=]] and [[.foo.]] are still future. */
3485
3486                     /* adjust RExC_parse so the warning shows after
3487                        the class closes */
3488                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3489                         RExC_parse++;
3490                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3491                 }
3492             } else {
3493                 /* Maternal grandfather:
3494                  * "[:" ending in ":" but not in ":]" */
3495                 RExC_parse = s;
3496             }
3497         }
3498     }
3499
3500     return namedclass;
3501 }
3502
3503 STATIC void
3504 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3505 {
3506     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3507         char *s = RExC_parse;
3508         char  c = *s++;
3509
3510         while(*s && isALNUM(*s))
3511             s++;
3512         if (*s && c == *s && s[1] == ']') {
3513             if (ckWARN(WARN_REGEXP))
3514                 vWARN3(s+2,
3515                         "POSIX syntax [%c %c] belongs inside character classes",
3516                         c, c);
3517
3518             /* [[=foo=]] and [[.foo.]] are still future. */
3519             if (POSIXCC_NOTYET(c)) {
3520                 /* adjust RExC_parse so the error shows after
3521                    the class closes */
3522                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3523                     ;
3524                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3525             }
3526         }
3527     }
3528 }
3529
3530 STATIC regnode *
3531 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3532 {
3533     register UV value;
3534     register UV nextvalue;
3535     register IV prevvalue = OOB_UNICODE;
3536     register IV range = 0;
3537     register regnode *ret;
3538     STRLEN numlen;
3539     IV namedclass;
3540     char *rangebegin = 0;
3541     bool need_class = 0;
3542     SV *listsv = Nullsv;
3543     register char *e;
3544     UV n;
3545     bool optimize_invert   = TRUE;
3546     AV* unicode_alternate  = 0;
3547 #ifdef EBCDIC
3548     UV literal_endpoint = 0;
3549 #endif
3550
3551     ret = reganode(pRExC_state, ANYOF, 0);
3552
3553     if (!SIZE_ONLY)
3554         ANYOF_FLAGS(ret) = 0;
3555
3556     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3557         RExC_naughty++;
3558         RExC_parse++;
3559         if (!SIZE_ONLY)
3560             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3561     }
3562
3563     if (SIZE_ONLY)
3564         RExC_size += ANYOF_SKIP;
3565     else {
3566         RExC_emit += ANYOF_SKIP;
3567         if (FOLD)
3568             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3569         if (LOC)
3570             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3571         ANYOF_BITMAP_ZERO(ret);
3572         listsv = newSVpvn("# comment\n", 10);
3573     }
3574
3575     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3576
3577     if (!SIZE_ONLY && POSIXCC(nextvalue))
3578         checkposixcc(pRExC_state);
3579
3580     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3581     if (UCHARAT(RExC_parse) == ']')
3582         goto charclassloop;
3583
3584     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3585
3586     charclassloop:
3587
3588         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3589
3590         if (!range)
3591             rangebegin = RExC_parse;
3592         if (UTF) {
3593             value = utf8n_to_uvchr((U8*)RExC_parse,
3594                                    RExC_end - RExC_parse,
3595                                    &numlen, 0);
3596             RExC_parse += numlen;
3597         }
3598         else
3599             value = UCHARAT(RExC_parse++);
3600         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3601         if (value == '[' && POSIXCC(nextvalue))
3602             namedclass = regpposixcc(pRExC_state, value);
3603         else if (value == '\\') {
3604             if (UTF) {
3605                 value = utf8n_to_uvchr((U8*)RExC_parse,
3606                                    RExC_end - RExC_parse,
3607                                    &numlen, 0);
3608                 RExC_parse += numlen;
3609             }
3610             else
3611                 value = UCHARAT(RExC_parse++);
3612             /* Some compilers cannot handle switching on 64-bit integer
3613              * values, therefore value cannot be an UV.  Yes, this will
3614              * be a problem later if we want switch on Unicode.
3615              * A similar issue a little bit later when switching on
3616              * namedclass. --jhi */
3617             switch ((I32)value) {
3618             case 'w':   namedclass = ANYOF_ALNUM;       break;
3619             case 'W':   namedclass = ANYOF_NALNUM;      break;
3620             case 's':   namedclass = ANYOF_SPACE;       break;
3621             case 'S':   namedclass = ANYOF_NSPACE;      break;
3622             case 'd':   namedclass = ANYOF_DIGIT;       break;
3623             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3624             case 'p':
3625             case 'P':
3626                 if (RExC_parse >= RExC_end)
3627                     vFAIL2("Empty \\%c{}", (U8)value);
3628                 if (*RExC_parse == '{') {
3629                     U8 c = (U8)value;
3630                     e = strchr(RExC_parse++, '}');
3631                     if (!e)
3632                         vFAIL2("Missing right brace on \\%c{}", c);
3633                     while (isSPACE(UCHARAT(RExC_parse)))
3634                         RExC_parse++;
3635                     if (e == RExC_parse)
3636                         vFAIL2("Empty \\%c{}", c);
3637                     n = e - RExC_parse;
3638                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3639                         n--;
3640                 }
3641                 else {
3642                     e = RExC_parse;
3643                     n = 1;
3644                 }
3645                 if (!SIZE_ONLY) {
3646                     if (UCHARAT(RExC_parse) == '^') {
3647                          RExC_parse++;
3648                          n--;
3649                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3650                          while (isSPACE(UCHARAT(RExC_parse))) {
3651                               RExC_parse++;
3652                               n--;
3653                          }
3654                     }
3655                     if (value == 'p')
3656                          Perl_sv_catpvf(aTHX_ listsv,
3657                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3658                     else
3659                          Perl_sv_catpvf(aTHX_ listsv,
3660                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3661                 }
3662                 RExC_parse = e + 1;
3663                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3664                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3665                 break;
3666             case 'n':   value = '\n';                   break;
3667             case 'r':   value = '\r';                   break;
3668             case 't':   value = '\t';                   break;
3669             case 'f':   value = '\f';                   break;
3670             case 'b':   value = '\b';                   break;
3671             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3672             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3673             case 'x':
3674                 if (*RExC_parse == '{') {
3675                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3676                         | PERL_SCAN_DISALLOW_PREFIX;
3677                     e = strchr(RExC_parse++, '}');
3678                     if (!e)
3679                         vFAIL("Missing right brace on \\x{}");
3680
3681                     numlen = e - RExC_parse;
3682                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3683                     RExC_parse = e + 1;
3684                 }
3685                 else {
3686                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3687                     numlen = 2;
3688                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3689                     RExC_parse += numlen;
3690                 }
3691                 break;
3692             case 'c':
3693                 value = UCHARAT(RExC_parse++);
3694                 value = toCTRL(value);
3695                 break;
3696             case '0': case '1': case '2': case '3': case '4':
3697             case '5': case '6': case '7': case '8': case '9':
3698             {
3699                 I32 flags = 0;
3700                 numlen = 3;
3701                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3702                 RExC_parse += numlen;
3703                 break;
3704             }
3705             default:
3706                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3707                     vWARN2(RExC_parse,
3708                            "Unrecognized escape \\%c in character class passed through",
3709                            (int)value);
3710                 break;
3711             }
3712         } /* end of \blah */
3713 #ifdef EBCDIC
3714         else
3715             literal_endpoint++;
3716 #endif
3717
3718         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3719
3720             if (!SIZE_ONLY && !need_class)
3721                 ANYOF_CLASS_ZERO(ret);
3722
3723             need_class = 1;
3724
3725             /* a bad range like a-\d, a-[:digit:] ? */
3726             if (range) {
3727                 if (!SIZE_ONLY) {
3728                     if (ckWARN(WARN_REGEXP))
3729                         vWARN4(RExC_parse,
3730                                "False [] range \"%*.*s\"",
3731                                RExC_parse - rangebegin,
3732                                RExC_parse - rangebegin,
3733                                rangebegin);
3734                     if (prevvalue < 256) {
3735                         ANYOF_BITMAP_SET(ret, prevvalue);
3736                         ANYOF_BITMAP_SET(ret, '-');
3737                     }
3738                     else {
3739                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3740                         Perl_sv_catpvf(aTHX_ listsv,
3741                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3742                     }
3743                 }
3744
3745                 range = 0; /* this was not a true range */
3746             }
3747
3748             if (!SIZE_ONLY) {
3749                 if (namedclass > OOB_NAMEDCLASS)
3750                     optimize_invert = FALSE;
3751                 /* Possible truncation here but in some 64-bit environments
3752                  * the compiler gets heartburn about switch on 64-bit values.
3753                  * A similar issue a little earlier when switching on value.
3754                  * --jhi */
3755                 switch ((I32)namedclass) {
3756                 case ANYOF_ALNUM:
3757                     if (LOC)
3758                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3759                     else {
3760                         for (value = 0; value < 256; value++)
3761                             if (isALNUM(value))
3762                                 ANYOF_BITMAP_SET(ret, value);
3763                     }
3764                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3765                     break;
3766                 case ANYOF_NALNUM:
3767                     if (LOC)
3768                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3769                     else {
3770                         for (value = 0; value < 256; value++)
3771                             if (!isALNUM(value))
3772                                 ANYOF_BITMAP_SET(ret, value);
3773                     }
3774                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3775                     break;
3776                 case ANYOF_ALNUMC:
3777                     if (LOC)
3778                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3779                     else {
3780                         for (value = 0; value < 256; value++)
3781                             if (isALNUMC(value))
3782                                 ANYOF_BITMAP_SET(ret, value);
3783                     }
3784                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3785                     break;
3786                 case ANYOF_NALNUMC:
3787                     if (LOC)
3788                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3789                     else {
3790                         for (value = 0; value < 256; value++)
3791                             if (!isALNUMC(value))
3792                                 ANYOF_BITMAP_SET(ret, value);
3793                     }
3794                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3795                     break;
3796                 case ANYOF_ALPHA:
3797                     if (LOC)
3798                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3799                     else {
3800                         for (value = 0; value < 256; value++)
3801                             if (isALPHA(value))
3802                                 ANYOF_BITMAP_SET(ret, value);
3803                     }
3804                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3805                     break;
3806                 case ANYOF_NALPHA:
3807                     if (LOC)
3808                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3809                     else {
3810                         for (value = 0; value < 256; value++)
3811                             if (!isALPHA(value))
3812                                 ANYOF_BITMAP_SET(ret, value);
3813                     }
3814                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3815                     break;
3816                 case ANYOF_ASCII:
3817                     if (LOC)
3818                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3819                     else {
3820 #ifndef EBCDIC
3821                         for (value = 0; value < 128; value++)
3822                             ANYOF_BITMAP_SET(ret, value);
3823 #else  /* EBCDIC */
3824                         for (value = 0; value < 256; value++) {
3825                             if (isASCII(value))
3826                                 ANYOF_BITMAP_SET(ret, value);
3827                         }
3828 #endif /* EBCDIC */
3829                     }
3830                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3831                     break;
3832                 case ANYOF_NASCII:
3833                     if (LOC)
3834                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3835                     else {
3836 #ifndef EBCDIC
3837                         for (value = 128; value < 256; value++)
3838                             ANYOF_BITMAP_SET(ret, value);
3839 #else  /* EBCDIC */
3840                         for (value = 0; value < 256; value++) {
3841                             if (!isASCII(value))
3842                                 ANYOF_BITMAP_SET(ret, value);
3843                         }
3844 #endif /* EBCDIC */
3845                     }
3846                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3847                     break;
3848                 case ANYOF_BLANK:
3849                     if (LOC)
3850                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3851                     else {
3852                         for (value = 0; value < 256; value++)
3853                             if (isBLANK(value))
3854                                 ANYOF_BITMAP_SET(ret, value);
3855                     }
3856                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3857                     break;
3858                 case ANYOF_NBLANK:
3859                     if (LOC)
3860                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3861                     else {
3862                         for (value = 0; value < 256; value++)
3863                             if (!isBLANK(value))
3864                                 ANYOF_BITMAP_SET(ret, value);
3865                     }
3866                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3867                     break;
3868                 case ANYOF_CNTRL:
3869                     if (LOC)
3870                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3871                     else {
3872                         for (value = 0; value < 256; value++)
3873                             if (isCNTRL(value))
3874                                 ANYOF_BITMAP_SET(ret, value);
3875                     }
3876                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3877                     break;
3878                 case ANYOF_NCNTRL:
3879                     if (LOC)
3880                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3881                     else {
3882                         for (value = 0; value < 256; value++)
3883                             if (!isCNTRL(value))
3884                                 ANYOF_BITMAP_SET(ret, value);
3885                     }
3886                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3887                     break;
3888                 case ANYOF_DIGIT:
3889                     if (LOC)
3890                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3891                     else {
3892                         /* consecutive digits assumed */
3893                         for (value = '0'; value <= '9'; value++)
3894                             ANYOF_BITMAP_SET(ret, value);
3895                     }
3896                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3897                     break;
3898                 case ANYOF_NDIGIT:
3899                     if (LOC)
3900                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3901                     else {
3902                         /* consecutive digits assumed */
3903                         for (value = 0; value < '0'; value++)
3904                             ANYOF_BITMAP_SET(ret, value);
3905                         for (value = '9' + 1; value < 256; value++)
3906                             ANYOF_BITMAP_SET(ret, value);
3907                     }
3908                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3909                     break;
3910                 case ANYOF_GRAPH:
3911                     if (LOC)
3912                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3913                     else {
3914                         for (value = 0; value < 256; value++)
3915                             if (isGRAPH(value))
3916                                 ANYOF_BITMAP_SET(ret, value);
3917                     }
3918                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3919                     break;
3920                 case ANYOF_NGRAPH:
3921                     if (LOC)
3922                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3923                     else {
3924                         for (value = 0; value < 256; value++)
3925                             if (!isGRAPH(value))
3926                                 ANYOF_BITMAP_SET(ret, value);
3927                     }
3928                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3929                     break;
3930                 case ANYOF_LOWER:
3931                     if (LOC)
3932                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3933                     else {
3934                         for (value = 0; value < 256; value++)
3935                             if (isLOWER(value))
3936                                 ANYOF_BITMAP_SET(ret, value);
3937                     }
3938                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3939                     break;
3940                 case ANYOF_NLOWER:
3941                     if (LOC)
3942                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3943                     else {
3944                         for (value = 0; value < 256; value++)
3945                             if (!isLOWER(value))
3946                                 ANYOF_BITMAP_SET(ret, value);
3947                     }
3948                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3949                     break;
3950                 case ANYOF_PRINT:
3951                     if (LOC)
3952                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3953                     else {
3954                         for (value = 0; value < 256; value++)
3955                             if (isPRINT(value))
3956                                 ANYOF_BITMAP_SET(ret, value);
3957                     }
3958                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3959                     break;
3960                 case ANYOF_NPRINT:
3961                     if (LOC)
3962                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3963                     else {
3964                         for (value = 0; value < 256; value++)
3965                             if (!isPRINT(value))
3966                                 ANYOF_BITMAP_SET(ret, value);
3967                     }
3968                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3969                     break;
3970                 case ANYOF_PSXSPC:
3971                     if (LOC)
3972                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3973                     else {
3974                         for (value = 0; value < 256; value++)
3975                             if (isPSXSPC(value))
3976                                 ANYOF_BITMAP_SET(ret, value);
3977                     }
3978                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3979                     break;
3980                 case ANYOF_NPSXSPC:
3981                     if (LOC)
3982                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3983                     else {
3984                         for (value = 0; value < 256; value++)
3985                             if (!isPSXSPC(value))
3986                                 ANYOF_BITMAP_SET(ret, value);
3987                     }
3988                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3989                     break;
3990                 case ANYOF_PUNCT:
3991                     if (LOC)
3992                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3993                     else {
3994                         for (value = 0; value < 256; value++)
3995                             if (isPUNCT(value))
3996                                 ANYOF_BITMAP_SET(ret, value);
3997                     }
3998                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3999                     break;
4000                 case ANYOF_NPUNCT:
4001                     if (LOC)
4002                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4003                     else {
4004                         for (value = 0; value < 256; value++)
4005                             if (!isPUNCT(value))
4006                                 ANYOF_BITMAP_SET(ret, value);
4007                     }
4008                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4009                     break;
4010                 case ANYOF_SPACE:
4011                     if (LOC)
4012                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4013                     else {
4014                         for (value = 0; value < 256; value++)
4015                             if (isSPACE(value))
4016                                 ANYOF_BITMAP_SET(ret, value);
4017                     }
4018                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4019                     break;
4020                 case ANYOF_NSPACE:
4021                     if (LOC)
4022                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4023                     else {
4024                         for (value = 0; value < 256; value++)
4025                             if (!isSPACE(value))
4026                                 ANYOF_BITMAP_SET(ret, value);
4027                     }
4028                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4029                     break;
4030                 case ANYOF_UPPER:
4031                     if (LOC)
4032                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4033                     else {
4034                         for (value = 0; value < 256; value++)
4035                             if (isUPPER(value))
4036                                 ANYOF_BITMAP_SET(ret, value);
4037                     }
4038                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4039                     break;
4040                 case ANYOF_NUPPER:
4041                     if (LOC)
4042                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4043                     else {
4044                         for (value = 0; value < 256; value++)
4045                             if (!isUPPER(value))
4046                                 ANYOF_BITMAP_SET(ret, value);
4047                     }
4048                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4049                     break;
4050                 case ANYOF_XDIGIT:
4051                     if (LOC)
4052                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4053                     else {
4054                         for (value = 0; value < 256; value++)
4055                             if (isXDIGIT(value))
4056                                 ANYOF_BITMAP_SET(ret, value);
4057                     }
4058                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4059                     break;
4060                 case ANYOF_NXDIGIT:
4061                     if (LOC)
4062                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4063                     else {
4064                         for (value = 0; value < 256; value++)
4065                             if (!isXDIGIT(value))
4066                                 ANYOF_BITMAP_SET(ret, value);
4067                     }
4068                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4069                     break;
4070                 case ANYOF_MAX:
4071                     /* this is to handle \p and \P */
4072                     break;
4073                 default:
4074                     vFAIL("Invalid [::] class");
4075                     break;
4076                 }
4077                 if (LOC)
4078                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4079                 continue;
4080             }
4081         } /* end of namedclass \blah */
4082
4083         if (range) {
4084             if (prevvalue > (IV)value) /* b-a */ {
4085                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4086                               RExC_parse - rangebegin,
4087                               RExC_parse - rangebegin,
4088                               rangebegin);
4089                 range = 0; /* not a valid range */
4090             }
4091         }
4092         else {
4093             prevvalue = value; /* save the beginning of the range */
4094             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4095                 RExC_parse[1] != ']') {
4096                 RExC_parse++;
4097
4098                 /* a bad range like \w-, [:word:]- ? */
4099                 if (namedclass > OOB_NAMEDCLASS) {
4100                     if (ckWARN(WARN_REGEXP))
4101                         vWARN4(RExC_parse,
4102                                "False [] range \"%*.*s\"",
4103                                RExC_parse - rangebegin,
4104                                RExC_parse - rangebegin,
4105                                rangebegin);
4106                     if (!SIZE_ONLY)
4107                         ANYOF_BITMAP_SET(ret, '-');
4108                 } else
4109                     range = 1;  /* yeah, it's a range! */
4110                 continue;       /* but do it the next time */
4111             }
4112         }
4113
4114         /* now is the next time */
4115         if (!SIZE_ONLY) {
4116             IV i;
4117
4118             if (prevvalue < 256) {
4119                 IV ceilvalue = value < 256 ? value : 255;
4120
4121 #ifdef EBCDIC
4122                 /* In EBCDIC [\x89-\x91] should include
4123                  * the \x8e but [i-j] should not. */
4124                 if (literal_endpoint == 2 &&
4125                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4126                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4127                 {
4128                     if (isLOWER(prevvalue)) {
4129                         for (i = prevvalue; i <= ceilvalue; i++)
4130                             if (isLOWER(i))
4131                                 ANYOF_BITMAP_SET(ret, i);
4132                     } else {
4133                         for (i = prevvalue; i <= ceilvalue; i++)
4134                             if (isUPPER(i))
4135                                 ANYOF_BITMAP_SET(ret, i);
4136                     }
4137                 }
4138                 else
4139 #endif
4140                       for (i = prevvalue; i <= ceilvalue; i++)
4141                           ANYOF_BITMAP_SET(ret, i);
4142           }
4143           if (value > 255 || UTF) {
4144                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4145                 UV natvalue      = NATIVE_TO_UNI(value);
4146
4147                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4148                 if (prevnatvalue < natvalue) { /* what about > ? */
4149                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4150                                    prevnatvalue, natvalue);
4151                 }
4152                 else if (prevnatvalue == natvalue) {
4153                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4154                     if (FOLD) {
4155                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4156                          STRLEN foldlen;
4157                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4158
4159                          /* If folding and foldable and a single
4160                           * character, insert also the folded version
4161                           * to the charclass. */
4162                          if (f != value) {
4163                               if (foldlen == (STRLEN)UNISKIP(f))
4164                                   Perl_sv_catpvf(aTHX_ listsv,
4165                                                  "%04"UVxf"\n", f);
4166                               else {
4167                                   /* Any multicharacter foldings
4168                                    * require the following transform:
4169                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4170                                    * where E folds into "pq" and F folds
4171                                    * into "rst", all other characters
4172                                    * fold to single characters.  We save
4173                                    * away these multicharacter foldings,
4174                                    * to be later saved as part of the
4175                                    * additional "s" data. */
4176                                   SV *sv;
4177
4178                                   if (!unicode_alternate)
4179                                       unicode_alternate = newAV();
4180                                   sv = newSVpvn((char*)foldbuf, foldlen);
4181                                   SvUTF8_on(sv);
4182                                   av_push(unicode_alternate, sv);
4183                               }
4184                          }
4185
4186                          /* If folding and the value is one of the Greek
4187                           * sigmas insert a few more sigmas to make the
4188                           * folding rules of the sigmas to work right.
4189                           * Note that not all the possible combinations
4190                           * are handled here: some of them are handled
4191                           * by the standard folding rules, and some of
4192                           * them (literal or EXACTF cases) are handled
4193                           * during runtime in regexec.c:S_find_byclass(). */
4194                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4195                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4196                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4197                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4198                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4199                          }
4200                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4201                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4202                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4203                     }
4204                 }
4205             }
4206 #ifdef EBCDIC
4207             literal_endpoint = 0;
4208 #endif
4209         }
4210
4211         range = 0; /* this range (if it was one) is done now */
4212     }
4213
4214     if (need_class) {
4215         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4216         if (SIZE_ONLY)
4217             RExC_size += ANYOF_CLASS_ADD_SKIP;
4218         else
4219             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4220     }
4221
4222     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4223     if (!SIZE_ONLY &&
4224          /* If the only flag is folding (plus possibly inversion). */
4225         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4226        ) {
4227         for (value = 0; value < 256; ++value) {
4228             if (ANYOF_BITMAP_TEST(ret, value)) {
4229                 UV fold = PL_fold[value];
4230
4231                 if (fold != value)
4232                     ANYOF_BITMAP_SET(ret, fold);
4233             }
4234         }
4235         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4236     }
4237
4238     /* optimize inverted simple patterns (e.g. [^a-z]) */
4239     if (!SIZE_ONLY && optimize_invert &&
4240         /* If the only flag is inversion. */
4241         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4242         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4243             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4244         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4245     }
4246
4247     if (!SIZE_ONLY) {
4248         AV *av = newAV();
4249         SV *rv;
4250
4251         /* The 0th element stores the character class description
4252          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4253          * to initialize the appropriate swash (which gets stored in
4254          * the 1st element), and also useful for dumping the regnode.
4255          * The 2nd element stores the multicharacter foldings,
4256          * used later (regexec.c:S_reginclass()). */
4257         av_store(av, 0, listsv);
4258         av_store(av, 1, NULL);
4259         av_store(av, 2, (SV*)unicode_alternate);
4260         rv = newRV_noinc((SV*)av);
4261         n = add_data(pRExC_state, 1, "s");
4262         RExC_rx->data->data[n] = (void*)rv;
4263         ARG_SET(ret, n);
4264     }
4265
4266     return ret;
4267 }
4268
4269 STATIC char*
4270 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4271 {
4272     char* retval = RExC_parse++;
4273
4274     for (;;) {
4275         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4276                 RExC_parse[2] == '#') {
4277             while (*RExC_parse != ')') {
4278                 if (RExC_parse == RExC_end)
4279                     FAIL("Sequence (?#... not terminated");
4280                 RExC_parse++;
4281             }
4282             RExC_parse++;
4283             continue;
4284         }
4285         if (RExC_flags & PMf_EXTENDED) {
4286             if (isSPACE(*RExC_parse)) {
4287                 RExC_parse++;
4288                 continue;
4289             }
4290             else if (*RExC_parse == '#') {
4291                 while (RExC_parse < RExC_end)
4292                     if (*RExC_parse++ == '\n') break;
4293                 continue;
4294             }
4295         }
4296         return retval;
4297     }
4298 }
4299
4300 /*
4301 - reg_node - emit a node
4302 */
4303 STATIC regnode *                        /* Location. */
4304 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4305 {
4306     register regnode *ret;
4307     register regnode *ptr;
4308
4309     ret = RExC_emit;
4310     if (SIZE_ONLY) {
4311         SIZE_ALIGN(RExC_size);
4312         RExC_size += 1;
4313         return(ret);
4314     }
4315
4316     NODE_ALIGN_FILL(ret);
4317     ptr = ret;
4318     FILL_ADVANCE_NODE(ptr, op);
4319     if (RExC_offsets) {         /* MJD */
4320         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4321               "reg_node", __LINE__, 
4322               reg_name[op],
4323               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4324               ? "Overwriting end of array!\n" : "OK",
4325               RExC_emit - RExC_emit_start,
4326               RExC_parse - RExC_start,
4327               RExC_offsets[0])); 
4328         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4329     }
4330             
4331     RExC_emit = ptr;
4332
4333     return(ret);
4334 }
4335
4336 /*
4337 - reganode - emit a node with an argument
4338 */
4339 STATIC regnode *                        /* Location. */
4340 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4341 {
4342     register regnode *ret;
4343     register regnode *ptr;
4344
4345     ret = RExC_emit;
4346     if (SIZE_ONLY) {
4347         SIZE_ALIGN(RExC_size);
4348         RExC_size += 2;
4349         return(ret);
4350     }
4351
4352     NODE_ALIGN_FILL(ret);
4353     ptr = ret;
4354     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4355     if (RExC_offsets) {         /* MJD */
4356         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4357               "reganode",
4358               __LINE__,
4359               reg_name[op],
4360               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4361               "Overwriting end of array!\n" : "OK",
4362               RExC_emit - RExC_emit_start,
4363               RExC_parse - RExC_start,
4364               RExC_offsets[0])); 
4365         Set_Cur_Node_Offset;
4366     }
4367             
4368     RExC_emit = ptr;
4369
4370     return(ret);
4371 }
4372
4373 /*
4374 - reguni - emit (if appropriate) a Unicode character
4375 */
4376 STATIC void
4377 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4378 {
4379     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4380 }
4381
4382 /*
4383 - reginsert - insert an operator in front of already-emitted operand
4384 *
4385 * Means relocating the operand.
4386 */
4387 STATIC void
4388 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4389 {
4390     register regnode *src;
4391     register regnode *dst;
4392     register regnode *place;
4393     register int offset = regarglen[(U8)op];
4394
4395 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4396
4397     if (SIZE_ONLY) {
4398         RExC_size += NODE_STEP_REGNODE + offset;
4399         return;
4400     }
4401
4402     src = RExC_emit;
4403     RExC_emit += NODE_STEP_REGNODE + offset;
4404     dst = RExC_emit;
4405     while (src > opnd) {
4406         StructCopy(--src, --dst, regnode);
4407         if (RExC_offsets) {     /* MJD 20010112 */
4408             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4409                   "reg_insert",
4410                   __LINE__,
4411                   reg_name[op],
4412                   dst - RExC_emit_start > RExC_offsets[0] 
4413                   ? "Overwriting end of array!\n" : "OK",
4414                   src - RExC_emit_start,
4415                   dst - RExC_emit_start,
4416                   RExC_offsets[0])); 
4417             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4418             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4419         }
4420     }
4421     
4422
4423     place = opnd;               /* Op node, where operand used to be. */
4424     if (RExC_offsets) {         /* MJD */
4425         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4426               "reginsert",
4427               __LINE__,
4428               reg_name[op],
4429               place - RExC_emit_start > RExC_offsets[0] 
4430               ? "Overwriting end of array!\n" : "OK",
4431               place - RExC_emit_start,
4432               RExC_parse - RExC_start,
4433               RExC_offsets[0])); 
4434         Set_Node_Offset(place, RExC_parse);
4435         Set_Node_Length(place, 1);
4436     }
4437     src = NEXTOPER(place);
4438     FILL_ADVANCE_NODE(place, op);
4439     Zero(src, offset, regnode);
4440 }
4441
4442 /*
4443 - regtail - set the next-pointer at the end of a node chain of p to val.
4444 */
4445 STATIC void
4446 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4447 {
4448     register regnode *scan;
4449     register regnode *temp;
4450
4451     if (SIZE_ONLY)
4452         return;
4453
4454     /* Find last node. */
4455     scan = p;
4456     for (;;) {
4457         temp = regnext(scan);
4458         if (temp == NULL)
4459             break;
4460         scan = temp;
4461     }
4462
4463     if (reg_off_by_arg[OP(scan)]) {
4464         ARG_SET(scan, val - scan);
4465     }
4466     else {
4467         NEXT_OFF(scan) = val - scan;
4468     }
4469 }
4470
4471 /*
4472 - regoptail - regtail on operand of first argument; nop if operandless
4473 */
4474 STATIC void
4475 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4476 {
4477     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4478     if (p == NULL || SIZE_ONLY)
4479         return;
4480     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4481         regtail(pRExC_state, NEXTOPER(p), val);
4482     }
4483     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4484         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4485     }
4486     else
4487         return;
4488 }
4489
4490 /*
4491  - regcurly - a little FSA that accepts {\d+,?\d*}
4492  */
4493 STATIC I32
4494 S_regcurly(pTHX_ register char *s)
4495 {
4496     if (*s++ != '{')
4497         return FALSE;
4498     if (!isDIGIT(*s))
4499         return FALSE;
4500     while (isDIGIT(*s))
4501         s++;
4502     if (*s == ',')
4503         s++;
4504     while (isDIGIT(*s))
4505         s++;
4506     if (*s != '}')
4507         return FALSE;
4508     return TRUE;
4509 }
4510
4511
4512 #ifdef DEBUGGING
4513
4514 STATIC regnode *
4515 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4516 {
4517     register U8 op = EXACT;     /* Arbitrary non-END op. */
4518     register regnode *next;
4519
4520     while (op != END && (!last || node < last)) {
4521         /* While that wasn't END last time... */
4522
4523         NODE_ALIGN(node);
4524         op = OP(node);
4525         if (op == CLOSE)
4526             l--;        
4527         next = regnext(node);
4528         /* Where, what. */
4529         if (OP(node) == OPTIMIZED)
4530             goto after_print;
4531         regprop(sv, node);
4532         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4533                       (int)(2*l + 1), "", SvPVX(sv));
4534         if (next == NULL)               /* Next ptr. */
4535             PerlIO_printf(Perl_debug_log, "(0)");
4536         else
4537             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4538         (void)PerlIO_putc(Perl_debug_log, '\n');
4539       after_print:
4540         if (PL_regkind[(U8)op] == BRANCHJ) {
4541             register regnode *nnode = (OP(next) == LONGJMP
4542                                        ? regnext(next)
4543                                        : next);
4544             if (last && nnode > last)
4545                 nnode = last;
4546             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4547         }
4548         else if (PL_regkind[(U8)op] == BRANCH) {
4549             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4550         }
4551         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4552             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4553                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4554         }
4555         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4556             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4557                              next, sv, l + 1);
4558         }
4559         else if ( op == PLUS || op == STAR) {
4560             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4561         }
4562         else if (op == ANYOF) {
4563             /* arglen 1 + class block */
4564             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4565                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4566             node = NEXTOPER(node);
4567         }
4568         else if (PL_regkind[(U8)op] == EXACT) {
4569             /* Literal string, where present. */
4570             node += NODE_SZ_STR(node) - 1;
4571             node = NEXTOPER(node);
4572         }
4573         else {
4574             node = NEXTOPER(node);
4575             node += regarglen[(U8)op];
4576         }
4577         if (op == CURLYX || op == OPEN)
4578             l++;
4579         else if (op == WHILEM)
4580             l--;
4581     }
4582     return node;
4583 }
4584
4585 #endif  /* DEBUGGING */
4586
4587 /*
4588  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4589  */
4590 void
4591 Perl_regdump(pTHX_ regexp *r)
4592 {
4593 #ifdef DEBUGGING
4594     SV *sv = sv_newmortal();
4595
4596     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4597
4598     /* Header fields of interest. */
4599     if (r->anchored_substr)
4600         PerlIO_printf(Perl_debug_log,
4601                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4602                       PL_colors[0],
4603                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4604                       SvPVX(r->anchored_substr),
4605                       PL_colors[1],
4606                       SvTAIL(r->anchored_substr) ? "$" : "",
4607                       (IV)r->anchored_offset);
4608     else if (r->anchored_utf8)
4609         PerlIO_printf(Perl_debug_log,
4610                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4611                       PL_colors[0],
4612                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4613                       SvPVX(r->anchored_utf8),
4614                       PL_colors[1],
4615                       SvTAIL(r->anchored_utf8) ? "$" : "",
4616                       (IV)r->anchored_offset);
4617     if (r->float_substr)
4618         PerlIO_printf(Perl_debug_log,
4619                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4620                       PL_colors[0],
4621                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4622                       SvPVX(r->float_substr),
4623                       PL_colors[1],
4624                       SvTAIL(r->float_substr) ? "$" : "",
4625                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4626     else if (r->float_utf8)
4627         PerlIO_printf(Perl_debug_log,
4628                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4629                       PL_colors[0],
4630                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4631                       SvPVX(r->float_utf8),
4632                       PL_colors[1],
4633                       SvTAIL(r->float_utf8) ? "$" : "",
4634                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4635     if (r->check_substr || r->check_utf8)
4636         PerlIO_printf(Perl_debug_log,
4637                       r->check_substr == r->float_substr
4638                       && r->check_utf8 == r->float_utf8
4639                       ? "(checking floating" : "(checking anchored");
4640     if (r->reganch & ROPT_NOSCAN)
4641         PerlIO_printf(Perl_debug_log, " noscan");
4642     if (r->reganch & ROPT_CHECK_ALL)
4643         PerlIO_printf(Perl_debug_log, " isall");
4644     if (r->check_substr || r->check_utf8)
4645         PerlIO_printf(Perl_debug_log, ") ");
4646
4647     if (r->regstclass) {
4648         regprop(sv, r->regstclass);
4649         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4650     }
4651     if (r->reganch & ROPT_ANCH) {
4652         PerlIO_printf(Perl_debug_log, "anchored");
4653         if (r->reganch & ROPT_ANCH_BOL)
4654             PerlIO_printf(Perl_debug_log, "(BOL)");
4655         if (r->reganch & ROPT_ANCH_MBOL)
4656             PerlIO_printf(Perl_debug_log, "(MBOL)");
4657         if (r->reganch & ROPT_ANCH_SBOL)
4658             PerlIO_printf(Perl_debug_log, "(SBOL)");
4659         if (r->reganch & ROPT_ANCH_GPOS)
4660             PerlIO_printf(Perl_debug_log, "(GPOS)");
4661         PerlIO_putc(Perl_debug_log, ' ');
4662     }
4663     if (r->reganch & ROPT_GPOS_SEEN)
4664         PerlIO_printf(Perl_debug_log, "GPOS ");
4665     if (r->reganch & ROPT_SKIP)
4666         PerlIO_printf(Perl_debug_log, "plus ");
4667     if (r->reganch & ROPT_IMPLICIT)
4668         PerlIO_printf(Perl_debug_log, "implicit ");
4669     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4670     if (r->reganch & ROPT_EVAL_SEEN)
4671         PerlIO_printf(Perl_debug_log, "with eval ");
4672     PerlIO_printf(Perl_debug_log, "\n");
4673     if (r->offsets) {
4674       U32 i;
4675       U32 len = r->offsets[0];
4676       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4677       for (i = 1; i <= len; i++)
4678         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4679                       (UV)r->offsets[i*2-1], 
4680                       (UV)r->offsets[i*2]);
4681       PerlIO_printf(Perl_debug_log, "\n");
4682     }
4683 #endif  /* DEBUGGING */
4684 }
4685
4686 #ifdef DEBUGGING
4687
4688 STATIC void
4689 S_put_byte(pTHX_ SV *sv, int c)
4690 {
4691     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4692         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4693     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4694         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4695     else
4696         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4697 }
4698
4699 #endif  /* DEBUGGING */
4700
4701 /*
4702 - regprop - printable representation of opcode
4703 */
4704 void
4705 Perl_regprop(pTHX_ SV *sv, regnode *o)
4706 {
4707 #ifdef DEBUGGING
4708     register int k;
4709
4710     sv_setpvn(sv, "", 0);
4711     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4712         /* It would be nice to FAIL() here, but this may be called from
4713            regexec.c, and it would be hard to supply pRExC_state. */
4714         Perl_croak(aTHX_ "Corrupted regexp opcode");
4715     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4716
4717     k = PL_regkind[(U8)OP(o)];
4718
4719     if (k == EXACT) {
4720         SV *dsv = sv_2mortal(newSVpvn("", 0));
4721         /* Using is_utf8_string() is a crude hack but it may
4722          * be the best for now since we have no flag "this EXACTish
4723          * node was UTF-8" --jhi */
4724         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4725         char *s    = do_utf8 ?
4726           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4727                          UNI_DISPLAY_REGEX) :
4728           STRING(o);
4729         int len = do_utf8 ?
4730           strlen(s) :
4731           STR_LEN(o);
4732         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4733                        PL_colors[0],
4734                        len, s,
4735                        PL_colors[1]);
4736     }
4737     else if (k == CURLY) {
4738         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4739             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4740         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4741     }
4742     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4743         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4744     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4745         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4746     else if (k == LOGICAL)
4747         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4748     else if (k == ANYOF) {
4749         int i, rangestart = -1;
4750         U8 flags = ANYOF_FLAGS(o);
4751         const char * const anyofs[] = { /* Should be synchronized with
4752                                          * ANYOF_ #xdefines in regcomp.h */
4753             "\\w",
4754             "\\W",
4755             "\\s",
4756             "\\S",
4757             "\\d",
4758             "\\D",
4759             "[:alnum:]",
4760             "[:^alnum:]",
4761             "[:alpha:]",
4762             "[:^alpha:]",
4763             "[:ascii:]",
4764             "[:^ascii:]",
4765             "[:ctrl:]",
4766             "[:^ctrl:]",
4767             "[:graph:]",
4768             "[:^graph:]",
4769             "[:lower:]",
4770             "[:^lower:]",
4771             "[:print:]",
4772             "[:^print:]",
4773             "[:punct:]",
4774             "[:^punct:]",
4775             "[:upper:]",
4776             "[:^upper:]",
4777             "[:xdigit:]",
4778             "[:^xdigit:]",
4779             "[:space:]",
4780             "[:^space:]",
4781             "[:blank:]",
4782             "[:^blank:]"
4783         };
4784
4785         if (flags & ANYOF_LOCALE)
4786             sv_catpv(sv, "{loc}");
4787         if (flags & ANYOF_FOLD)
4788             sv_catpv(sv, "{i}");
4789         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4790         if (flags & ANYOF_INVERT)
4791             sv_catpv(sv, "^");
4792         for (i = 0; i <= 256; i++) {
4793             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4794                 if (rangestart == -1)
4795                     rangestart = i;
4796             } else if (rangestart != -1) {
4797                 if (i <= rangestart + 3)
4798                     for (; rangestart < i; rangestart++)
4799                         put_byte(sv, rangestart);
4800                 else {
4801                     put_byte(sv, rangestart);
4802                     sv_catpv(sv, "-");
4803                     put_byte(sv, i - 1);
4804                 }
4805                 rangestart = -1;
4806             }
4807         }
4808
4809         if (o->flags & ANYOF_CLASS)
4810             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4811                 if (ANYOF_CLASS_TEST(o,i))
4812                     sv_catpv(sv, anyofs[i]);
4813
4814         if (flags & ANYOF_UNICODE)
4815             sv_catpv(sv, "{unicode}");
4816         else if (flags & ANYOF_UNICODE_ALL)
4817             sv_catpv(sv, "{unicode_all}");
4818
4819         {
4820             SV *lv;
4821             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4822         
4823             if (lv) {
4824                 if (sw) {
4825                     U8 s[UTF8_MAXLEN+1];
4826                 
4827                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4828                         U8 *e = uvchr_to_utf8(s, i);
4829                         
4830                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4831                             if (rangestart == -1)
4832                                 rangestart = i;
4833                         } else if (rangestart != -1) {
4834                             U8 *p;
4835                         
4836                             if (i <= rangestart + 3)
4837                                 for (; rangestart < i; rangestart++) {
4838                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4839                                         put_byte(sv, *p);
4840                                 }
4841                             else {
4842                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4843                                     put_byte(sv, *p);
4844                                 sv_catpv(sv, "-");
4845                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4846                                         put_byte(sv, *p);
4847                                 }
4848                                 rangestart = -1;
4849                             }
4850                         }
4851                         
4852                     sv_catpv(sv, "..."); /* et cetera */
4853                 }
4854
4855                 {
4856                     char *s = savepv(SvPVX(lv));
4857                     char *origs = s;
4858                 
4859                     while(*s && *s != '\n') s++;
4860                 
4861                     if (*s == '\n') {
4862                         char *t = ++s;
4863                         
4864                         while (*s) {
4865                             if (*s == '\n')
4866                                 *s = ' ';
4867                             s++;
4868                         }
4869                         if (s[-1] == ' ')
4870                             s[-1] = 0;
4871                         
4872                         sv_catpv(sv, t);
4873                     }
4874                 
4875                     Safefree(origs);
4876                 }
4877             }
4878         }
4879
4880         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4881     }
4882     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4883         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4884 #endif  /* DEBUGGING */
4885 }
4886
4887 SV *
4888 Perl_re_intuit_string(pTHX_ regexp *prog)
4889 {                               /* Assume that RE_INTUIT is set */
4890     DEBUG_r(
4891         {   STRLEN n_a;
4892             char *s = SvPV(prog->check_substr
4893                       ? prog->check_substr : prog->check_utf8, n_a);
4894
4895             if (!PL_colorset) reginitcolors();
4896             PerlIO_printf(Perl_debug_log,
4897                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4898                       PL_colors[4],
4899                       prog->check_substr ? "" : "utf8 ",
4900                       PL_colors[5],PL_colors[0],
4901                       s,
4902                       PL_colors[1],
4903                       (strlen(s) > 60 ? "..." : ""));
4904         } );
4905
4906     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4907 }
4908
4909 void
4910 Perl_pregfree(pTHX_ struct regexp *r)
4911 {
4912 #ifdef DEBUGGING
4913     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4914 #endif
4915
4916     if (!r || (--r->refcnt > 0))
4917         return;
4918     DEBUG_r({
4919          int len;
4920          char *s;
4921
4922          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4923                 r->prelen, 60, UNI_DISPLAY_REGEX)
4924             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4925          len = SvCUR(dsv);
4926          if (!PL_colorset)
4927               reginitcolors();
4928          PerlIO_printf(Perl_debug_log,
4929                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4930                        PL_colors[4],PL_colors[5],PL_colors[0],
4931                        len, len, s,
4932                        PL_colors[1],
4933                        len > 60 ? "..." : "");
4934     });
4935
4936     if (r->precomp)
4937         Safefree(r->precomp);
4938     if (r->offsets)             /* 20010421 MJD */
4939         Safefree(r->offsets);
4940     RX_MATCH_COPY_FREE(r);
4941 #ifdef PERL_COPY_ON_WRITE
4942     if (r->saved_copy)
4943         SvREFCNT_dec(r->saved_copy);
4944 #endif
4945     if (r->substrs) {
4946         if (r->anchored_substr)
4947             SvREFCNT_dec(r->anchored_substr);
4948         if (r->anchored_utf8)
4949             SvREFCNT_dec(r->anchored_utf8);
4950         if (r->float_substr)
4951             SvREFCNT_dec(r->float_substr);
4952         if (r->float_utf8)
4953             SvREFCNT_dec(r->float_utf8);
4954         Safefree(r->substrs);
4955     }
4956     if (r->data) {
4957         int n = r->data->count;
4958         PAD* new_comppad = NULL;
4959         PAD* old_comppad;
4960
4961         while (--n >= 0) {
4962           /* If you add a ->what type here, update the comment in regcomp.h */
4963             switch (r->data->what[n]) {
4964             case 's':
4965                 SvREFCNT_dec((SV*)r->data->data[n]);
4966                 break;
4967             case 'f':
4968                 Safefree(r->data->data[n]);
4969                 break;
4970             case 'p':
4971                 new_comppad = (AV*)r->data->data[n];
4972                 break;
4973             case 'o':
4974                 if (new_comppad == NULL)
4975                     Perl_croak(aTHX_ "panic: pregfree comppad");
4976                 PAD_SAVE_LOCAL(old_comppad,
4977                     /* Watch out for global destruction's random ordering. */
4978                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4979                                 new_comppad : Null(PAD *)
4980                 );
4981                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4982                     op_free((OP_4tree*)r->data->data[n]);
4983                 }
4984
4985                 PAD_RESTORE_LOCAL(old_comppad);
4986                 SvREFCNT_dec((SV*)new_comppad);
4987                 new_comppad = NULL;
4988                 break;
4989             case 'n':
4990                 break;
4991             default:
4992                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4993             }
4994         }
4995         Safefree(r->data->what);
4996         Safefree(r->data);
4997     }
4998     Safefree(r->startp);
4999     Safefree(r->endp);
5000     Safefree(r);
5001 }
5002
5003 /*
5004  - regnext - dig the "next" pointer out of a node
5005  *
5006  * [Note, when REGALIGN is defined there are two places in regmatch()
5007  * that bypass this code for speed.]
5008  */
5009 regnode *
5010 Perl_regnext(pTHX_ register regnode *p)
5011 {
5012     register I32 offset;
5013
5014     if (p == &PL_regdummy)
5015         return(NULL);
5016
5017     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5018     if (offset == 0)
5019         return(NULL);
5020
5021     return(p+offset);
5022 }
5023
5024 STATIC void     
5025 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5026 {
5027     va_list args;
5028     STRLEN l1 = strlen(pat1);
5029     STRLEN l2 = strlen(pat2);
5030     char buf[512];
5031     SV *msv;
5032     char *message;
5033
5034     if (l1 > 510)
5035         l1 = 510;
5036     if (l1 + l2 > 510)
5037         l2 = 510 - l1;
5038     Copy(pat1, buf, l1 , char);
5039     Copy(pat2, buf + l1, l2 , char);
5040     buf[l1 + l2] = '\n';
5041     buf[l1 + l2 + 1] = '\0';
5042 #ifdef I_STDARG
5043     /* ANSI variant takes additional second argument */
5044     va_start(args, pat2);
5045 #else
5046     va_start(args);
5047 #endif
5048     msv = vmess(buf, &args);
5049     va_end(args);
5050     message = SvPV(msv,l1);
5051     if (l1 > 512)
5052         l1 = 512;
5053     Copy(message, buf, l1 , char);
5054     buf[l1-1] = '\0';                   /* Overwrite \n */
5055     Perl_croak(aTHX_ "%s", buf);
5056 }
5057
5058 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5059
5060 void
5061 Perl_save_re_context(pTHX)
5062 {
5063     SAVEI32(PL_reg_flags);              /* from regexec.c */
5064     SAVEPPTR(PL_bostr);
5065     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5066     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5067     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5068     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5069     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5070     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5071     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5072     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5073     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5074     PL_reg_start_tmp = 0;
5075     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5076     PL_reg_start_tmpl = 0;
5077     SAVEVPTR(PL_regdata);
5078     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5079     SAVEI32(PL_regnarrate);             /* from regexec.c */
5080     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5081     SAVEINT(PL_regindent);              /* from regexec.c */
5082     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5083     SAVEVPTR(PL_curcop);
5084     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5085     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5086     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5087     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5088     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5089     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5090     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5091     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5092     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5093     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5094     PL_reg_oldsaved = Nullch;
5095     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5096     PL_reg_oldsavedlen = 0;
5097 #ifdef PERL_COPY_ON_WRITE
5098     SAVESPTR(PL_nrs);
5099     PL_nrs = Nullsv;
5100 #endif
5101     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5102     PL_reg_maxiter = 0;
5103     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5104     PL_reg_leftiter = 0;
5105     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5106     PL_reg_poscache = Nullch;
5107     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5108     PL_reg_poscache_size = 0;
5109     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5110     SAVEI32(PL_regnpar);                /* () count. */
5111     SAVEI32(PL_regsize);                /* from regexec.c */
5112
5113     {
5114         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5115         U32 i;
5116         GV *mgv;
5117         REGEXP *rx;
5118         char digits[16];
5119
5120         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5121             for (i = 1; i <= rx->nparens; i++) {
5122                 sprintf(digits, "%lu", (long)i);
5123                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5124                     save_scalar(mgv);
5125             }
5126         }
5127     }
5128
5129 #ifdef DEBUGGING
5130     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5131 #endif
5132 }
5133
5134 static void
5135 clear_re(pTHX_ void *r)
5136 {
5137     ReREFCNT_dec((regexp *)r);
5138 }
5139