c1ec5f905e3c7ea672e273d1aff84e0688aa6398
[p5sagit/p5-mst-13.2.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73  ****    2000, 2001, 2002, 2003, by Larry Wall and others
74  ****
75  ****    You may distribute under the terms of either the GNU General Public
76  ****    License or the Artistic License, as specified in the README file.
77
78  *
79  * Beware that some of this code is subtly aware of the way operator
80  * precedence is structured in regular expressions.  Serious changes in
81  * regular-expression syntax might require a total rethink.
82  */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86
87 #ifndef PERL_IN_XSUB_RE
88 #  include "INTERN.h"
89 #endif
90
91 #define REG_COMP_C
92 #include "regcomp.h"
93
94 #ifdef op
95 #undef op
96 #endif /* op */
97
98 #ifdef MSDOS
99 # if defined(BUGGY_MSC6)
100  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101  # pragma optimize("a",off)
102  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103  # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106
107 #ifndef STATIC
108 #define STATIC  static
109 #endif
110
111 typedef struct RExC_state_t {
112     U32         flags;                  /* are we folding, multilining? */
113     char        *precomp;               /* uncompiled string. */
114     regexp      *rx;
115     char        *start;                 /* Start of input for compile */
116     char        *end;                   /* End of input for compile */
117     char        *parse;                 /* Input-scan pointer. */
118     I32         whilem_seen;            /* number of WHILEM in this expr */
119     regnode     *emit_start;            /* Start of emitted-code area */
120     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
121     I32         naughty;                /* How bad is this pattern? */
122     I32         sawback;                /* Did we see \1, ...? */
123     U32         seen;
124     I32         size;                   /* Code size. */
125     I32         npar;                   /* () count. */
126     I32         extralen;
127     I32         seen_zerolen;
128     I32         seen_evals;
129     I32         utf8;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags      (pRExC_state->flags)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_start      (pRExC_state->start)
140 #define RExC_end        (pRExC_state->end)
141 #define RExC_parse      (pRExC_state->parse)
142 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
143 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit       (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty    (pRExC_state->naughty)
147 #define RExC_sawback    (pRExC_state->sawback)
148 #define RExC_seen       (pRExC_state->seen)
149 #define RExC_size       (pRExC_state->size)
150 #define RExC_npar       (pRExC_state->npar)
151 #define RExC_extralen   (pRExC_state->extralen)
152 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8       (pRExC_state->utf8)
155
156 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158         ((*s) == '{' && regcurly(s)))
159
160 #ifdef SPSTART
161 #undef SPSTART          /* dratted cpp namespace... */
162 #endif
163 /*
164  * Flags to be passed up and down.
165  */
166 #define WORST           0       /* Worst case. */
167 #define HASWIDTH        0x1     /* Known to match non-null strings. */
168 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART         0x4     /* Starts with * or +. */
170 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
171
172 /* Length of a variant. */
173
174 typedef struct scan_data_t {
175     I32 len_min;
176     I32 len_delta;
177     I32 pos_min;
178     I32 pos_delta;
179     SV *last_found;
180     I32 last_end;                       /* min value, <0 unless valid. */
181     I32 last_start_min;
182     I32 last_start_max;
183     SV **longest;                       /* Either &l_fixed, or &l_float. */
184     SV *longest_fixed;
185     I32 offset_fixed;
186     SV *longest_float;
187     I32 offset_float_min;
188     I32 offset_float_max;
189     I32 flags;
190     I32 whilem_c;
191     I32 *last_closep;
192     struct regnode_charclass_class *start_class;
193 } scan_data_t;
194
195 /*
196  * Forward declarations for pregcomp()'s friends.
197  */
198
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200                                       0, 0, 0, 0, 0, 0};
201
202 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL          0x1
204 #define SF_BEFORE_MEOL          0x2
205 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
208 #ifdef NO_UNARY_PLUS
209 #  define SF_FIX_SHIFT_EOL      (0+2)
210 #  define SF_FL_SHIFT_EOL               (0+4)
211 #else
212 #  define SF_FIX_SHIFT_EOL      (+2)
213 #  define SF_FL_SHIFT_EOL               (+4)
214 #endif
215
216 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF               0x40
222 #define SF_HAS_PAR              0x80
223 #define SF_IN_PAR               0x100
224 #define SF_HAS_EVAL             0x200
225 #define SCF_DO_SUBSTR           0x400
226 #define SCF_DO_STCLASS_AND      0x0800
227 #define SCF_DO_STCLASS_OR       0x1000
228 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS  0x2000
230
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234
235 #define OOB_UNICODE             12345678
236 #define OOB_NAMEDCLASS          -1
237
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
241
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
244
245 /*
246  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248  * op/pragma/warn/regcomp.
249  */
250 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
252
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
254
255 /*
256  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257  * arg. Show regex, up to a maximum length. If it's too long, chop and add
258  * "...".
259  */
260 #define FAIL(msg) STMT_START {                                          \
261     char *ellipses = "";                                                \
262     IV len = RExC_end - RExC_precomp;                                   \
263                                                                         \
264     if (!SIZE_ONLY)                                                     \
265         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
266     if (len > RegexLengthToShowInErrorMessages) {                       \
267         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
268         len = RegexLengthToShowInErrorMessages - 10;                    \
269         ellipses = "...";                                               \
270     }                                                                   \
271     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
272             msg, (int)len, RExC_precomp, ellipses);                     \
273 } STMT_END
274
275 /*
276  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277  * args. Show regex, up to a maximum length. If it's too long, chop and add
278  * "...".
279  */
280 #define FAIL2(pat,msg) STMT_START {                                     \
281     char *ellipses = "";                                                \
282     IV len = RExC_end - RExC_precomp;                                   \
283                                                                         \
284     if (!SIZE_ONLY)                                                     \
285         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
286     if (len > RegexLengthToShowInErrorMessages) {                       \
287         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
288         len = RegexLengthToShowInErrorMessages - 10;                    \
289         ellipses = "...";                                               \
290     }                                                                   \
291     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
292             msg, (int)len, RExC_precomp, ellipses);                     \
293 } STMT_END
294
295
296 /*
297  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298  */
299 #define Simple_vFAIL(m) STMT_START {                                    \
300     IV offset = RExC_parse - RExC_precomp;                              \
301     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
302             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
303 } STMT_END
304
305 /*
306  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307  */
308 #define vFAIL(m) STMT_START {                           \
309     if (!SIZE_ONLY)                                     \
310         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
311     Simple_vFAIL(m);                                    \
312 } STMT_END
313
314 /*
315  * Like Simple_vFAIL(), but accepts two arguments.
316  */
317 #define Simple_vFAIL2(m,a1) STMT_START {                        \
318     IV offset = RExC_parse - RExC_precomp;                      \
319     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
320             (int)offset, RExC_precomp, RExC_precomp + offset);  \
321 } STMT_END
322
323 /*
324  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325  */
326 #define vFAIL2(m,a1) STMT_START {                       \
327     if (!SIZE_ONLY)                                     \
328         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
329     Simple_vFAIL2(m, a1);                               \
330 } STMT_END
331
332
333 /*
334  * Like Simple_vFAIL(), but accepts three arguments.
335  */
336 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
337     IV offset = RExC_parse - RExC_precomp;                      \
338     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
339             (int)offset, RExC_precomp, RExC_precomp + offset);  \
340 } STMT_END
341
342 /*
343  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344  */
345 #define vFAIL3(m,a1,a2) STMT_START {                    \
346     if (!SIZE_ONLY)                                     \
347         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
348     Simple_vFAIL3(m, a1, a2);                           \
349 } STMT_END
350
351 /*
352  * Like Simple_vFAIL(), but accepts four arguments.
353  */
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
355     IV offset = RExC_parse - RExC_precomp;                      \
356     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
357             (int)offset, RExC_precomp, RExC_precomp + offset);  \
358 } STMT_END
359
360 /*
361  * Like Simple_vFAIL(), but accepts five arguments.
362  */
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
364     IV offset = RExC_parse - RExC_precomp;                      \
365     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
366             (int)offset, RExC_precomp, RExC_precomp + offset);  \
367 } STMT_END
368
369
370 #define vWARN(loc,m) STMT_START {                                       \
371     IV offset = loc - RExC_precomp;                                     \
372     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
373             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
374 } STMT_END
375
376 #define vWARNdep(loc,m) STMT_START {                                    \
377     IV offset = loc - RExC_precomp;                                     \
378     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
379             "%s" REPORT_LOCATION,                                       \
380             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
381 } STMT_END
382
383
384 #define vWARN2(loc, m, a1) STMT_START {                                 \
385     IV offset = loc - RExC_precomp;                                     \
386     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
387             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
388 } STMT_END
389
390 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
391     IV offset = loc - RExC_precomp;                                     \
392     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
393             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
394 } STMT_END
395
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
397     IV offset = loc - RExC_precomp;                                     \
398     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
399             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 } STMT_END
401
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
403     IV offset = loc - RExC_precomp;                                     \
404     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
405             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 } STMT_END
407
408
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START {                  \
411     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412 } STMT_END
413
414 /* Macros for recording node offsets.   20001227 mjd@plover.com 
415  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
416  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
417  * Element 0 holds the number n.
418  */
419
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
425     if (! SIZE_ONLY) {                                                  \
426         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
427                 __LINE__, (node), (byte)));                             \
428         if((node) < 0) {                                                \
429             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430         } else {                                                        \
431             RExC_offsets[2*(node)-1] = (byte);                          \
432         }                                                               \
433     }                                                                   \
434 } STMT_END
435
436 #define Set_Node_Offset(node,byte) \
437     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
441     if (! SIZE_ONLY) {                                                  \
442         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
443                 __LINE__, (node), (len)));                              \
444         if((node) < 0) {                                                \
445             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446         } else {                                                        \
447             RExC_offsets[2*(node)] = (len);                             \
448         }                                                               \
449     }                                                                   \
450 } STMT_END
451
452 #define Set_Node_Length(node,len) \
453     Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456     Set_Node_Length(node, RExC_parse - parse_start)
457
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
462 static void clear_re(pTHX_ void *r);
463
464 /* Mark that we cannot extend a found fixed substring at this point.
465    Updata the longest found anchored substring and the longest found
466    floating substrings if needed. */
467
468 STATIC void
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 {
471     STRLEN l = CHR_SVLEN(data->last_found);
472     STRLEN old_l = CHR_SVLEN(*data->longest);
473
474     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475         sv_setsv(*data->longest, data->last_found);
476         if (*data->longest == data->longest_fixed) {
477             data->offset_fixed = l ? data->last_start_min : data->pos_min;
478             if (data->flags & SF_BEFORE_EOL)
479                 data->flags
480                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481             else
482                 data->flags &= ~SF_FIX_BEFORE_EOL;
483         }
484         else {
485             data->offset_float_min = l ? data->last_start_min : data->pos_min;
486             data->offset_float_max = (l
487                                       ? data->last_start_max
488                                       : data->pos_min + data->pos_delta);
489             if ((U32)data->offset_float_max > (U32)I32_MAX)
490                 data->offset_float_max = I32_MAX;
491             if (data->flags & SF_BEFORE_EOL)
492                 data->flags
493                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494             else
495                 data->flags &= ~SF_FL_BEFORE_EOL;
496         }
497     }
498     SvCUR_set(data->last_found, 0);
499     {
500         SV * sv = data->last_found;
501         MAGIC *mg =
502             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503         if (mg && mg->mg_len > 0)
504             mg->mg_len = 0;
505     }
506     data->last_end = -1;
507     data->flags &= ~SF_BEFORE_EOL;
508 }
509
510 /* Can match anything (initialization) */
511 STATIC void
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 {
514     ANYOF_CLASS_ZERO(cl);
515     ANYOF_BITMAP_SETALL(cl);
516     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517     if (LOC)
518         cl->flags |= ANYOF_LOCALE;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC int
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524 {
525     int value;
526
527     for (value = 0; value <= ANYOF_MAX; value += 2)
528         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529             return 1;
530     if (!(cl->flags & ANYOF_UNICODE_ALL))
531         return 0;
532     if (!ANYOF_BITMAP_TESTALLSET(cl))
533         return 0;
534     return 1;
535 }
536
537 /* Can match anything (initialization) */
538 STATIC void
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544 }
545
546 STATIC void
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 {
549     Zero(cl, 1, struct regnode_charclass_class);
550     cl->type = ANYOF;
551     cl_anything(pRExC_state, cl);
552     if (LOC)
553         cl->flags |= ANYOF_LOCALE;
554 }
555
556 /* 'And' a given class with another one.  Can create false positives */
557 /* We assume that cl is not inverted */
558 STATIC void
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560          struct regnode_charclass_class *and_with)
561 {
562     if (!(and_with->flags & ANYOF_CLASS)
563         && !(cl->flags & ANYOF_CLASS)
564         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565         && !(and_with->flags & ANYOF_FOLD)
566         && !(cl->flags & ANYOF_FOLD)) {
567         int i;
568
569         if (and_with->flags & ANYOF_INVERT)
570             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571                 cl->bitmap[i] &= ~and_with->bitmap[i];
572         else
573             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574                 cl->bitmap[i] &= and_with->bitmap[i];
575     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576     if (!(and_with->flags & ANYOF_EOS))
577         cl->flags &= ~ANYOF_EOS;
578
579     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
580         cl->flags &= ~ANYOF_UNICODE_ALL;
581         cl->flags |= ANYOF_UNICODE;
582         ARG_SET(cl, ARG(and_with));
583     }
584     if (!(and_with->flags & ANYOF_UNICODE_ALL))
585         cl->flags &= ~ANYOF_UNICODE_ALL;
586     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
587         cl->flags &= ~ANYOF_UNICODE;
588 }
589
590 /* 'OR' a given class with another one.  Can create false positives */
591 /* We assume that cl is not inverted */
592 STATIC void
593 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
594 {
595     if (or_with->flags & ANYOF_INVERT) {
596         /* We do not use
597          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
598          *   <= (B1 | !B2) | (CL1 | !CL2)
599          * which is wasteful if CL2 is small, but we ignore CL2:
600          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
601          * XXXX Can we handle case-fold?  Unclear:
602          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
603          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
604          */
605         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
606              && !(or_with->flags & ANYOF_FOLD)
607              && !(cl->flags & ANYOF_FOLD) ) {
608             int i;
609
610             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611                 cl->bitmap[i] |= ~or_with->bitmap[i];
612         } /* XXXX: logic is complicated otherwise */
613         else {
614             cl_anything(pRExC_state, cl);
615         }
616     } else {
617         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
618         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619              && (!(or_with->flags & ANYOF_FOLD)
620                  || (cl->flags & ANYOF_FOLD)) ) {
621             int i;
622
623             /* OR char bitmap and class bitmap separately */
624             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
625                 cl->bitmap[i] |= or_with->bitmap[i];
626             if (or_with->flags & ANYOF_CLASS) {
627                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
628                     cl->classflags[i] |= or_with->classflags[i];
629                 cl->flags |= ANYOF_CLASS;
630             }
631         }
632         else { /* XXXX: logic is complicated, leave it along for a moment. */
633             cl_anything(pRExC_state, cl);
634         }
635     }
636     if (or_with->flags & ANYOF_EOS)
637         cl->flags |= ANYOF_EOS;
638
639     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
640         ARG(cl) != ARG(or_with)) {
641         cl->flags |= ANYOF_UNICODE_ALL;
642         cl->flags &= ~ANYOF_UNICODE;
643     }
644     if (or_with->flags & ANYOF_UNICODE_ALL) {
645         cl->flags |= ANYOF_UNICODE_ALL;
646         cl->flags &= ~ANYOF_UNICODE;
647     }
648 }
649
650 /*
651  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
652  * These need to be revisited when a newer toolchain becomes available.
653  */
654 #if defined(__sparc64__) && defined(__GNUC__)
655 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
656 #       undef  SPARC64_GCC_WORKAROUND
657 #       define SPARC64_GCC_WORKAROUND 1
658 #   endif
659 #endif
660
661 /* REx optimizer.  Converts nodes into quickier variants "in place".
662    Finds fixed substrings.  */
663
664 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
665    to the position after last scanned or to NULL. */
666
667 STATIC I32
668 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
669                         /* scanp: Start here (read-write). */
670                         /* deltap: Write maxlen-minlen here. */
671                         /* last: Stop before this one. */
672 {
673     I32 min = 0, pars = 0, code;
674     regnode *scan = *scanp, *next;
675     I32 delta = 0;
676     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
677     int is_inf_internal = 0;            /* The studied chunk is infinite */
678     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
679     scan_data_t data_fake;
680     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
681
682     while (scan && OP(scan) != END && scan < last) {
683         /* Peephole optimizer: */
684
685         if (PL_regkind[(U8)OP(scan)] == EXACT) {
686             /* Merge several consecutive EXACTish nodes into one. */
687             regnode *n = regnext(scan);
688             U32 stringok = 1;
689 #ifdef DEBUGGING
690             regnode *stop = scan;
691 #endif
692
693             next = scan + NODE_SZ_STR(scan);
694             /* Skip NOTHING, merge EXACT*. */
695             while (n &&
696                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
697                      (stringok && (OP(n) == OP(scan))))
698                    && NEXT_OFF(n)
699                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
700                 if (OP(n) == TAIL || n > next)
701                     stringok = 0;
702                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
703                     NEXT_OFF(scan) += NEXT_OFF(n);
704                     next = n + NODE_STEP_REGNODE;
705 #ifdef DEBUGGING
706                     if (stringok)
707                         stop = n;
708 #endif
709                     n = regnext(n);
710                 }
711                 else if (stringok) {
712                     int oldl = STR_LEN(scan);
713                     regnode *nnext = regnext(n);
714
715                     if (oldl + STR_LEN(n) > U8_MAX)
716                         break;
717                     NEXT_OFF(scan) += NEXT_OFF(n);
718                     STR_LEN(scan) += STR_LEN(n);
719                     next = n + NODE_SZ_STR(n);
720                     /* Now we can overwrite *n : */
721                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
722 #ifdef DEBUGGING
723                     stop = next - 1;
724 #endif
725                     n = nnext;
726                 }
727             }
728
729             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
730 /*
731   Two problematic code points in Unicode casefolding of EXACT nodes:
732
733    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
734    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
735
736    which casefold to
737
738    Unicode                      UTF-8
739
740    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
741    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
742
743    This means that in case-insensitive matching (or "loose matching",
744    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
745    length of the above casefolded versions) can match a target string
746    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
747    This would rather mess up the minimum length computation.
748
749    What we'll do is to look for the tail four bytes, and then peek
750    at the preceding two bytes to see whether we need to decrease
751    the minimum length by four (six minus two).
752
753    Thanks to the design of UTF-8, there cannot be false matches:
754    A sequence of valid UTF-8 bytes cannot be a subsequence of
755    another valid sequence of UTF-8 bytes.
756
757 */
758                  char *s0 = STRING(scan), *s, *t;
759                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
760                  char *t0 = "\xcc\x88\xcc\x81";
761                  char *t1 = t0 + 3;
762                  
763                  for (s = s0 + 2;
764                       s < s2 && (t = ninstr(s, s1, t0, t1));
765                       s = t + 4) {
766                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
767                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
768                            min -= 4;
769                  }
770             }
771
772 #ifdef DEBUGGING
773             /* Allow dumping */
774             n = scan + NODE_SZ_STR(scan);
775             while (n <= stop) {
776                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
777                     OP(n) = OPTIMIZED;
778                     NEXT_OFF(n) = 0;
779                 }
780                 n++;
781             }
782 #endif
783         }
784         /* Follow the next-chain of the current node and optimize
785            away all the NOTHINGs from it.  */
786         if (OP(scan) != CURLYX) {
787             int max = (reg_off_by_arg[OP(scan)]
788                        ? I32_MAX
789                        /* I32 may be smaller than U16 on CRAYs! */
790                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
791             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
792             int noff;
793             regnode *n = scan;
794         
795             /* Skip NOTHING and LONGJMP. */
796             while ((n = regnext(n))
797                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
798                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
799                    && off + noff < max)
800                 off += noff;
801             if (reg_off_by_arg[OP(scan)])
802                 ARG(scan) = off;
803             else
804                 NEXT_OFF(scan) = off;
805         }
806         /* The principal pseudo-switch.  Cannot be a switch, since we
807            look into several different things.  */
808         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
809                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
810             next = regnext(scan);
811             code = OP(scan);
812         
813             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
814                 I32 max1 = 0, min1 = I32_MAX, num = 0;
815                 struct regnode_charclass_class accum;
816                 
817                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
818                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
819                 if (flags & SCF_DO_STCLASS)
820                     cl_init_zero(pRExC_state, &accum);
821                 while (OP(scan) == code) {
822                     I32 deltanext, minnext, f = 0, fake;
823                     struct regnode_charclass_class this_class;
824
825                     num++;
826                     data_fake.flags = 0;
827                     if (data) {         
828                         data_fake.whilem_c = data->whilem_c;
829                         data_fake.last_closep = data->last_closep;
830                     }
831                     else
832                         data_fake.last_closep = &fake;
833                     next = regnext(scan);
834                     scan = NEXTOPER(scan);
835                     if (code != BRANCH)
836                         scan = NEXTOPER(scan);
837                     if (flags & SCF_DO_STCLASS) {
838                         cl_init(pRExC_state, &this_class);
839                         data_fake.start_class = &this_class;
840                         f = SCF_DO_STCLASS_AND;
841                     }           
842                     if (flags & SCF_WHILEM_VISITED_POS)
843                         f |= SCF_WHILEM_VISITED_POS;
844                     /* we suppose the run is continuous, last=next...*/
845                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
846                                           next, &data_fake, f);
847                     if (min1 > minnext)
848                         min1 = minnext;
849                     if (max1 < minnext + deltanext)
850                         max1 = minnext + deltanext;
851                     if (deltanext == I32_MAX)
852                         is_inf = is_inf_internal = 1;
853                     scan = next;
854                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
855                         pars++;
856                     if (data && (data_fake.flags & SF_HAS_EVAL))
857                         data->flags |= SF_HAS_EVAL;
858                     if (data)
859                         data->whilem_c = data_fake.whilem_c;
860                     if (flags & SCF_DO_STCLASS)
861                         cl_or(pRExC_state, &accum, &this_class);
862                     if (code == SUSPEND)
863                         break;
864                 }
865                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
866                     min1 = 0;
867                 if (flags & SCF_DO_SUBSTR) {
868                     data->pos_min += min1;
869                     data->pos_delta += max1 - min1;
870                     if (max1 != min1 || is_inf)
871                         data->longest = &(data->longest_float);
872                 }
873                 min += min1;
874                 delta += max1 - min1;
875                 if (flags & SCF_DO_STCLASS_OR) {
876                     cl_or(pRExC_state, data->start_class, &accum);
877                     if (min1) {
878                         cl_and(data->start_class, &and_with);
879                         flags &= ~SCF_DO_STCLASS;
880                     }
881                 }
882                 else if (flags & SCF_DO_STCLASS_AND) {
883                     if (min1) {
884                         cl_and(data->start_class, &accum);
885                         flags &= ~SCF_DO_STCLASS;
886                     }
887                     else {
888                         /* Switch to OR mode: cache the old value of
889                          * data->start_class */
890                         StructCopy(data->start_class, &and_with,
891                                    struct regnode_charclass_class);
892                         flags &= ~SCF_DO_STCLASS_AND;
893                         StructCopy(&accum, data->start_class,
894                                    struct regnode_charclass_class);
895                         flags |= SCF_DO_STCLASS_OR;
896                         data->start_class->flags |= ANYOF_EOS;
897                     }
898                 }
899             }
900             else if (code == BRANCHJ)   /* single branch is optimized. */
901                 scan = NEXTOPER(NEXTOPER(scan));
902             else                        /* single branch is optimized. */
903                 scan = NEXTOPER(scan);
904             continue;
905         }
906         else if (OP(scan) == EXACT) {
907             I32 l = STR_LEN(scan);
908             UV uc = *((U8*)STRING(scan));
909             if (UTF) {
910                 U8 *s = (U8*)STRING(scan);
911                 l = utf8_length(s, s + l);
912                 uc = utf8_to_uvchr(s, NULL);
913             }
914             min += l;
915             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
916                 /* The code below prefers earlier match for fixed
917                    offset, later match for variable offset.  */
918                 if (data->last_end == -1) { /* Update the start info. */
919                     data->last_start_min = data->pos_min;
920                     data->last_start_max = is_inf
921                         ? I32_MAX : data->pos_min + data->pos_delta;
922                 }
923                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
924                 {
925                     SV * sv = data->last_found;
926                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
927                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
928                     if (mg && mg->mg_len >= 0)
929                         mg->mg_len += utf8_length((U8*)STRING(scan),
930                                                   (U8*)STRING(scan)+STR_LEN(scan));
931                 }
932                 if (UTF)
933                     SvUTF8_on(data->last_found);
934                 data->last_end = data->pos_min + l;
935                 data->pos_min += l; /* As in the first entry. */
936                 data->flags &= ~SF_BEFORE_EOL;
937             }
938             if (flags & SCF_DO_STCLASS_AND) {
939                 /* Check whether it is compatible with what we know already! */
940                 int compat = 1;
941
942                 if (uc >= 0x100 ||
943                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
944                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
945                     && (!(data->start_class->flags & ANYOF_FOLD)
946                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
947                     )
948                     compat = 0;
949                 ANYOF_CLASS_ZERO(data->start_class);
950                 ANYOF_BITMAP_ZERO(data->start_class);
951                 if (compat)
952                     ANYOF_BITMAP_SET(data->start_class, uc);
953                 data->start_class->flags &= ~ANYOF_EOS;
954                 if (uc < 0x100)
955                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
956             }
957             else if (flags & SCF_DO_STCLASS_OR) {
958                 /* false positive possible if the class is case-folded */
959                 if (uc < 0x100)
960                     ANYOF_BITMAP_SET(data->start_class, uc);
961                 else
962                     data->start_class->flags |= ANYOF_UNICODE_ALL;
963                 data->start_class->flags &= ~ANYOF_EOS;
964                 cl_and(data->start_class, &and_with);
965             }
966             flags &= ~SCF_DO_STCLASS;
967         }
968         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
969             I32 l = STR_LEN(scan);
970             UV uc = *((U8*)STRING(scan));
971
972             /* Search for fixed substrings supports EXACT only. */
973             if (flags & SCF_DO_SUBSTR)
974                 scan_commit(pRExC_state, data);
975             if (UTF) {
976                 U8 *s = (U8 *)STRING(scan);
977                 l = utf8_length(s, s + l);
978                 uc = utf8_to_uvchr(s, NULL);
979             }
980             min += l;
981             if (data && (flags & SCF_DO_SUBSTR))
982                 data->pos_min += l;
983             if (flags & SCF_DO_STCLASS_AND) {
984                 /* Check whether it is compatible with what we know already! */
985                 int compat = 1;
986
987                 if (uc >= 0x100 ||
988                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
989                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
990                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
991                     compat = 0;
992                 ANYOF_CLASS_ZERO(data->start_class);
993                 ANYOF_BITMAP_ZERO(data->start_class);
994                 if (compat) {
995                     ANYOF_BITMAP_SET(data->start_class, uc);
996                     data->start_class->flags &= ~ANYOF_EOS;
997                     data->start_class->flags |= ANYOF_FOLD;
998                     if (OP(scan) == EXACTFL)
999                         data->start_class->flags |= ANYOF_LOCALE;
1000                 }
1001             }
1002             else if (flags & SCF_DO_STCLASS_OR) {
1003                 if (data->start_class->flags & ANYOF_FOLD) {
1004                     /* false positive possible if the class is case-folded.
1005                        Assume that the locale settings are the same... */
1006                     if (uc < 0x100)
1007                         ANYOF_BITMAP_SET(data->start_class, uc);
1008                     data->start_class->flags &= ~ANYOF_EOS;
1009                 }
1010                 cl_and(data->start_class, &and_with);
1011             }
1012             flags &= ~SCF_DO_STCLASS;
1013         }
1014         else if (strchr((char*)PL_varies,OP(scan))) {
1015             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1016             I32 f = flags, pos_before = 0;
1017             regnode *oscan = scan;
1018             struct regnode_charclass_class this_class;
1019             struct regnode_charclass_class *oclass = NULL;
1020             I32 next_is_eval = 0;
1021
1022             switch (PL_regkind[(U8)OP(scan)]) {
1023             case WHILEM:                /* End of (?:...)* . */
1024                 scan = NEXTOPER(scan);
1025                 goto finish;
1026             case PLUS:
1027                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1028                     next = NEXTOPER(scan);
1029                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1030                         mincount = 1;
1031                         maxcount = REG_INFTY;
1032                         next = regnext(scan);
1033                         scan = NEXTOPER(scan);
1034                         goto do_curly;
1035                     }
1036                 }
1037                 if (flags & SCF_DO_SUBSTR)
1038                     data->pos_min++;
1039                 min++;
1040                 /* Fall through. */
1041             case STAR:
1042                 if (flags & SCF_DO_STCLASS) {
1043                     mincount = 0;
1044                     maxcount = REG_INFTY;
1045                     next = regnext(scan);
1046                     scan = NEXTOPER(scan);
1047                     goto do_curly;
1048                 }
1049                 is_inf = is_inf_internal = 1;
1050                 scan = regnext(scan);
1051                 if (flags & SCF_DO_SUBSTR) {
1052                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1053                     data->longest = &(data->longest_float);
1054                 }
1055                 goto optimize_curly_tail;
1056             case CURLY:
1057                 mincount = ARG1(scan);
1058                 maxcount = ARG2(scan);
1059                 next = regnext(scan);
1060                 if (OP(scan) == CURLYX) {
1061                     I32 lp = (data ? *(data->last_closep) : 0);
1062
1063                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064                 }
1065                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1066                 next_is_eval = (OP(scan) == EVAL);
1067               do_curly:
1068                 if (flags & SCF_DO_SUBSTR) {
1069                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1070                     pos_before = data->pos_min;
1071                 }
1072                 if (data) {
1073                     fl = data->flags;
1074                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075                     if (is_inf)
1076                         data->flags |= SF_IS_INF;
1077                 }
1078                 if (flags & SCF_DO_STCLASS) {
1079                     cl_init(pRExC_state, &this_class);
1080                     oclass = data->start_class;
1081                     data->start_class = &this_class;
1082                     f |= SCF_DO_STCLASS_AND;
1083                     f &= ~SCF_DO_STCLASS_OR;
1084                 }
1085                 /* These are the cases when once a subexpression
1086                    fails at a particular position, it cannot succeed
1087                    even after backtracking at the enclosing scope.
1088                 
1089                    XXXX what if minimal match and we are at the
1090                         initial run of {n,m}? */
1091                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092                     f &= ~SCF_WHILEM_VISITED_POS;
1093
1094                 /* This will finish on WHILEM, setting scan, or on NULL: */
1095                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096                                       mincount == 0
1097                                         ? (f & ~SCF_DO_SUBSTR) : f);
1098
1099                 if (flags & SCF_DO_STCLASS)
1100                     data->start_class = oclass;
1101                 if (mincount == 0 || minnext == 0) {
1102                     if (flags & SCF_DO_STCLASS_OR) {
1103                         cl_or(pRExC_state, data->start_class, &this_class);
1104                     }
1105                     else if (flags & SCF_DO_STCLASS_AND) {
1106                         /* Switch to OR mode: cache the old value of
1107                          * data->start_class */
1108                         StructCopy(data->start_class, &and_with,
1109                                    struct regnode_charclass_class);
1110                         flags &= ~SCF_DO_STCLASS_AND;
1111                         StructCopy(&this_class, data->start_class,
1112                                    struct regnode_charclass_class);
1113                         flags |= SCF_DO_STCLASS_OR;
1114                         data->start_class->flags |= ANYOF_EOS;
1115                     }
1116                 } else {                /* Non-zero len */
1117                     if (flags & SCF_DO_STCLASS_OR) {
1118                         cl_or(pRExC_state, data->start_class, &this_class);
1119                         cl_and(data->start_class, &and_with);
1120                     }
1121                     else if (flags & SCF_DO_STCLASS_AND)
1122                         cl_and(data->start_class, &this_class);
1123                     flags &= ~SCF_DO_STCLASS;
1124                 }
1125                 if (!scan)              /* It was not CURLYX, but CURLY. */
1126                     scan = next;
1127                 if (ckWARN(WARN_REGEXP)
1128                        /* ? quantifier ok, except for (?{ ... }) */
1129                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1130                     && (minnext == 0) && (deltanext == 0)
1131                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1132                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1133                 {
1134                     vWARN(RExC_parse,
1135                           "Quantifier unexpected on zero-length expression");
1136                 }
1137
1138                 min += minnext * mincount;
1139                 is_inf_internal |= ((maxcount == REG_INFTY
1140                                      && (minnext + deltanext) > 0)
1141                                     || deltanext == I32_MAX);
1142                 is_inf |= is_inf_internal;
1143                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144
1145                 /* Try powerful optimization CURLYX => CURLYN. */
1146                 if (  OP(oscan) == CURLYX && data
1147                       && data->flags & SF_IN_PAR
1148                       && !(data->flags & SF_HAS_EVAL)
1149                       && !deltanext && minnext == 1 ) {
1150                     /* Try to optimize to CURLYN.  */
1151                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1152                     regnode *nxt1 = nxt;
1153 #ifdef DEBUGGING
1154                     regnode *nxt2;
1155 #endif
1156
1157                     /* Skip open. */
1158                     nxt = regnext(nxt);
1159                     if (!strchr((char*)PL_simple,OP(nxt))
1160                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1161                              && STR_LEN(nxt) == 1))
1162                         goto nogo;
1163 #ifdef DEBUGGING
1164                     nxt2 = nxt;
1165 #endif
1166                     nxt = regnext(nxt);
1167                     if (OP(nxt) != CLOSE)
1168                         goto nogo;
1169                     /* Now we know that nxt2 is the only contents: */
1170                     oscan->flags = (U8)ARG(nxt);
1171                     OP(oscan) = CURLYN;
1172                     OP(nxt1) = NOTHING; /* was OPEN. */
1173 #ifdef DEBUGGING
1174                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1178                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1180 #endif
1181                 }
1182               nogo:
1183
1184                 /* Try optimization CURLYX => CURLYM. */
1185                 if (  OP(oscan) == CURLYX && data
1186                       && !(data->flags & SF_HAS_PAR)
1187                       && !(data->flags & SF_HAS_EVAL)
1188                       && !deltanext  ) {
1189                     /* XXXX How to optimize if data == 0? */
1190                     /* Optimize to a simpler form.  */
1191                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1192                     regnode *nxt2;
1193
1194                     OP(oscan) = CURLYM;
1195                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1196                             && (OP(nxt2) != WHILEM))
1197                         nxt = nxt2;
1198                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1199                     /* Need to optimize away parenths. */
1200                     if (data->flags & SF_IN_PAR) {
1201                         /* Set the parenth number.  */
1202                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203
1204                         if (OP(nxt) != CLOSE)
1205                             FAIL("Panic opt close");
1206                         oscan->flags = (U8)ARG(nxt);
1207                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1208                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1209 #ifdef DEBUGGING
1210                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1212                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1214 #endif
1215 #if 0
1216                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217                             regnode *nnxt = regnext(nxt1);
1218                         
1219                             if (nnxt == nxt) {
1220                                 if (reg_off_by_arg[OP(nxt1)])
1221                                     ARG_SET(nxt1, nxt2 - nxt1);
1222                                 else if (nxt2 - nxt1 < U16_MAX)
1223                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1224                                 else
1225                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1226                             }
1227                             nxt1 = nnxt;
1228                         }
1229 #endif
1230                         /* Optimize again: */
1231                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1232                                     NULL, 0);
1233                     }
1234                     else
1235                         oscan->flags = 0;
1236                 }
1237                 else if ((OP(oscan) == CURLYX)
1238                          && (flags & SCF_WHILEM_VISITED_POS)
1239                          /* See the comment on a similar expression above.
1240                             However, this time it not a subexpression
1241                             we care about, but the expression itself. */
1242                          && (maxcount == REG_INFTY)
1243                          && data && ++data->whilem_c < 16) {
1244                     /* This stays as CURLYX, we can put the count/of pair. */
1245                     /* Find WHILEM (as in regexec.c) */
1246                     regnode *nxt = oscan + NEXT_OFF(oscan);
1247
1248                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249                         nxt += ARG(nxt);
1250                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1251                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1252                 }
1253                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1254                     pars++;
1255                 if (flags & SCF_DO_SUBSTR) {
1256                     SV *last_str = Nullsv;
1257                     int counted = mincount != 0;
1258
1259                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1260 #if defined(SPARC64_GCC_WORKAROUND)
1261                         I32 b = 0;
1262                         STRLEN l = 0;
1263                         char *s = NULL;
1264                         I32 old = 0;
1265
1266                         if (pos_before >= data->last_start_min)
1267                             b = pos_before;
1268                         else
1269                             b = data->last_start_min;
1270
1271                         l = 0;
1272                         s = SvPV(data->last_found, l);
1273                         old = b - data->last_start_min;
1274
1275 #else
1276                         I32 b = pos_before >= data->last_start_min
1277                             ? pos_before : data->last_start_min;
1278                         STRLEN l;
1279                         char *s = SvPV(data->last_found, l);
1280                         I32 old = b - data->last_start_min;
1281 #endif
1282
1283                         if (UTF)
1284                             old = utf8_hop((U8*)s, old) - (U8*)s;
1285                         
1286                         l -= old;
1287                         /* Get the added string: */
1288                         last_str = newSVpvn(s  + old, l);
1289                         if (UTF)
1290                             SvUTF8_on(last_str);
1291                         if (deltanext == 0 && pos_before == b) {
1292                             /* What was added is a constant string */
1293                             if (mincount > 1) {
1294                                 SvGROW(last_str, (mincount * l) + 1);
1295                                 repeatcpy(SvPVX(last_str) + l,
1296                                           SvPVX(last_str), l, mincount - 1);
1297                                 SvCUR(last_str) *= mincount;
1298                                 /* Add additional parts. */
1299                                 SvCUR_set(data->last_found,
1300                                           SvCUR(data->last_found) - l);
1301                                 sv_catsv(data->last_found, last_str);
1302                                 {
1303                                     SV * sv = data->last_found;
1304                                     MAGIC *mg =
1305                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1306                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1307                                     if (mg && mg->mg_len >= 0)
1308                                         mg->mg_len += CHR_SVLEN(last_str);
1309                                 }
1310                                 data->last_end += l * (mincount - 1);
1311                             }
1312                         } else {
1313                             /* start offset must point into the last copy */
1314                             data->last_start_min += minnext * (mincount - 1);
1315                             data->last_start_max += is_inf ? I32_MAX
1316                                 : (maxcount - 1) * (minnext + data->pos_delta);
1317                         }
1318                     }
1319                     /* It is counted once already... */
1320                     data->pos_min += minnext * (mincount - counted);
1321                     data->pos_delta += - counted * deltanext +
1322                         (minnext + deltanext) * maxcount - minnext * mincount;
1323                     if (mincount != maxcount) {
1324                          /* Cannot extend fixed substrings found inside
1325                             the group.  */
1326                         scan_commit(pRExC_state,data);
1327                         if (mincount && last_str) {
1328                             sv_setsv(data->last_found, last_str);
1329                             data->last_end = data->pos_min;
1330                             data->last_start_min =
1331                                 data->pos_min - CHR_SVLEN(last_str);
1332                             data->last_start_max = is_inf
1333                                 ? I32_MAX
1334                                 : data->pos_min + data->pos_delta
1335                                 - CHR_SVLEN(last_str);
1336                         }
1337                         data->longest = &(data->longest_float);
1338                     }
1339                     SvREFCNT_dec(last_str);
1340                 }
1341                 if (data && (fl & SF_HAS_EVAL))
1342                     data->flags |= SF_HAS_EVAL;
1343               optimize_curly_tail:
1344                 if (OP(oscan) != CURLYX) {
1345                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1346                            && NEXT_OFF(next))
1347                         NEXT_OFF(oscan) += NEXT_OFF(next);
1348                 }
1349                 continue;
1350             default:                    /* REF and CLUMP only? */
1351                 if (flags & SCF_DO_SUBSTR) {
1352                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1353                     data->longest = &(data->longest_float);
1354                 }
1355                 is_inf = is_inf_internal = 1;
1356                 if (flags & SCF_DO_STCLASS_OR)
1357                     cl_anything(pRExC_state, data->start_class);
1358                 flags &= ~SCF_DO_STCLASS;
1359                 break;
1360             }
1361         }
1362         else if (strchr((char*)PL_simple,OP(scan))) {
1363             int value = 0;
1364
1365             if (flags & SCF_DO_SUBSTR) {
1366                 scan_commit(pRExC_state,data);
1367                 data->pos_min++;
1368             }
1369             min++;
1370             if (flags & SCF_DO_STCLASS) {
1371                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1372
1373                 /* Some of the logic below assumes that switching
1374                    locale on will only add false positives. */
1375                 switch (PL_regkind[(U8)OP(scan)]) {
1376                 case SANY:
1377                 default:
1378                   do_default:
1379                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1380                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1381                         cl_anything(pRExC_state, data->start_class);
1382                     break;
1383                 case REG_ANY:
1384                     if (OP(scan) == SANY)
1385                         goto do_default;
1386                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1387                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1388                                  || (data->start_class->flags & ANYOF_CLASS));
1389                         cl_anything(pRExC_state, data->start_class);
1390                     }
1391                     if (flags & SCF_DO_STCLASS_AND || !value)
1392                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1393                     break;
1394                 case ANYOF:
1395                     if (flags & SCF_DO_STCLASS_AND)
1396                         cl_and(data->start_class,
1397                                (struct regnode_charclass_class*)scan);
1398                     else
1399                         cl_or(pRExC_state, data->start_class,
1400                               (struct regnode_charclass_class*)scan);
1401                     break;
1402                 case ALNUM:
1403                     if (flags & SCF_DO_STCLASS_AND) {
1404                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1405                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1406                             for (value = 0; value < 256; value++)
1407                                 if (!isALNUM(value))
1408                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1409                         }
1410                     }
1411                     else {
1412                         if (data->start_class->flags & ANYOF_LOCALE)
1413                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1414                         else {
1415                             for (value = 0; value < 256; value++)
1416                                 if (isALNUM(value))
1417                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1418                         }
1419                     }
1420                     break;
1421                 case ALNUML:
1422                     if (flags & SCF_DO_STCLASS_AND) {
1423                         if (data->start_class->flags & ANYOF_LOCALE)
1424                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1425                     }
1426                     else {
1427                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1428                         data->start_class->flags |= ANYOF_LOCALE;
1429                     }
1430                     break;
1431                 case NALNUM:
1432                     if (flags & SCF_DO_STCLASS_AND) {
1433                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1434                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1435                             for (value = 0; value < 256; value++)
1436                                 if (isALNUM(value))
1437                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1438                         }
1439                     }
1440                     else {
1441                         if (data->start_class->flags & ANYOF_LOCALE)
1442                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1443                         else {
1444                             for (value = 0; value < 256; value++)
1445                                 if (!isALNUM(value))
1446                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1447                         }
1448                     }
1449                     break;
1450                 case NALNUML:
1451                     if (flags & SCF_DO_STCLASS_AND) {
1452                         if (data->start_class->flags & ANYOF_LOCALE)
1453                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1454                     }
1455                     else {
1456                         data->start_class->flags |= ANYOF_LOCALE;
1457                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458                     }
1459                     break;
1460                 case SPACE:
1461                     if (flags & SCF_DO_STCLASS_AND) {
1462                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1463                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1464                             for (value = 0; value < 256; value++)
1465                                 if (!isSPACE(value))
1466                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1467                         }
1468                     }
1469                     else {
1470                         if (data->start_class->flags & ANYOF_LOCALE)
1471                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1472                         else {
1473                             for (value = 0; value < 256; value++)
1474                                 if (isSPACE(value))
1475                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1476                         }
1477                     }
1478                     break;
1479                 case SPACEL:
1480                     if (flags & SCF_DO_STCLASS_AND) {
1481                         if (data->start_class->flags & ANYOF_LOCALE)
1482                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1483                     }
1484                     else {
1485                         data->start_class->flags |= ANYOF_LOCALE;
1486                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487                     }
1488                     break;
1489                 case NSPACE:
1490                     if (flags & SCF_DO_STCLASS_AND) {
1491                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1492                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1493                             for (value = 0; value < 256; value++)
1494                                 if (isSPACE(value))
1495                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1496                         }
1497                     }
1498                     else {
1499                         if (data->start_class->flags & ANYOF_LOCALE)
1500                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1501                         else {
1502                             for (value = 0; value < 256; value++)
1503                                 if (!isSPACE(value))
1504                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1505                         }
1506                     }
1507                     break;
1508                 case NSPACEL:
1509                     if (flags & SCF_DO_STCLASS_AND) {
1510                         if (data->start_class->flags & ANYOF_LOCALE) {
1511                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1512                             for (value = 0; value < 256; value++)
1513                                 if (!isSPACE(value))
1514                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1515                         }
1516                     }
1517                     else {
1518                         data->start_class->flags |= ANYOF_LOCALE;
1519                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1520                     }
1521                     break;
1522                 case DIGIT:
1523                     if (flags & SCF_DO_STCLASS_AND) {
1524                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1525                         for (value = 0; value < 256; value++)
1526                             if (!isDIGIT(value))
1527                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1528                     }
1529                     else {
1530                         if (data->start_class->flags & ANYOF_LOCALE)
1531                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1532                         else {
1533                             for (value = 0; value < 256; value++)
1534                                 if (isDIGIT(value))
1535                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1536                         }
1537                     }
1538                     break;
1539                 case NDIGIT:
1540                     if (flags & SCF_DO_STCLASS_AND) {
1541                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1542                         for (value = 0; value < 256; value++)
1543                             if (isDIGIT(value))
1544                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545                     }
1546                     else {
1547                         if (data->start_class->flags & ANYOF_LOCALE)
1548                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1549                         else {
1550                             for (value = 0; value < 256; value++)
1551                                 if (!isDIGIT(value))
1552                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1553                         }
1554                     }
1555                     break;
1556                 }
1557                 if (flags & SCF_DO_STCLASS_OR)
1558                     cl_and(data->start_class, &and_with);
1559                 flags &= ~SCF_DO_STCLASS;
1560             }
1561         }
1562         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1563             data->flags |= (OP(scan) == MEOL
1564                             ? SF_BEFORE_MEOL
1565                             : SF_BEFORE_SEOL);
1566         }
1567         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1568                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1569                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1570                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1571             /* Lookahead/lookbehind */
1572             I32 deltanext, minnext, fake = 0;
1573             regnode *nscan;
1574             struct regnode_charclass_class intrnl;
1575             int f = 0;
1576
1577             data_fake.flags = 0;
1578             if (data) {         
1579                 data_fake.whilem_c = data->whilem_c;
1580                 data_fake.last_closep = data->last_closep;
1581             }
1582             else
1583                 data_fake.last_closep = &fake;
1584             if ( flags & SCF_DO_STCLASS && !scan->flags
1585                  && OP(scan) == IFMATCH ) { /* Lookahead */
1586                 cl_init(pRExC_state, &intrnl);
1587                 data_fake.start_class = &intrnl;
1588                 f |= SCF_DO_STCLASS_AND;
1589             }
1590             if (flags & SCF_WHILEM_VISITED_POS)
1591                 f |= SCF_WHILEM_VISITED_POS;
1592             next = regnext(scan);
1593             nscan = NEXTOPER(NEXTOPER(scan));
1594             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1595             if (scan->flags) {
1596                 if (deltanext) {
1597                     vFAIL("Variable length lookbehind not implemented");
1598                 }
1599                 else if (minnext > U8_MAX) {
1600                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1601                 }
1602                 scan->flags = (U8)minnext;
1603             }
1604             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1605                 pars++;
1606             if (data && (data_fake.flags & SF_HAS_EVAL))
1607                 data->flags |= SF_HAS_EVAL;
1608             if (data)
1609                 data->whilem_c = data_fake.whilem_c;
1610             if (f & SCF_DO_STCLASS_AND) {
1611                 int was = (data->start_class->flags & ANYOF_EOS);
1612
1613                 cl_and(data->start_class, &intrnl);
1614                 if (was)
1615                     data->start_class->flags |= ANYOF_EOS;
1616             }
1617         }
1618         else if (OP(scan) == OPEN) {
1619             pars++;
1620         }
1621         else if (OP(scan) == CLOSE) {
1622             if ((I32)ARG(scan) == is_par) {
1623                 next = regnext(scan);
1624
1625                 if ( next && (OP(next) != WHILEM) && next < last)
1626                     is_par = 0;         /* Disable optimization */
1627             }
1628             if (data)
1629                 *(data->last_closep) = ARG(scan);
1630         }
1631         else if (OP(scan) == EVAL) {
1632                 if (data)
1633                     data->flags |= SF_HAS_EVAL;
1634         }
1635         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1636                 if (flags & SCF_DO_SUBSTR) {
1637                     scan_commit(pRExC_state,data);
1638                     data->longest = &(data->longest_float);
1639                 }
1640                 is_inf = is_inf_internal = 1;
1641                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1642                     cl_anything(pRExC_state, data->start_class);
1643                 flags &= ~SCF_DO_STCLASS;
1644         }
1645         /* Else: zero-length, ignore. */
1646         scan = regnext(scan);
1647     }
1648
1649   finish:
1650     *scanp = scan;
1651     *deltap = is_inf_internal ? I32_MAX : delta;
1652     if (flags & SCF_DO_SUBSTR && is_inf)
1653         data->pos_delta = I32_MAX - data->pos_min;
1654     if (is_par > U8_MAX)
1655         is_par = 0;
1656     if (is_par && pars==1 && data) {
1657         data->flags |= SF_IN_PAR;
1658         data->flags &= ~SF_HAS_PAR;
1659     }
1660     else if (pars && data) {
1661         data->flags |= SF_HAS_PAR;
1662         data->flags &= ~SF_IN_PAR;
1663     }
1664     if (flags & SCF_DO_STCLASS_OR)
1665         cl_and(data->start_class, &and_with);
1666     return min;
1667 }
1668
1669 STATIC I32
1670 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1671 {
1672     if (RExC_rx->data) {
1673         Renewc(RExC_rx->data,
1674                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1675                char, struct reg_data);
1676         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1677         RExC_rx->data->count += n;
1678     }
1679     else {
1680         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1681              char, struct reg_data);
1682         New(1208, RExC_rx->data->what, n, U8);
1683         RExC_rx->data->count = n;
1684     }
1685     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1686     return RExC_rx->data->count - n;
1687 }
1688
1689 void
1690 Perl_reginitcolors(pTHX)
1691 {
1692     int i = 0;
1693     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1694         
1695     if (s) {
1696         PL_colors[0] = s = savepv(s);
1697         while (++i < 6) {
1698             s = strchr(s, '\t');
1699             if (s) {
1700                 *s = '\0';
1701                 PL_colors[i] = ++s;
1702             }
1703             else
1704                 PL_colors[i] = s = "";
1705         }
1706     } else {
1707         while (i < 6)
1708             PL_colors[i++] = "";
1709     }
1710     PL_colorset = 1;
1711 }
1712
1713
1714 /*
1715  - pregcomp - compile a regular expression into internal code
1716  *
1717  * We can't allocate space until we know how big the compiled form will be,
1718  * but we can't compile it (and thus know how big it is) until we've got a
1719  * place to put the code.  So we cheat:  we compile it twice, once with code
1720  * generation turned off and size counting turned on, and once "for real".
1721  * This also means that we don't allocate space until we are sure that the
1722  * thing really will compile successfully, and we never have to move the
1723  * code and thus invalidate pointers into it.  (Note that it has to be in
1724  * one piece because free() must be able to free it all.) [NB: not true in perl]
1725  *
1726  * Beware that the optimization-preparation code in here knows about some
1727  * of the structure of the compiled regexp.  [I'll say.]
1728  */
1729 regexp *
1730 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1731 {
1732     register regexp *r;
1733     regnode *scan;
1734     regnode *first;
1735     I32 flags;
1736     I32 minlen = 0;
1737     I32 sawplus = 0;
1738     I32 sawopen = 0;
1739     scan_data_t data;
1740     RExC_state_t RExC_state;
1741     RExC_state_t *pRExC_state = &RExC_state;
1742
1743     if (exp == NULL)
1744         FAIL("NULL regexp argument");
1745
1746     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1747
1748     RExC_precomp = exp;
1749     DEBUG_r({
1750          if (!PL_colorset) reginitcolors();
1751          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1752                        PL_colors[4],PL_colors[5],PL_colors[0],
1753                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1754     });
1755     RExC_flags = pm->op_pmflags;
1756     RExC_sawback = 0;
1757
1758     RExC_seen = 0;
1759     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1760     RExC_seen_evals = 0;
1761     RExC_extralen = 0;
1762
1763     /* First pass: determine size, legality. */
1764     RExC_parse = exp;
1765     RExC_start = exp;
1766     RExC_end = xend;
1767     RExC_naughty = 0;
1768     RExC_npar = 1;
1769     RExC_size = 0L;
1770     RExC_emit = &PL_regdummy;
1771     RExC_whilem_seen = 0;
1772 #if 0 /* REGC() is (currently) a NOP at the first pass.
1773        * Clever compilers notice this and complain. --jhi */
1774     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1775 #endif
1776     if (reg(pRExC_state, 0, &flags) == NULL) {
1777         RExC_precomp = Nullch;
1778         return(NULL);
1779     }
1780     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1781
1782     /* Small enough for pointer-storage convention?
1783        If extralen==0, this means that we will not need long jumps. */
1784     if (RExC_size >= 0x10000L && RExC_extralen)
1785         RExC_size += RExC_extralen;
1786     else
1787         RExC_extralen = 0;
1788     if (RExC_whilem_seen > 15)
1789         RExC_whilem_seen = 15;
1790
1791     /* Allocate space and initialize. */
1792     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1793          char, regexp);
1794     if (r == NULL)
1795         FAIL("Regexp out of space");
1796
1797 #ifdef DEBUGGING
1798     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1799     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1800 #endif
1801     r->refcnt = 1;
1802     r->prelen = xend - exp;
1803     r->precomp = savepvn(RExC_precomp, r->prelen);
1804     r->subbeg = NULL;
1805 #ifdef PERL_COPY_ON_WRITE
1806     r->saved_copy = Nullsv;
1807 #endif
1808     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1809     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1810
1811     r->substrs = 0;                     /* Useful during FAIL. */
1812     r->startp = 0;                      /* Useful during FAIL. */
1813     r->endp = 0;                        /* Useful during FAIL. */
1814
1815     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1816     if (r->offsets) {
1817       r->offsets[0] = RExC_size; 
1818     }
1819     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1820                           "%s %"UVuf" bytes for offset annotations.\n", 
1821                           r->offsets ? "Got" : "Couldn't get", 
1822                           (UV)((2*RExC_size+1) * sizeof(U32))));
1823
1824     RExC_rx = r;
1825
1826     /* Second pass: emit code. */
1827     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1828     RExC_parse = exp;
1829     RExC_end = xend;
1830     RExC_naughty = 0;
1831     RExC_npar = 1;
1832     RExC_emit_start = r->program;
1833     RExC_emit = r->program;
1834     /* Store the count of eval-groups for security checks: */
1835     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1836     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1837     r->data = 0;
1838     if (reg(pRExC_state, 0, &flags) == NULL)
1839         return(NULL);
1840
1841     /* Dig out information for optimizations. */
1842     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1843     pm->op_pmflags = RExC_flags;
1844     if (UTF)
1845         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1846     r->regstclass = NULL;
1847     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1848         r->reganch |= ROPT_NAUGHTY;
1849     scan = r->program + 1;              /* First BRANCH. */
1850
1851     /* XXXX To minimize changes to RE engine we always allocate
1852        3-units-long substrs field. */
1853     Newz(1004, r->substrs, 1, struct reg_substr_data);
1854
1855     StructCopy(&zero_scan_data, &data, scan_data_t);
1856     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1857     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1858         I32 fake;
1859         STRLEN longest_float_length, longest_fixed_length;
1860         struct regnode_charclass_class ch_class;
1861         int stclass_flag;
1862         I32 last_close = 0;
1863
1864         first = scan;
1865         /* Skip introductions and multiplicators >= 1. */
1866         while ((OP(first) == OPEN && (sawopen = 1)) ||
1867                /* An OR of *one* alternative - should not happen now. */
1868             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869             (OP(first) == PLUS) ||
1870             (OP(first) == MINMOD) ||
1871                /* An {n,m} with n>0 */
1872             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1873                 if (OP(first) == PLUS)
1874                     sawplus = 1;
1875                 else
1876                     first += regarglen[(U8)OP(first)];
1877                 first = NEXTOPER(first);
1878         }
1879
1880         /* Starting-point info. */
1881       again:
1882         if (PL_regkind[(U8)OP(first)] == EXACT) {
1883             if (OP(first) == EXACT)
1884                 ;       /* Empty, get anchored substr later. */
1885             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1886                 r->regstclass = first;
1887         }
1888         else if (strchr((char*)PL_simple,OP(first)))
1889             r->regstclass = first;
1890         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891                  PL_regkind[(U8)OP(first)] == NBOUND)
1892             r->regstclass = first;
1893         else if (PL_regkind[(U8)OP(first)] == BOL) {
1894             r->reganch |= (OP(first) == MBOL
1895                            ? ROPT_ANCH_MBOL
1896                            : (OP(first) == SBOL
1897                               ? ROPT_ANCH_SBOL
1898                               : ROPT_ANCH_BOL));
1899             first = NEXTOPER(first);
1900             goto again;
1901         }
1902         else if (OP(first) == GPOS) {
1903             r->reganch |= ROPT_ANCH_GPOS;
1904             first = NEXTOPER(first);
1905             goto again;
1906         }
1907         else if (!sawopen && (OP(first) == STAR &&
1908             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1909             !(r->reganch & ROPT_ANCH) )
1910         {
1911             /* turn .* into ^.* with an implied $*=1 */
1912             int type = OP(NEXTOPER(first));
1913
1914             if (type == REG_ANY)
1915                 type = ROPT_ANCH_MBOL;
1916             else
1917                 type = ROPT_ANCH_SBOL;
1918
1919             r->reganch |= type | ROPT_IMPLICIT;
1920             first = NEXTOPER(first);
1921             goto again;
1922         }
1923         if (sawplus && (!sawopen || !RExC_sawback)
1924             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1925             /* x+ must match at the 1st pos of run of x's */
1926             r->reganch |= ROPT_SKIP;
1927
1928         /* Scan is after the zeroth branch, first is atomic matcher. */
1929         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1930                               (IV)(first - scan + 1)));
1931         /*
1932         * If there's something expensive in the r.e., find the
1933         * longest literal string that must appear and make it the
1934         * regmust.  Resolve ties in favor of later strings, since
1935         * the regstart check works with the beginning of the r.e.
1936         * and avoiding duplication strengthens checking.  Not a
1937         * strong reason, but sufficient in the absence of others.
1938         * [Now we resolve ties in favor of the earlier string if
1939         * it happens that c_offset_min has been invalidated, since the
1940         * earlier string may buy us something the later one won't.]
1941         */
1942         minlen = 0;
1943
1944         data.longest_fixed = newSVpvn("",0);
1945         data.longest_float = newSVpvn("",0);
1946         data.last_found = newSVpvn("",0);
1947         data.longest = &(data.longest_fixed);
1948         first = scan;
1949         if (!r->regstclass) {
1950             cl_init(pRExC_state, &ch_class);
1951             data.start_class = &ch_class;
1952             stclass_flag = SCF_DO_STCLASS_AND;
1953         } else                          /* XXXX Check for BOUND? */
1954             stclass_flag = 0;
1955         data.last_closep = &last_close;
1956
1957         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1958                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1959         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1960              && data.last_start_min == 0 && data.last_end > 0
1961              && !RExC_seen_zerolen
1962              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1963             r->reganch |= ROPT_CHECK_ALL;
1964         scan_commit(pRExC_state, &data);
1965         SvREFCNT_dec(data.last_found);
1966
1967         longest_float_length = CHR_SVLEN(data.longest_float);
1968         if (longest_float_length
1969             || (data.flags & SF_FL_BEFORE_EOL
1970                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971                     || (RExC_flags & PMf_MULTILINE)))) {
1972             int t;
1973
1974             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1975                 && data.offset_fixed == data.offset_float_min
1976                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977                     goto remove_float;          /* As in (a)+. */
1978
1979             if (SvUTF8(data.longest_float)) {
1980                 r->float_utf8 = data.longest_float;
1981                 r->float_substr = Nullsv;
1982             } else {
1983                 r->float_substr = data.longest_float;
1984                 r->float_utf8 = Nullsv;
1985             }
1986             r->float_min_offset = data.offset_float_min;
1987             r->float_max_offset = data.offset_float_max;
1988             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1990                            || (RExC_flags & PMf_MULTILINE)));
1991             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1992         }
1993         else {
1994           remove_float:
1995             r->float_substr = r->float_utf8 = Nullsv;
1996             SvREFCNT_dec(data.longest_float);
1997             longest_float_length = 0;
1998         }
1999
2000         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2001         if (longest_fixed_length
2002             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2004                     || (RExC_flags & PMf_MULTILINE)))) {
2005             int t;
2006
2007             if (SvUTF8(data.longest_fixed)) {
2008                 r->anchored_utf8 = data.longest_fixed;
2009                 r->anchored_substr = Nullsv;
2010             } else {
2011                 r->anchored_substr = data.longest_fixed;
2012                 r->anchored_utf8 = Nullsv;
2013             }
2014             r->anchored_offset = data.offset_fixed;
2015             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2017                      || (RExC_flags & PMf_MULTILINE)));
2018             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2019         }
2020         else {
2021             r->anchored_substr = r->anchored_utf8 = Nullsv;
2022             SvREFCNT_dec(data.longest_fixed);
2023             longest_fixed_length = 0;
2024         }
2025         if (r->regstclass
2026             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2027             r->regstclass = NULL;
2028         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029             && stclass_flag
2030             && !(data.start_class->flags & ANYOF_EOS)
2031             && !cl_is_anything(data.start_class))
2032         {
2033             I32 n = add_data(pRExC_state, 1, "f");
2034
2035             New(1006, RExC_rx->data->data[n], 1,
2036                 struct regnode_charclass_class);
2037             StructCopy(data.start_class,
2038                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2039                        struct regnode_charclass_class);
2040             r->regstclass = (regnode*)RExC_rx->data->data[n];
2041             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2042             PL_regdata = r->data; /* for regprop() */
2043             DEBUG_r({ SV *sv = sv_newmortal();
2044                       regprop(sv, (regnode*)data.start_class);
2045                       PerlIO_printf(Perl_debug_log,
2046                                     "synthetic stclass `%s'.\n",
2047                                     SvPVX(sv));});
2048         }
2049
2050         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2051         if (longest_fixed_length > longest_float_length) {
2052             r->check_substr = r->anchored_substr;
2053             r->check_utf8 = r->anchored_utf8;
2054             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055             if (r->reganch & ROPT_ANCH_SINGLE)
2056                 r->reganch |= ROPT_NOSCAN;
2057         }
2058         else {
2059             r->check_substr = r->float_substr;
2060             r->check_utf8 = r->float_utf8;
2061             r->check_offset_min = data.offset_float_min;
2062             r->check_offset_max = data.offset_float_max;
2063         }
2064         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065            This should be changed ASAP!  */
2066         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2067             r->reganch |= RE_USE_INTUIT;
2068             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2069                 r->reganch |= RE_INTUIT_TAIL;
2070         }
2071     }
2072     else {
2073         /* Several toplevels. Best we can is to set minlen. */
2074         I32 fake;
2075         struct regnode_charclass_class ch_class;
2076         I32 last_close = 0;
2077         
2078         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079         scan = r->program + 1;
2080         cl_init(pRExC_state, &ch_class);
2081         data.start_class = &ch_class;
2082         data.last_closep = &last_close;
2083         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2084         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085                 = r->float_substr = r->float_utf8 = Nullsv;
2086         if (!(data.start_class->flags & ANYOF_EOS)
2087             && !cl_is_anything(data.start_class))
2088         {
2089             I32 n = add_data(pRExC_state, 1, "f");
2090
2091             New(1006, RExC_rx->data->data[n], 1,
2092                 struct regnode_charclass_class);
2093             StructCopy(data.start_class,
2094                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2095                        struct regnode_charclass_class);
2096             r->regstclass = (regnode*)RExC_rx->data->data[n];
2097             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2098             DEBUG_r({ SV* sv = sv_newmortal();
2099                       regprop(sv, (regnode*)data.start_class);
2100                       PerlIO_printf(Perl_debug_log,
2101                                     "synthetic stclass `%s'.\n",
2102                                     SvPVX(sv));});
2103         }
2104     }
2105
2106     r->minlen = minlen;
2107     if (RExC_seen & REG_SEEN_GPOS)
2108         r->reganch |= ROPT_GPOS_SEEN;
2109     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2110         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2111     if (RExC_seen & REG_SEEN_EVAL)
2112         r->reganch |= ROPT_EVAL_SEEN;
2113     if (RExC_seen & REG_SEEN_CANY)
2114         r->reganch |= ROPT_CANY_SEEN;
2115     Newz(1002, r->startp, RExC_npar, I32);
2116     Newz(1002, r->endp, RExC_npar, I32);
2117     PL_regdata = r->data; /* for regprop() */
2118     DEBUG_r(regdump(r));
2119     return(r);
2120 }
2121
2122 /*
2123  - reg - regular expression, i.e. main body or parenthesized thing
2124  *
2125  * Caller must absorb opening parenthesis.
2126  *
2127  * Combining parenthesis handling with the base level of regular expression
2128  * is a trifle forced, but the need to tie the tails of the branches to what
2129  * follows makes it hard to avoid.
2130  */
2131 STATIC regnode *
2132 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2133     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2134 {
2135     register regnode *ret;              /* Will be the head of the group. */
2136     register regnode *br;
2137     register regnode *lastbr;
2138     register regnode *ender = 0;
2139     register I32 parno = 0;
2140     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2141
2142     /* for (?g), (?gc), and (?o) warnings; warning
2143        about (?c) will warn about (?g) -- japhy    */
2144
2145     I32 wastedflags = 0x00,
2146         wasted_o    = 0x01,
2147         wasted_g    = 0x02,
2148         wasted_gc   = 0x02 | 0x04,
2149         wasted_c    = 0x04;
2150
2151     char * parse_start = RExC_parse; /* MJD */
2152     char *oregcomp_parse = RExC_parse;
2153     char c;
2154
2155     *flagp = 0;                         /* Tentatively. */
2156
2157
2158     /* Make an OPEN node, if parenthesized. */
2159     if (paren) {
2160         if (*RExC_parse == '?') { /* (?...) */
2161             U32 posflags = 0, negflags = 0;
2162             U32 *flagsp = &posflags;
2163             int logical = 0;
2164             char *seqstart = RExC_parse;
2165
2166             RExC_parse++;
2167             paren = *RExC_parse++;
2168             ret = NULL;                 /* For look-ahead/behind. */
2169             switch (paren) {
2170             case '<':           /* (?<...) */
2171                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2172                 if (*RExC_parse == '!')
2173                     paren = ',';
2174                 if (*RExC_parse != '=' && *RExC_parse != '!')
2175                     goto unknown;
2176                 RExC_parse++;
2177             case '=':           /* (?=...) */
2178             case '!':           /* (?!...) */
2179                 RExC_seen_zerolen++;
2180             case ':':           /* (?:...) */
2181             case '>':           /* (?>...) */
2182                 break;
2183             case '$':           /* (?$...) */
2184             case '@':           /* (?@...) */
2185                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2186                 break;
2187             case '#':           /* (?#...) */
2188                 while (*RExC_parse && *RExC_parse != ')')
2189                     RExC_parse++;
2190                 if (*RExC_parse != ')')
2191                     FAIL("Sequence (?#... not terminated");
2192                 nextchar(pRExC_state);
2193                 *flagp = TRYAGAIN;
2194                 return NULL;
2195             case 'p':           /* (?p...) */
2196                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2197                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2198                 /* FALL THROUGH*/
2199             case '?':           /* (??...) */
2200                 logical = 1;
2201                 if (*RExC_parse != '{')
2202                     goto unknown;
2203                 paren = *RExC_parse++;
2204                 /* FALL THROUGH */
2205             case '{':           /* (?{...}) */
2206             {
2207                 I32 count = 1, n = 0;
2208                 char c;
2209                 char *s = RExC_parse;
2210                 SV *sv;
2211                 OP_4tree *sop, *rop;
2212
2213                 RExC_seen_zerolen++;
2214                 RExC_seen |= REG_SEEN_EVAL;
2215                 while (count && (c = *RExC_parse)) {
2216                     if (c == '\\' && RExC_parse[1])
2217                         RExC_parse++;
2218                     else if (c == '{')
2219                         count++;
2220                     else if (c == '}')
2221                         count--;
2222                     RExC_parse++;
2223                 }
2224                 if (*RExC_parse != ')')
2225                 {
2226                     RExC_parse = s;             
2227                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228                 }
2229                 if (!SIZE_ONLY) {
2230                     PAD *pad;
2231                 
2232                     if (RExC_parse - 1 - s)
2233                         sv = newSVpvn(s, RExC_parse - 1 - s);
2234                     else
2235                         sv = newSVpvn("", 0);
2236
2237                     ENTER;
2238                     Perl_save_re_context(aTHX);
2239                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2240                     sop->op_private |= OPpREFCOUNTED;
2241                     /* re_dup will OpREFCNT_inc */
2242                     OpREFCNT_set(sop, 1);
2243                     LEAVE;
2244
2245                     n = add_data(pRExC_state, 3, "nop");
2246                     RExC_rx->data->data[n] = (void*)rop;
2247                     RExC_rx->data->data[n+1] = (void*)sop;
2248                     RExC_rx->data->data[n+2] = (void*)pad;
2249                     SvREFCNT_dec(sv);
2250                 }
2251                 else {                                          /* First pass */
2252                     if (PL_reginterp_cnt < ++RExC_seen_evals
2253                         && PL_curcop != &PL_compiling)
2254                         /* No compiled RE interpolated, has runtime
2255                            components ===> unsafe.  */
2256                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2257                     if (PL_tainting && PL_tainted)
2258                         FAIL("Eval-group in insecure regular expression");
2259                 }
2260                 
2261                 nextchar(pRExC_state);
2262                 if (logical) {
2263                     ret = reg_node(pRExC_state, LOGICAL);
2264                     if (!SIZE_ONLY)
2265                         ret->flags = 2;
2266                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2267                     /* deal with the length of this later - MJD */
2268                     return ret;
2269                 }
2270                 ret = reganode(pRExC_state, EVAL, n);
2271                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272                 Set_Node_Offset(ret, parse_start);
2273                 return ret;
2274             }
2275             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2276             {
2277                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2278                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279                         || RExC_parse[1] == '<'
2280                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2281                         I32 flag;
2282                         
2283                         ret = reg_node(pRExC_state, LOGICAL);
2284                         if (!SIZE_ONLY)
2285                             ret->flags = 1;
2286                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2287                         goto insert_if;
2288                     }
2289                 }
2290                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2291                     /* (?(1)...) */
2292                     parno = atoi(RExC_parse++);
2293
2294                     while (isDIGIT(*RExC_parse))
2295                         RExC_parse++;
2296                     ret = reganode(pRExC_state, GROUPP, parno);
2297                     
2298                     if ((c = *nextchar(pRExC_state)) != ')')
2299                         vFAIL("Switch condition not recognized");
2300                   insert_if:
2301                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302                     br = regbranch(pRExC_state, &flags, 1);
2303                     if (br == NULL)
2304                         br = reganode(pRExC_state, LONGJMP, 0);
2305                     else
2306                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307                     c = *nextchar(pRExC_state);
2308                     if (flags&HASWIDTH)
2309                         *flagp |= HASWIDTH;
2310                     if (c == '|') {
2311                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312                         regbranch(pRExC_state, &flags, 1);
2313                         regtail(pRExC_state, ret, lastbr);
2314                         if (flags&HASWIDTH)
2315                             *flagp |= HASWIDTH;
2316                         c = *nextchar(pRExC_state);
2317                     }
2318                     else
2319                         lastbr = NULL;
2320                     if (c != ')')
2321                         vFAIL("Switch (?(condition)... contains too many branches");
2322                     ender = reg_node(pRExC_state, TAIL);
2323                     regtail(pRExC_state, br, ender);
2324                     if (lastbr) {
2325                         regtail(pRExC_state, lastbr, ender);
2326                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2327                     }
2328                     else
2329                         regtail(pRExC_state, ret, ender);
2330                     return ret;
2331                 }
2332                 else {
2333                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2334                 }
2335             }
2336             case 0:
2337                 RExC_parse--; /* for vFAIL to print correctly */
2338                 vFAIL("Sequence (? incomplete");
2339                 break;
2340             default:
2341                 --RExC_parse;
2342               parse_flags:      /* (?i) */
2343                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2344                     /* (?g), (?gc) and (?o) are useless here
2345                        and must be globally applied -- japhy */
2346
2347                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350                             if (! (wastedflags & wflagbit) ) {
2351                                 wastedflags |= wflagbit;
2352                                 vWARN5(
2353                                     RExC_parse + 1,
2354                                     "Useless (%s%c) - %suse /%c modifier",
2355                                     flagsp == &negflags ? "?-" : "?",
2356                                     *RExC_parse,
2357                                     flagsp == &negflags ? "don't " : "",
2358                                     *RExC_parse
2359                                 );
2360                             }
2361                         }
2362                     }
2363                     else if (*RExC_parse == 'c') {
2364                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365                             if (! (wastedflags & wasted_c) ) {
2366                                 wastedflags |= wasted_gc;
2367                                 vWARN3(
2368                                     RExC_parse + 1,
2369                                     "Useless (%sc) - %suse /gc modifier",
2370                                     flagsp == &negflags ? "?-" : "?",
2371                                     flagsp == &negflags ? "don't " : ""
2372                                 );
2373                             }
2374                         }
2375                     }
2376                     else { pmflag(flagsp, *RExC_parse); }
2377
2378                     ++RExC_parse;
2379                 }
2380                 if (*RExC_parse == '-') {
2381                     flagsp = &negflags;
2382                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2383                     ++RExC_parse;
2384                     goto parse_flags;
2385                 }
2386                 RExC_flags |= posflags;
2387                 RExC_flags &= ~negflags;
2388                 if (*RExC_parse == ':') {
2389                     RExC_parse++;
2390                     paren = ':';
2391                     break;
2392                 }               
2393               unknown:
2394                 if (*RExC_parse != ')') {
2395                     RExC_parse++;
2396                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2397                 }
2398                 nextchar(pRExC_state);
2399                 *flagp = TRYAGAIN;
2400                 return NULL;
2401             }
2402         }
2403         else {                  /* (...) */
2404             parno = RExC_npar;
2405             RExC_npar++;
2406             ret = reganode(pRExC_state, OPEN, parno);
2407             Set_Node_Length(ret, 1); /* MJD */
2408             Set_Node_Offset(ret, RExC_parse); /* MJD */
2409             open = 1;
2410         }
2411     }
2412     else                        /* ! paren */
2413         ret = NULL;
2414
2415     /* Pick up the branches, linking them together. */
2416     parse_start = RExC_parse;   /* MJD */
2417     br = regbranch(pRExC_state, &flags, 1);
2418     /*     branch_len = (paren != 0); */
2419     
2420     if (br == NULL)
2421         return(NULL);
2422     if (*RExC_parse == '|') {
2423         if (!SIZE_ONLY && RExC_extralen) {
2424             reginsert(pRExC_state, BRANCHJ, br);
2425         }
2426         else {                  /* MJD */
2427             reginsert(pRExC_state, BRANCH, br);
2428             Set_Node_Length(br, paren != 0);
2429             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430         }
2431         have_branch = 1;
2432         if (SIZE_ONLY)
2433             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2434     }
2435     else if (paren == ':') {
2436         *flagp |= flags&SIMPLE;
2437     }
2438     if (open) {                         /* Starts with OPEN. */
2439         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2440     }
2441     else if (paren != '?')              /* Not Conditional */
2442         ret = br;
2443     *flagp |= flags & (SPSTART | HASWIDTH);
2444     lastbr = br;
2445     while (*RExC_parse == '|') {
2446         if (!SIZE_ONLY && RExC_extralen) {
2447             ender = reganode(pRExC_state, LONGJMP,0);
2448             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2449         }
2450         if (SIZE_ONLY)
2451             RExC_extralen += 2;         /* Account for LONGJMP. */
2452         nextchar(pRExC_state);
2453         br = regbranch(pRExC_state, &flags, 0);
2454         
2455         if (br == NULL)
2456             return(NULL);
2457         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2458         lastbr = br;
2459         if (flags&HASWIDTH)
2460             *flagp |= HASWIDTH;
2461         *flagp |= flags&SPSTART;
2462     }
2463
2464     if (have_branch || paren != ':') {
2465         /* Make a closing node, and hook it on the end. */
2466         switch (paren) {
2467         case ':':
2468             ender = reg_node(pRExC_state, TAIL);
2469             break;
2470         case 1:
2471             ender = reganode(pRExC_state, CLOSE, parno);
2472             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473             Set_Node_Length(ender,1); /* MJD */
2474             break;
2475         case '<':
2476         case ',':
2477         case '=':
2478         case '!':
2479             *flagp &= ~HASWIDTH;
2480             /* FALL THROUGH */
2481         case '>':
2482             ender = reg_node(pRExC_state, SUCCEED);
2483             break;
2484         case 0:
2485             ender = reg_node(pRExC_state, END);
2486             break;
2487         }
2488         regtail(pRExC_state, lastbr, ender);
2489
2490         if (have_branch) {
2491             /* Hook the tails of the branches to the closing node. */
2492             for (br = ret; br != NULL; br = regnext(br)) {
2493                 regoptail(pRExC_state, br, ender);
2494             }
2495         }
2496     }
2497
2498     {
2499         char *p;
2500         static char parens[] = "=!<,>";
2501
2502         if (paren && (p = strchr(parens, paren))) {
2503             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2504             int flag = (p - parens) > 1;
2505
2506             if (paren == '>')
2507                 node = SUSPEND, flag = 0;
2508             reginsert(pRExC_state, node,ret);
2509             Set_Node_Offset(ret, oregcomp_parse);
2510             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2511             ret->flags = flag;
2512             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2513         }
2514     }
2515
2516     /* Check for proper termination. */
2517     if (paren) {
2518         RExC_flags = oregflags;
2519         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520             RExC_parse = oregcomp_parse;
2521             vFAIL("Unmatched (");
2522         }
2523     }
2524     else if (!paren && RExC_parse < RExC_end) {
2525         if (*RExC_parse == ')') {
2526             RExC_parse++;
2527             vFAIL("Unmatched )");
2528         }
2529         else
2530             FAIL("Junk on end of regexp");      /* "Can't happen". */
2531         /* NOTREACHED */
2532     }
2533
2534     return(ret);
2535 }
2536
2537 /*
2538  - regbranch - one alternative of an | operator
2539  *
2540  * Implements the concatenation operator.
2541  */
2542 STATIC regnode *
2543 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2544 {
2545     register regnode *ret;
2546     register regnode *chain = NULL;
2547     register regnode *latest;
2548     I32 flags = 0, c = 0;
2549
2550     if (first)
2551         ret = NULL;
2552     else {
2553         if (!SIZE_ONLY && RExC_extralen)
2554             ret = reganode(pRExC_state, BRANCHJ,0);
2555         else {
2556             ret = reg_node(pRExC_state, BRANCH);
2557             Set_Node_Length(ret, 1);
2558         }
2559     }
2560         
2561     if (!first && SIZE_ONLY)
2562         RExC_extralen += 1;                     /* BRANCHJ */
2563
2564     *flagp = WORST;                     /* Tentatively. */
2565
2566     RExC_parse--;
2567     nextchar(pRExC_state);
2568     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2569         flags &= ~TRYAGAIN;
2570         latest = regpiece(pRExC_state, &flags);
2571         if (latest == NULL) {
2572             if (flags & TRYAGAIN)
2573                 continue;
2574             return(NULL);
2575         }
2576         else if (ret == NULL)
2577             ret = latest;
2578         *flagp |= flags&HASWIDTH;
2579         if (chain == NULL)      /* First piece. */
2580             *flagp |= flags&SPSTART;
2581         else {
2582             RExC_naughty++;
2583             regtail(pRExC_state, chain, latest);
2584         }
2585         chain = latest;
2586         c++;
2587     }
2588     if (chain == NULL) {        /* Loop ran zero times. */
2589         chain = reg_node(pRExC_state, NOTHING);
2590         if (ret == NULL)
2591             ret = chain;
2592     }
2593     if (c == 1) {
2594         *flagp |= flags&SIMPLE;
2595     }
2596
2597     return(ret);
2598 }
2599
2600 /*
2601  - regpiece - something followed by possible [*+?]
2602  *
2603  * Note that the branching code sequences used for ? and the general cases
2604  * of * and + are somewhat optimized:  they use the same NOTHING node as
2605  * both the endmarker for their branch list and the body of the last branch.
2606  * It might seem that this node could be dispensed with entirely, but the
2607  * endmarker role is not redundant.
2608  */
2609 STATIC regnode *
2610 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2611 {
2612     register regnode *ret;
2613     register char op;
2614     register char *next;
2615     I32 flags;
2616     char *origparse = RExC_parse;
2617     char *maxpos;
2618     I32 min;
2619     I32 max = REG_INFTY;
2620     char *parse_start;
2621
2622     ret = regatom(pRExC_state, &flags);
2623     if (ret == NULL) {
2624         if (flags & TRYAGAIN)
2625             *flagp |= TRYAGAIN;
2626         return(NULL);
2627     }
2628
2629     op = *RExC_parse;
2630
2631     if (op == '{' && regcurly(RExC_parse)) {
2632         parse_start = RExC_parse; /* MJD */
2633         next = RExC_parse + 1;
2634         maxpos = Nullch;
2635         while (isDIGIT(*next) || *next == ',') {
2636             if (*next == ',') {
2637                 if (maxpos)
2638                     break;
2639                 else
2640                     maxpos = next;
2641             }
2642             next++;
2643         }
2644         if (*next == '}') {             /* got one */
2645             if (!maxpos)
2646                 maxpos = next;
2647             RExC_parse++;
2648             min = atoi(RExC_parse);
2649             if (*maxpos == ',')
2650                 maxpos++;
2651             else
2652                 maxpos = RExC_parse;
2653             max = atoi(maxpos);
2654             if (!max && *maxpos != '0')
2655                 max = REG_INFTY;                /* meaning "infinity" */
2656             else if (max >= REG_INFTY)
2657                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2658             RExC_parse = next;
2659             nextchar(pRExC_state);
2660
2661         do_curly:
2662             if ((flags&SIMPLE)) {
2663                 RExC_naughty += 2 + RExC_naughty / 2;
2664                 reginsert(pRExC_state, CURLY, ret);
2665                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666                 Set_Node_Cur_Length(ret);
2667             }
2668             else {
2669                 regnode *w = reg_node(pRExC_state, WHILEM);
2670
2671                 w->flags = 0;
2672                 regtail(pRExC_state, ret, w);
2673                 if (!SIZE_ONLY && RExC_extralen) {
2674                     reginsert(pRExC_state, LONGJMP,ret);
2675                     reginsert(pRExC_state, NOTHING,ret);
2676                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2677                 }
2678                 reginsert(pRExC_state, CURLYX,ret);
2679                                 /* MJD hk */
2680                 Set_Node_Offset(ret, parse_start+1);
2681                 Set_Node_Length(ret, 
2682                                 op == '{' ? (RExC_parse - parse_start) : 1);
2683                 
2684                 if (!SIZE_ONLY && RExC_extralen)
2685                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2686                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2687                 if (SIZE_ONLY)
2688                     RExC_whilem_seen++, RExC_extralen += 3;
2689                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2690             }
2691             ret->flags = 0;
2692
2693             if (min > 0)
2694                 *flagp = WORST;
2695             if (max > 0)
2696                 *flagp |= HASWIDTH;
2697             if (max && max < min)
2698                 vFAIL("Can't do {n,m} with n > m");
2699             if (!SIZE_ONLY) {
2700                 ARG1_SET(ret, (U16)min);
2701                 ARG2_SET(ret, (U16)max);
2702             }
2703
2704             goto nest_check;
2705         }
2706     }
2707
2708     if (!ISMULT1(op)) {
2709         *flagp = flags;
2710         return(ret);
2711     }
2712
2713 #if 0                           /* Now runtime fix should be reliable. */
2714
2715     /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719            (F) The part of the regexp subject to either the * or + quantifier
2720            could match an empty string. The {#} shows in the regular
2721            expression about where the problem was discovered.
2722
2723     */
2724
2725     if (!(flags&HASWIDTH) && op != '?')
2726       vFAIL("Regexp *+ operand could be empty");
2727 #endif
2728
2729     parse_start = RExC_parse;
2730     nextchar(pRExC_state);
2731
2732     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2733
2734     if (op == '*' && (flags&SIMPLE)) {
2735         reginsert(pRExC_state, STAR, ret);
2736         ret->flags = 0;
2737         RExC_naughty += 4;
2738     }
2739     else if (op == '*') {
2740         min = 0;
2741         goto do_curly;
2742     }
2743     else if (op == '+' && (flags&SIMPLE)) {
2744         reginsert(pRExC_state, PLUS, ret);
2745         ret->flags = 0;
2746         RExC_naughty += 3;
2747     }
2748     else if (op == '+') {
2749         min = 1;
2750         goto do_curly;
2751     }
2752     else if (op == '?') {
2753         min = 0; max = 1;
2754         goto do_curly;
2755     }
2756   nest_check:
2757     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2758         vWARN3(RExC_parse,
2759                "%.*s matches null string many times",
2760                RExC_parse - origparse,
2761                origparse);
2762     }
2763
2764     if (*RExC_parse == '?') {
2765         nextchar(pRExC_state);
2766         reginsert(pRExC_state, MINMOD, ret);
2767         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2768     }
2769     if (ISMULT2(RExC_parse)) {
2770         RExC_parse++;
2771         vFAIL("Nested quantifiers");
2772     }
2773
2774     return(ret);
2775 }
2776
2777 /*
2778  - regatom - the lowest level
2779  *
2780  * Optimization:  gobbles an entire sequence of ordinary characters so that
2781  * it can turn them into a single node, which is smaller to store and
2782  * faster to run.  Backslashed characters are exceptions, each becoming a
2783  * separate node; the code is simpler that way and it's not worth fixing.
2784  *
2785  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2786 STATIC regnode *
2787 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2788 {
2789     register regnode *ret = 0;
2790     I32 flags;
2791     char *parse_start = 0;
2792
2793     *flagp = WORST;             /* Tentatively. */
2794
2795 tryagain:
2796     switch (*RExC_parse) {
2797     case '^':
2798         RExC_seen_zerolen++;
2799         nextchar(pRExC_state);
2800         if (RExC_flags & PMf_MULTILINE)
2801             ret = reg_node(pRExC_state, MBOL);
2802         else if (RExC_flags & PMf_SINGLELINE)
2803             ret = reg_node(pRExC_state, SBOL);
2804         else
2805             ret = reg_node(pRExC_state, BOL);
2806         Set_Node_Length(ret, 1); /* MJD */
2807         break;
2808     case '$':
2809         nextchar(pRExC_state);
2810         if (*RExC_parse)
2811             RExC_seen_zerolen++;
2812         if (RExC_flags & PMf_MULTILINE)
2813             ret = reg_node(pRExC_state, MEOL);
2814         else if (RExC_flags & PMf_SINGLELINE)
2815             ret = reg_node(pRExC_state, SEOL);
2816         else
2817             ret = reg_node(pRExC_state, EOL);
2818         Set_Node_Length(ret, 1); /* MJD */
2819         break;
2820     case '.':
2821         nextchar(pRExC_state);
2822         if (RExC_flags & PMf_SINGLELINE)
2823             ret = reg_node(pRExC_state, SANY);
2824         else
2825             ret = reg_node(pRExC_state, REG_ANY);
2826         *flagp |= HASWIDTH|SIMPLE;
2827         RExC_naughty++;
2828         Set_Node_Length(ret, 1); /* MJD */
2829         break;
2830     case '[':
2831     {
2832         char *oregcomp_parse = ++RExC_parse;
2833         ret = regclass(pRExC_state);
2834         if (*RExC_parse != ']') {
2835             RExC_parse = oregcomp_parse;
2836             vFAIL("Unmatched [");
2837         }
2838         nextchar(pRExC_state);
2839         *flagp |= HASWIDTH|SIMPLE;
2840         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2841         break;
2842     }
2843     case '(':
2844         nextchar(pRExC_state);
2845         ret = reg(pRExC_state, 1, &flags);
2846         if (ret == NULL) {
2847                 if (flags & TRYAGAIN) {
2848                     if (RExC_parse == RExC_end) {
2849                          /* Make parent create an empty node if needed. */
2850                         *flagp |= TRYAGAIN;
2851                         return(NULL);
2852                     }
2853                     goto tryagain;
2854                 }
2855                 return(NULL);
2856         }
2857         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2858         break;
2859     case '|':
2860     case ')':
2861         if (flags & TRYAGAIN) {
2862             *flagp |= TRYAGAIN;
2863             return NULL;
2864         }
2865         vFAIL("Internal urp");
2866                                 /* Supposed to be caught earlier. */
2867         break;
2868     case '{':
2869         if (!regcurly(RExC_parse)) {
2870             RExC_parse++;
2871             goto defchar;
2872         }
2873         /* FALL THROUGH */
2874     case '?':
2875     case '+':
2876     case '*':
2877         RExC_parse++;
2878         vFAIL("Quantifier follows nothing");
2879         break;
2880     case '\\':
2881         switch (*++RExC_parse) {
2882         case 'A':
2883             RExC_seen_zerolen++;
2884             ret = reg_node(pRExC_state, SBOL);
2885             *flagp |= SIMPLE;
2886             nextchar(pRExC_state);
2887             Set_Node_Length(ret, 2); /* MJD */
2888             break;
2889         case 'G':
2890             ret = reg_node(pRExC_state, GPOS);
2891             RExC_seen |= REG_SEEN_GPOS;
2892             *flagp |= SIMPLE;
2893             nextchar(pRExC_state);
2894             Set_Node_Length(ret, 2); /* MJD */
2895             break;
2896         case 'Z':
2897             ret = reg_node(pRExC_state, SEOL);
2898             *flagp |= SIMPLE;
2899             RExC_seen_zerolen++;                /* Do not optimize RE away */
2900             nextchar(pRExC_state);
2901             break;
2902         case 'z':
2903             ret = reg_node(pRExC_state, EOS);
2904             *flagp |= SIMPLE;
2905             RExC_seen_zerolen++;                /* Do not optimize RE away */
2906             nextchar(pRExC_state);
2907             Set_Node_Length(ret, 2); /* MJD */
2908             break;
2909         case 'C':
2910             ret = reg_node(pRExC_state, CANY);
2911             RExC_seen |= REG_SEEN_CANY;
2912             *flagp |= HASWIDTH|SIMPLE;
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'X':
2917             ret = reg_node(pRExC_state, CLUMP);
2918             *flagp |= HASWIDTH;
2919             nextchar(pRExC_state);
2920             Set_Node_Length(ret, 2); /* MJD */
2921             break;
2922         case 'w':
2923             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2924             *flagp |= HASWIDTH|SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'W':
2929             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2930             *flagp |= HASWIDTH|SIMPLE;
2931             nextchar(pRExC_state);
2932             Set_Node_Length(ret, 2); /* MJD */
2933             break;
2934         case 'b':
2935             RExC_seen_zerolen++;
2936             RExC_seen |= REG_SEEN_LOOKBEHIND;
2937             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2938             *flagp |= SIMPLE;
2939             nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941             break;
2942         case 'B':
2943             RExC_seen_zerolen++;
2944             RExC_seen |= REG_SEEN_LOOKBEHIND;
2945             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2946             *flagp |= SIMPLE;
2947             nextchar(pRExC_state);
2948             Set_Node_Length(ret, 2); /* MJD */
2949             break;
2950         case 's':
2951             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2952             *flagp |= HASWIDTH|SIMPLE;
2953             nextchar(pRExC_state);
2954             Set_Node_Length(ret, 2); /* MJD */
2955             break;
2956         case 'S':
2957             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2958             *flagp |= HASWIDTH|SIMPLE;
2959             nextchar(pRExC_state);
2960             Set_Node_Length(ret, 2); /* MJD */
2961             break;
2962         case 'd':
2963             ret = reg_node(pRExC_state, DIGIT);
2964             *flagp |= HASWIDTH|SIMPLE;
2965             nextchar(pRExC_state);
2966             Set_Node_Length(ret, 2); /* MJD */
2967             break;
2968         case 'D':
2969             ret = reg_node(pRExC_state, NDIGIT);
2970             *flagp |= HASWIDTH|SIMPLE;
2971             nextchar(pRExC_state);
2972             Set_Node_Length(ret, 2); /* MJD */
2973             break;
2974         case 'p':
2975         case 'P':
2976             {   
2977                 char* oldregxend = RExC_end;
2978                 char* parse_start = RExC_parse - 2;
2979
2980                 if (RExC_parse[1] == '{') {
2981                   /* a lovely hack--pretend we saw [\pX] instead */
2982                     RExC_end = strchr(RExC_parse, '}');
2983                     if (!RExC_end) {
2984                         U8 c = (U8)*RExC_parse;
2985                         RExC_parse += 2;
2986                         RExC_end = oldregxend;
2987                         vFAIL2("Missing right brace on \\%c{}", c);
2988                     }
2989                     RExC_end++;
2990                 }
2991                 else {
2992                     RExC_end = RExC_parse + 2;
2993                     if (RExC_end > oldregxend)
2994                         RExC_end = oldregxend;
2995                 }
2996                 RExC_parse--;
2997
2998                 ret = regclass(pRExC_state);
2999
3000                 RExC_end = oldregxend;
3001                 RExC_parse--;
3002
3003                 Set_Node_Offset(ret, parse_start + 2);
3004                 Set_Node_Cur_Length(ret);
3005                 nextchar(pRExC_state);
3006                 *flagp |= HASWIDTH|SIMPLE;
3007             }
3008             break;
3009         case 'n':
3010         case 'r':
3011         case 't':
3012         case 'f':
3013         case 'e':
3014         case 'a':
3015         case 'x':
3016         case 'c':
3017         case '0':
3018             goto defchar;
3019         case '1': case '2': case '3': case '4':
3020         case '5': case '6': case '7': case '8': case '9':
3021             {
3022                 I32 num = atoi(RExC_parse);
3023
3024                 if (num > 9 && num >= RExC_npar)
3025                     goto defchar;
3026                 else {
3027                     char * parse_start = RExC_parse - 1; /* MJD */
3028                     while (isDIGIT(*RExC_parse))
3029                         RExC_parse++;
3030
3031                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3032                         vFAIL("Reference to nonexistent group");
3033                     RExC_sawback = 1;
3034                     ret = reganode(pRExC_state,
3035                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036                                    num);
3037                     *flagp |= HASWIDTH;
3038                     
3039                     /* override incorrect value set in reganode MJD */
3040                     Set_Node_Offset(ret, parse_start+1); 
3041                     Set_Node_Cur_Length(ret); /* MJD */
3042                     RExC_parse--;
3043                     nextchar(pRExC_state);
3044                 }
3045             }
3046             break;
3047         case '\0':
3048             if (RExC_parse >= RExC_end)
3049                 FAIL("Trailing \\");
3050             /* FALL THROUGH */
3051         default:
3052             /* Do not generate `unrecognized' warnings here, we fall
3053                back into the quick-grab loop below */
3054             goto defchar;
3055         }
3056         break;
3057
3058     case '#':
3059         if (RExC_flags & PMf_EXTENDED) {
3060             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061             if (RExC_parse < RExC_end)
3062                 goto tryagain;
3063         }
3064         /* FALL THROUGH */
3065
3066     default: {
3067             register STRLEN len;
3068             register UV ender;
3069             register char *p;
3070             char *oldp, *s;
3071             STRLEN numlen;
3072             STRLEN foldlen;
3073             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3074
3075             parse_start = RExC_parse - 1;
3076
3077             RExC_parse++;
3078
3079         defchar:
3080             ender = 0;
3081             ret = reg_node(pRExC_state,
3082                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3083             s = STRING(ret);
3084             for (len = 0, p = RExC_parse - 1;
3085               len < 127 && p < RExC_end;
3086               len++)
3087             {
3088                 oldp = p;
3089
3090                 if (RExC_flags & PMf_EXTENDED)
3091                     p = regwhite(p, RExC_end);
3092                 switch (*p) {
3093                 case '^':
3094                 case '$':
3095                 case '.':
3096                 case '[':
3097                 case '(':
3098                 case ')':
3099                 case '|':
3100                     goto loopdone;
3101                 case '\\':
3102                     switch (*++p) {
3103                     case 'A':
3104                     case 'C':
3105                     case 'X':
3106                     case 'G':
3107                     case 'Z':
3108                     case 'z':
3109                     case 'w':
3110                     case 'W':
3111                     case 'b':
3112                     case 'B':
3113                     case 's':
3114                     case 'S':
3115                     case 'd':
3116                     case 'D':
3117                     case 'p':
3118                     case 'P':
3119                         --p;
3120                         goto loopdone;
3121                     case 'n':
3122                         ender = '\n';
3123                         p++;
3124                         break;
3125                     case 'r':
3126                         ender = '\r';
3127                         p++;
3128                         break;
3129                     case 't':
3130                         ender = '\t';
3131                         p++;
3132                         break;
3133                     case 'f':
3134                         ender = '\f';
3135                         p++;
3136                         break;
3137                     case 'e':
3138                           ender = ASCII_TO_NATIVE('\033');
3139                         p++;
3140                         break;
3141                     case 'a':
3142                           ender = ASCII_TO_NATIVE('\007');
3143                         p++;
3144                         break;
3145                     case 'x':
3146                         if (*++p == '{') {
3147                             char* e = strchr(p, '}');
3148         
3149                             if (!e) {
3150                                 RExC_parse = p + 1;
3151                                 vFAIL("Missing right brace on \\x{}");
3152                             }
3153                             else {
3154                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155                                     | PERL_SCAN_DISALLOW_PREFIX;
3156                                 numlen = e - p - 1;
3157                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3158                                 if (ender > 0xff)
3159                                     RExC_utf8 = 1;
3160                                 /* numlen is generous */
3161                                 if (numlen + len >= 127) {
3162                                     p--;
3163                                     goto loopdone;
3164                                 }
3165                                 p = e + 1;
3166                             }
3167                         }
3168                         else {
3169                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3170                             numlen = 2;
3171                             ender = grok_hex(p, &numlen, &flags, NULL);
3172                             p += numlen;
3173                         }
3174                         break;
3175                     case 'c':
3176                         p++;
3177                         ender = UCHARAT(p++);
3178                         ender = toCTRL(ender);
3179                         break;
3180                     case '0': case '1': case '2': case '3':case '4':
3181                     case '5': case '6': case '7': case '8':case '9':
3182                         if (*p == '0' ||
3183                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3184                             I32 flags = 0;
3185                             numlen = 3;
3186                             ender = grok_oct(p, &numlen, &flags, NULL);
3187                             p += numlen;
3188                         }
3189                         else {
3190                             --p;
3191                             goto loopdone;
3192                         }
3193                         break;
3194                     case '\0':
3195                         if (p >= RExC_end)
3196                             FAIL("Trailing \\");
3197                         /* FALL THROUGH */
3198                     default:
3199                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3200                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3201                         goto normal_default;
3202                     }
3203                     break;
3204                 default:
3205                   normal_default:
3206                     if (UTF8_IS_START(*p) && UTF) {
3207                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3208                                                &numlen, 0);
3209                         p += numlen;
3210                     }
3211                     else
3212                         ender = *p++;
3213                     break;
3214                 }
3215                 if (RExC_flags & PMf_EXTENDED)
3216                     p = regwhite(p, RExC_end);
3217                 if (UTF && FOLD) {
3218                     /* Prime the casefolded buffer. */
3219                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3220                 }
3221                 if (ISMULT2(p)) { /* Back off on ?+*. */
3222                     if (len)
3223                         p = oldp;
3224                     else if (UTF) {
3225                          STRLEN unilen;
3226
3227                          if (FOLD) {
3228                               /* Emit all the Unicode characters. */
3229                               for (foldbuf = tmpbuf;
3230                                    foldlen;
3231                                    foldlen -= numlen) {
3232                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3233                                    if (numlen > 0) {
3234                                         reguni(pRExC_state, ender, s, &unilen);
3235                                         s       += unilen;
3236                                         len     += unilen;
3237                                         /* In EBCDIC the numlen
3238                                          * and unilen can differ. */
3239                                         foldbuf += numlen;
3240                                         if (numlen >= foldlen)
3241                                              break;
3242                                    }
3243                                    else
3244                                         break; /* "Can't happen." */
3245                               }
3246                          }
3247                          else {
3248                               reguni(pRExC_state, ender, s, &unilen);
3249                               if (unilen > 0) {
3250                                    s   += unilen;
3251                                    len += unilen;
3252                               }
3253                          }
3254                     }
3255                     else {
3256                         len++;
3257                         REGC((char)ender, s++);
3258                     }
3259                     break;
3260                 }
3261                 if (UTF) {
3262                      STRLEN unilen;
3263
3264                      if (FOLD) {
3265                           /* Emit all the Unicode characters. */
3266                           for (foldbuf = tmpbuf;
3267                                foldlen;
3268                                foldlen -= numlen) {
3269                                ender = utf8_to_uvchr(foldbuf, &numlen);
3270                                if (numlen > 0) {
3271                                     reguni(pRExC_state, ender, s, &unilen);
3272                                     len     += unilen;
3273                                     s       += unilen;
3274                                     /* In EBCDIC the numlen
3275                                      * and unilen can differ. */
3276                                     foldbuf += numlen;
3277                                     if (numlen >= foldlen)
3278                                          break;
3279                                }
3280                                else
3281                                     break;
3282                           }
3283                      }
3284                      else {
3285                           reguni(pRExC_state, ender, s, &unilen);
3286                           if (unilen > 0) {
3287                                s   += unilen;
3288                                len += unilen;
3289                           }
3290                      }
3291                      len--;
3292                 }
3293                 else
3294                     REGC((char)ender, s++);
3295             }
3296         loopdone:
3297             RExC_parse = p - 1;
3298             Set_Node_Cur_Length(ret); /* MJD */
3299             nextchar(pRExC_state);
3300             {
3301                 /* len is STRLEN which is unsigned, need to copy to signed */
3302                 IV iv = len;
3303                 if (iv < 0)
3304                     vFAIL("Internal disaster");
3305             }
3306             if (len > 0)
3307                 *flagp |= HASWIDTH;
3308             if (len == 1)
3309                 *flagp |= SIMPLE;
3310             if (!SIZE_ONLY)
3311                 STR_LEN(ret) = len;
3312             if (SIZE_ONLY)
3313                 RExC_size += STR_SZ(len);
3314             else
3315                 RExC_emit += STR_SZ(len);
3316         }
3317         break;
3318     }
3319
3320     /* If the encoding pragma is in effect recode the text of
3321      * any EXACT-kind nodes. */
3322     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3323         STRLEN oldlen = STR_LEN(ret);
3324         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3325
3326         if (RExC_utf8)
3327             SvUTF8_on(sv);
3328         if (sv_utf8_downgrade(sv, TRUE)) {
3329             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3330             STRLEN newlen = SvCUR(sv);
3331
3332             if (SvUTF8(sv))
3333                 RExC_utf8 = 1;
3334             if (!SIZE_ONLY) {
3335                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3336                                       (int)oldlen, STRING(ret),
3337                                       (int)newlen, s));
3338                 Copy(s, STRING(ret), newlen, char);
3339                 STR_LEN(ret) += newlen - oldlen;
3340                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3341             } else
3342                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3343         }
3344     }
3345
3346     return(ret);
3347 }
3348
3349 STATIC char *
3350 S_regwhite(pTHX_ char *p, char *e)
3351 {
3352     while (p < e) {
3353         if (isSPACE(*p))
3354             ++p;
3355         else if (*p == '#') {
3356             do {
3357                 p++;
3358             } while (p < e && *p != '\n');
3359         }
3360         else
3361             break;
3362     }
3363     return p;
3364 }
3365
3366 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3367    Character classes ([:foo:]) can also be negated ([:^foo:]).
3368    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3369    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3370    but trigger failures because they are currently unimplemented. */
3371
3372 #define POSIXCC_DONE(c)   ((c) == ':')
3373 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3374 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3375
3376 STATIC I32
3377 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3378 {
3379     char *posixcc = 0;
3380     I32 namedclass = OOB_NAMEDCLASS;
3381
3382     if (value == '[' && RExC_parse + 1 < RExC_end &&
3383         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3384         POSIXCC(UCHARAT(RExC_parse))) {
3385         char  c = UCHARAT(RExC_parse);
3386         char* s = RExC_parse++;
3387         
3388         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3389             RExC_parse++;
3390         if (RExC_parse == RExC_end)
3391             /* Grandfather lone [:, [=, [. */
3392             RExC_parse = s;
3393         else {
3394             char* t = RExC_parse++; /* skip over the c */
3395
3396             if (UCHARAT(RExC_parse) == ']') {
3397                 RExC_parse++; /* skip over the ending ] */
3398                 posixcc = s + 1;
3399                 if (*s == ':') {
3400                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401                     I32 skip = 5; /* the most common skip */
3402
3403                     switch (*posixcc) {
3404                     case 'a':
3405                         if (strnEQ(posixcc, "alnum", 5))
3406                             namedclass =
3407                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3408                         else if (strnEQ(posixcc, "alpha", 5))
3409                             namedclass =
3410                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3411                         else if (strnEQ(posixcc, "ascii", 5))
3412                             namedclass =
3413                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3414                         break;
3415                     case 'b':
3416                         if (strnEQ(posixcc, "blank", 5))
3417                             namedclass =
3418                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3419                         break;
3420                     case 'c':
3421                         if (strnEQ(posixcc, "cntrl", 5))
3422                             namedclass =
3423                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3424                         break;
3425                     case 'd':
3426                         if (strnEQ(posixcc, "digit", 5))
3427                             namedclass =
3428                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3429                         break;
3430                     case 'g':
3431                         if (strnEQ(posixcc, "graph", 5))
3432                             namedclass =
3433                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3434                         break;
3435                     case 'l':
3436                         if (strnEQ(posixcc, "lower", 5))
3437                             namedclass =
3438                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3439                         break;
3440                     case 'p':
3441                         if (strnEQ(posixcc, "print", 5))
3442                             namedclass =
3443                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3444                         else if (strnEQ(posixcc, "punct", 5))
3445                             namedclass =
3446                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3447                         break;
3448                     case 's':
3449                         if (strnEQ(posixcc, "space", 5))
3450                             namedclass =
3451                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3452                         break;
3453                     case 'u':
3454                         if (strnEQ(posixcc, "upper", 5))
3455                             namedclass =
3456                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3457                         break;
3458                     case 'w': /* this is not POSIX, this is the Perl \w */
3459                         if (strnEQ(posixcc, "word", 4)) {
3460                             namedclass =
3461                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3462                             skip = 4;
3463                         }
3464                         break;
3465                     case 'x':
3466                         if (strnEQ(posixcc, "xdigit", 6)) {
3467                             namedclass =
3468                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3469                             skip = 6;
3470                         }
3471                         break;
3472                     }
3473                     if (namedclass == OOB_NAMEDCLASS ||
3474                         posixcc[skip] != ':' ||
3475                         posixcc[skip+1] != ']')
3476                     {
3477                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3478                                       t - s - 1, s + 1);
3479                     }
3480                 } else if (!SIZE_ONLY) {
3481                     /* [[=foo=]] and [[.foo.]] are still future. */
3482
3483                     /* adjust RExC_parse so the warning shows after
3484                        the class closes */
3485                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3486                         RExC_parse++;
3487                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3488                 }
3489             } else {
3490                 /* Maternal grandfather:
3491                  * "[:" ending in ":" but not in ":]" */
3492                 RExC_parse = s;
3493             }
3494         }
3495     }
3496
3497     return namedclass;
3498 }
3499
3500 STATIC void
3501 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3502 {
3503     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3504         char *s = RExC_parse;
3505         char  c = *s++;
3506
3507         while(*s && isALNUM(*s))
3508             s++;
3509         if (*s && c == *s && s[1] == ']') {
3510             if (ckWARN(WARN_REGEXP))
3511                 vWARN3(s+2,
3512                         "POSIX syntax [%c %c] belongs inside character classes",
3513                         c, c);
3514
3515             /* [[=foo=]] and [[.foo.]] are still future. */
3516             if (POSIXCC_NOTYET(c)) {
3517                 /* adjust RExC_parse so the error shows after
3518                    the class closes */
3519                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3520                     ;
3521                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3522             }
3523         }
3524     }
3525 }
3526
3527 STATIC regnode *
3528 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3529 {
3530     register UV value;
3531     register UV nextvalue;
3532     register IV prevvalue = OOB_UNICODE;
3533     register IV range = 0;
3534     register regnode *ret;
3535     STRLEN numlen;
3536     IV namedclass;
3537     char *rangebegin = 0;
3538     bool need_class = 0;
3539     SV *listsv = Nullsv;
3540     register char *e;
3541     UV n;
3542     bool optimize_invert   = TRUE;
3543     AV* unicode_alternate  = 0;
3544 #ifdef EBCDIC
3545     UV literal_endpoint = 0;
3546 #endif
3547
3548     ret = reganode(pRExC_state, ANYOF, 0);
3549
3550     if (!SIZE_ONLY)
3551         ANYOF_FLAGS(ret) = 0;
3552
3553     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3554         RExC_naughty++;
3555         RExC_parse++;
3556         if (!SIZE_ONLY)
3557             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3558     }
3559
3560     if (SIZE_ONLY)
3561         RExC_size += ANYOF_SKIP;
3562     else {
3563         RExC_emit += ANYOF_SKIP;
3564         if (FOLD)
3565             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3566         if (LOC)
3567             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3568         ANYOF_BITMAP_ZERO(ret);
3569         listsv = newSVpvn("# comment\n", 10);
3570     }
3571
3572     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3573
3574     if (!SIZE_ONLY && POSIXCC(nextvalue))
3575         checkposixcc(pRExC_state);
3576
3577     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3578     if (UCHARAT(RExC_parse) == ']')
3579         goto charclassloop;
3580
3581     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3582
3583     charclassloop:
3584
3585         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3586
3587         if (!range)
3588             rangebegin = RExC_parse;
3589         if (UTF) {
3590             value = utf8n_to_uvchr((U8*)RExC_parse,
3591                                    RExC_end - RExC_parse,
3592                                    &numlen, 0);
3593             RExC_parse += numlen;
3594         }
3595         else
3596             value = UCHARAT(RExC_parse++);
3597         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3598         if (value == '[' && POSIXCC(nextvalue))
3599             namedclass = regpposixcc(pRExC_state, value);
3600         else if (value == '\\') {
3601             if (UTF) {
3602                 value = utf8n_to_uvchr((U8*)RExC_parse,
3603                                    RExC_end - RExC_parse,
3604                                    &numlen, 0);
3605                 RExC_parse += numlen;
3606             }
3607             else
3608                 value = UCHARAT(RExC_parse++);
3609             /* Some compilers cannot handle switching on 64-bit integer
3610              * values, therefore value cannot be an UV.  Yes, this will
3611              * be a problem later if we want switch on Unicode.
3612              * A similar issue a little bit later when switching on
3613              * namedclass. --jhi */
3614             switch ((I32)value) {
3615             case 'w':   namedclass = ANYOF_ALNUM;       break;
3616             case 'W':   namedclass = ANYOF_NALNUM;      break;
3617             case 's':   namedclass = ANYOF_SPACE;       break;
3618             case 'S':   namedclass = ANYOF_NSPACE;      break;
3619             case 'd':   namedclass = ANYOF_DIGIT;       break;
3620             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3621             case 'p':
3622             case 'P':
3623                 if (RExC_parse >= RExC_end)
3624                     vFAIL2("Empty \\%c{}", (U8)value);
3625                 if (*RExC_parse == '{') {
3626                     U8 c = (U8)value;
3627                     e = strchr(RExC_parse++, '}');
3628                     if (!e)
3629                         vFAIL2("Missing right brace on \\%c{}", c);
3630                     while (isSPACE(UCHARAT(RExC_parse)))
3631                         RExC_parse++;
3632                     if (e == RExC_parse)
3633                         vFAIL2("Empty \\%c{}", c);
3634                     n = e - RExC_parse;
3635                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3636                         n--;
3637                 }
3638                 else {
3639                     e = RExC_parse;
3640                     n = 1;
3641                 }
3642                 if (!SIZE_ONLY) {
3643                     if (UCHARAT(RExC_parse) == '^') {
3644                          RExC_parse++;
3645                          n--;
3646                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3647                          while (isSPACE(UCHARAT(RExC_parse))) {
3648                               RExC_parse++;
3649                               n--;
3650                          }
3651                     }
3652                     if (value == 'p')
3653                          Perl_sv_catpvf(aTHX_ listsv,
3654                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3655                     else
3656                          Perl_sv_catpvf(aTHX_ listsv,
3657                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3658                 }
3659                 RExC_parse = e + 1;
3660                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3661                 continue;
3662             case 'n':   value = '\n';                   break;
3663             case 'r':   value = '\r';                   break;
3664             case 't':   value = '\t';                   break;
3665             case 'f':   value = '\f';                   break;
3666             case 'b':   value = '\b';                   break;
3667             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3668             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3669             case 'x':
3670                 if (*RExC_parse == '{') {
3671                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3672                         | PERL_SCAN_DISALLOW_PREFIX;
3673                     e = strchr(RExC_parse++, '}');
3674                     if (!e)
3675                         vFAIL("Missing right brace on \\x{}");
3676
3677                     numlen = e - RExC_parse;
3678                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3679                     RExC_parse = e + 1;
3680                 }
3681                 else {
3682                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3683                     numlen = 2;
3684                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3685                     RExC_parse += numlen;
3686                 }
3687                 break;
3688             case 'c':
3689                 value = UCHARAT(RExC_parse++);
3690                 value = toCTRL(value);
3691                 break;
3692             case '0': case '1': case '2': case '3': case '4':
3693             case '5': case '6': case '7': case '8': case '9':
3694             {
3695                 I32 flags = 0;
3696                 numlen = 3;
3697                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3698                 RExC_parse += numlen;
3699                 break;
3700             }
3701             default:
3702                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3703                     vWARN2(RExC_parse,
3704                            "Unrecognized escape \\%c in character class passed through",
3705                            (int)value);
3706                 break;
3707             }
3708         } /* end of \blah */
3709 #ifdef EBCDIC
3710         else
3711             literal_endpoint++;
3712 #endif
3713
3714         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3715
3716             if (!SIZE_ONLY && !need_class)
3717                 ANYOF_CLASS_ZERO(ret);
3718
3719             need_class = 1;
3720
3721             /* a bad range like a-\d, a-[:digit:] ? */
3722             if (range) {
3723                 if (!SIZE_ONLY) {
3724                     if (ckWARN(WARN_REGEXP))
3725                         vWARN4(RExC_parse,
3726                                "False [] range \"%*.*s\"",
3727                                RExC_parse - rangebegin,
3728                                RExC_parse - rangebegin,
3729                                rangebegin);
3730                     if (prevvalue < 256) {
3731                         ANYOF_BITMAP_SET(ret, prevvalue);
3732                         ANYOF_BITMAP_SET(ret, '-');
3733                     }
3734                     else {
3735                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3736                         Perl_sv_catpvf(aTHX_ listsv,
3737                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3738                     }
3739                 }
3740
3741                 range = 0; /* this was not a true range */
3742             }
3743
3744             if (!SIZE_ONLY) {
3745                 if (namedclass > OOB_NAMEDCLASS)
3746                     optimize_invert = FALSE;
3747                 /* Possible truncation here but in some 64-bit environments
3748                  * the compiler gets heartburn about switch on 64-bit values.
3749                  * A similar issue a little earlier when switching on value.
3750                  * --jhi */
3751                 switch ((I32)namedclass) {
3752                 case ANYOF_ALNUM:
3753                     if (LOC)
3754                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3755                     else {
3756                         for (value = 0; value < 256; value++)
3757                             if (isALNUM(value))
3758                                 ANYOF_BITMAP_SET(ret, value);
3759                     }
3760                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3761                     break;
3762                 case ANYOF_NALNUM:
3763                     if (LOC)
3764                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3765                     else {
3766                         for (value = 0; value < 256; value++)
3767                             if (!isALNUM(value))
3768                                 ANYOF_BITMAP_SET(ret, value);
3769                     }
3770                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3771                     break;
3772                 case ANYOF_ALNUMC:
3773                     if (LOC)
3774                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3775                     else {
3776                         for (value = 0; value < 256; value++)
3777                             if (isALNUMC(value))
3778                                 ANYOF_BITMAP_SET(ret, value);
3779                     }
3780                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3781                     break;
3782                 case ANYOF_NALNUMC:
3783                     if (LOC)
3784                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3785                     else {
3786                         for (value = 0; value < 256; value++)
3787                             if (!isALNUMC(value))
3788                                 ANYOF_BITMAP_SET(ret, value);
3789                     }
3790                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3791                     break;
3792                 case ANYOF_ALPHA:
3793                     if (LOC)
3794                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3795                     else {
3796                         for (value = 0; value < 256; value++)
3797                             if (isALPHA(value))
3798                                 ANYOF_BITMAP_SET(ret, value);
3799                     }
3800                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3801                     break;
3802                 case ANYOF_NALPHA:
3803                     if (LOC)
3804                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3805                     else {
3806                         for (value = 0; value < 256; value++)
3807                             if (!isALPHA(value))
3808                                 ANYOF_BITMAP_SET(ret, value);
3809                     }
3810                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3811                     break;
3812                 case ANYOF_ASCII:
3813                     if (LOC)
3814                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3815                     else {
3816 #ifndef EBCDIC
3817                         for (value = 0; value < 128; value++)
3818                             ANYOF_BITMAP_SET(ret, value);
3819 #else  /* EBCDIC */
3820                         for (value = 0; value < 256; value++) {
3821                             if (isASCII(value))
3822                                 ANYOF_BITMAP_SET(ret, value);
3823                         }
3824 #endif /* EBCDIC */
3825                     }
3826                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3827                     break;
3828                 case ANYOF_NASCII:
3829                     if (LOC)
3830                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3831                     else {
3832 #ifndef EBCDIC
3833                         for (value = 128; value < 256; value++)
3834                             ANYOF_BITMAP_SET(ret, value);
3835 #else  /* EBCDIC */
3836                         for (value = 0; value < 256; value++) {
3837                             if (!isASCII(value))
3838                                 ANYOF_BITMAP_SET(ret, value);
3839                         }
3840 #endif /* EBCDIC */
3841                     }
3842                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3843                     break;
3844                 case ANYOF_BLANK:
3845                     if (LOC)
3846                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3847                     else {
3848                         for (value = 0; value < 256; value++)
3849                             if (isBLANK(value))
3850                                 ANYOF_BITMAP_SET(ret, value);
3851                     }
3852                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3853                     break;
3854                 case ANYOF_NBLANK:
3855                     if (LOC)
3856                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3857                     else {
3858                         for (value = 0; value < 256; value++)
3859                             if (!isBLANK(value))
3860                                 ANYOF_BITMAP_SET(ret, value);
3861                     }
3862                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3863                     break;
3864                 case ANYOF_CNTRL:
3865                     if (LOC)
3866                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3867                     else {
3868                         for (value = 0; value < 256; value++)
3869                             if (isCNTRL(value))
3870                                 ANYOF_BITMAP_SET(ret, value);
3871                     }
3872                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3873                     break;
3874                 case ANYOF_NCNTRL:
3875                     if (LOC)
3876                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3877                     else {
3878                         for (value = 0; value < 256; value++)
3879                             if (!isCNTRL(value))
3880                                 ANYOF_BITMAP_SET(ret, value);
3881                     }
3882                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3883                     break;
3884                 case ANYOF_DIGIT:
3885                     if (LOC)
3886                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3887                     else {
3888                         /* consecutive digits assumed */
3889                         for (value = '0'; value <= '9'; value++)
3890                             ANYOF_BITMAP_SET(ret, value);
3891                     }
3892                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3893                     break;
3894                 case ANYOF_NDIGIT:
3895                     if (LOC)
3896                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3897                     else {
3898                         /* consecutive digits assumed */
3899                         for (value = 0; value < '0'; value++)
3900                             ANYOF_BITMAP_SET(ret, value);
3901                         for (value = '9' + 1; value < 256; value++)
3902                             ANYOF_BITMAP_SET(ret, value);
3903                     }
3904                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3905                     break;
3906                 case ANYOF_GRAPH:
3907                     if (LOC)
3908                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3909                     else {
3910                         for (value = 0; value < 256; value++)
3911                             if (isGRAPH(value))
3912                                 ANYOF_BITMAP_SET(ret, value);
3913                     }
3914                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3915                     break;
3916                 case ANYOF_NGRAPH:
3917                     if (LOC)
3918                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3919                     else {
3920                         for (value = 0; value < 256; value++)
3921                             if (!isGRAPH(value))
3922                                 ANYOF_BITMAP_SET(ret, value);
3923                     }
3924                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3925                     break;
3926                 case ANYOF_LOWER:
3927                     if (LOC)
3928                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3929                     else {
3930                         for (value = 0; value < 256; value++)
3931                             if (isLOWER(value))
3932                                 ANYOF_BITMAP_SET(ret, value);
3933                     }
3934                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3935                     break;
3936                 case ANYOF_NLOWER:
3937                     if (LOC)
3938                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3939                     else {
3940                         for (value = 0; value < 256; value++)
3941                             if (!isLOWER(value))
3942                                 ANYOF_BITMAP_SET(ret, value);
3943                     }
3944                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3945                     break;
3946                 case ANYOF_PRINT:
3947                     if (LOC)
3948                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3949                     else {
3950                         for (value = 0; value < 256; value++)
3951                             if (isPRINT(value))
3952                                 ANYOF_BITMAP_SET(ret, value);
3953                     }
3954                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3955                     break;
3956                 case ANYOF_NPRINT:
3957                     if (LOC)
3958                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3959                     else {
3960                         for (value = 0; value < 256; value++)
3961                             if (!isPRINT(value))
3962                                 ANYOF_BITMAP_SET(ret, value);
3963                     }
3964                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3965                     break;
3966                 case ANYOF_PSXSPC:
3967                     if (LOC)
3968                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3969                     else {
3970                         for (value = 0; value < 256; value++)
3971                             if (isPSXSPC(value))
3972                                 ANYOF_BITMAP_SET(ret, value);
3973                     }
3974                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3975                     break;
3976                 case ANYOF_NPSXSPC:
3977                     if (LOC)
3978                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3979                     else {
3980                         for (value = 0; value < 256; value++)
3981                             if (!isPSXSPC(value))
3982                                 ANYOF_BITMAP_SET(ret, value);
3983                     }
3984                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3985                     break;
3986                 case ANYOF_PUNCT:
3987                     if (LOC)
3988                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3989                     else {
3990                         for (value = 0; value < 256; value++)
3991                             if (isPUNCT(value))
3992                                 ANYOF_BITMAP_SET(ret, value);
3993                     }
3994                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3995                     break;
3996                 case ANYOF_NPUNCT:
3997                     if (LOC)
3998                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3999                     else {
4000                         for (value = 0; value < 256; value++)
4001                             if (!isPUNCT(value))
4002                                 ANYOF_BITMAP_SET(ret, value);
4003                     }
4004                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4005                     break;
4006                 case ANYOF_SPACE:
4007                     if (LOC)
4008                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4009                     else {
4010                         for (value = 0; value < 256; value++)
4011                             if (isSPACE(value))
4012                                 ANYOF_BITMAP_SET(ret, value);
4013                     }
4014                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4015                     break;
4016                 case ANYOF_NSPACE:
4017                     if (LOC)
4018                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4019                     else {
4020                         for (value = 0; value < 256; value++)
4021                             if (!isSPACE(value))
4022                                 ANYOF_BITMAP_SET(ret, value);
4023                     }
4024                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4025                     break;
4026                 case ANYOF_UPPER:
4027                     if (LOC)
4028                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4029                     else {
4030                         for (value = 0; value < 256; value++)
4031                             if (isUPPER(value))
4032                                 ANYOF_BITMAP_SET(ret, value);
4033                     }
4034                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4035                     break;
4036                 case ANYOF_NUPPER:
4037                     if (LOC)
4038                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4039                     else {
4040                         for (value = 0; value < 256; value++)
4041                             if (!isUPPER(value))
4042                                 ANYOF_BITMAP_SET(ret, value);
4043                     }
4044                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4045                     break;
4046                 case ANYOF_XDIGIT:
4047                     if (LOC)
4048                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4049                     else {
4050                         for (value = 0; value < 256; value++)
4051                             if (isXDIGIT(value))
4052                                 ANYOF_BITMAP_SET(ret, value);
4053                     }
4054                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4055                     break;
4056                 case ANYOF_NXDIGIT:
4057                     if (LOC)
4058                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4059                     else {
4060                         for (value = 0; value < 256; value++)
4061                             if (!isXDIGIT(value))
4062                                 ANYOF_BITMAP_SET(ret, value);
4063                     }
4064                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4065                     break;
4066                 default:
4067                     vFAIL("Invalid [::] class");
4068                     break;
4069                 }
4070                 if (LOC)
4071                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4072                 continue;
4073             }
4074         } /* end of namedclass \blah */
4075
4076         if (range) {
4077             if (prevvalue > (IV)value) /* b-a */ {
4078                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4079                               RExC_parse - rangebegin,
4080                               RExC_parse - rangebegin,
4081                               rangebegin);
4082                 range = 0; /* not a valid range */
4083             }
4084         }
4085         else {
4086             prevvalue = value; /* save the beginning of the range */
4087             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4088                 RExC_parse[1] != ']') {
4089                 RExC_parse++;
4090
4091                 /* a bad range like \w-, [:word:]- ? */
4092                 if (namedclass > OOB_NAMEDCLASS) {
4093                     if (ckWARN(WARN_REGEXP))
4094                         vWARN4(RExC_parse,
4095                                "False [] range \"%*.*s\"",
4096                                RExC_parse - rangebegin,
4097                                RExC_parse - rangebegin,
4098                                rangebegin);
4099                     if (!SIZE_ONLY)
4100                         ANYOF_BITMAP_SET(ret, '-');
4101                 } else
4102                     range = 1;  /* yeah, it's a range! */
4103                 continue;       /* but do it the next time */
4104             }
4105         }
4106
4107         /* now is the next time */
4108         if (!SIZE_ONLY) {
4109             IV i;
4110
4111             if (prevvalue < 256) {
4112                 IV ceilvalue = value < 256 ? value : 255;
4113
4114 #ifdef EBCDIC
4115                 /* In EBCDIC [\x89-\x91] should include
4116                  * the \x8e but [i-j] should not. */
4117                 if (literal_endpoint == 2 &&
4118                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4119                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4120                 {
4121                     if (isLOWER(prevvalue)) {
4122                         for (i = prevvalue; i <= ceilvalue; i++)
4123                             if (isLOWER(i))
4124                                 ANYOF_BITMAP_SET(ret, i);
4125                     } else {
4126                         for (i = prevvalue; i <= ceilvalue; i++)
4127                             if (isUPPER(i))
4128                                 ANYOF_BITMAP_SET(ret, i);
4129                     }
4130                 }
4131                 else
4132 #endif
4133                       for (i = prevvalue; i <= ceilvalue; i++)
4134                           ANYOF_BITMAP_SET(ret, i);
4135           }
4136           if (value > 255 || UTF) {
4137                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4138                 UV natvalue      = NATIVE_TO_UNI(value);
4139
4140                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4141                 if (prevnatvalue < natvalue) { /* what about > ? */
4142                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4143                                    prevnatvalue, natvalue);
4144                 }
4145                 else if (prevnatvalue == natvalue) {
4146                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4147                     if (FOLD) {
4148                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4149                          STRLEN foldlen;
4150                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4151
4152                          /* If folding and foldable and a single
4153                           * character, insert also the folded version
4154                           * to the charclass. */
4155                          if (f != value) {
4156                               if (foldlen == (STRLEN)UNISKIP(f))
4157                                   Perl_sv_catpvf(aTHX_ listsv,
4158                                                  "%04"UVxf"\n", f);
4159                               else {
4160                                   /* Any multicharacter foldings
4161                                    * require the following transform:
4162                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4163                                    * where E folds into "pq" and F folds
4164                                    * into "rst", all other characters
4165                                    * fold to single characters.  We save
4166                                    * away these multicharacter foldings,
4167                                    * to be later saved as part of the
4168                                    * additional "s" data. */
4169                                   SV *sv;
4170
4171                                   if (!unicode_alternate)
4172                                       unicode_alternate = newAV();
4173                                   sv = newSVpvn((char*)foldbuf, foldlen);
4174                                   SvUTF8_on(sv);
4175                                   av_push(unicode_alternate, sv);
4176                               }
4177                          }
4178
4179                          /* If folding and the value is one of the Greek
4180                           * sigmas insert a few more sigmas to make the
4181                           * folding rules of the sigmas to work right.
4182                           * Note that not all the possible combinations
4183                           * are handled here: some of them are handled
4184                           * by the standard folding rules, and some of
4185                           * them (literal or EXACTF cases) are handled
4186                           * during runtime in regexec.c:S_find_byclass(). */
4187                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4188                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4189                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4190                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4191                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4192                          }
4193                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4194                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4195                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4196                     }
4197                 }
4198             }
4199 #ifdef EBCDIC
4200             literal_endpoint = 0;
4201 #endif
4202         }
4203
4204         range = 0; /* this range (if it was one) is done now */
4205     }
4206
4207     if (need_class) {
4208         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4209         if (SIZE_ONLY)
4210             RExC_size += ANYOF_CLASS_ADD_SKIP;
4211         else
4212             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4213     }
4214
4215     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4216     if (!SIZE_ONLY &&
4217          /* If the only flag is folding (plus possibly inversion). */
4218         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4219        ) {
4220         for (value = 0; value < 256; ++value) {
4221             if (ANYOF_BITMAP_TEST(ret, value)) {
4222                 UV fold = PL_fold[value];
4223
4224                 if (fold != value)
4225                     ANYOF_BITMAP_SET(ret, fold);
4226             }
4227         }
4228         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4229     }
4230
4231     /* optimize inverted simple patterns (e.g. [^a-z]) */
4232     if (!SIZE_ONLY && optimize_invert &&
4233         /* If the only flag is inversion. */
4234         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4235         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4236             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4237         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4238     }
4239
4240     if (!SIZE_ONLY) {
4241         AV *av = newAV();
4242         SV *rv;
4243
4244         /* The 0th element stores the character class description
4245          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4246          * to initialize the appropriate swash (which gets stored in
4247          * the 1st element), and also useful for dumping the regnode.
4248          * The 2nd element stores the multicharacter foldings,
4249          * used later (regexec.c:S_reginclass()). */
4250         av_store(av, 0, listsv);
4251         av_store(av, 1, NULL);
4252         av_store(av, 2, (SV*)unicode_alternate);
4253         rv = newRV_noinc((SV*)av);
4254         n = add_data(pRExC_state, 1, "s");
4255         RExC_rx->data->data[n] = (void*)rv;
4256         ARG_SET(ret, n);
4257     }
4258
4259     return ret;
4260 }
4261
4262 STATIC char*
4263 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4264 {
4265     char* retval = RExC_parse++;
4266
4267     for (;;) {
4268         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4269                 RExC_parse[2] == '#') {
4270             while (*RExC_parse && *RExC_parse != ')')
4271                 RExC_parse++;
4272             RExC_parse++;
4273             continue;
4274         }
4275         if (RExC_flags & PMf_EXTENDED) {
4276             if (isSPACE(*RExC_parse)) {
4277                 RExC_parse++;
4278                 continue;
4279             }
4280             else if (*RExC_parse == '#') {
4281                 while (*RExC_parse && *RExC_parse != '\n')
4282                     RExC_parse++;
4283                 RExC_parse++;
4284                 continue;
4285             }
4286         }
4287         return retval;
4288     }
4289 }
4290
4291 /*
4292 - reg_node - emit a node
4293 */
4294 STATIC regnode *                        /* Location. */
4295 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4296 {
4297     register regnode *ret;
4298     register regnode *ptr;
4299
4300     ret = RExC_emit;
4301     if (SIZE_ONLY) {
4302         SIZE_ALIGN(RExC_size);
4303         RExC_size += 1;
4304         return(ret);
4305     }
4306
4307     NODE_ALIGN_FILL(ret);
4308     ptr = ret;
4309     FILL_ADVANCE_NODE(ptr, op);
4310     if (RExC_offsets) {         /* MJD */
4311         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4312               "reg_node", __LINE__, 
4313               reg_name[op],
4314               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4315               ? "Overwriting end of array!\n" : "OK",
4316               RExC_emit - RExC_emit_start,
4317               RExC_parse - RExC_start,
4318               RExC_offsets[0])); 
4319         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4320     }
4321             
4322     RExC_emit = ptr;
4323
4324     return(ret);
4325 }
4326
4327 /*
4328 - reganode - emit a node with an argument
4329 */
4330 STATIC regnode *                        /* Location. */
4331 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4332 {
4333     register regnode *ret;
4334     register regnode *ptr;
4335
4336     ret = RExC_emit;
4337     if (SIZE_ONLY) {
4338         SIZE_ALIGN(RExC_size);
4339         RExC_size += 2;
4340         return(ret);
4341     }
4342
4343     NODE_ALIGN_FILL(ret);
4344     ptr = ret;
4345     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4346     if (RExC_offsets) {         /* MJD */
4347         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4348               "reganode",
4349               __LINE__,
4350               reg_name[op],
4351               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4352               "Overwriting end of array!\n" : "OK",
4353               RExC_emit - RExC_emit_start,
4354               RExC_parse - RExC_start,
4355               RExC_offsets[0])); 
4356         Set_Cur_Node_Offset;
4357     }
4358             
4359     RExC_emit = ptr;
4360
4361     return(ret);
4362 }
4363
4364 /*
4365 - reguni - emit (if appropriate) a Unicode character
4366 */
4367 STATIC void
4368 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4369 {
4370     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4371 }
4372
4373 /*
4374 - reginsert - insert an operator in front of already-emitted operand
4375 *
4376 * Means relocating the operand.
4377 */
4378 STATIC void
4379 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4380 {
4381     register regnode *src;
4382     register regnode *dst;
4383     register regnode *place;
4384     register int offset = regarglen[(U8)op];
4385
4386 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4387
4388     if (SIZE_ONLY) {
4389         RExC_size += NODE_STEP_REGNODE + offset;
4390         return;
4391     }
4392
4393     src = RExC_emit;
4394     RExC_emit += NODE_STEP_REGNODE + offset;
4395     dst = RExC_emit;
4396     while (src > opnd) {
4397         StructCopy(--src, --dst, regnode);
4398         if (RExC_offsets) {     /* MJD 20010112 */
4399             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4400                   "reg_insert",
4401                   __LINE__,
4402                   reg_name[op],
4403                   dst - RExC_emit_start > RExC_offsets[0] 
4404                   ? "Overwriting end of array!\n" : "OK",
4405                   src - RExC_emit_start,
4406                   dst - RExC_emit_start,
4407                   RExC_offsets[0])); 
4408             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4409             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4410         }
4411     }
4412     
4413
4414     place = opnd;               /* Op node, where operand used to be. */
4415     if (RExC_offsets) {         /* MJD */
4416         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4417               "reginsert",
4418               __LINE__,
4419               reg_name[op],
4420               place - RExC_emit_start > RExC_offsets[0] 
4421               ? "Overwriting end of array!\n" : "OK",
4422               place - RExC_emit_start,
4423               RExC_parse - RExC_start,
4424               RExC_offsets[0])); 
4425         Set_Node_Offset(place, RExC_parse);
4426     }
4427     src = NEXTOPER(place);
4428     FILL_ADVANCE_NODE(place, op);
4429     Zero(src, offset, regnode);
4430 }
4431
4432 /*
4433 - regtail - set the next-pointer at the end of a node chain of p to val.
4434 */
4435 STATIC void
4436 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4437 {
4438     register regnode *scan;
4439     register regnode *temp;
4440
4441     if (SIZE_ONLY)
4442         return;
4443
4444     /* Find last node. */
4445     scan = p;
4446     for (;;) {
4447         temp = regnext(scan);
4448         if (temp == NULL)
4449             break;
4450         scan = temp;
4451     }
4452
4453     if (reg_off_by_arg[OP(scan)]) {
4454         ARG_SET(scan, val - scan);
4455     }
4456     else {
4457         NEXT_OFF(scan) = val - scan;
4458     }
4459 }
4460
4461 /*
4462 - regoptail - regtail on operand of first argument; nop if operandless
4463 */
4464 STATIC void
4465 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4466 {
4467     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4468     if (p == NULL || SIZE_ONLY)
4469         return;
4470     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4471         regtail(pRExC_state, NEXTOPER(p), val);
4472     }
4473     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4474         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4475     }
4476     else
4477         return;
4478 }
4479
4480 /*
4481  - regcurly - a little FSA that accepts {\d+,?\d*}
4482  */
4483 STATIC I32
4484 S_regcurly(pTHX_ register char *s)
4485 {
4486     if (*s++ != '{')
4487         return FALSE;
4488     if (!isDIGIT(*s))
4489         return FALSE;
4490     while (isDIGIT(*s))
4491         s++;
4492     if (*s == ',')
4493         s++;
4494     while (isDIGIT(*s))
4495         s++;
4496     if (*s != '}')
4497         return FALSE;
4498     return TRUE;
4499 }
4500
4501
4502 #ifdef DEBUGGING
4503
4504 STATIC regnode *
4505 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4506 {
4507     register U8 op = EXACT;     /* Arbitrary non-END op. */
4508     register regnode *next;
4509
4510     while (op != END && (!last || node < last)) {
4511         /* While that wasn't END last time... */
4512
4513         NODE_ALIGN(node);
4514         op = OP(node);
4515         if (op == CLOSE)
4516             l--;        
4517         next = regnext(node);
4518         /* Where, what. */
4519         if (OP(node) == OPTIMIZED)
4520             goto after_print;
4521         regprop(sv, node);
4522         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4523                       (int)(2*l + 1), "", SvPVX(sv));
4524         if (next == NULL)               /* Next ptr. */
4525             PerlIO_printf(Perl_debug_log, "(0)");
4526         else
4527             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4528         (void)PerlIO_putc(Perl_debug_log, '\n');
4529       after_print:
4530         if (PL_regkind[(U8)op] == BRANCHJ) {
4531             register regnode *nnode = (OP(next) == LONGJMP
4532                                        ? regnext(next)
4533                                        : next);
4534             if (last && nnode > last)
4535                 nnode = last;
4536             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4537         }
4538         else if (PL_regkind[(U8)op] == BRANCH) {
4539             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4540         }
4541         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4542             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4543                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4544         }
4545         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4546             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4547                              next, sv, l + 1);
4548         }
4549         else if ( op == PLUS || op == STAR) {
4550             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4551         }
4552         else if (op == ANYOF) {
4553             /* arglen 1 + class block */
4554             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4555                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4556             node = NEXTOPER(node);
4557         }
4558         else if (PL_regkind[(U8)op] == EXACT) {
4559             /* Literal string, where present. */
4560             node += NODE_SZ_STR(node) - 1;
4561             node = NEXTOPER(node);
4562         }
4563         else {
4564             node = NEXTOPER(node);
4565             node += regarglen[(U8)op];
4566         }
4567         if (op == CURLYX || op == OPEN)
4568             l++;
4569         else if (op == WHILEM)
4570             l--;
4571     }
4572     return node;
4573 }
4574
4575 #endif  /* DEBUGGING */
4576
4577 /*
4578  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4579  */
4580 void
4581 Perl_regdump(pTHX_ regexp *r)
4582 {
4583 #ifdef DEBUGGING
4584     SV *sv = sv_newmortal();
4585
4586     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4587
4588     /* Header fields of interest. */
4589     if (r->anchored_substr)
4590         PerlIO_printf(Perl_debug_log,
4591                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4592                       PL_colors[0],
4593                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4594                       SvPVX(r->anchored_substr),
4595                       PL_colors[1],
4596                       SvTAIL(r->anchored_substr) ? "$" : "",
4597                       (IV)r->anchored_offset);
4598     else if (r->anchored_utf8)
4599         PerlIO_printf(Perl_debug_log,
4600                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4601                       PL_colors[0],
4602                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4603                       SvPVX(r->anchored_utf8),
4604                       PL_colors[1],
4605                       SvTAIL(r->anchored_utf8) ? "$" : "",
4606                       (IV)r->anchored_offset);
4607     if (r->float_substr)
4608         PerlIO_printf(Perl_debug_log,
4609                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4610                       PL_colors[0],
4611                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4612                       SvPVX(r->float_substr),
4613                       PL_colors[1],
4614                       SvTAIL(r->float_substr) ? "$" : "",
4615                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4616     else if (r->float_utf8)
4617         PerlIO_printf(Perl_debug_log,
4618                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4619                       PL_colors[0],
4620                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4621                       SvPVX(r->float_utf8),
4622                       PL_colors[1],
4623                       SvTAIL(r->float_utf8) ? "$" : "",
4624                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4625     if (r->check_substr || r->check_utf8)
4626         PerlIO_printf(Perl_debug_log,
4627                       r->check_substr == r->float_substr
4628                       && r->check_utf8 == r->float_utf8
4629                       ? "(checking floating" : "(checking anchored");
4630     if (r->reganch & ROPT_NOSCAN)
4631         PerlIO_printf(Perl_debug_log, " noscan");
4632     if (r->reganch & ROPT_CHECK_ALL)
4633         PerlIO_printf(Perl_debug_log, " isall");
4634     if (r->check_substr || r->check_utf8)
4635         PerlIO_printf(Perl_debug_log, ") ");
4636
4637     if (r->regstclass) {
4638         regprop(sv, r->regstclass);
4639         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4640     }
4641     if (r->reganch & ROPT_ANCH) {
4642         PerlIO_printf(Perl_debug_log, "anchored");
4643         if (r->reganch & ROPT_ANCH_BOL)
4644             PerlIO_printf(Perl_debug_log, "(BOL)");
4645         if (r->reganch & ROPT_ANCH_MBOL)
4646             PerlIO_printf(Perl_debug_log, "(MBOL)");
4647         if (r->reganch & ROPT_ANCH_SBOL)
4648             PerlIO_printf(Perl_debug_log, "(SBOL)");
4649         if (r->reganch & ROPT_ANCH_GPOS)
4650             PerlIO_printf(Perl_debug_log, "(GPOS)");
4651         PerlIO_putc(Perl_debug_log, ' ');
4652     }
4653     if (r->reganch & ROPT_GPOS_SEEN)
4654         PerlIO_printf(Perl_debug_log, "GPOS ");
4655     if (r->reganch & ROPT_SKIP)
4656         PerlIO_printf(Perl_debug_log, "plus ");
4657     if (r->reganch & ROPT_IMPLICIT)
4658         PerlIO_printf(Perl_debug_log, "implicit ");
4659     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4660     if (r->reganch & ROPT_EVAL_SEEN)
4661         PerlIO_printf(Perl_debug_log, "with eval ");
4662     PerlIO_printf(Perl_debug_log, "\n");
4663     if (r->offsets) {
4664       U32 i;
4665       U32 len = r->offsets[0];
4666       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4667       for (i = 1; i <= len; i++)
4668         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4669                       (UV)r->offsets[i*2-1], 
4670                       (UV)r->offsets[i*2]);
4671       PerlIO_printf(Perl_debug_log, "\n");
4672     }
4673 #endif  /* DEBUGGING */
4674 }
4675
4676 #ifdef DEBUGGING
4677
4678 STATIC void
4679 S_put_byte(pTHX_ SV *sv, int c)
4680 {
4681     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4682         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4683     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4684         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4685     else
4686         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4687 }
4688
4689 #endif  /* DEBUGGING */
4690
4691 /*
4692 - regprop - printable representation of opcode
4693 */
4694 void
4695 Perl_regprop(pTHX_ SV *sv, regnode *o)
4696 {
4697 #ifdef DEBUGGING
4698     register int k;
4699
4700     sv_setpvn(sv, "", 0);
4701     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4702         /* It would be nice to FAIL() here, but this may be called from
4703            regexec.c, and it would be hard to supply pRExC_state. */
4704         Perl_croak(aTHX_ "Corrupted regexp opcode");
4705     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4706
4707     k = PL_regkind[(U8)OP(o)];
4708
4709     if (k == EXACT) {
4710         SV *dsv = sv_2mortal(newSVpvn("", 0));
4711         /* Using is_utf8_string() is a crude hack but it may
4712          * be the best for now since we have no flag "this EXACTish
4713          * node was UTF-8" --jhi */
4714         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4715         char *s    = do_utf8 ?
4716           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4717                          UNI_DISPLAY_REGEX) :
4718           STRING(o);
4719         int len = do_utf8 ?
4720           strlen(s) :
4721           STR_LEN(o);
4722         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4723                        PL_colors[0],
4724                        len, s,
4725                        PL_colors[1]);
4726     }
4727     else if (k == CURLY) {
4728         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4729             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4730         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4731     }
4732     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4733         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4734     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4735         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4736     else if (k == LOGICAL)
4737         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4738     else if (k == ANYOF) {
4739         int i, rangestart = -1;
4740         U8 flags = ANYOF_FLAGS(o);
4741         const char * const anyofs[] = { /* Should be synchronized with
4742                                          * ANYOF_ #xdefines in regcomp.h */
4743             "\\w",
4744             "\\W",
4745             "\\s",
4746             "\\S",
4747             "\\d",
4748             "\\D",
4749             "[:alnum:]",
4750             "[:^alnum:]",
4751             "[:alpha:]",
4752             "[:^alpha:]",
4753             "[:ascii:]",
4754             "[:^ascii:]",
4755             "[:ctrl:]",
4756             "[:^ctrl:]",
4757             "[:graph:]",
4758             "[:^graph:]",
4759             "[:lower:]",
4760             "[:^lower:]",
4761             "[:print:]",
4762             "[:^print:]",
4763             "[:punct:]",
4764             "[:^punct:]",
4765             "[:upper:]",
4766             "[:^upper:]",
4767             "[:xdigit:]",
4768             "[:^xdigit:]",
4769             "[:space:]",
4770             "[:^space:]",
4771             "[:blank:]",
4772             "[:^blank:]"
4773         };
4774
4775         if (flags & ANYOF_LOCALE)
4776             sv_catpv(sv, "{loc}");
4777         if (flags & ANYOF_FOLD)
4778             sv_catpv(sv, "{i}");
4779         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4780         if (flags & ANYOF_INVERT)
4781             sv_catpv(sv, "^");
4782         for (i = 0; i <= 256; i++) {
4783             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4784                 if (rangestart == -1)
4785                     rangestart = i;
4786             } else if (rangestart != -1) {
4787                 if (i <= rangestart + 3)
4788                     for (; rangestart < i; rangestart++)
4789                         put_byte(sv, rangestart);
4790                 else {
4791                     put_byte(sv, rangestart);
4792                     sv_catpv(sv, "-");
4793                     put_byte(sv, i - 1);
4794                 }
4795                 rangestart = -1;
4796             }
4797         }
4798
4799         if (o->flags & ANYOF_CLASS)
4800             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4801                 if (ANYOF_CLASS_TEST(o,i))
4802                     sv_catpv(sv, anyofs[i]);
4803
4804         if (flags & ANYOF_UNICODE)
4805             sv_catpv(sv, "{unicode}");
4806         else if (flags & ANYOF_UNICODE_ALL)
4807             sv_catpv(sv, "{unicode_all}");
4808
4809         {
4810             SV *lv;
4811             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4812         
4813             if (lv) {
4814                 if (sw) {
4815                     U8 s[UTF8_MAXLEN+1];
4816                 
4817                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4818                         U8 *e = uvchr_to_utf8(s, i);
4819                         
4820                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4821                             if (rangestart == -1)
4822                                 rangestart = i;
4823                         } else if (rangestart != -1) {
4824                             U8 *p;
4825                         
4826                             if (i <= rangestart + 3)
4827                                 for (; rangestart < i; rangestart++) {
4828                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4829                                         put_byte(sv, *p);
4830                                 }
4831                             else {
4832                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4833                                     put_byte(sv, *p);
4834                                 sv_catpv(sv, "-");
4835                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4836                                         put_byte(sv, *p);
4837                                 }
4838                                 rangestart = -1;
4839                             }
4840                         }
4841                         
4842                     sv_catpv(sv, "..."); /* et cetera */
4843                 }
4844
4845                 {
4846                     char *s = savepv(SvPVX(lv));
4847                     char *origs = s;
4848                 
4849                     while(*s && *s != '\n') s++;
4850                 
4851                     if (*s == '\n') {
4852                         char *t = ++s;
4853                         
4854                         while (*s) {
4855                             if (*s == '\n')
4856                                 *s = ' ';
4857                             s++;
4858                         }
4859                         if (s[-1] == ' ')
4860                             s[-1] = 0;
4861                         
4862                         sv_catpv(sv, t);
4863                     }
4864                 
4865                     Safefree(origs);
4866                 }
4867             }
4868         }
4869
4870         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4871     }
4872     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4873         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4874 #endif  /* DEBUGGING */
4875 }
4876
4877 SV *
4878 Perl_re_intuit_string(pTHX_ regexp *prog)
4879 {                               /* Assume that RE_INTUIT is set */
4880     DEBUG_r(
4881         {   STRLEN n_a;
4882             char *s = SvPV(prog->check_substr
4883                       ? prog->check_substr : prog->check_utf8, n_a);
4884
4885             if (!PL_colorset) reginitcolors();
4886             PerlIO_printf(Perl_debug_log,
4887                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4888                       PL_colors[4],
4889                       prog->check_substr ? "" : "utf8 ",
4890                       PL_colors[5],PL_colors[0],
4891                       s,
4892                       PL_colors[1],
4893                       (strlen(s) > 60 ? "..." : ""));
4894         } );
4895
4896     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4897 }
4898
4899 void
4900 Perl_pregfree(pTHX_ struct regexp *r)
4901 {
4902 #ifdef DEBUGGING
4903     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4904 #endif
4905
4906     if (!r || (--r->refcnt > 0))
4907         return;
4908     DEBUG_r({
4909          int len;
4910          char *s;
4911
4912          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4913                 r->prelen, 60, UNI_DISPLAY_REGEX)
4914             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4915          len = SvCUR(dsv);
4916          if (!PL_colorset)
4917               reginitcolors();
4918          PerlIO_printf(Perl_debug_log,
4919                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4920                        PL_colors[4],PL_colors[5],PL_colors[0],
4921                        len, len, s,
4922                        PL_colors[1],
4923                        len > 60 ? "..." : "");
4924     });
4925
4926     if (r->precomp)
4927         Safefree(r->precomp);
4928     if (r->offsets)             /* 20010421 MJD */
4929         Safefree(r->offsets);
4930     RX_MATCH_COPY_FREE(r);
4931 #ifdef PERL_COPY_ON_WRITE
4932     if (r->saved_copy)
4933         SvREFCNT_dec(r->saved_copy);
4934 #endif
4935     if (r->substrs) {
4936         if (r->anchored_substr)
4937             SvREFCNT_dec(r->anchored_substr);
4938         if (r->anchored_utf8)
4939             SvREFCNT_dec(r->anchored_utf8);
4940         if (r->float_substr)
4941             SvREFCNT_dec(r->float_substr);
4942         if (r->float_utf8)
4943             SvREFCNT_dec(r->float_utf8);
4944         Safefree(r->substrs);
4945     }
4946     if (r->data) {
4947         int n = r->data->count;
4948         PAD* new_comppad = NULL;
4949         PAD* old_comppad;
4950
4951         while (--n >= 0) {
4952           /* If you add a ->what type here, update the comment in regcomp.h */
4953             switch (r->data->what[n]) {
4954             case 's':
4955                 SvREFCNT_dec((SV*)r->data->data[n]);
4956                 break;
4957             case 'f':
4958                 Safefree(r->data->data[n]);
4959                 break;
4960             case 'p':
4961                 new_comppad = (AV*)r->data->data[n];
4962                 break;
4963             case 'o':
4964                 if (new_comppad == NULL)
4965                     Perl_croak(aTHX_ "panic: pregfree comppad");
4966                 PAD_SAVE_LOCAL(old_comppad,
4967                     /* Watch out for global destruction's random ordering. */
4968                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4969                                 new_comppad : Null(PAD *)
4970                 );
4971                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4972                     op_free((OP_4tree*)r->data->data[n]);
4973                 }
4974
4975                 PAD_RESTORE_LOCAL(old_comppad);
4976                 SvREFCNT_dec((SV*)new_comppad);
4977                 new_comppad = NULL;
4978                 break;
4979             case 'n':
4980                 break;
4981             default:
4982                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4983             }
4984         }
4985         Safefree(r->data->what);
4986         Safefree(r->data);
4987     }
4988     Safefree(r->startp);
4989     Safefree(r->endp);
4990     Safefree(r);
4991 }
4992
4993 /*
4994  - regnext - dig the "next" pointer out of a node
4995  *
4996  * [Note, when REGALIGN is defined there are two places in regmatch()
4997  * that bypass this code for speed.]
4998  */
4999 regnode *
5000 Perl_regnext(pTHX_ register regnode *p)
5001 {
5002     register I32 offset;
5003
5004     if (p == &PL_regdummy)
5005         return(NULL);
5006
5007     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5008     if (offset == 0)
5009         return(NULL);
5010
5011     return(p+offset);
5012 }
5013
5014 STATIC void     
5015 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5016 {
5017     va_list args;
5018     STRLEN l1 = strlen(pat1);
5019     STRLEN l2 = strlen(pat2);
5020     char buf[512];
5021     SV *msv;
5022     char *message;
5023
5024     if (l1 > 510)
5025         l1 = 510;
5026     if (l1 + l2 > 510)
5027         l2 = 510 - l1;
5028     Copy(pat1, buf, l1 , char);
5029     Copy(pat2, buf + l1, l2 , char);
5030     buf[l1 + l2] = '\n';
5031     buf[l1 + l2 + 1] = '\0';
5032 #ifdef I_STDARG
5033     /* ANSI variant takes additional second argument */
5034     va_start(args, pat2);
5035 #else
5036     va_start(args);
5037 #endif
5038     msv = vmess(buf, &args);
5039     va_end(args);
5040     message = SvPV(msv,l1);
5041     if (l1 > 512)
5042         l1 = 512;
5043     Copy(message, buf, l1 , char);
5044     buf[l1] = '\0';                     /* Overwrite \n */
5045     Perl_croak(aTHX_ "%s", buf);
5046 }
5047
5048 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5049
5050 void
5051 Perl_save_re_context(pTHX)
5052 {
5053     SAVEI32(PL_reg_flags);              /* from regexec.c */
5054     SAVEPPTR(PL_bostr);
5055     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5056     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5057     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5058     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5059     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5060     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5061     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5062     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5063     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5064     PL_reg_start_tmp = 0;
5065     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5066     PL_reg_start_tmpl = 0;
5067     SAVEVPTR(PL_regdata);
5068     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5069     SAVEI32(PL_regnarrate);             /* from regexec.c */
5070     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5071     SAVEINT(PL_regindent);              /* from regexec.c */
5072     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5073     SAVEVPTR(PL_curcop);
5074     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5075     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5076     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5077     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5078     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5079     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5080     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5081     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5082     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5083     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5084     PL_reg_oldsaved = Nullch;
5085     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5086     PL_reg_oldsavedlen = 0;
5087 #ifdef PERL_COPY_ON_WRITE
5088     SAVESPTR(PL_nrs);
5089     PL_nrs = Nullsv;
5090 #endif
5091     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5092     PL_reg_maxiter = 0;
5093     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5094     PL_reg_leftiter = 0;
5095     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5096     PL_reg_poscache = Nullch;
5097     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5098     PL_reg_poscache_size = 0;
5099     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5100     SAVEI32(PL_regnpar);                /* () count. */
5101     SAVEI32(PL_regsize);                /* from regexec.c */
5102
5103     {
5104         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5105         U32 i;
5106         GV *mgv;
5107         REGEXP *rx;
5108         char digits[16];
5109
5110         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5111             for (i = 1; i <= rx->nparens; i++) {
5112                 sprintf(digits, "%lu", (long)i);
5113                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5114                     save_scalar(mgv);
5115             }
5116         }
5117     }
5118
5119 #ifdef DEBUGGING
5120     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5121 #endif
5122 }
5123
5124 static void
5125 clear_re(pTHX_ void *r)
5126 {
5127     ReREFCNT_dec((regexp *)r);
5128 }
5129