5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 data->flags &= ~SF_FL_BEFORE_EOL;
515 SvCUR_set(data->last_found, 0);
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
600 /* 'OR' a given class with another one. Can create false positives */
601 /* We assume that cl is not inverted */
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 if (or_with->flags & ANYOF_INVERT) {
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
624 cl_anything(pRExC_state, cl);
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629 && (!(or_with->flags & ANYOF_FOLD)
630 || (cl->flags & ANYOF_FOLD)) ) {
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
643 cl_anything(pRExC_state, cl);
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
664 #if defined(__sparc64__) && defined(__GNUC__)
665 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 # undef SPARC64_GCC_WORKAROUND
667 # define SPARC64_GCC_WORKAROUND 1
671 /* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687 int is_inf_internal = 0; /* The studied chunk is infinite */
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
696 /* Merge several consecutive EXACTish nodes into one. */
697 regnode *n = regnext(scan);
700 regnode *stop = scan;
703 next = scan + NODE_SZ_STR(scan);
704 /* Skip NOTHING, merge EXACT*. */
706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
707 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
722 int oldl = STR_LEN(scan);
723 regnode *nnext = regnext(n);
725 if (oldl + STR_LEN(n) > U8_MAX)
727 NEXT_OFF(scan) += NEXT_OFF(n);
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
730 /* Now we can overwrite *n : */
731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 Two problematic code points in Unicode casefolding of EXACT nodes:
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
774 s < s2 && (t = ninstr(s, s1, t0, t1));
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
784 n = scan + NODE_SZ_STR(scan);
786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
796 if (OP(scan) != CURLYX) {
797 int max = (reg_off_by_arg[OP(scan)]
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 if (reg_off_by_arg[OP(scan)])
814 NEXT_OFF(scan) = off;
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824 I32 max1 = 0, min1 = I32_MAX, num = 0;
825 struct regnode_charclass_class accum;
827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829 if (flags & SCF_DO_STCLASS)
830 cl_init_zero(pRExC_state, &accum);
831 while (OP(scan) == code) {
832 I32 deltanext, minnext, f = 0, fake;
833 struct regnode_charclass_class this_class;
838 data_fake.whilem_c = data->whilem_c;
839 data_fake.last_closep = data->last_closep;
842 data_fake.last_closep = &fake;
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
846 scan = NEXTOPER(scan);
847 if (flags & SCF_DO_STCLASS) {
848 cl_init(pRExC_state, &this_class);
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
854 /* we suppose the run is continuous, last=next...*/
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
862 is_inf = is_inf_internal = 1;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 if (data && (data_fake.flags & SF_HAS_EVAL))
867 data->flags |= SF_HAS_EVAL;
869 data->whilem_c = data_fake.whilem_c;
870 if (flags & SCF_DO_STCLASS)
871 cl_or(pRExC_state, &accum, &this_class);
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
884 delta += max1 - min1;
885 if (flags & SCF_DO_STCLASS_OR) {
886 cl_or(pRExC_state, data->start_class, &accum);
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
892 else if (flags & SCF_DO_STCLASS_AND) {
894 cl_and(data->start_class, &accum);
895 flags &= ~SCF_DO_STCLASS;
898 /* Switch to OR mode: cache the old value of
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
910 else if (code == BRANCHJ) /* single branch is optimized. */
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
916 else if (OP(scan) == EXACT) {
917 I32 l = STR_LEN(scan);
918 UV uc = *((U8*)STRING(scan));
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
922 uc = utf8_to_uvchr(s, NULL);
925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
931 ? I32_MAX : data->pos_min + data->pos_delta;
933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
945 && (!(data->start_class->flags & ANYOF_FOLD)
946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
952 ANYOF_BITMAP_SET(data->start_class, uc);
953 data->start_class->flags &= ~ANYOF_EOS;
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
960 ANYOF_BITMAP_SET(data->start_class, uc);
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
966 flags &= ~SCF_DO_STCLASS;
968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
969 I32 l = STR_LEN(scan);
970 UV uc = *((U8*)STRING(scan));
972 /* Search for fixed substrings supports EXACT only. */
973 if (flags & SCF_DO_SUBSTR)
974 scan_commit(pRExC_state, data);
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
978 uc = utf8_to_uvchr(s, NULL);
981 if (data && (flags & SCF_DO_SUBSTR))
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
995 ANYOF_BITMAP_SET(data->start_class, uc);
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1007 ANYOF_BITMAP_SET(data->start_class, uc);
1008 data->start_class->flags &= ~ANYOF_EOS;
1010 cl_and(data->start_class, &and_with);
1012 flags &= ~SCF_DO_STCLASS;
1014 else if (strchr((char*)PL_varies,OP(scan))) {
1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1016 I32 f = flags, pos_before = 0;
1017 regnode *oscan = scan;
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
1020 I32 next_is_eval = 0;
1022 switch (PL_regkind[(U8)OP(scan)]) {
1023 case WHILEM: /* End of (?:...)* . */
1024 scan = NEXTOPER(scan);
1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1028 next = NEXTOPER(scan);
1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031 maxcount = REG_INFTY;
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1037 if (flags & SCF_DO_SUBSTR)
1042 if (flags & SCF_DO_STCLASS) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1049 is_inf = is_inf_internal = 1;
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1053 data->longest = &(data->longest_float);
1055 goto optimize_curly_tail;
1057 mincount = ARG1(scan);
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1066 next_is_eval = (OP(scan) == EVAL);
1068 if (flags & SCF_DO_SUBSTR) {
1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1070 pos_before = data->pos_min;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 data->flags |= SF_IS_INF;
1078 if (flags & SCF_DO_STCLASS) {
1079 cl_init(pRExC_state, &this_class);
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
1094 /* This will finish on WHILEM, setting scan, or on NULL: */
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
1103 cl_or(pRExC_state, data->start_class, &this_class);
1105 else if (flags & SCF_DO_STCLASS_AND) {
1106 /* Switch to OR mode: cache the old value of
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
1118 cl_or(pRExC_state, data->start_class, &this_class);
1119 cl_and(data->start_class, &and_with);
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1127 if (ckWARN(WARN_REGEXP)
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1130 && (minnext == 0) && (deltanext == 0)
1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1135 "Quantifier unexpected on zero-length expression");
1138 min += minnext * mincount;
1139 is_inf_internal |= ((maxcount == REG_INFTY
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
1142 is_inf |= is_inf_internal;
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145 /* Try powerful optimization CURLYX => CURLYN. */
1146 if ( OP(oscan) == CURLYX && data
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1152 regnode *nxt1 = nxt;
1159 if (!strchr((char*)PL_simple,OP(nxt))
1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1161 && STR_LEN(nxt) == 1))
1167 if (OP(nxt) != CLOSE)
1169 /* Now we know that nxt2 is the only contents: */
1170 oscan->flags = ARG(nxt);
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 /* Try optimization CURLYX => CURLYM. */
1185 if ( OP(oscan) == CURLYX && data
1186 && !(data->flags & SF_HAS_PAR)
1187 && !(data->flags & SF_HAS_EVAL)
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1196 && (OP(nxt2) != WHILEM))
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204 if (OP(nxt) != CLOSE)
1205 FAIL("Panic opt close");
1206 oscan->flags = ARG(nxt);
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1230 /* Optimize again: */
1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 PREVOPER(nxt)->flags = data->whilem_c
1251 | (RExC_whilem_seen << 4); /* On WHILEM */
1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1260 #if defined(SPARC64_GCC_WORKAROUND)
1266 if (pos_before >= data->last_start_min)
1269 b = data->last_start_min;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1276 I32 b = pos_before >= data->last_start_min
1277 ? pos_before : data->last_start_min;
1279 char *s = SvPV(data->last_found, l);
1280 I32 old = b - data->last_start_min;
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
1287 /* Get the added string: */
1288 last_str = newSVpvn(s + old, l);
1289 if (deltanext == 0 && pos_before == b) {
1290 /* What was added is a constant string */
1292 SvGROW(last_str, (mincount * l) + 1);
1293 repeatcpy(SvPVX(last_str) + l,
1294 SvPVX(last_str), l, mincount - 1);
1295 SvCUR(last_str) *= mincount;
1296 /* Add additional parts. */
1297 SvCUR_set(data->last_found,
1298 SvCUR(data->last_found) - l);
1299 sv_catsv(data->last_found, last_str);
1300 data->last_end += l * (mincount - 1);
1303 /* start offset must point into the last copy */
1304 data->last_start_min += minnext * (mincount - 1);
1305 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1306 * (minnext + data->pos_delta);
1309 /* It is counted once already... */
1310 data->pos_min += minnext * (mincount - counted);
1311 data->pos_delta += - counted * deltanext +
1312 (minnext + deltanext) * maxcount - minnext * mincount;
1313 if (mincount != maxcount) {
1314 /* Cannot extend fixed substrings found inside
1316 scan_commit(pRExC_state,data);
1317 if (mincount && last_str) {
1318 sv_setsv(data->last_found, last_str);
1319 data->last_end = data->pos_min;
1320 data->last_start_min =
1321 data->pos_min - CHR_SVLEN(last_str);
1322 data->last_start_max = is_inf
1324 : data->pos_min + data->pos_delta
1325 - CHR_SVLEN(last_str);
1327 data->longest = &(data->longest_float);
1329 SvREFCNT_dec(last_str);
1331 if (data && (fl & SF_HAS_EVAL))
1332 data->flags |= SF_HAS_EVAL;
1333 optimize_curly_tail:
1334 if (OP(oscan) != CURLYX) {
1335 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1337 NEXT_OFF(oscan) += NEXT_OFF(next);
1340 default: /* REF and CLUMP only? */
1341 if (flags & SCF_DO_SUBSTR) {
1342 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1343 data->longest = &(data->longest_float);
1345 is_inf = is_inf_internal = 1;
1346 if (flags & SCF_DO_STCLASS_OR)
1347 cl_anything(pRExC_state, data->start_class);
1348 flags &= ~SCF_DO_STCLASS;
1352 else if (strchr((char*)PL_simple,OP(scan))) {
1355 if (flags & SCF_DO_SUBSTR) {
1356 scan_commit(pRExC_state,data);
1360 if (flags & SCF_DO_STCLASS) {
1361 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363 /* Some of the logic below assumes that switching
1364 locale on will only add false positives. */
1365 switch (PL_regkind[(U8)OP(scan)]) {
1369 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1370 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1371 cl_anything(pRExC_state, data->start_class);
1374 if (OP(scan) == SANY)
1376 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1377 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1378 || (data->start_class->flags & ANYOF_CLASS));
1379 cl_anything(pRExC_state, data->start_class);
1381 if (flags & SCF_DO_STCLASS_AND || !value)
1382 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385 if (flags & SCF_DO_STCLASS_AND)
1386 cl_and(data->start_class,
1387 (struct regnode_charclass_class*)scan);
1389 cl_or(pRExC_state, data->start_class,
1390 (struct regnode_charclass_class*)scan);
1393 if (flags & SCF_DO_STCLASS_AND) {
1394 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1395 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1396 for (value = 0; value < 256; value++)
1397 if (!isALNUM(value))
1398 ANYOF_BITMAP_CLEAR(data->start_class, value);
1402 if (data->start_class->flags & ANYOF_LOCALE)
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 for (value = 0; value < 256; value++)
1407 ANYOF_BITMAP_SET(data->start_class, value);
1412 if (flags & SCF_DO_STCLASS_AND) {
1413 if (data->start_class->flags & ANYOF_LOCALE)
1414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418 data->start_class->flags |= ANYOF_LOCALE;
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1425 for (value = 0; value < 256; value++)
1427 ANYOF_BITMAP_CLEAR(data->start_class, value);
1431 if (data->start_class->flags & ANYOF_LOCALE)
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 for (value = 0; value < 256; value++)
1435 if (!isALNUM(value))
1436 ANYOF_BITMAP_SET(data->start_class, value);
1441 if (flags & SCF_DO_STCLASS_AND) {
1442 if (data->start_class->flags & ANYOF_LOCALE)
1443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446 data->start_class->flags |= ANYOF_LOCALE;
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1454 for (value = 0; value < 256; value++)
1455 if (!isSPACE(value))
1456 ANYOF_BITMAP_CLEAR(data->start_class, value);
1460 if (data->start_class->flags & ANYOF_LOCALE)
1461 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 for (value = 0; value < 256; value++)
1465 ANYOF_BITMAP_SET(data->start_class, value);
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 if (data->start_class->flags & ANYOF_LOCALE)
1472 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475 data->start_class->flags |= ANYOF_LOCALE;
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1483 for (value = 0; value < 256; value++)
1485 ANYOF_BITMAP_CLEAR(data->start_class, value);
1489 if (data->start_class->flags & ANYOF_LOCALE)
1490 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 for (value = 0; value < 256; value++)
1493 if (!isSPACE(value))
1494 ANYOF_BITMAP_SET(data->start_class, value);
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 if (data->start_class->flags & ANYOF_LOCALE) {
1501 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
1504 ANYOF_BITMAP_CLEAR(data->start_class, value);
1508 data->start_class->flags |= ANYOF_LOCALE;
1509 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1513 if (flags & SCF_DO_STCLASS_AND) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1515 for (value = 0; value < 256; value++)
1516 if (!isDIGIT(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 if (data->start_class->flags & ANYOF_LOCALE)
1521 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 for (value = 0; value < 256; value++)
1525 ANYOF_BITMAP_SET(data->start_class, value);
1530 if (flags & SCF_DO_STCLASS_AND) {
1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1532 for (value = 0; value < 256; value++)
1534 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537 if (data->start_class->flags & ANYOF_LOCALE)
1538 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_SET(data->start_class, value);
1547 if (flags & SCF_DO_STCLASS_OR)
1548 cl_and(data->start_class, &and_with);
1549 flags &= ~SCF_DO_STCLASS;
1552 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1553 data->flags |= (OP(scan) == MEOL
1557 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1558 /* Lookbehind, or need to calculate parens/evals/stclass: */
1559 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1560 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1561 /* Lookahead/lookbehind */
1562 I32 deltanext, minnext, fake = 0;
1564 struct regnode_charclass_class intrnl;
1567 data_fake.flags = 0;
1569 data_fake.whilem_c = data->whilem_c;
1570 data_fake.last_closep = data->last_closep;
1573 data_fake.last_closep = &fake;
1574 if ( flags & SCF_DO_STCLASS && !scan->flags
1575 && OP(scan) == IFMATCH ) { /* Lookahead */
1576 cl_init(pRExC_state, &intrnl);
1577 data_fake.start_class = &intrnl;
1578 f |= SCF_DO_STCLASS_AND;
1580 if (flags & SCF_WHILEM_VISITED_POS)
1581 f |= SCF_WHILEM_VISITED_POS;
1582 next = regnext(scan);
1583 nscan = NEXTOPER(NEXTOPER(scan));
1584 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1587 vFAIL("Variable length lookbehind not implemented");
1589 else if (minnext > U8_MAX) {
1590 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1592 scan->flags = minnext;
1594 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 if (data && (data_fake.flags & SF_HAS_EVAL))
1597 data->flags |= SF_HAS_EVAL;
1599 data->whilem_c = data_fake.whilem_c;
1600 if (f & SCF_DO_STCLASS_AND) {
1601 int was = (data->start_class->flags & ANYOF_EOS);
1603 cl_and(data->start_class, &intrnl);
1605 data->start_class->flags |= ANYOF_EOS;
1608 else if (OP(scan) == OPEN) {
1611 else if (OP(scan) == CLOSE) {
1612 if (ARG(scan) == is_par) {
1613 next = regnext(scan);
1615 if ( next && (OP(next) != WHILEM) && next < last)
1616 is_par = 0; /* Disable optimization */
1619 *(data->last_closep) = ARG(scan);
1621 else if (OP(scan) == EVAL) {
1623 data->flags |= SF_HAS_EVAL;
1625 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1626 if (flags & SCF_DO_SUBSTR) {
1627 scan_commit(pRExC_state,data);
1628 data->longest = &(data->longest_float);
1630 is_inf = is_inf_internal = 1;
1631 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1632 cl_anything(pRExC_state, data->start_class);
1633 flags &= ~SCF_DO_STCLASS;
1635 /* Else: zero-length, ignore. */
1636 scan = regnext(scan);
1641 *deltap = is_inf_internal ? I32_MAX : delta;
1642 if (flags & SCF_DO_SUBSTR && is_inf)
1643 data->pos_delta = I32_MAX - data->pos_min;
1644 if (is_par > U8_MAX)
1646 if (is_par && pars==1 && data) {
1647 data->flags |= SF_IN_PAR;
1648 data->flags &= ~SF_HAS_PAR;
1650 else if (pars && data) {
1651 data->flags |= SF_HAS_PAR;
1652 data->flags &= ~SF_IN_PAR;
1654 if (flags & SCF_DO_STCLASS_OR)
1655 cl_and(data->start_class, &and_with);
1660 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1662 if (RExC_rx->data) {
1663 Renewc(RExC_rx->data,
1664 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1665 char, struct reg_data);
1666 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1667 RExC_rx->data->count += n;
1670 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1671 char, struct reg_data);
1672 New(1208, RExC_rx->data->what, n, U8);
1673 RExC_rx->data->count = n;
1675 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1676 return RExC_rx->data->count - n;
1680 Perl_reginitcolors(pTHX)
1683 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1686 PL_colors[0] = s = savepv(s);
1688 s = strchr(s, '\t');
1694 PL_colors[i] = s = "";
1698 PL_colors[i++] = "";
1705 - pregcomp - compile a regular expression into internal code
1707 * We can't allocate space until we know how big the compiled form will be,
1708 * but we can't compile it (and thus know how big it is) until we've got a
1709 * place to put the code. So we cheat: we compile it twice, once with code
1710 * generation turned off and size counting turned on, and once "for real".
1711 * This also means that we don't allocate space until we are sure that the
1712 * thing really will compile successfully, and we never have to move the
1713 * code and thus invalidate pointers into it. (Note that it has to be in
1714 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 * Beware that the optimization-preparation code in here knows about some
1717 * of the structure of the compiled regexp. [I'll say.]
1720 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1730 RExC_state_t RExC_state;
1731 RExC_state_t *pRExC_state = &RExC_state;
1734 FAIL("NULL regexp argument");
1736 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1740 if (!PL_colorset) reginitcolors();
1741 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1742 PL_colors[4],PL_colors[5],PL_colors[0],
1743 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 RExC_flags16 = pm->op_pmflags;
1749 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1750 RExC_seen_evals = 0;
1753 /* First pass: determine size, legality. */
1760 RExC_emit = &PL_regdummy;
1761 RExC_whilem_seen = 0;
1762 #if 0 /* REGC() is (currently) a NOP at the first pass.
1763 * Clever compilers notice this and complain. --jhi */
1764 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1766 if (reg(pRExC_state, 0, &flags) == NULL) {
1767 RExC_precomp = Nullch;
1770 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1772 /* Small enough for pointer-storage convention?
1773 If extralen==0, this means that we will not need long jumps. */
1774 if (RExC_size >= 0x10000L && RExC_extralen)
1775 RExC_size += RExC_extralen;
1778 if (RExC_whilem_seen > 15)
1779 RExC_whilem_seen = 15;
1781 /* Allocate space and initialize. */
1782 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1785 FAIL("Regexp out of space");
1788 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1789 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1792 r->prelen = xend - exp;
1793 r->precomp = savepvn(RExC_precomp, r->prelen);
1795 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1796 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1798 r->substrs = 0; /* Useful during FAIL. */
1799 r->startp = 0; /* Useful during FAIL. */
1800 r->endp = 0; /* Useful during FAIL. */
1802 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 r->offsets[0] = RExC_size;
1806 DEBUG_r(PerlIO_printf(Perl_debug_log,
1807 "%s %"UVuf" bytes for offset annotations.\n",
1808 r->offsets ? "Got" : "Couldn't get",
1809 (UV)((2*RExC_size+1) * sizeof(U32))));
1813 /* Second pass: emit code. */
1814 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1819 RExC_emit_start = r->program;
1820 RExC_emit = r->program;
1821 /* Store the count of eval-groups for security checks: */
1822 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1823 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1825 if (reg(pRExC_state, 0, &flags) == NULL)
1828 /* Dig out information for optimizations. */
1829 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1830 pm->op_pmflags = RExC_flags16;
1832 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1833 r->regstclass = NULL;
1834 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1835 r->reganch |= ROPT_NAUGHTY;
1836 scan = r->program + 1; /* First BRANCH. */
1838 /* XXXX To minimize changes to RE engine we always allocate
1839 3-units-long substrs field. */
1840 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842 StructCopy(&zero_scan_data, &data, scan_data_t);
1843 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1844 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1846 STRLEN longest_float_length, longest_fixed_length;
1847 struct regnode_charclass_class ch_class;
1852 /* Skip introductions and multiplicators >= 1. */
1853 while ((OP(first) == OPEN && (sawopen = 1)) ||
1854 /* An OR of *one* alternative - should not happen now. */
1855 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1856 (OP(first) == PLUS) ||
1857 (OP(first) == MINMOD) ||
1858 /* An {n,m} with n>0 */
1859 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1860 if (OP(first) == PLUS)
1863 first += regarglen[(U8)OP(first)];
1864 first = NEXTOPER(first);
1867 /* Starting-point info. */
1869 if (PL_regkind[(U8)OP(first)] == EXACT) {
1870 if (OP(first) == EXACT)
1871 ; /* Empty, get anchored substr later. */
1872 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1873 r->regstclass = first;
1875 else if (strchr((char*)PL_simple,OP(first)))
1876 r->regstclass = first;
1877 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1878 PL_regkind[(U8)OP(first)] == NBOUND)
1879 r->regstclass = first;
1880 else if (PL_regkind[(U8)OP(first)] == BOL) {
1881 r->reganch |= (OP(first) == MBOL
1883 : (OP(first) == SBOL
1886 first = NEXTOPER(first);
1889 else if (OP(first) == GPOS) {
1890 r->reganch |= ROPT_ANCH_GPOS;
1891 first = NEXTOPER(first);
1894 else if (!sawopen && (OP(first) == STAR &&
1895 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1896 !(r->reganch & ROPT_ANCH) )
1898 /* turn .* into ^.* with an implied $*=1 */
1899 int type = OP(NEXTOPER(first));
1901 if (type == REG_ANY)
1902 type = ROPT_ANCH_MBOL;
1904 type = ROPT_ANCH_SBOL;
1906 r->reganch |= type | ROPT_IMPLICIT;
1907 first = NEXTOPER(first);
1910 if (sawplus && (!sawopen || !RExC_sawback)
1911 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1912 /* x+ must match at the 1st pos of run of x's */
1913 r->reganch |= ROPT_SKIP;
1915 /* Scan is after the zeroth branch, first is atomic matcher. */
1916 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1917 (IV)(first - scan + 1)));
1919 * If there's something expensive in the r.e., find the
1920 * longest literal string that must appear and make it the
1921 * regmust. Resolve ties in favor of later strings, since
1922 * the regstart check works with the beginning of the r.e.
1923 * and avoiding duplication strengthens checking. Not a
1924 * strong reason, but sufficient in the absence of others.
1925 * [Now we resolve ties in favor of the earlier string if
1926 * it happens that c_offset_min has been invalidated, since the
1927 * earlier string may buy us something the later one won't.]
1931 data.longest_fixed = newSVpvn("",0);
1932 data.longest_float = newSVpvn("",0);
1933 data.last_found = newSVpvn("",0);
1934 data.longest = &(data.longest_fixed);
1936 if (!r->regstclass) {
1937 cl_init(pRExC_state, &ch_class);
1938 data.start_class = &ch_class;
1939 stclass_flag = SCF_DO_STCLASS_AND;
1940 } else /* XXXX Check for BOUND? */
1942 data.last_closep = &last_close;
1944 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1945 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1946 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1947 && data.last_start_min == 0 && data.last_end > 0
1948 && !RExC_seen_zerolen
1949 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1950 r->reganch |= ROPT_CHECK_ALL;
1951 scan_commit(pRExC_state, &data);
1952 SvREFCNT_dec(data.last_found);
1954 longest_float_length = CHR_SVLEN(data.longest_float);
1955 if (longest_float_length
1956 || (data.flags & SF_FL_BEFORE_EOL
1957 && (!(data.flags & SF_FL_BEFORE_MEOL)
1958 || (RExC_flags16 & PMf_MULTILINE)))) {
1961 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1962 && data.offset_fixed == data.offset_float_min
1963 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1964 goto remove_float; /* As in (a)+. */
1966 r->float_substr = data.longest_float;
1967 r->float_min_offset = data.offset_float_min;
1968 r->float_max_offset = data.offset_float_max;
1969 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971 || (RExC_flags16 & PMf_MULTILINE)));
1972 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1976 r->float_substr = Nullsv;
1977 SvREFCNT_dec(data.longest_float);
1978 longest_float_length = 0;
1981 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1982 if (longest_fixed_length
1983 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1984 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1985 || (RExC_flags16 & PMf_MULTILINE)))) {
1988 r->anchored_substr = data.longest_fixed;
1989 r->anchored_offset = data.offset_fixed;
1990 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1991 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1992 || (RExC_flags16 & PMf_MULTILINE)));
1993 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1996 r->anchored_substr = Nullsv;
1997 SvREFCNT_dec(data.longest_fixed);
1998 longest_fixed_length = 0;
2001 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2002 r->regstclass = NULL;
2003 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2004 && !(data.start_class->flags & ANYOF_EOS)
2005 && !cl_is_anything(data.start_class)) {
2006 I32 n = add_data(pRExC_state, 1, "f");
2008 New(1006, RExC_rx->data->data[n], 1,
2009 struct regnode_charclass_class);
2010 StructCopy(data.start_class,
2011 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2012 struct regnode_charclass_class);
2013 r->regstclass = (regnode*)RExC_rx->data->data[n];
2014 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2015 PL_regdata = r->data; /* for regprop() */
2016 DEBUG_r({ SV *sv = sv_newmortal();
2017 regprop(sv, (regnode*)data.start_class);
2018 PerlIO_printf(Perl_debug_log,
2019 "synthetic stclass `%s'.\n",
2023 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2024 if (longest_fixed_length > longest_float_length) {
2025 r->check_substr = r->anchored_substr;
2026 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2027 if (r->reganch & ROPT_ANCH_SINGLE)
2028 r->reganch |= ROPT_NOSCAN;
2031 r->check_substr = r->float_substr;
2032 r->check_offset_min = data.offset_float_min;
2033 r->check_offset_max = data.offset_float_max;
2035 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036 This should be changed ASAP! */
2037 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2038 r->reganch |= RE_USE_INTUIT;
2039 if (SvTAIL(r->check_substr))
2040 r->reganch |= RE_INTUIT_TAIL;
2044 /* Several toplevels. Best we can is to set minlen. */
2046 struct regnode_charclass_class ch_class;
2049 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050 scan = r->program + 1;
2051 cl_init(pRExC_state, &ch_class);
2052 data.start_class = &ch_class;
2053 data.last_closep = &last_close;
2054 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2055 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2056 if (!(data.start_class->flags & ANYOF_EOS)
2057 && !cl_is_anything(data.start_class)) {
2058 I32 n = add_data(pRExC_state, 1, "f");
2060 New(1006, RExC_rx->data->data[n], 1,
2061 struct regnode_charclass_class);
2062 StructCopy(data.start_class,
2063 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2064 struct regnode_charclass_class);
2065 r->regstclass = (regnode*)RExC_rx->data->data[n];
2066 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2067 DEBUG_r({ SV* sv = sv_newmortal();
2068 regprop(sv, (regnode*)data.start_class);
2069 PerlIO_printf(Perl_debug_log,
2070 "synthetic stclass `%s'.\n",
2076 if (RExC_seen & REG_SEEN_GPOS)
2077 r->reganch |= ROPT_GPOS_SEEN;
2078 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2079 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2080 if (RExC_seen & REG_SEEN_EVAL)
2081 r->reganch |= ROPT_EVAL_SEEN;
2082 if (RExC_seen & REG_SEEN_CANY)
2083 r->reganch |= ROPT_CANY_SEEN;
2084 Newz(1002, r->startp, RExC_npar, I32);
2085 Newz(1002, r->endp, RExC_npar, I32);
2086 PL_regdata = r->data; /* for regprop() */
2087 DEBUG_r(regdump(r));
2092 - reg - regular expression, i.e. main body or parenthesized thing
2094 * Caller must absorb opening parenthesis.
2096 * Combining parenthesis handling with the base level of regular expression
2097 * is a trifle forced, but the need to tie the tails of the branches to what
2098 * follows makes it hard to avoid.
2101 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2102 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2104 register regnode *ret; /* Will be the head of the group. */
2105 register regnode *br;
2106 register regnode *lastbr;
2107 register regnode *ender = 0;
2108 register I32 parno = 0;
2109 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2111 /* for (?g), (?gc), and (?o) warnings; warning
2112 about (?c) will warn about (?g) -- japhy */
2114 I32 wastedflags = 0x00,
2117 wasted_gc = 0x02 | 0x04,
2120 char * parse_start = RExC_parse; /* MJD */
2121 char *oregcomp_parse = RExC_parse;
2124 *flagp = 0; /* Tentatively. */
2127 /* Make an OPEN node, if parenthesized. */
2129 if (*RExC_parse == '?') { /* (?...) */
2130 U16 posflags = 0, negflags = 0;
2131 U16 *flagsp = &posflags;
2133 char *seqstart = RExC_parse;
2136 paren = *RExC_parse++;
2137 ret = NULL; /* For look-ahead/behind. */
2139 case '<': /* (?<...) */
2140 RExC_seen |= REG_SEEN_LOOKBEHIND;
2141 if (*RExC_parse == '!')
2143 if (*RExC_parse != '=' && *RExC_parse != '!')
2146 case '=': /* (?=...) */
2147 case '!': /* (?!...) */
2148 RExC_seen_zerolen++;
2149 case ':': /* (?:...) */
2150 case '>': /* (?>...) */
2152 case '$': /* (?$...) */
2153 case '@': /* (?@...) */
2154 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2156 case '#': /* (?#...) */
2157 while (*RExC_parse && *RExC_parse != ')')
2159 if (*RExC_parse != ')')
2160 FAIL("Sequence (?#... not terminated");
2161 nextchar(pRExC_state);
2164 case 'p': /* (?p...) */
2165 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2166 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2168 case '?': /* (??...) */
2170 if (*RExC_parse != '{')
2172 paren = *RExC_parse++;
2174 case '{': /* (?{...}) */
2176 I32 count = 1, n = 0;
2178 char *s = RExC_parse;
2180 OP_4tree *sop, *rop;
2182 RExC_seen_zerolen++;
2183 RExC_seen |= REG_SEEN_EVAL;
2184 while (count && (c = *RExC_parse)) {
2185 if (c == '\\' && RExC_parse[1])
2193 if (*RExC_parse != ')')
2196 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2201 if (RExC_parse - 1 - s)
2202 sv = newSVpvn(s, RExC_parse - 1 - s);
2204 sv = newSVpvn("", 0);
2207 Perl_save_re_context(aTHX);
2208 rop = sv_compile_2op(sv, &sop, "re", &av);
2209 sop->op_private |= OPpREFCOUNTED;
2210 /* re_dup will OpREFCNT_inc */
2211 OpREFCNT_set(sop, 1);
2214 n = add_data(pRExC_state, 3, "nop");
2215 RExC_rx->data->data[n] = (void*)rop;
2216 RExC_rx->data->data[n+1] = (void*)sop;
2217 RExC_rx->data->data[n+2] = (void*)av;
2220 else { /* First pass */
2221 if (PL_reginterp_cnt < ++RExC_seen_evals
2222 && PL_curcop != &PL_compiling)
2223 /* No compiled RE interpolated, has runtime
2224 components ===> unsafe. */
2225 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2226 if (PL_tainting && PL_tainted)
2227 FAIL("Eval-group in insecure regular expression");
2230 nextchar(pRExC_state);
2232 ret = reg_node(pRExC_state, LOGICAL);
2235 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2236 /* deal with the length of this later - MJD */
2239 return reganode(pRExC_state, EVAL, n);
2241 case '(': /* (?(?{...})...) and (?(?=...)...) */
2243 if (RExC_parse[0] == '?') { /* (?(?...)) */
2244 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2245 || RExC_parse[1] == '<'
2246 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2249 ret = reg_node(pRExC_state, LOGICAL);
2252 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2256 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2258 parno = atoi(RExC_parse++);
2260 while (isDIGIT(*RExC_parse))
2262 ret = reganode(pRExC_state, GROUPP, parno);
2264 if ((c = *nextchar(pRExC_state)) != ')')
2265 vFAIL("Switch condition not recognized");
2267 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2268 br = regbranch(pRExC_state, &flags, 1);
2270 br = reganode(pRExC_state, LONGJMP, 0);
2272 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2273 c = *nextchar(pRExC_state);
2277 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2278 regbranch(pRExC_state, &flags, 1);
2279 regtail(pRExC_state, ret, lastbr);
2282 c = *nextchar(pRExC_state);
2287 vFAIL("Switch (?(condition)... contains too many branches");
2288 ender = reg_node(pRExC_state, TAIL);
2289 regtail(pRExC_state, br, ender);
2291 regtail(pRExC_state, lastbr, ender);
2292 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2295 regtail(pRExC_state, ret, ender);
2299 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2303 RExC_parse--; /* for vFAIL to print correctly */
2304 vFAIL("Sequence (? incomplete");
2308 parse_flags: /* (?i) */
2309 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2310 /* (?g), (?gc) and (?o) are useless here
2311 and must be globally applied -- japhy */
2313 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2314 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2315 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2316 if (! (wastedflags & wflagbit) ) {
2317 wastedflags |= wflagbit;
2320 "Useless (%s%c) - %suse /%c modifier",
2321 flagsp == &negflags ? "?-" : "?",
2323 flagsp == &negflags ? "don't " : "",
2329 else if (*RExC_parse == 'c') {
2330 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2331 if (! (wastedflags & wasted_c) ) {
2332 wastedflags |= wasted_gc;
2335 "Useless (%sc) - %suse /gc modifier",
2336 flagsp == &negflags ? "?-" : "?",
2337 flagsp == &negflags ? "don't " : ""
2342 else { pmflag(flagsp, *RExC_parse); }
2346 if (*RExC_parse == '-') {
2348 wastedflags = 0; /* reset so (?g-c) warns twice */
2352 RExC_flags16 |= posflags;
2353 RExC_flags16 &= ~negflags;
2354 if (*RExC_parse == ':') {
2360 if (*RExC_parse != ')') {
2362 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2364 nextchar(pRExC_state);
2372 ret = reganode(pRExC_state, OPEN, parno);
2373 Set_Node_Length(ret, 1); /* MJD */
2374 Set_Node_Offset(ret, RExC_parse); /* MJD */
2381 /* Pick up the branches, linking them together. */
2382 parse_start = RExC_parse; /* MJD */
2383 br = regbranch(pRExC_state, &flags, 1);
2384 /* branch_len = (paren != 0); */
2388 if (*RExC_parse == '|') {
2389 if (!SIZE_ONLY && RExC_extralen) {
2390 reginsert(pRExC_state, BRANCHJ, br);
2393 reginsert(pRExC_state, BRANCH, br);
2394 Set_Node_Length(br, paren != 0);
2395 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2399 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2401 else if (paren == ':') {
2402 *flagp |= flags&SIMPLE;
2404 if (open) { /* Starts with OPEN. */
2405 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2407 else if (paren != '?') /* Not Conditional */
2409 *flagp |= flags & (SPSTART | HASWIDTH);
2411 while (*RExC_parse == '|') {
2412 if (!SIZE_ONLY && RExC_extralen) {
2413 ender = reganode(pRExC_state, LONGJMP,0);
2414 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2417 RExC_extralen += 2; /* Account for LONGJMP. */
2418 nextchar(pRExC_state);
2419 br = regbranch(pRExC_state, &flags, 0);
2423 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2427 *flagp |= flags&SPSTART;
2430 if (have_branch || paren != ':') {
2431 /* Make a closing node, and hook it on the end. */
2434 ender = reg_node(pRExC_state, TAIL);
2437 ender = reganode(pRExC_state, CLOSE, parno);
2438 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2439 Set_Node_Length(ender,1); /* MJD */
2445 *flagp &= ~HASWIDTH;
2448 ender = reg_node(pRExC_state, SUCCEED);
2451 ender = reg_node(pRExC_state, END);
2454 regtail(pRExC_state, lastbr, ender);
2457 /* Hook the tails of the branches to the closing node. */
2458 for (br = ret; br != NULL; br = regnext(br)) {
2459 regoptail(pRExC_state, br, ender);
2466 static char parens[] = "=!<,>";
2468 if (paren && (p = strchr(parens, paren))) {
2469 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2470 int flag = (p - parens) > 1;
2473 node = SUSPEND, flag = 0;
2474 reginsert(pRExC_state, node,ret);
2476 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2480 /* Check for proper termination. */
2482 RExC_flags16 = oregflags;
2483 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2484 RExC_parse = oregcomp_parse;
2485 vFAIL("Unmatched (");
2488 else if (!paren && RExC_parse < RExC_end) {
2489 if (*RExC_parse == ')') {
2491 vFAIL("Unmatched )");
2494 FAIL("Junk on end of regexp"); /* "Can't happen". */
2502 - regbranch - one alternative of an | operator
2504 * Implements the concatenation operator.
2507 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2509 register regnode *ret;
2510 register regnode *chain = NULL;
2511 register regnode *latest;
2512 I32 flags = 0, c = 0;
2517 if (!SIZE_ONLY && RExC_extralen)
2518 ret = reganode(pRExC_state, BRANCHJ,0);
2520 ret = reg_node(pRExC_state, BRANCH);
2521 Set_Node_Length(ret, 1);
2525 if (!first && SIZE_ONLY)
2526 RExC_extralen += 1; /* BRANCHJ */
2528 *flagp = WORST; /* Tentatively. */
2531 nextchar(pRExC_state);
2532 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2534 latest = regpiece(pRExC_state, &flags);
2535 if (latest == NULL) {
2536 if (flags & TRYAGAIN)
2540 else if (ret == NULL)
2542 *flagp |= flags&HASWIDTH;
2543 if (chain == NULL) /* First piece. */
2544 *flagp |= flags&SPSTART;
2547 regtail(pRExC_state, chain, latest);
2552 if (chain == NULL) { /* Loop ran zero times. */
2553 chain = reg_node(pRExC_state, NOTHING);
2558 *flagp |= flags&SIMPLE;
2565 - regpiece - something followed by possible [*+?]
2567 * Note that the branching code sequences used for ? and the general cases
2568 * of * and + are somewhat optimized: they use the same NOTHING node as
2569 * both the endmarker for their branch list and the body of the last branch.
2570 * It might seem that this node could be dispensed with entirely, but the
2571 * endmarker role is not redundant.
2574 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2576 register regnode *ret;
2578 register char *next;
2580 char *origparse = RExC_parse;
2583 I32 max = REG_INFTY;
2586 ret = regatom(pRExC_state, &flags);
2588 if (flags & TRYAGAIN)
2595 if (op == '{' && regcurly(RExC_parse)) {
2596 parse_start = RExC_parse; /* MJD */
2597 next = RExC_parse + 1;
2599 while (isDIGIT(*next) || *next == ',') {
2608 if (*next == '}') { /* got one */
2612 min = atoi(RExC_parse);
2616 maxpos = RExC_parse;
2618 if (!max && *maxpos != '0')
2619 max = REG_INFTY; /* meaning "infinity" */
2620 else if (max >= REG_INFTY)
2621 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2623 nextchar(pRExC_state);
2626 if ((flags&SIMPLE)) {
2627 RExC_naughty += 2 + RExC_naughty / 2;
2628 reginsert(pRExC_state, CURLY, ret);
2629 Set_Node_Offset(ret, parse_start+1); /* MJD */
2630 Set_Node_Cur_Length(ret);
2633 regnode *w = reg_node(pRExC_state, WHILEM);
2636 regtail(pRExC_state, ret, w);
2637 if (!SIZE_ONLY && RExC_extralen) {
2638 reginsert(pRExC_state, LONGJMP,ret);
2639 reginsert(pRExC_state, NOTHING,ret);
2640 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2642 reginsert(pRExC_state, CURLYX,ret);
2644 Set_Node_Offset(ret, parse_start+1);
2645 Set_Node_Length(ret,
2646 op == '{' ? (RExC_parse - parse_start) : 1);
2648 if (!SIZE_ONLY && RExC_extralen)
2649 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2650 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2652 RExC_whilem_seen++, RExC_extralen += 3;
2653 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2661 if (max && max < min)
2662 vFAIL("Can't do {n,m} with n > m");
2677 #if 0 /* Now runtime fix should be reliable. */
2679 /* if this is reinstated, don't forget to put this back into perldiag:
2681 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2683 (F) The part of the regexp subject to either the * or + quantifier
2684 could match an empty string. The {#} shows in the regular
2685 expression about where the problem was discovered.
2689 if (!(flags&HASWIDTH) && op != '?')
2690 vFAIL("Regexp *+ operand could be empty");
2693 parse_start = RExC_parse;
2694 nextchar(pRExC_state);
2696 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2698 if (op == '*' && (flags&SIMPLE)) {
2699 reginsert(pRExC_state, STAR, ret);
2703 else if (op == '*') {
2707 else if (op == '+' && (flags&SIMPLE)) {
2708 reginsert(pRExC_state, PLUS, ret);
2712 else if (op == '+') {
2716 else if (op == '?') {
2721 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2723 "%.*s matches null string many times",
2724 RExC_parse - origparse,
2728 if (*RExC_parse == '?') {
2729 nextchar(pRExC_state);
2730 reginsert(pRExC_state, MINMOD, ret);
2731 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2733 if (ISMULT2(RExC_parse)) {
2735 vFAIL("Nested quantifiers");
2742 - regatom - the lowest level
2744 * Optimization: gobbles an entire sequence of ordinary characters so that
2745 * it can turn them into a single node, which is smaller to store and
2746 * faster to run. Backslashed characters are exceptions, each becoming a
2747 * separate node; the code is simpler that way and it's not worth fixing.
2749 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2751 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2753 register regnode *ret = 0;
2755 char *parse_start = 0;
2757 *flagp = WORST; /* Tentatively. */
2760 switch (*RExC_parse) {
2762 RExC_seen_zerolen++;
2763 nextchar(pRExC_state);
2764 if (RExC_flags16 & PMf_MULTILINE)
2765 ret = reg_node(pRExC_state, MBOL);
2766 else if (RExC_flags16 & PMf_SINGLELINE)
2767 ret = reg_node(pRExC_state, SBOL);
2769 ret = reg_node(pRExC_state, BOL);
2770 Set_Node_Length(ret, 1); /* MJD */
2773 nextchar(pRExC_state);
2775 RExC_seen_zerolen++;
2776 if (RExC_flags16 & PMf_MULTILINE)
2777 ret = reg_node(pRExC_state, MEOL);
2778 else if (RExC_flags16 & PMf_SINGLELINE)
2779 ret = reg_node(pRExC_state, SEOL);
2781 ret = reg_node(pRExC_state, EOL);
2782 Set_Node_Length(ret, 1); /* MJD */
2785 nextchar(pRExC_state);
2786 if (RExC_flags16 & PMf_SINGLELINE)
2787 ret = reg_node(pRExC_state, SANY);
2789 ret = reg_node(pRExC_state, REG_ANY);
2790 *flagp |= HASWIDTH|SIMPLE;
2792 Set_Node_Length(ret, 1); /* MJD */
2796 char *oregcomp_parse = ++RExC_parse;
2797 ret = regclass(pRExC_state);
2798 if (*RExC_parse != ']') {
2799 RExC_parse = oregcomp_parse;
2800 vFAIL("Unmatched [");
2802 nextchar(pRExC_state);
2803 *flagp |= HASWIDTH|SIMPLE;
2804 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2808 nextchar(pRExC_state);
2809 ret = reg(pRExC_state, 1, &flags);
2811 if (flags & TRYAGAIN) {
2812 if (RExC_parse == RExC_end) {
2813 /* Make parent create an empty node if needed. */
2821 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2825 if (flags & TRYAGAIN) {
2829 vFAIL("Internal urp");
2830 /* Supposed to be caught earlier. */
2833 if (!regcurly(RExC_parse)) {
2842 vFAIL("Quantifier follows nothing");
2845 switch (*++RExC_parse) {
2847 RExC_seen_zerolen++;
2848 ret = reg_node(pRExC_state, SBOL);
2850 nextchar(pRExC_state);
2851 Set_Node_Length(ret, 2); /* MJD */
2854 ret = reg_node(pRExC_state, GPOS);
2855 RExC_seen |= REG_SEEN_GPOS;
2857 nextchar(pRExC_state);
2858 Set_Node_Length(ret, 2); /* MJD */
2861 ret = reg_node(pRExC_state, SEOL);
2863 RExC_seen_zerolen++; /* Do not optimize RE away */
2864 nextchar(pRExC_state);
2867 ret = reg_node(pRExC_state, EOS);
2869 RExC_seen_zerolen++; /* Do not optimize RE away */
2870 nextchar(pRExC_state);
2871 Set_Node_Length(ret, 2); /* MJD */
2874 ret = reg_node(pRExC_state, CANY);
2875 RExC_seen |= REG_SEEN_CANY;
2876 *flagp |= HASWIDTH|SIMPLE;
2877 nextchar(pRExC_state);
2878 Set_Node_Length(ret, 2); /* MJD */
2881 ret = reg_node(pRExC_state, CLUMP);
2883 nextchar(pRExC_state);
2884 Set_Node_Length(ret, 2); /* MJD */
2887 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2888 *flagp |= HASWIDTH|SIMPLE;
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2894 *flagp |= HASWIDTH|SIMPLE;
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 RExC_seen_zerolen++;
2900 RExC_seen |= REG_SEEN_LOOKBEHIND;
2901 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2903 nextchar(pRExC_state);
2904 Set_Node_Length(ret, 2); /* MJD */
2907 RExC_seen_zerolen++;
2908 RExC_seen |= REG_SEEN_LOOKBEHIND;
2909 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2911 nextchar(pRExC_state);
2912 Set_Node_Length(ret, 2); /* MJD */
2915 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2916 *flagp |= HASWIDTH|SIMPLE;
2917 nextchar(pRExC_state);
2918 Set_Node_Length(ret, 2); /* MJD */
2921 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2922 *flagp |= HASWIDTH|SIMPLE;
2923 nextchar(pRExC_state);
2924 Set_Node_Length(ret, 2); /* MJD */
2927 ret = reg_node(pRExC_state, DIGIT);
2928 *flagp |= HASWIDTH|SIMPLE;
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, NDIGIT);
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2941 char* oldregxend = RExC_end;
2942 char* parse_start = RExC_parse;
2944 if (RExC_parse[1] == '{') {
2945 /* a lovely hack--pretend we saw [\pX] instead */
2946 RExC_end = strchr(RExC_parse, '}');
2948 U8 c = (U8)*RExC_parse;
2950 RExC_end = oldregxend;
2951 vFAIL2("Missing right brace on \\%c{}", c);
2956 RExC_end = RExC_parse + 2;
2959 ret = regclass(pRExC_state);
2961 RExC_end = oldregxend;
2963 Set_Node_Cur_Length(ret); /* MJD */
2964 nextchar(pRExC_state);
2965 *flagp |= HASWIDTH|SIMPLE;
2978 case '1': case '2': case '3': case '4':
2979 case '5': case '6': case '7': case '8': case '9':
2981 I32 num = atoi(RExC_parse);
2983 if (num > 9 && num >= RExC_npar)
2986 char * parse_start = RExC_parse - 1; /* MJD */
2987 while (isDIGIT(*RExC_parse))
2990 if (!SIZE_ONLY && num > RExC_rx->nparens)
2991 vFAIL("Reference to nonexistent group");
2993 ret = reganode(pRExC_state, FOLD
2994 ? (LOC ? REFFL : REFF)
2998 /* override incorrect value set in reganode MJD */
2999 Set_Node_Offset(ret, parse_start+1);
3000 Set_Node_Cur_Length(ret); /* MJD */
3002 nextchar(pRExC_state);
3007 if (RExC_parse >= RExC_end)
3008 FAIL("Trailing \\");
3011 /* Do not generate `unrecognized' warnings here, we fall
3012 back into the quick-grab loop below */
3018 if (RExC_flags16 & PMf_EXTENDED) {
3019 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3020 if (RExC_parse < RExC_end)
3026 register STRLEN len;
3032 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3034 parse_start = RExC_parse - 1;
3039 ret = reg_node(pRExC_state, FOLD
3040 ? (LOC ? EXACTFL : EXACTF)
3043 for (len = 0, p = RExC_parse - 1;
3044 len < 127 && p < RExC_end;
3049 if (RExC_flags16 & PMf_EXTENDED)
3050 p = regwhite(p, RExC_end);
3097 ender = ASCII_TO_NATIVE('\033');
3101 ender = ASCII_TO_NATIVE('\007');
3106 char* e = strchr(p, '}');
3110 vFAIL("Missing right brace on \\x{}");
3113 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3114 | PERL_SCAN_DISALLOW_PREFIX;
3116 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3119 /* numlen is generous */
3120 if (numlen + len >= 127) {
3128 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3130 ender = grok_hex(p, &numlen, &flags, NULL);
3136 ender = UCHARAT(p++);
3137 ender = toCTRL(ender);
3139 case '0': case '1': case '2': case '3':case '4':
3140 case '5': case '6': case '7': case '8':case '9':
3142 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3145 ender = grok_oct(p, &numlen, &flags, NULL);
3155 FAIL("Trailing \\");
3158 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3159 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3160 goto normal_default;
3165 if (UTF8_IS_START(*p) && UTF) {
3166 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3174 if (RExC_flags16 & PMf_EXTENDED)
3175 p = regwhite(p, RExC_end);
3177 /* Prime the casefolded buffer. */
3178 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3180 if (ISMULT2(p)) { /* Back off on ?+*. */
3187 /* Emit all the Unicode characters. */
3188 for (foldbuf = tmpbuf;
3190 foldlen -= numlen) {
3191 ender = utf8_to_uvchr(foldbuf, &numlen);
3193 reguni(pRExC_state, ender, s, &unilen);
3196 /* In EBCDIC the numlen
3197 * and unilen can differ. */
3199 if (numlen >= foldlen)
3203 break; /* "Can't happen." */
3207 reguni(pRExC_state, ender, s, &unilen);
3224 /* Emit all the Unicode characters. */
3225 for (foldbuf = tmpbuf;
3227 foldlen -= numlen) {
3228 ender = utf8_to_uvchr(foldbuf, &numlen);
3230 reguni(pRExC_state, ender, s, &unilen);
3233 /* In EBCDIC the numlen
3234 * and unilen can differ. */
3236 if (numlen >= foldlen)
3244 reguni(pRExC_state, ender, s, &unilen);
3257 Set_Node_Cur_Length(ret); /* MJD */
3258 nextchar(pRExC_state);
3260 /* len is STRLEN which is unsigned, need to copy to signed */
3263 vFAIL("Internal disaster");
3272 RExC_size += STR_SZ(len);
3274 RExC_emit += STR_SZ(len);
3279 /* If the encoding pragma is in effect recode the text of
3280 * any EXACT-kind nodes. */
3281 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3282 STRLEN oldlen = STR_LEN(ret);
3283 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3287 if (sv_utf8_downgrade(sv, TRUE)) {
3288 char *s = sv_recode_to_utf8(sv, PL_encoding);
3289 STRLEN newlen = SvCUR(sv);
3292 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3293 (int)oldlen, STRING(ret),
3295 Copy(s, STRING(ret), newlen, char);
3296 STR_LEN(ret) += newlen - oldlen;
3297 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3299 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3307 S_regwhite(pTHX_ char *p, char *e)
3312 else if (*p == '#') {
3315 } while (p < e && *p != '\n');
3323 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3324 Character classes ([:foo:]) can also be negated ([:^foo:]).
3325 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3326 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3327 but trigger failures because they are currently unimplemented. */
3329 #define POSIXCC_DONE(c) ((c) == ':')
3330 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3331 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3334 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3337 I32 namedclass = OOB_NAMEDCLASS;
3339 if (value == '[' && RExC_parse + 1 < RExC_end &&
3340 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3341 POSIXCC(UCHARAT(RExC_parse))) {
3342 char c = UCHARAT(RExC_parse);
3343 char* s = RExC_parse++;
3345 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3347 if (RExC_parse == RExC_end)
3348 /* Grandfather lone [:, [=, [. */
3351 char* t = RExC_parse++; /* skip over the c */
3353 if (UCHARAT(RExC_parse) == ']') {
3354 RExC_parse++; /* skip over the ending ] */
3357 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3358 I32 skip = 5; /* the most common skip */
3362 if (strnEQ(posixcc, "alnum", 5))
3364 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3365 else if (strnEQ(posixcc, "alpha", 5))
3367 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3368 else if (strnEQ(posixcc, "ascii", 5))
3370 complement ? ANYOF_NASCII : ANYOF_ASCII;
3373 if (strnEQ(posixcc, "blank", 5))
3375 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3378 if (strnEQ(posixcc, "cntrl", 5))
3380 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3383 if (strnEQ(posixcc, "digit", 5))
3385 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3388 if (strnEQ(posixcc, "graph", 5))
3390 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3393 if (strnEQ(posixcc, "lower", 5))
3395 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3398 if (strnEQ(posixcc, "print", 5))
3400 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3401 else if (strnEQ(posixcc, "punct", 5))
3403 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3406 if (strnEQ(posixcc, "space", 5))
3408 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3411 if (strnEQ(posixcc, "upper", 5))
3413 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3415 case 'w': /* this is not POSIX, this is the Perl \w */
3416 if (strnEQ(posixcc, "word", 4)) {
3418 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3423 if (strnEQ(posixcc, "xdigit", 6)) {
3425 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3430 if (namedclass == OOB_NAMEDCLASS ||
3431 posixcc[skip] != ':' ||
3432 posixcc[skip+1] != ']')
3434 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3437 } else if (!SIZE_ONLY) {
3438 /* [[=foo=]] and [[.foo.]] are still future. */
3440 /* adjust RExC_parse so the warning shows after
3442 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3444 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3447 /* Maternal grandfather:
3448 * "[:" ending in ":" but not in ":]" */
3458 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3460 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3461 char *s = RExC_parse;
3464 while(*s && isALNUM(*s))
3466 if (*s && c == *s && s[1] == ']') {
3467 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3469 /* [[=foo=]] and [[.foo.]] are still future. */
3470 if (POSIXCC_NOTYET(c)) {
3471 /* adjust RExC_parse so the error shows after
3473 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3475 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3482 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3485 register UV nextvalue;
3486 register IV prevvalue = OOB_UNICODE;
3487 register IV range = 0;
3488 register regnode *ret;
3491 char *rangebegin = 0;
3492 bool need_class = 0;
3493 SV *listsv = Nullsv;
3496 bool optimize_invert = TRUE;
3497 AV* unicode_alternate = 0;
3499 ret = reganode(pRExC_state, ANYOF, 0);
3502 ANYOF_FLAGS(ret) = 0;
3504 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3508 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3512 RExC_size += ANYOF_SKIP;
3514 RExC_emit += ANYOF_SKIP;
3516 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3518 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3519 ANYOF_BITMAP_ZERO(ret);
3520 listsv = newSVpvn("# comment\n", 10);
3523 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3525 if (!SIZE_ONLY && POSIXCC(nextvalue))
3526 checkposixcc(pRExC_state);
3528 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3529 if (UCHARAT(RExC_parse) == ']')
3532 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3536 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3539 rangebegin = RExC_parse;
3541 value = utf8n_to_uvchr((U8*)RExC_parse,
3542 RExC_end - RExC_parse,
3544 RExC_parse += numlen;
3547 value = UCHARAT(RExC_parse++);
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3549 if (value == '[' && POSIXCC(nextvalue))
3550 namedclass = regpposixcc(pRExC_state, value);
3551 else if (value == '\\') {
3553 value = utf8n_to_uvchr((U8*)RExC_parse,
3554 RExC_end - RExC_parse,
3556 RExC_parse += numlen;
3559 value = UCHARAT(RExC_parse++);
3560 /* Some compilers cannot handle switching on 64-bit integer
3561 * values, therefore value cannot be an UV. Yes, this will
3562 * be a problem later if we want switch on Unicode.
3563 * A similar issue a little bit later when switching on
3564 * namedclass. --jhi */
3565 switch ((I32)value) {
3566 case 'w': namedclass = ANYOF_ALNUM; break;
3567 case 'W': namedclass = ANYOF_NALNUM; break;
3568 case 's': namedclass = ANYOF_SPACE; break;
3569 case 'S': namedclass = ANYOF_NSPACE; break;
3570 case 'd': namedclass = ANYOF_DIGIT; break;
3571 case 'D': namedclass = ANYOF_NDIGIT; break;
3574 if (*RExC_parse == '{') {
3576 e = strchr(RExC_parse++, '}');
3578 vFAIL2("Missing right brace on \\%c{}", c);
3579 while (isSPACE(UCHARAT(RExC_parse)))
3581 if (e == RExC_parse)
3582 vFAIL2("Empty \\%c{}", c);
3584 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3592 if (UCHARAT(RExC_parse) == '^') {
3595 value = value == 'p' ? 'P' : 'p'; /* toggle */
3596 while (isSPACE(UCHARAT(RExC_parse))) {
3602 Perl_sv_catpvf(aTHX_ listsv,
3603 "+utf8::%.*s\n", (int)n, RExC_parse);
3605 Perl_sv_catpvf(aTHX_ listsv,
3606 "!utf8::%.*s\n", (int)n, RExC_parse);
3609 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3611 case 'n': value = '\n'; break;
3612 case 'r': value = '\r'; break;
3613 case 't': value = '\t'; break;
3614 case 'f': value = '\f'; break;
3615 case 'b': value = '\b'; break;
3616 case 'e': value = ASCII_TO_NATIVE('\033');break;
3617 case 'a': value = ASCII_TO_NATIVE('\007');break;
3619 if (*RExC_parse == '{') {
3620 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3621 | PERL_SCAN_DISALLOW_PREFIX;
3622 e = strchr(RExC_parse++, '}');
3624 vFAIL("Missing right brace on \\x{}");
3626 numlen = e - RExC_parse;
3627 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3631 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3633 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3634 RExC_parse += numlen;
3638 value = UCHARAT(RExC_parse++);
3639 value = toCTRL(value);
3641 case '0': case '1': case '2': case '3': case '4':
3642 case '5': case '6': case '7': case '8': case '9':
3646 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3647 RExC_parse += numlen;
3651 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3653 "Unrecognized escape \\%c in character class passed through",
3657 } /* end of \blah */
3659 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3661 if (!SIZE_ONLY && !need_class)
3662 ANYOF_CLASS_ZERO(ret);
3666 /* a bad range like a-\d, a-[:digit:] ? */
3669 if (ckWARN(WARN_REGEXP))
3671 "False [] range \"%*.*s\"",
3672 RExC_parse - rangebegin,
3673 RExC_parse - rangebegin,
3675 if (prevvalue < 256) {
3676 ANYOF_BITMAP_SET(ret, prevvalue);
3677 ANYOF_BITMAP_SET(ret, '-');
3680 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3681 Perl_sv_catpvf(aTHX_ listsv,
3682 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3686 range = 0; /* this was not a true range */
3690 if (namedclass > OOB_NAMEDCLASS)
3691 optimize_invert = FALSE;
3692 /* Possible truncation here but in some 64-bit environments
3693 * the compiler gets heartburn about switch on 64-bit values.
3694 * A similar issue a little earlier when switching on value.
3696 switch ((I32)namedclass) {
3699 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3701 for (value = 0; value < 256; value++)
3703 ANYOF_BITMAP_SET(ret, value);
3705 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3709 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3711 for (value = 0; value < 256; value++)
3712 if (!isALNUM(value))
3713 ANYOF_BITMAP_SET(ret, value);
3715 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3719 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3721 for (value = 0; value < 256; value++)
3722 if (isALNUMC(value))
3723 ANYOF_BITMAP_SET(ret, value);
3725 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3729 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3731 for (value = 0; value < 256; value++)
3732 if (!isALNUMC(value))
3733 ANYOF_BITMAP_SET(ret, value);
3735 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3739 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3741 for (value = 0; value < 256; value++)
3743 ANYOF_BITMAP_SET(ret, value);
3745 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3749 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3751 for (value = 0; value < 256; value++)
3752 if (!isALPHA(value))
3753 ANYOF_BITMAP_SET(ret, value);
3755 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3759 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3762 for (value = 0; value < 128; value++)
3763 ANYOF_BITMAP_SET(ret, value);
3765 for (value = 0; value < 256; value++) {
3767 ANYOF_BITMAP_SET(ret, value);
3771 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3775 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3778 for (value = 128; value < 256; value++)
3779 ANYOF_BITMAP_SET(ret, value);
3781 for (value = 0; value < 256; value++) {
3782 if (!isASCII(value))
3783 ANYOF_BITMAP_SET(ret, value);
3787 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3791 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3793 for (value = 0; value < 256; value++)
3795 ANYOF_BITMAP_SET(ret, value);
3797 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3801 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3803 for (value = 0; value < 256; value++)
3804 if (!isBLANK(value))
3805 ANYOF_BITMAP_SET(ret, value);
3807 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3811 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3813 for (value = 0; value < 256; value++)
3815 ANYOF_BITMAP_SET(ret, value);
3817 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3821 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3823 for (value = 0; value < 256; value++)
3824 if (!isCNTRL(value))
3825 ANYOF_BITMAP_SET(ret, value);
3827 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3831 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3833 /* consecutive digits assumed */
3834 for (value = '0'; value <= '9'; value++)
3835 ANYOF_BITMAP_SET(ret, value);
3837 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3841 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3843 /* consecutive digits assumed */
3844 for (value = 0; value < '0'; value++)
3845 ANYOF_BITMAP_SET(ret, value);
3846 for (value = '9' + 1; value < 256; value++)
3847 ANYOF_BITMAP_SET(ret, value);
3849 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3853 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3855 for (value = 0; value < 256; value++)
3857 ANYOF_BITMAP_SET(ret, value);
3859 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3863 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3865 for (value = 0; value < 256; value++)
3866 if (!isGRAPH(value))
3867 ANYOF_BITMAP_SET(ret, value);
3869 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3873 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3875 for (value = 0; value < 256; value++)
3877 ANYOF_BITMAP_SET(ret, value);
3879 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3883 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3885 for (value = 0; value < 256; value++)
3886 if (!isLOWER(value))
3887 ANYOF_BITMAP_SET(ret, value);
3889 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3893 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3895 for (value = 0; value < 256; value++)
3897 ANYOF_BITMAP_SET(ret, value);
3899 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3903 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3905 for (value = 0; value < 256; value++)
3906 if (!isPRINT(value))
3907 ANYOF_BITMAP_SET(ret, value);
3909 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3913 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3915 for (value = 0; value < 256; value++)
3916 if (isPSXSPC(value))
3917 ANYOF_BITMAP_SET(ret, value);
3919 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3923 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3925 for (value = 0; value < 256; value++)
3926 if (!isPSXSPC(value))
3927 ANYOF_BITMAP_SET(ret, value);
3929 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3933 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3935 for (value = 0; value < 256; value++)
3937 ANYOF_BITMAP_SET(ret, value);
3939 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3943 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3945 for (value = 0; value < 256; value++)
3946 if (!isPUNCT(value))
3947 ANYOF_BITMAP_SET(ret, value);
3949 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3953 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3955 for (value = 0; value < 256; value++)
3957 ANYOF_BITMAP_SET(ret, value);
3959 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3963 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3965 for (value = 0; value < 256; value++)
3966 if (!isSPACE(value))
3967 ANYOF_BITMAP_SET(ret, value);
3969 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3973 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3975 for (value = 0; value < 256; value++)
3977 ANYOF_BITMAP_SET(ret, value);
3979 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3983 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3985 for (value = 0; value < 256; value++)
3986 if (!isUPPER(value))
3987 ANYOF_BITMAP_SET(ret, value);
3989 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3993 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3995 for (value = 0; value < 256; value++)
3996 if (isXDIGIT(value))
3997 ANYOF_BITMAP_SET(ret, value);
3999 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4003 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4005 for (value = 0; value < 256; value++)
4006 if (!isXDIGIT(value))
4007 ANYOF_BITMAP_SET(ret, value);
4009 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4012 vFAIL("Invalid [::] class");
4016 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4019 } /* end of namedclass \blah */
4022 if (prevvalue > value) /* b-a */ {
4023 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4024 RExC_parse - rangebegin,
4025 RExC_parse - rangebegin,
4027 range = 0; /* not a valid range */
4031 prevvalue = value; /* save the beginning of the range */
4032 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4033 RExC_parse[1] != ']') {
4036 /* a bad range like \w-, [:word:]- ? */
4037 if (namedclass > OOB_NAMEDCLASS) {
4038 if (ckWARN(WARN_REGEXP))
4040 "False [] range \"%*.*s\"",
4041 RExC_parse - rangebegin,
4042 RExC_parse - rangebegin,
4045 ANYOF_BITMAP_SET(ret, '-');
4047 range = 1; /* yeah, it's a range! */
4048 continue; /* but do it the next time */
4052 /* now is the next time */
4056 if (prevvalue < 256) {
4057 IV ceilvalue = value < 256 ? value : 255;
4060 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4061 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4063 if (isLOWER(prevvalue)) {
4064 for (i = prevvalue; i <= ceilvalue; i++)
4066 ANYOF_BITMAP_SET(ret, i);
4068 for (i = prevvalue; i <= ceilvalue; i++)
4070 ANYOF_BITMAP_SET(ret, i);
4075 for (i = prevvalue; i <= ceilvalue; i++)
4076 ANYOF_BITMAP_SET(ret, i);
4078 if (value > 255 || UTF) {
4079 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4080 UV natvalue = NATIVE_TO_UNI(value);
4082 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4083 if (prevnatvalue < natvalue) { /* what about > ? */
4084 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4085 prevnatvalue, natvalue);
4087 else if (prevnatvalue == natvalue) {
4088 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4090 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4092 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4094 /* If folding and foldable and a single
4095 * character, insert also the folded version
4096 * to the charclass. */
4098 if (foldlen == UNISKIP(f))
4099 Perl_sv_catpvf(aTHX_ listsv,
4102 /* Any multicharacter foldings
4103 * require the following transform:
4104 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4105 * where E folds into "pq" and F folds
4106 * into "rst", all other characters
4107 * fold to single characters. We save
4108 * away these multicharacter foldings,
4109 * to be later saved as part of the
4110 * additional "s" data. */
4113 if (!unicode_alternate)
4114 unicode_alternate = newAV();
4115 sv = newSVpvn((char*)foldbuf, foldlen);
4117 av_push(unicode_alternate, sv);
4121 /* If folding and the value is one of the Greek
4122 * sigmas insert a few more sigmas to make the
4123 * folding rules of the sigmas to work right.
4124 * Note that not all the possible combinations
4125 * are handled here: some of them are handled
4126 * by the standard folding rules, and some of
4127 * them (literal or EXACTF cases) are handled
4128 * during runtime in regexec.c:S_find_byclass(). */
4129 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4130 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4131 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4132 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4133 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4135 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4136 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4137 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4143 range = 0; /* this range (if it was one) is done now */
4147 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4149 RExC_size += ANYOF_CLASS_ADD_SKIP;
4151 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4154 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4156 /* If the only flag is folding (plus possibly inversion). */
4157 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4159 for (value = 0; value < 256; ++value) {
4160 if (ANYOF_BITMAP_TEST(ret, value)) {
4161 IV fold = PL_fold[value];
4164 ANYOF_BITMAP_SET(ret, fold);
4167 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4170 /* optimize inverted simple patterns (e.g. [^a-z]) */
4171 if (!SIZE_ONLY && optimize_invert &&
4172 /* If the only flag is inversion. */
4173 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4174 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4175 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4176 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4183 /* The 0th element stores the character class description
4184 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4185 * to initialize the appropriate swash (which gets stored in
4186 * the 1st element), and also useful for dumping the regnode.
4187 * The 2nd element stores the multicharacter foldings,
4188 * used later (regexec.c:s_reginclasslen()). */
4189 av_store(av, 0, listsv);
4190 av_store(av, 1, NULL);
4191 av_store(av, 2, (SV*)unicode_alternate);
4192 rv = newRV_noinc((SV*)av);
4193 n = add_data(pRExC_state, 1, "s");
4194 RExC_rx->data->data[n] = (void*)rv;
4202 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4204 char* retval = RExC_parse++;
4207 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4208 RExC_parse[2] == '#') {
4209 while (*RExC_parse && *RExC_parse != ')')
4214 if (RExC_flags16 & PMf_EXTENDED) {
4215 if (isSPACE(*RExC_parse)) {
4219 else if (*RExC_parse == '#') {
4220 while (*RExC_parse && *RExC_parse != '\n')
4231 - reg_node - emit a node
4233 STATIC regnode * /* Location. */
4234 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4236 register regnode *ret;
4237 register regnode *ptr;
4241 SIZE_ALIGN(RExC_size);
4246 NODE_ALIGN_FILL(ret);
4248 FILL_ADVANCE_NODE(ptr, op);
4249 if (RExC_offsets) { /* MJD */
4250 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4251 "reg_node", __LINE__,
4253 RExC_emit - RExC_emit_start > RExC_offsets[0]
4254 ? "Overwriting end of array!\n" : "OK",
4255 RExC_emit - RExC_emit_start,
4256 RExC_parse - RExC_start,
4258 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4267 - reganode - emit a node with an argument
4269 STATIC regnode * /* Location. */
4270 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4272 register regnode *ret;
4273 register regnode *ptr;
4277 SIZE_ALIGN(RExC_size);
4282 NODE_ALIGN_FILL(ret);
4284 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4285 if (RExC_offsets) { /* MJD */
4286 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4288 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4289 "Overwriting end of array!\n" : "OK",
4290 RExC_emit - RExC_emit_start,
4291 RExC_parse - RExC_start,
4293 Set_Cur_Node_Offset;
4302 - reguni - emit (if appropriate) a Unicode character
4305 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4307 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4311 - reginsert - insert an operator in front of already-emitted operand
4313 * Means relocating the operand.
4316 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4318 register regnode *src;
4319 register regnode *dst;
4320 register regnode *place;
4321 register int offset = regarglen[(U8)op];
4323 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4326 RExC_size += NODE_STEP_REGNODE + offset;
4331 RExC_emit += NODE_STEP_REGNODE + offset;
4333 while (src > opnd) {
4334 StructCopy(--src, --dst, regnode);
4335 if (RExC_offsets) { /* MJD 20010112 */
4336 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4338 dst - RExC_emit_start > RExC_offsets[0]
4339 ? "Overwriting end of array!\n" : "OK",
4340 src - RExC_emit_start,
4341 dst - RExC_emit_start,
4343 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4344 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4349 place = opnd; /* Op node, where operand used to be. */
4350 if (RExC_offsets) { /* MJD */
4351 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4353 place - RExC_emit_start > RExC_offsets[0]
4354 ? "Overwriting end of array!\n" : "OK",
4355 place - RExC_emit_start,
4356 RExC_parse - RExC_start,
4358 Set_Node_Offset(place, RExC_parse);
4360 src = NEXTOPER(place);
4361 FILL_ADVANCE_NODE(place, op);
4362 Zero(src, offset, regnode);
4366 - regtail - set the next-pointer at the end of a node chain of p to val.
4369 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4371 register regnode *scan;
4372 register regnode *temp;
4377 /* Find last node. */
4380 temp = regnext(scan);
4386 if (reg_off_by_arg[OP(scan)]) {
4387 ARG_SET(scan, val - scan);
4390 NEXT_OFF(scan) = val - scan;
4395 - regoptail - regtail on operand of first argument; nop if operandless
4398 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4400 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4401 if (p == NULL || SIZE_ONLY)
4403 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4404 regtail(pRExC_state, NEXTOPER(p), val);
4406 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4407 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4414 - regcurly - a little FSA that accepts {\d+,?\d*}
4417 S_regcurly(pTHX_ register char *s)
4438 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4440 register U8 op = EXACT; /* Arbitrary non-END op. */
4441 register regnode *next;
4443 while (op != END && (!last || node < last)) {
4444 /* While that wasn't END last time... */
4450 next = regnext(node);
4452 if (OP(node) == OPTIMIZED)
4455 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4456 (int)(2*l + 1), "", SvPVX(sv));
4457 if (next == NULL) /* Next ptr. */
4458 PerlIO_printf(Perl_debug_log, "(0)");
4460 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4461 (void)PerlIO_putc(Perl_debug_log, '\n');
4463 if (PL_regkind[(U8)op] == BRANCHJ) {
4464 register regnode *nnode = (OP(next) == LONGJMP
4467 if (last && nnode > last)
4469 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4471 else if (PL_regkind[(U8)op] == BRANCH) {
4472 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4474 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4475 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4476 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4478 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4479 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4482 else if ( op == PLUS || op == STAR) {
4483 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4485 else if (op == ANYOF) {
4486 /* arglen 1 + class block */
4487 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4488 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4489 node = NEXTOPER(node);
4491 else if (PL_regkind[(U8)op] == EXACT) {
4492 /* Literal string, where present. */
4493 node += NODE_SZ_STR(node) - 1;
4494 node = NEXTOPER(node);
4497 node = NEXTOPER(node);
4498 node += regarglen[(U8)op];
4500 if (op == CURLYX || op == OPEN)
4502 else if (op == WHILEM)
4508 #endif /* DEBUGGING */
4511 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4514 Perl_regdump(pTHX_ regexp *r)
4517 SV *sv = sv_newmortal();
4519 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4521 /* Header fields of interest. */
4522 if (r->anchored_substr)
4523 PerlIO_printf(Perl_debug_log,
4524 "anchored `%s%.*s%s'%s at %"IVdf" ",
4526 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4527 SvPVX(r->anchored_substr),
4529 SvTAIL(r->anchored_substr) ? "$" : "",
4530 (IV)r->anchored_offset);
4531 if (r->float_substr)
4532 PerlIO_printf(Perl_debug_log,
4533 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4535 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4536 SvPVX(r->float_substr),
4538 SvTAIL(r->float_substr) ? "$" : "",
4539 (IV)r->float_min_offset, (UV)r->float_max_offset);
4540 if (r->check_substr)
4541 PerlIO_printf(Perl_debug_log,
4542 r->check_substr == r->float_substr
4543 ? "(checking floating" : "(checking anchored");
4544 if (r->reganch & ROPT_NOSCAN)
4545 PerlIO_printf(Perl_debug_log, " noscan");
4546 if (r->reganch & ROPT_CHECK_ALL)
4547 PerlIO_printf(Perl_debug_log, " isall");
4548 if (r->check_substr)
4549 PerlIO_printf(Perl_debug_log, ") ");
4551 if (r->regstclass) {
4552 regprop(sv, r->regstclass);
4553 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4555 if (r->reganch & ROPT_ANCH) {
4556 PerlIO_printf(Perl_debug_log, "anchored");
4557 if (r->reganch & ROPT_ANCH_BOL)
4558 PerlIO_printf(Perl_debug_log, "(BOL)");
4559 if (r->reganch & ROPT_ANCH_MBOL)
4560 PerlIO_printf(Perl_debug_log, "(MBOL)");
4561 if (r->reganch & ROPT_ANCH_SBOL)
4562 PerlIO_printf(Perl_debug_log, "(SBOL)");
4563 if (r->reganch & ROPT_ANCH_GPOS)
4564 PerlIO_printf(Perl_debug_log, "(GPOS)");
4565 PerlIO_putc(Perl_debug_log, ' ');
4567 if (r->reganch & ROPT_GPOS_SEEN)
4568 PerlIO_printf(Perl_debug_log, "GPOS ");
4569 if (r->reganch & ROPT_SKIP)
4570 PerlIO_printf(Perl_debug_log, "plus ");
4571 if (r->reganch & ROPT_IMPLICIT)
4572 PerlIO_printf(Perl_debug_log, "implicit ");
4573 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4574 if (r->reganch & ROPT_EVAL_SEEN)
4575 PerlIO_printf(Perl_debug_log, "with eval ");
4576 PerlIO_printf(Perl_debug_log, "\n");
4579 U32 len = r->offsets[0];
4580 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4581 for (i = 1; i <= len; i++)
4582 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4583 (UV)r->offsets[i*2-1],
4584 (UV)r->offsets[i*2]);
4585 PerlIO_printf(Perl_debug_log, "\n");
4587 #endif /* DEBUGGING */
4593 S_put_byte(pTHX_ SV *sv, int c)
4595 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4596 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4597 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4598 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4600 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4603 #endif /* DEBUGGING */
4606 - regprop - printable representation of opcode
4609 Perl_regprop(pTHX_ SV *sv, regnode *o)
4614 sv_setpvn(sv, "", 0);
4615 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4616 /* It would be nice to FAIL() here, but this may be called from
4617 regexec.c, and it would be hard to supply pRExC_state. */
4618 Perl_croak(aTHX_ "Corrupted regexp opcode");
4619 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4621 k = PL_regkind[(U8)OP(o)];
4624 SV *dsv = sv_2mortal(newSVpvn("", 0));
4625 /* Using is_utf8_string() is a crude hack but it may
4626 * be the best for now since we have no flag "this EXACTish
4627 * node was UTF-8" --jhi */
4628 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4630 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4631 UNI_DISPLAY_REGEX) :
4636 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4641 else if (k == CURLY) {
4642 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4643 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4644 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4646 else if (k == WHILEM && o->flags) /* Ordinal/of */
4647 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4648 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4649 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4650 else if (k == LOGICAL)
4651 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4652 else if (k == ANYOF) {
4653 int i, rangestart = -1;
4654 U8 flags = ANYOF_FLAGS(o);
4655 const char * const anyofs[] = { /* Should be syncronized with
4656 * ANYOF_ #xdefines in regcomp.h */
4689 if (flags & ANYOF_LOCALE)
4690 sv_catpv(sv, "{loc}");
4691 if (flags & ANYOF_FOLD)
4692 sv_catpv(sv, "{i}");
4693 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4694 if (flags & ANYOF_INVERT)
4696 for (i = 0; i <= 256; i++) {
4697 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4698 if (rangestart == -1)
4700 } else if (rangestart != -1) {
4701 if (i <= rangestart + 3)
4702 for (; rangestart < i; rangestart++)
4703 put_byte(sv, rangestart);
4705 put_byte(sv, rangestart);
4707 put_byte(sv, i - 1);
4713 if (o->flags & ANYOF_CLASS)
4714 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4715 if (ANYOF_CLASS_TEST(o,i))
4716 sv_catpv(sv, anyofs[i]);
4718 if (flags & ANYOF_UNICODE)
4719 sv_catpv(sv, "{unicode}");
4720 else if (flags & ANYOF_UNICODE_ALL)
4721 sv_catpv(sv, "{unicode_all}");
4725 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4730 U8 s[UTF8_MAXLEN+1];
4732 for (i = 0; i <= 256; i++) { /* just the first 256 */
4733 U8 *e = uvchr_to_utf8(s, i);
4735 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4736 if (rangestart == -1)
4738 } else if (rangestart != -1) {
4741 if (i <= rangestart + 3)
4742 for (; rangestart < i; rangestart++) {
4743 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4747 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4750 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4757 sv_catpv(sv, "..."); /* et cetera */
4761 char *s = savepv(SvPVX(lv));
4764 while(*s && *s != '\n') s++;
4785 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4787 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4788 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4789 #endif /* DEBUGGING */
4793 Perl_re_intuit_string(pTHX_ regexp *prog)
4794 { /* Assume that RE_INTUIT is set */
4797 char *s = SvPV(prog->check_substr,n_a);
4799 if (!PL_colorset) reginitcolors();
4800 PerlIO_printf(Perl_debug_log,
4801 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4802 PL_colors[4],PL_colors[5],PL_colors[0],
4805 (strlen(s) > 60 ? "..." : ""));
4808 return prog->check_substr;
4812 Perl_pregfree(pTHX_ struct regexp *r)
4815 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4818 if (!r || (--r->refcnt > 0))
4821 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4823 int len = SvCUR(dsv);
4826 PerlIO_printf(Perl_debug_log,
4827 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4828 PL_colors[4],PL_colors[5],PL_colors[0],
4831 len > 60 ? "..." : "");
4835 Safefree(r->precomp);
4836 if (r->offsets) /* 20010421 MJD */
4837 Safefree(r->offsets);
4838 if (RX_MATCH_COPIED(r))
4839 Safefree(r->subbeg);
4841 if (r->anchored_substr)
4842 SvREFCNT_dec(r->anchored_substr);
4843 if (r->float_substr)
4844 SvREFCNT_dec(r->float_substr);
4845 Safefree(r->substrs);
4848 int n = r->data->count;
4849 AV* new_comppad = NULL;
4854 /* If you add a ->what type here, update the comment in regcomp.h */
4855 switch (r->data->what[n]) {
4857 SvREFCNT_dec((SV*)r->data->data[n]);
4860 Safefree(r->data->data[n]);
4863 new_comppad = (AV*)r->data->data[n];
4866 if (new_comppad == NULL)
4867 Perl_croak(aTHX_ "panic: pregfree comppad");
4868 old_comppad = PL_comppad;
4869 old_curpad = PL_curpad;
4870 /* Watch out for global destruction's random ordering. */
4871 if (SvTYPE(new_comppad) == SVt_PVAV) {
4872 PL_comppad = new_comppad;
4873 PL_curpad = AvARRAY(new_comppad);
4878 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4879 op_free((OP_4tree*)r->data->data[n]);
4882 PL_comppad = old_comppad;
4883 PL_curpad = old_curpad;
4884 SvREFCNT_dec((SV*)new_comppad);
4890 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4893 Safefree(r->data->what);
4896 Safefree(r->startp);
4902 - regnext - dig the "next" pointer out of a node
4904 * [Note, when REGALIGN is defined there are two places in regmatch()
4905 * that bypass this code for speed.]
4908 Perl_regnext(pTHX_ register regnode *p)
4910 register I32 offset;
4912 if (p == &PL_regdummy)
4915 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4923 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4926 STRLEN l1 = strlen(pat1);
4927 STRLEN l2 = strlen(pat2);
4936 Copy(pat1, buf, l1 , char);
4937 Copy(pat2, buf + l1, l2 , char);
4938 buf[l1 + l2] = '\n';
4939 buf[l1 + l2 + 1] = '\0';
4941 /* ANSI variant takes additional second argument */
4942 va_start(args, pat2);
4946 msv = vmess(buf, &args);
4948 message = SvPV(msv,l1);
4951 Copy(message, buf, l1 , char);
4952 buf[l1] = '\0'; /* Overwrite \n */
4953 Perl_croak(aTHX_ "%s", buf);
4956 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4959 Perl_save_re_context(pTHX)
4962 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4963 SAVEI32(RExC_npar); /* () count. */
4964 SAVEI32(RExC_size); /* Code size. */
4965 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4966 SAVEVPTR(RExC_rx); /* from regcomp.c */
4967 SAVEI32(RExC_seen); /* from regcomp.c */
4968 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4969 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4970 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
4971 SAVEPPTR(RExC_end); /* End of input for compile */
4972 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4975 SAVEI32(PL_reg_flags); /* from regexec.c */
4977 SAVEPPTR(PL_reginput); /* String-input pointer. */
4978 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4979 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
4980 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4981 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4982 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
4983 SAVEPPTR(PL_regtill); /* How far we are required to go. */
4984 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
4985 PL_reg_start_tmp = 0;
4986 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4987 PL_reg_start_tmpl = 0;
4988 SAVEVPTR(PL_regdata);
4989 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4990 SAVEI32(PL_regnarrate); /* from regexec.c */
4991 SAVEVPTR(PL_regprogram); /* from regexec.c */
4992 SAVEINT(PL_regindent); /* from regexec.c */
4993 SAVEVPTR(PL_regcc); /* from regexec.c */
4994 SAVEVPTR(PL_curcop);
4995 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4996 SAVEVPTR(PL_reg_re); /* from regexec.c */
4997 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4998 SAVESPTR(PL_reg_sv); /* from regexec.c */
4999 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5000 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5001 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5002 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5003 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5004 SAVEI32(PL_regnpar); /* () count. */
5005 SAVEI32(PL_regsize); /* from regexec.c */
5007 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5012 clear_re(pTHX_ void *r)
5014 ReREFCNT_dec((regexp *)r);