5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 data->flags &= ~SF_FL_BEFORE_EOL;
515 SvCUR_set(data->last_found, 0);
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
600 /* 'OR' a given class with another one. Can create false positives */
601 /* We assume that cl is not inverted */
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 if (or_with->flags & ANYOF_INVERT) {
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
624 cl_anything(pRExC_state, cl);
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629 && (!(or_with->flags & ANYOF_FOLD)
630 || (cl->flags & ANYOF_FOLD)) ) {
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
643 cl_anything(pRExC_state, cl);
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
664 #if defined(__sparc64__) && defined(__GNUC__)
665 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 # undef SPARC64_GCC_WORKAROUND
667 # define SPARC64_GCC_WORKAROUND 1
671 /* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687 int is_inf_internal = 0; /* The studied chunk is infinite */
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
696 /* Merge several consecutive EXACTish nodes into one. */
697 regnode *n = regnext(scan);
700 regnode *stop = scan;
703 next = scan + NODE_SZ_STR(scan);
704 /* Skip NOTHING, merge EXACT*. */
706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
707 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
722 int oldl = STR_LEN(scan);
723 regnode *nnext = regnext(n);
725 if (oldl + STR_LEN(n) > U8_MAX)
727 NEXT_OFF(scan) += NEXT_OFF(n);
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
730 /* Now we can overwrite *n : */
731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 Two problematic code points in Unicode casefolding of EXACT nodes:
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
774 s < s2 && (t = ninstr(s, s1, t0, t1));
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
784 n = scan + NODE_SZ_STR(scan);
786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
796 if (OP(scan) != CURLYX) {
797 int max = (reg_off_by_arg[OP(scan)]
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 if (reg_off_by_arg[OP(scan)])
814 NEXT_OFF(scan) = off;
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824 I32 max1 = 0, min1 = I32_MAX, num = 0;
825 struct regnode_charclass_class accum;
827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829 if (flags & SCF_DO_STCLASS)
830 cl_init_zero(pRExC_state, &accum);
831 while (OP(scan) == code) {
832 I32 deltanext, minnext, f = 0, fake;
833 struct regnode_charclass_class this_class;
838 data_fake.whilem_c = data->whilem_c;
839 data_fake.last_closep = data->last_closep;
842 data_fake.last_closep = &fake;
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
846 scan = NEXTOPER(scan);
847 if (flags & SCF_DO_STCLASS) {
848 cl_init(pRExC_state, &this_class);
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
854 /* we suppose the run is continuous, last=next...*/
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
862 is_inf = is_inf_internal = 1;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 if (data && (data_fake.flags & SF_HAS_EVAL))
867 data->flags |= SF_HAS_EVAL;
869 data->whilem_c = data_fake.whilem_c;
870 if (flags & SCF_DO_STCLASS)
871 cl_or(pRExC_state, &accum, &this_class);
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
884 delta += max1 - min1;
885 if (flags & SCF_DO_STCLASS_OR) {
886 cl_or(pRExC_state, data->start_class, &accum);
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
892 else if (flags & SCF_DO_STCLASS_AND) {
894 cl_and(data->start_class, &accum);
895 flags &= ~SCF_DO_STCLASS;
898 /* Switch to OR mode: cache the old value of
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
910 else if (code == BRANCHJ) /* single branch is optimized. */
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
916 else if (OP(scan) == EXACT) {
917 I32 l = STR_LEN(scan);
918 UV uc = *((U8*)STRING(scan));
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
922 uc = utf8_to_uvchr(s, NULL);
925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
931 ? I32_MAX : data->pos_min + data->pos_delta;
933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
945 && (!(data->start_class->flags & ANYOF_FOLD)
946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
952 ANYOF_BITMAP_SET(data->start_class, uc);
953 data->start_class->flags &= ~ANYOF_EOS;
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
960 ANYOF_BITMAP_SET(data->start_class, uc);
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
966 flags &= ~SCF_DO_STCLASS;
968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
969 I32 l = STR_LEN(scan);
970 UV uc = *((U8*)STRING(scan));
972 /* Search for fixed substrings supports EXACT only. */
973 if (flags & SCF_DO_SUBSTR)
974 scan_commit(pRExC_state, data);
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
978 uc = utf8_to_uvchr(s, NULL);
981 if (data && (flags & SCF_DO_SUBSTR))
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
995 ANYOF_BITMAP_SET(data->start_class, uc);
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1007 ANYOF_BITMAP_SET(data->start_class, uc);
1008 data->start_class->flags &= ~ANYOF_EOS;
1010 cl_and(data->start_class, &and_with);
1012 flags &= ~SCF_DO_STCLASS;
1014 else if (strchr((char*)PL_varies,OP(scan))) {
1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1016 I32 f = flags, pos_before = 0;
1017 regnode *oscan = scan;
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
1020 I32 next_is_eval = 0;
1022 switch (PL_regkind[(U8)OP(scan)]) {
1023 case WHILEM: /* End of (?:...)* . */
1024 scan = NEXTOPER(scan);
1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1028 next = NEXTOPER(scan);
1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031 maxcount = REG_INFTY;
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1037 if (flags & SCF_DO_SUBSTR)
1042 if (flags & SCF_DO_STCLASS) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1049 is_inf = is_inf_internal = 1;
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1053 data->longest = &(data->longest_float);
1055 goto optimize_curly_tail;
1057 mincount = ARG1(scan);
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1066 next_is_eval = (OP(scan) == EVAL);
1068 if (flags & SCF_DO_SUBSTR) {
1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1070 pos_before = data->pos_min;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 data->flags |= SF_IS_INF;
1078 if (flags & SCF_DO_STCLASS) {
1079 cl_init(pRExC_state, &this_class);
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
1094 /* This will finish on WHILEM, setting scan, or on NULL: */
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
1103 cl_or(pRExC_state, data->start_class, &this_class);
1105 else if (flags & SCF_DO_STCLASS_AND) {
1106 /* Switch to OR mode: cache the old value of
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
1118 cl_or(pRExC_state, data->start_class, &this_class);
1119 cl_and(data->start_class, &and_with);
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1127 if (ckWARN(WARN_REGEXP)
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1130 && (minnext == 0) && (deltanext == 0)
1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1135 "Quantifier unexpected on zero-length expression");
1138 min += minnext * mincount;
1139 is_inf_internal |= ((maxcount == REG_INFTY
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
1142 is_inf |= is_inf_internal;
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145 /* Try powerful optimization CURLYX => CURLYN. */
1146 if ( OP(oscan) == CURLYX && data
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1152 regnode *nxt1 = nxt;
1159 if (!strchr((char*)PL_simple,OP(nxt))
1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1161 && STR_LEN(nxt) == 1))
1167 if (OP(nxt) != CLOSE)
1169 /* Now we know that nxt2 is the only contents: */
1170 oscan->flags = ARG(nxt);
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 /* Try optimization CURLYX => CURLYM. */
1185 if ( OP(oscan) == CURLYX && data
1186 && !(data->flags & SF_HAS_PAR)
1187 && !(data->flags & SF_HAS_EVAL)
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1196 && (OP(nxt2) != WHILEM))
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204 if (OP(nxt) != CLOSE)
1205 FAIL("Panic opt close");
1206 oscan->flags = ARG(nxt);
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1230 /* Optimize again: */
1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 PREVOPER(nxt)->flags = data->whilem_c
1251 | (RExC_whilem_seen << 4); /* On WHILEM */
1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1260 #if defined(SPARC64_GCC_WORKAROUND)
1266 if (pos_before >= data->last_start_min)
1269 b = data->last_start_min;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1276 I32 b = pos_before >= data->last_start_min
1277 ? pos_before : data->last_start_min;
1279 char *s = SvPV(data->last_found, l);
1280 I32 old = b - data->last_start_min;
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
1287 /* Get the added string: */
1288 last_str = newSVpvn(s + old, l);
1289 if (deltanext == 0 && pos_before == b) {
1290 /* What was added is a constant string */
1292 SvGROW(last_str, (mincount * l) + 1);
1293 repeatcpy(SvPVX(last_str) + l,
1294 SvPVX(last_str), l, mincount - 1);
1295 SvCUR(last_str) *= mincount;
1296 /* Add additional parts. */
1297 SvCUR_set(data->last_found,
1298 SvCUR(data->last_found) - l);
1299 sv_catsv(data->last_found, last_str);
1300 data->last_end += l * (mincount - 1);
1303 /* start offset must point into the last copy */
1304 data->last_start_min += minnext * (mincount - 1);
1305 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1306 * (minnext + data->pos_delta);
1309 /* It is counted once already... */
1310 data->pos_min += minnext * (mincount - counted);
1311 data->pos_delta += - counted * deltanext +
1312 (minnext + deltanext) * maxcount - minnext * mincount;
1313 if (mincount != maxcount) {
1314 /* Cannot extend fixed substrings found inside
1316 scan_commit(pRExC_state,data);
1317 if (mincount && last_str) {
1318 sv_setsv(data->last_found, last_str);
1319 data->last_end = data->pos_min;
1320 data->last_start_min =
1321 data->pos_min - CHR_SVLEN(last_str);
1322 data->last_start_max = is_inf
1324 : data->pos_min + data->pos_delta
1325 - CHR_SVLEN(last_str);
1327 data->longest = &(data->longest_float);
1329 SvREFCNT_dec(last_str);
1331 if (data && (fl & SF_HAS_EVAL))
1332 data->flags |= SF_HAS_EVAL;
1333 optimize_curly_tail:
1334 if (OP(oscan) != CURLYX) {
1335 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1337 NEXT_OFF(oscan) += NEXT_OFF(next);
1340 default: /* REF and CLUMP only? */
1341 if (flags & SCF_DO_SUBSTR) {
1342 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1343 data->longest = &(data->longest_float);
1345 is_inf = is_inf_internal = 1;
1346 if (flags & SCF_DO_STCLASS_OR)
1347 cl_anything(pRExC_state, data->start_class);
1348 flags &= ~SCF_DO_STCLASS;
1352 else if (strchr((char*)PL_simple,OP(scan))) {
1355 if (flags & SCF_DO_SUBSTR) {
1356 scan_commit(pRExC_state,data);
1360 if (flags & SCF_DO_STCLASS) {
1361 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363 /* Some of the logic below assumes that switching
1364 locale on will only add false positives. */
1365 switch (PL_regkind[(U8)OP(scan)]) {
1369 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1370 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1371 cl_anything(pRExC_state, data->start_class);
1374 if (OP(scan) == SANY)
1376 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1377 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1378 || (data->start_class->flags & ANYOF_CLASS));
1379 cl_anything(pRExC_state, data->start_class);
1381 if (flags & SCF_DO_STCLASS_AND || !value)
1382 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1385 if (flags & SCF_DO_STCLASS_AND)
1386 cl_and(data->start_class,
1387 (struct regnode_charclass_class*)scan);
1389 cl_or(pRExC_state, data->start_class,
1390 (struct regnode_charclass_class*)scan);
1393 if (flags & SCF_DO_STCLASS_AND) {
1394 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1395 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1396 for (value = 0; value < 256; value++)
1397 if (!isALNUM(value))
1398 ANYOF_BITMAP_CLEAR(data->start_class, value);
1402 if (data->start_class->flags & ANYOF_LOCALE)
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 for (value = 0; value < 256; value++)
1407 ANYOF_BITMAP_SET(data->start_class, value);
1412 if (flags & SCF_DO_STCLASS_AND) {
1413 if (data->start_class->flags & ANYOF_LOCALE)
1414 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418 data->start_class->flags |= ANYOF_LOCALE;
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1425 for (value = 0; value < 256; value++)
1427 ANYOF_BITMAP_CLEAR(data->start_class, value);
1431 if (data->start_class->flags & ANYOF_LOCALE)
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 for (value = 0; value < 256; value++)
1435 if (!isALNUM(value))
1436 ANYOF_BITMAP_SET(data->start_class, value);
1441 if (flags & SCF_DO_STCLASS_AND) {
1442 if (data->start_class->flags & ANYOF_LOCALE)
1443 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446 data->start_class->flags |= ANYOF_LOCALE;
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1454 for (value = 0; value < 256; value++)
1455 if (!isSPACE(value))
1456 ANYOF_BITMAP_CLEAR(data->start_class, value);
1460 if (data->start_class->flags & ANYOF_LOCALE)
1461 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 for (value = 0; value < 256; value++)
1465 ANYOF_BITMAP_SET(data->start_class, value);
1470 if (flags & SCF_DO_STCLASS_AND) {
1471 if (data->start_class->flags & ANYOF_LOCALE)
1472 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475 data->start_class->flags |= ANYOF_LOCALE;
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1483 for (value = 0; value < 256; value++)
1485 ANYOF_BITMAP_CLEAR(data->start_class, value);
1489 if (data->start_class->flags & ANYOF_LOCALE)
1490 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 for (value = 0; value < 256; value++)
1493 if (!isSPACE(value))
1494 ANYOF_BITMAP_SET(data->start_class, value);
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 if (data->start_class->flags & ANYOF_LOCALE) {
1501 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
1504 ANYOF_BITMAP_CLEAR(data->start_class, value);
1508 data->start_class->flags |= ANYOF_LOCALE;
1509 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1513 if (flags & SCF_DO_STCLASS_AND) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1515 for (value = 0; value < 256; value++)
1516 if (!isDIGIT(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 if (data->start_class->flags & ANYOF_LOCALE)
1521 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 for (value = 0; value < 256; value++)
1525 ANYOF_BITMAP_SET(data->start_class, value);
1530 if (flags & SCF_DO_STCLASS_AND) {
1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1532 for (value = 0; value < 256; value++)
1534 ANYOF_BITMAP_CLEAR(data->start_class, value);
1537 if (data->start_class->flags & ANYOF_LOCALE)
1538 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_SET(data->start_class, value);
1547 if (flags & SCF_DO_STCLASS_OR)
1548 cl_and(data->start_class, &and_with);
1549 flags &= ~SCF_DO_STCLASS;
1552 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1553 data->flags |= (OP(scan) == MEOL
1557 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1558 /* Lookbehind, or need to calculate parens/evals/stclass: */
1559 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1560 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1561 /* Lookahead/lookbehind */
1562 I32 deltanext, minnext, fake = 0;
1564 struct regnode_charclass_class intrnl;
1567 data_fake.flags = 0;
1569 data_fake.whilem_c = data->whilem_c;
1570 data_fake.last_closep = data->last_closep;
1573 data_fake.last_closep = &fake;
1574 if ( flags & SCF_DO_STCLASS && !scan->flags
1575 && OP(scan) == IFMATCH ) { /* Lookahead */
1576 cl_init(pRExC_state, &intrnl);
1577 data_fake.start_class = &intrnl;
1578 f |= SCF_DO_STCLASS_AND;
1580 if (flags & SCF_WHILEM_VISITED_POS)
1581 f |= SCF_WHILEM_VISITED_POS;
1582 next = regnext(scan);
1583 nscan = NEXTOPER(NEXTOPER(scan));
1584 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1587 vFAIL("Variable length lookbehind not implemented");
1589 else if (minnext > U8_MAX) {
1590 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1592 scan->flags = minnext;
1594 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 if (data && (data_fake.flags & SF_HAS_EVAL))
1597 data->flags |= SF_HAS_EVAL;
1599 data->whilem_c = data_fake.whilem_c;
1600 if (f & SCF_DO_STCLASS_AND) {
1601 int was = (data->start_class->flags & ANYOF_EOS);
1603 cl_and(data->start_class, &intrnl);
1605 data->start_class->flags |= ANYOF_EOS;
1608 else if (OP(scan) == OPEN) {
1611 else if (OP(scan) == CLOSE) {
1612 if (ARG(scan) == is_par) {
1613 next = regnext(scan);
1615 if ( next && (OP(next) != WHILEM) && next < last)
1616 is_par = 0; /* Disable optimization */
1619 *(data->last_closep) = ARG(scan);
1621 else if (OP(scan) == EVAL) {
1623 data->flags |= SF_HAS_EVAL;
1625 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1626 if (flags & SCF_DO_SUBSTR) {
1627 scan_commit(pRExC_state,data);
1628 data->longest = &(data->longest_float);
1630 is_inf = is_inf_internal = 1;
1631 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1632 cl_anything(pRExC_state, data->start_class);
1633 flags &= ~SCF_DO_STCLASS;
1635 /* Else: zero-length, ignore. */
1636 scan = regnext(scan);
1641 *deltap = is_inf_internal ? I32_MAX : delta;
1642 if (flags & SCF_DO_SUBSTR && is_inf)
1643 data->pos_delta = I32_MAX - data->pos_min;
1644 if (is_par > U8_MAX)
1646 if (is_par && pars==1 && data) {
1647 data->flags |= SF_IN_PAR;
1648 data->flags &= ~SF_HAS_PAR;
1650 else if (pars && data) {
1651 data->flags |= SF_HAS_PAR;
1652 data->flags &= ~SF_IN_PAR;
1654 if (flags & SCF_DO_STCLASS_OR)
1655 cl_and(data->start_class, &and_with);
1660 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1662 if (RExC_rx->data) {
1663 Renewc(RExC_rx->data,
1664 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1665 char, struct reg_data);
1666 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1667 RExC_rx->data->count += n;
1670 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1671 char, struct reg_data);
1672 New(1208, RExC_rx->data->what, n, U8);
1673 RExC_rx->data->count = n;
1675 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1676 return RExC_rx->data->count - n;
1680 Perl_reginitcolors(pTHX)
1683 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1686 PL_colors[0] = s = savepv(s);
1688 s = strchr(s, '\t');
1694 PL_colors[i] = s = "";
1698 PL_colors[i++] = "";
1705 - pregcomp - compile a regular expression into internal code
1707 * We can't allocate space until we know how big the compiled form will be,
1708 * but we can't compile it (and thus know how big it is) until we've got a
1709 * place to put the code. So we cheat: we compile it twice, once with code
1710 * generation turned off and size counting turned on, and once "for real".
1711 * This also means that we don't allocate space until we are sure that the
1712 * thing really will compile successfully, and we never have to move the
1713 * code and thus invalidate pointers into it. (Note that it has to be in
1714 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 * Beware that the optimization-preparation code in here knows about some
1717 * of the structure of the compiled regexp. [I'll say.]
1720 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1730 RExC_state_t RExC_state;
1731 RExC_state_t *pRExC_state = &RExC_state;
1734 FAIL("NULL regexp argument");
1736 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1740 if (!PL_colorset) reginitcolors();
1741 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1742 PL_colors[4],PL_colors[5],PL_colors[0],
1743 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 RExC_flags16 = pm->op_pmflags;
1749 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1750 RExC_seen_evals = 0;
1753 /* First pass: determine size, legality. */
1760 RExC_emit = &PL_regdummy;
1761 RExC_whilem_seen = 0;
1762 #if 0 /* REGC() is (currently) a NOP at the first pass.
1763 * Clever compilers notice this and complain. --jhi */
1764 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1766 if (reg(pRExC_state, 0, &flags) == NULL) {
1767 RExC_precomp = Nullch;
1770 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1772 /* Small enough for pointer-storage convention?
1773 If extralen==0, this means that we will not need long jumps. */
1774 if (RExC_size >= 0x10000L && RExC_extralen)
1775 RExC_size += RExC_extralen;
1778 if (RExC_whilem_seen > 15)
1779 RExC_whilem_seen = 15;
1781 /* Allocate space and initialize. */
1782 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1785 FAIL("Regexp out of space");
1788 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1789 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1792 r->prelen = xend - exp;
1793 r->precomp = savepvn(RExC_precomp, r->prelen);
1795 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1796 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1798 r->substrs = 0; /* Useful during FAIL. */
1799 r->startp = 0; /* Useful during FAIL. */
1800 r->endp = 0; /* Useful during FAIL. */
1802 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 r->offsets[0] = RExC_size;
1806 DEBUG_r(PerlIO_printf(Perl_debug_log,
1807 "%s %"UVuf" bytes for offset annotations.\n",
1808 r->offsets ? "Got" : "Couldn't get",
1809 (UV)((2*RExC_size+1) * sizeof(U32))));
1813 /* Second pass: emit code. */
1814 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1819 RExC_emit_start = r->program;
1820 RExC_emit = r->program;
1821 /* Store the count of eval-groups for security checks: */
1822 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1823 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1825 if (reg(pRExC_state, 0, &flags) == NULL)
1828 /* Dig out information for optimizations. */
1829 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1830 pm->op_pmflags = RExC_flags16;
1832 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1833 r->regstclass = NULL;
1834 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1835 r->reganch |= ROPT_NAUGHTY;
1836 scan = r->program + 1; /* First BRANCH. */
1838 /* XXXX To minimize changes to RE engine we always allocate
1839 3-units-long substrs field. */
1840 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842 StructCopy(&zero_scan_data, &data, scan_data_t);
1843 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1844 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1846 STRLEN longest_float_length, longest_fixed_length;
1847 struct regnode_charclass_class ch_class;
1852 /* Skip introductions and multiplicators >= 1. */
1853 while ((OP(first) == OPEN && (sawopen = 1)) ||
1854 /* An OR of *one* alternative - should not happen now. */
1855 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1856 (OP(first) == PLUS) ||
1857 (OP(first) == MINMOD) ||
1858 /* An {n,m} with n>0 */
1859 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1860 if (OP(first) == PLUS)
1863 first += regarglen[(U8)OP(first)];
1864 first = NEXTOPER(first);
1867 /* Starting-point info. */
1869 if (PL_regkind[(U8)OP(first)] == EXACT) {
1870 if (OP(first) == EXACT)
1871 ; /* Empty, get anchored substr later. */
1872 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1873 r->regstclass = first;
1875 else if (strchr((char*)PL_simple,OP(first)))
1876 r->regstclass = first;
1877 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1878 PL_regkind[(U8)OP(first)] == NBOUND)
1879 r->regstclass = first;
1880 else if (PL_regkind[(U8)OP(first)] == BOL) {
1881 r->reganch |= (OP(first) == MBOL
1883 : (OP(first) == SBOL
1886 first = NEXTOPER(first);
1889 else if (OP(first) == GPOS) {
1890 r->reganch |= ROPT_ANCH_GPOS;
1891 first = NEXTOPER(first);
1894 else if (!sawopen && (OP(first) == STAR &&
1895 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1896 !(r->reganch & ROPT_ANCH) )
1898 /* turn .* into ^.* with an implied $*=1 */
1899 int type = OP(NEXTOPER(first));
1901 if (type == REG_ANY)
1902 type = ROPT_ANCH_MBOL;
1904 type = ROPT_ANCH_SBOL;
1906 r->reganch |= type | ROPT_IMPLICIT;
1907 first = NEXTOPER(first);
1910 if (sawplus && (!sawopen || !RExC_sawback)
1911 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1912 /* x+ must match at the 1st pos of run of x's */
1913 r->reganch |= ROPT_SKIP;
1915 /* Scan is after the zeroth branch, first is atomic matcher. */
1916 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1917 (IV)(first - scan + 1)));
1919 * If there's something expensive in the r.e., find the
1920 * longest literal string that must appear and make it the
1921 * regmust. Resolve ties in favor of later strings, since
1922 * the regstart check works with the beginning of the r.e.
1923 * and avoiding duplication strengthens checking. Not a
1924 * strong reason, but sufficient in the absence of others.
1925 * [Now we resolve ties in favor of the earlier string if
1926 * it happens that c_offset_min has been invalidated, since the
1927 * earlier string may buy us something the later one won't.]
1931 data.longest_fixed = newSVpvn("",0);
1932 data.longest_float = newSVpvn("",0);
1933 data.last_found = newSVpvn("",0);
1934 data.longest = &(data.longest_fixed);
1936 if (!r->regstclass) {
1937 cl_init(pRExC_state, &ch_class);
1938 data.start_class = &ch_class;
1939 stclass_flag = SCF_DO_STCLASS_AND;
1940 } else /* XXXX Check for BOUND? */
1942 data.last_closep = &last_close;
1944 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1945 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1946 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1947 && data.last_start_min == 0 && data.last_end > 0
1948 && !RExC_seen_zerolen
1949 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1950 r->reganch |= ROPT_CHECK_ALL;
1951 scan_commit(pRExC_state, &data);
1952 SvREFCNT_dec(data.last_found);
1954 longest_float_length = CHR_SVLEN(data.longest_float);
1955 if (longest_float_length
1956 || (data.flags & SF_FL_BEFORE_EOL
1957 && (!(data.flags & SF_FL_BEFORE_MEOL)
1958 || (RExC_flags16 & PMf_MULTILINE)))) {
1961 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1962 && data.offset_fixed == data.offset_float_min
1963 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1964 goto remove_float; /* As in (a)+. */
1966 r->float_substr = data.longest_float;
1967 r->float_min_offset = data.offset_float_min;
1968 r->float_max_offset = data.offset_float_max;
1969 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971 || (RExC_flags16 & PMf_MULTILINE)));
1972 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1976 r->float_substr = Nullsv;
1977 SvREFCNT_dec(data.longest_float);
1978 longest_float_length = 0;
1981 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1982 if (longest_fixed_length
1983 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1984 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1985 || (RExC_flags16 & PMf_MULTILINE)))) {
1988 r->anchored_substr = data.longest_fixed;
1989 r->anchored_offset = data.offset_fixed;
1990 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1991 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1992 || (RExC_flags16 & PMf_MULTILINE)));
1993 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1996 r->anchored_substr = Nullsv;
1997 SvREFCNT_dec(data.longest_fixed);
1998 longest_fixed_length = 0;
2001 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2002 r->regstclass = NULL;
2003 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2004 && !(data.start_class->flags & ANYOF_EOS)
2005 && !cl_is_anything(data.start_class)) {
2006 I32 n = add_data(pRExC_state, 1, "f");
2008 New(1006, RExC_rx->data->data[n], 1,
2009 struct regnode_charclass_class);
2010 StructCopy(data.start_class,
2011 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2012 struct regnode_charclass_class);
2013 r->regstclass = (regnode*)RExC_rx->data->data[n];
2014 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2015 PL_regdata = r->data; /* for regprop() */
2016 DEBUG_r({ SV *sv = sv_newmortal();
2017 regprop(sv, (regnode*)data.start_class);
2018 PerlIO_printf(Perl_debug_log,
2019 "synthetic stclass `%s'.\n",
2023 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2024 if (longest_fixed_length > longest_float_length) {
2025 r->check_substr = r->anchored_substr;
2026 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2027 if (r->reganch & ROPT_ANCH_SINGLE)
2028 r->reganch |= ROPT_NOSCAN;
2031 r->check_substr = r->float_substr;
2032 r->check_offset_min = data.offset_float_min;
2033 r->check_offset_max = data.offset_float_max;
2035 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036 This should be changed ASAP! */
2037 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2038 r->reganch |= RE_USE_INTUIT;
2039 if (SvTAIL(r->check_substr))
2040 r->reganch |= RE_INTUIT_TAIL;
2044 /* Several toplevels. Best we can is to set minlen. */
2046 struct regnode_charclass_class ch_class;
2049 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050 scan = r->program + 1;
2051 cl_init(pRExC_state, &ch_class);
2052 data.start_class = &ch_class;
2053 data.last_closep = &last_close;
2054 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2055 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2056 if (!(data.start_class->flags & ANYOF_EOS)
2057 && !cl_is_anything(data.start_class)) {
2058 I32 n = add_data(pRExC_state, 1, "f");
2060 New(1006, RExC_rx->data->data[n], 1,
2061 struct regnode_charclass_class);
2062 StructCopy(data.start_class,
2063 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2064 struct regnode_charclass_class);
2065 r->regstclass = (regnode*)RExC_rx->data->data[n];
2066 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2067 DEBUG_r({ SV* sv = sv_newmortal();
2068 regprop(sv, (regnode*)data.start_class);
2069 PerlIO_printf(Perl_debug_log,
2070 "synthetic stclass `%s'.\n",
2076 if (RExC_seen & REG_SEEN_GPOS)
2077 r->reganch |= ROPT_GPOS_SEEN;
2078 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2079 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2080 if (RExC_seen & REG_SEEN_EVAL)
2081 r->reganch |= ROPT_EVAL_SEEN;
2082 if (RExC_seen & REG_SEEN_CANY)
2083 r->reganch |= ROPT_CANY_SEEN;
2084 Newz(1002, r->startp, RExC_npar, I32);
2085 Newz(1002, r->endp, RExC_npar, I32);
2086 PL_regdata = r->data; /* for regprop() */
2087 DEBUG_r(regdump(r));
2092 - reg - regular expression, i.e. main body or parenthesized thing
2094 * Caller must absorb opening parenthesis.
2096 * Combining parenthesis handling with the base level of regular expression
2097 * is a trifle forced, but the need to tie the tails of the branches to what
2098 * follows makes it hard to avoid.
2101 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2102 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2104 register regnode *ret; /* Will be the head of the group. */
2105 register regnode *br;
2106 register regnode *lastbr;
2107 register regnode *ender = 0;
2108 register I32 parno = 0;
2109 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2111 /* for (?g), (?gc), and (?o) warnings; warning
2112 about (?c) will warn about (?g) -- japhy */
2114 I32 wastedflags = 0x00,
2117 wasted_gc = 0x02 | 0x04,
2120 char * parse_start = RExC_parse; /* MJD */
2121 char *oregcomp_parse = RExC_parse;
2124 *flagp = 0; /* Tentatively. */
2127 /* Make an OPEN node, if parenthesized. */
2129 if (*RExC_parse == '?') { /* (?...) */
2130 U16 posflags = 0, negflags = 0;
2131 U16 *flagsp = &posflags;
2133 char *seqstart = RExC_parse;
2136 paren = *RExC_parse++;
2137 ret = NULL; /* For look-ahead/behind. */
2139 case '<': /* (?<...) */
2140 RExC_seen |= REG_SEEN_LOOKBEHIND;
2141 if (*RExC_parse == '!')
2143 if (*RExC_parse != '=' && *RExC_parse != '!')
2146 case '=': /* (?=...) */
2147 case '!': /* (?!...) */
2148 RExC_seen_zerolen++;
2149 case ':': /* (?:...) */
2150 case '>': /* (?>...) */
2152 case '$': /* (?$...) */
2153 case '@': /* (?@...) */
2154 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2156 case '#': /* (?#...) */
2157 while (*RExC_parse && *RExC_parse != ')')
2159 if (*RExC_parse != ')')
2160 FAIL("Sequence (?#... not terminated");
2161 nextchar(pRExC_state);
2164 case 'p': /* (?p...) */
2165 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2166 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2168 case '?': /* (??...) */
2170 if (*RExC_parse != '{')
2172 paren = *RExC_parse++;
2174 case '{': /* (?{...}) */
2176 I32 count = 1, n = 0;
2178 char *s = RExC_parse;
2180 OP_4tree *sop, *rop;
2182 RExC_seen_zerolen++;
2183 RExC_seen |= REG_SEEN_EVAL;
2184 while (count && (c = *RExC_parse)) {
2185 if (c == '\\' && RExC_parse[1])
2193 if (*RExC_parse != ')')
2196 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2201 if (RExC_parse - 1 - s)
2202 sv = newSVpvn(s, RExC_parse - 1 - s);
2204 sv = newSVpvn("", 0);
2207 Perl_save_re_context(aTHX);
2208 rop = sv_compile_2op(sv, &sop, "re", &av);
2209 sop->op_private |= OPpREFCOUNTED;
2210 /* re_dup will OpREFCNT_inc */
2211 OpREFCNT_set(sop, 1);
2214 n = add_data(pRExC_state, 3, "nop");
2215 RExC_rx->data->data[n] = (void*)rop;
2216 RExC_rx->data->data[n+1] = (void*)sop;
2217 RExC_rx->data->data[n+2] = (void*)av;
2220 else { /* First pass */
2221 if (PL_reginterp_cnt < ++RExC_seen_evals
2222 && PL_curcop != &PL_compiling)
2223 /* No compiled RE interpolated, has runtime
2224 components ===> unsafe. */
2225 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2226 if (PL_tainting && PL_tainted)
2227 FAIL("Eval-group in insecure regular expression");
2230 nextchar(pRExC_state);
2232 ret = reg_node(pRExC_state, LOGICAL);
2235 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2236 /* deal with the length of this later - MJD */
2239 return reganode(pRExC_state, EVAL, n);
2241 case '(': /* (?(?{...})...) and (?(?=...)...) */
2243 if (RExC_parse[0] == '?') { /* (?(?...)) */
2244 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2245 || RExC_parse[1] == '<'
2246 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2249 ret = reg_node(pRExC_state, LOGICAL);
2252 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2256 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2258 parno = atoi(RExC_parse++);
2260 while (isDIGIT(*RExC_parse))
2262 ret = reganode(pRExC_state, GROUPP, parno);
2264 if ((c = *nextchar(pRExC_state)) != ')')
2265 vFAIL("Switch condition not recognized");
2267 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2268 br = regbranch(pRExC_state, &flags, 1);
2270 br = reganode(pRExC_state, LONGJMP, 0);
2272 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2273 c = *nextchar(pRExC_state);
2277 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2278 regbranch(pRExC_state, &flags, 1);
2279 regtail(pRExC_state, ret, lastbr);
2282 c = *nextchar(pRExC_state);
2287 vFAIL("Switch (?(condition)... contains too many branches");
2288 ender = reg_node(pRExC_state, TAIL);
2289 regtail(pRExC_state, br, ender);
2291 regtail(pRExC_state, lastbr, ender);
2292 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2295 regtail(pRExC_state, ret, ender);
2299 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2303 RExC_parse--; /* for vFAIL to print correctly */
2304 vFAIL("Sequence (? incomplete");
2308 parse_flags: /* (?i) */
2309 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2310 /* (?g), (?gc) and (?o) are useless here
2311 and must be globally applied -- japhy */
2313 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2314 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2315 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2316 if (! (wastedflags & wflagbit) ) {
2317 wastedflags |= wflagbit;
2320 "Useless (%s%c) - %suse /%c modifier",
2321 flagsp == &negflags ? "?-" : "?",
2323 flagsp == &negflags ? "don't " : "",
2329 else if (*RExC_parse == 'c') {
2330 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2331 if (! (wastedflags & wasted_c) ) {
2332 wastedflags |= wasted_gc;
2335 "Useless (%sc) - %suse /gc modifier",
2336 flagsp == &negflags ? "?-" : "?",
2337 flagsp == &negflags ? "don't " : ""
2342 else { pmflag(flagsp, *RExC_parse); }
2346 if (*RExC_parse == '-') {
2348 wastedflags = 0; /* reset so (?g-c) warns twice */
2352 RExC_flags16 |= posflags;
2353 RExC_flags16 &= ~negflags;
2354 if (*RExC_parse == ':') {
2360 if (*RExC_parse != ')') {
2362 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2364 nextchar(pRExC_state);
2372 ret = reganode(pRExC_state, OPEN, parno);
2373 Set_Node_Length(ret, 1); /* MJD */
2374 Set_Node_Offset(ret, RExC_parse); /* MJD */
2381 /* Pick up the branches, linking them together. */
2382 parse_start = RExC_parse; /* MJD */
2383 br = regbranch(pRExC_state, &flags, 1);
2384 /* branch_len = (paren != 0); */
2388 if (*RExC_parse == '|') {
2389 if (!SIZE_ONLY && RExC_extralen) {
2390 reginsert(pRExC_state, BRANCHJ, br);
2393 reginsert(pRExC_state, BRANCH, br);
2394 Set_Node_Length(br, paren != 0);
2395 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2399 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2401 else if (paren == ':') {
2402 *flagp |= flags&SIMPLE;
2404 if (open) { /* Starts with OPEN. */
2405 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2407 else if (paren != '?') /* Not Conditional */
2409 *flagp |= flags & (SPSTART | HASWIDTH);
2411 while (*RExC_parse == '|') {
2412 if (!SIZE_ONLY && RExC_extralen) {
2413 ender = reganode(pRExC_state, LONGJMP,0);
2414 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2417 RExC_extralen += 2; /* Account for LONGJMP. */
2418 nextchar(pRExC_state);
2419 br = regbranch(pRExC_state, &flags, 0);
2423 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2427 *flagp |= flags&SPSTART;
2430 if (have_branch || paren != ':') {
2431 /* Make a closing node, and hook it on the end. */
2434 ender = reg_node(pRExC_state, TAIL);
2437 ender = reganode(pRExC_state, CLOSE, parno);
2438 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2439 Set_Node_Length(ender,1); /* MJD */
2445 *flagp &= ~HASWIDTH;
2448 ender = reg_node(pRExC_state, SUCCEED);
2451 ender = reg_node(pRExC_state, END);
2454 regtail(pRExC_state, lastbr, ender);
2457 /* Hook the tails of the branches to the closing node. */
2458 for (br = ret; br != NULL; br = regnext(br)) {
2459 regoptail(pRExC_state, br, ender);
2466 static char parens[] = "=!<,>";
2468 if (paren && (p = strchr(parens, paren))) {
2469 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2470 int flag = (p - parens) > 1;
2473 node = SUSPEND, flag = 0;
2474 reginsert(pRExC_state, node,ret);
2476 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2480 /* Check for proper termination. */
2482 RExC_flags16 = oregflags;
2483 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2484 RExC_parse = oregcomp_parse;
2485 vFAIL("Unmatched (");
2488 else if (!paren && RExC_parse < RExC_end) {
2489 if (*RExC_parse == ')') {
2491 vFAIL("Unmatched )");
2494 FAIL("Junk on end of regexp"); /* "Can't happen". */
2502 - regbranch - one alternative of an | operator
2504 * Implements the concatenation operator.
2507 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2509 register regnode *ret;
2510 register regnode *chain = NULL;
2511 register regnode *latest;
2512 I32 flags = 0, c = 0;
2517 if (!SIZE_ONLY && RExC_extralen)
2518 ret = reganode(pRExC_state, BRANCHJ,0);
2520 ret = reg_node(pRExC_state, BRANCH);
2521 Set_Node_Length(ret, 1);
2525 if (!first && SIZE_ONLY)
2526 RExC_extralen += 1; /* BRANCHJ */
2528 *flagp = WORST; /* Tentatively. */
2531 nextchar(pRExC_state);
2532 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2534 latest = regpiece(pRExC_state, &flags);
2535 if (latest == NULL) {
2536 if (flags & TRYAGAIN)
2540 else if (ret == NULL)
2542 *flagp |= flags&HASWIDTH;
2543 if (chain == NULL) /* First piece. */
2544 *flagp |= flags&SPSTART;
2547 regtail(pRExC_state, chain, latest);
2552 if (chain == NULL) { /* Loop ran zero times. */
2553 chain = reg_node(pRExC_state, NOTHING);
2558 *flagp |= flags&SIMPLE;
2565 - regpiece - something followed by possible [*+?]
2567 * Note that the branching code sequences used for ? and the general cases
2568 * of * and + are somewhat optimized: they use the same NOTHING node as
2569 * both the endmarker for their branch list and the body of the last branch.
2570 * It might seem that this node could be dispensed with entirely, but the
2571 * endmarker role is not redundant.
2574 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2576 register regnode *ret;
2578 register char *next;
2580 char *origparse = RExC_parse;
2583 I32 max = REG_INFTY;
2586 ret = regatom(pRExC_state, &flags);
2588 if (flags & TRYAGAIN)
2595 if (op == '{' && regcurly(RExC_parse)) {
2596 parse_start = RExC_parse; /* MJD */
2597 next = RExC_parse + 1;
2599 while (isDIGIT(*next) || *next == ',') {
2608 if (*next == '}') { /* got one */
2612 min = atoi(RExC_parse);
2616 maxpos = RExC_parse;
2618 if (!max && *maxpos != '0')
2619 max = REG_INFTY; /* meaning "infinity" */
2620 else if (max >= REG_INFTY)
2621 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2623 nextchar(pRExC_state);
2626 if ((flags&SIMPLE)) {
2627 RExC_naughty += 2 + RExC_naughty / 2;
2628 reginsert(pRExC_state, CURLY, ret);
2629 Set_Node_Offset(ret, parse_start+1); /* MJD */
2630 Set_Node_Cur_Length(ret);
2633 regnode *w = reg_node(pRExC_state, WHILEM);
2636 regtail(pRExC_state, ret, w);
2637 if (!SIZE_ONLY && RExC_extralen) {
2638 reginsert(pRExC_state, LONGJMP,ret);
2639 reginsert(pRExC_state, NOTHING,ret);
2640 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2642 reginsert(pRExC_state, CURLYX,ret);
2644 Set_Node_Offset(ret, parse_start+1);
2645 Set_Node_Length(ret,
2646 op == '{' ? (RExC_parse - parse_start) : 1);
2648 if (!SIZE_ONLY && RExC_extralen)
2649 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2650 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2652 RExC_whilem_seen++, RExC_extralen += 3;
2653 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2661 if (max && max < min)
2662 vFAIL("Can't do {n,m} with n > m");
2677 #if 0 /* Now runtime fix should be reliable. */
2679 /* if this is reinstated, don't forget to put this back into perldiag:
2681 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2683 (F) The part of the regexp subject to either the * or + quantifier
2684 could match an empty string. The {#} shows in the regular
2685 expression about where the problem was discovered.
2689 if (!(flags&HASWIDTH) && op != '?')
2690 vFAIL("Regexp *+ operand could be empty");
2693 parse_start = RExC_parse;
2694 nextchar(pRExC_state);
2696 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2698 if (op == '*' && (flags&SIMPLE)) {
2699 reginsert(pRExC_state, STAR, ret);
2703 else if (op == '*') {
2707 else if (op == '+' && (flags&SIMPLE)) {
2708 reginsert(pRExC_state, PLUS, ret);
2712 else if (op == '+') {
2716 else if (op == '?') {
2721 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2723 "%.*s matches null string many times",
2724 RExC_parse - origparse,
2728 if (*RExC_parse == '?') {
2729 nextchar(pRExC_state);
2730 reginsert(pRExC_state, MINMOD, ret);
2731 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2733 if (ISMULT2(RExC_parse)) {
2735 vFAIL("Nested quantifiers");
2742 - regatom - the lowest level
2744 * Optimization: gobbles an entire sequence of ordinary characters so that
2745 * it can turn them into a single node, which is smaller to store and
2746 * faster to run. Backslashed characters are exceptions, each becoming a
2747 * separate node; the code is simpler that way and it's not worth fixing.
2749 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2751 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2753 register regnode *ret = 0;
2755 char *parse_start = 0;
2757 *flagp = WORST; /* Tentatively. */
2760 switch (*RExC_parse) {
2762 RExC_seen_zerolen++;
2763 nextchar(pRExC_state);
2764 if (RExC_flags16 & PMf_MULTILINE)
2765 ret = reg_node(pRExC_state, MBOL);
2766 else if (RExC_flags16 & PMf_SINGLELINE)
2767 ret = reg_node(pRExC_state, SBOL);
2769 ret = reg_node(pRExC_state, BOL);
2770 Set_Node_Length(ret, 1); /* MJD */
2773 nextchar(pRExC_state);
2775 RExC_seen_zerolen++;
2776 if (RExC_flags16 & PMf_MULTILINE)
2777 ret = reg_node(pRExC_state, MEOL);
2778 else if (RExC_flags16 & PMf_SINGLELINE)
2779 ret = reg_node(pRExC_state, SEOL);
2781 ret = reg_node(pRExC_state, EOL);
2782 Set_Node_Length(ret, 1); /* MJD */
2785 nextchar(pRExC_state);
2786 if (RExC_flags16 & PMf_SINGLELINE)
2787 ret = reg_node(pRExC_state, SANY);
2789 ret = reg_node(pRExC_state, REG_ANY);
2790 *flagp |= HASWIDTH|SIMPLE;
2792 Set_Node_Length(ret, 1); /* MJD */
2796 char *oregcomp_parse = ++RExC_parse;
2797 ret = regclass(pRExC_state);
2798 if (*RExC_parse != ']') {
2799 RExC_parse = oregcomp_parse;
2800 vFAIL("Unmatched [");
2802 nextchar(pRExC_state);
2803 *flagp |= HASWIDTH|SIMPLE;
2804 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2808 nextchar(pRExC_state);
2809 ret = reg(pRExC_state, 1, &flags);
2811 if (flags & TRYAGAIN) {
2812 if (RExC_parse == RExC_end) {
2813 /* Make parent create an empty node if needed. */
2821 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2825 if (flags & TRYAGAIN) {
2829 vFAIL("Internal urp");
2830 /* Supposed to be caught earlier. */
2833 if (!regcurly(RExC_parse)) {
2842 vFAIL("Quantifier follows nothing");
2845 switch (*++RExC_parse) {
2847 RExC_seen_zerolen++;
2848 ret = reg_node(pRExC_state, SBOL);
2850 nextchar(pRExC_state);
2851 Set_Node_Length(ret, 2); /* MJD */
2854 ret = reg_node(pRExC_state, GPOS);
2855 RExC_seen |= REG_SEEN_GPOS;
2857 nextchar(pRExC_state);
2858 Set_Node_Length(ret, 2); /* MJD */
2861 ret = reg_node(pRExC_state, SEOL);
2863 RExC_seen_zerolen++; /* Do not optimize RE away */
2864 nextchar(pRExC_state);
2867 ret = reg_node(pRExC_state, EOS);
2869 RExC_seen_zerolen++; /* Do not optimize RE away */
2870 nextchar(pRExC_state);
2871 Set_Node_Length(ret, 2); /* MJD */
2874 ret = reg_node(pRExC_state, CANY);
2875 RExC_seen |= REG_SEEN_CANY;
2876 *flagp |= HASWIDTH|SIMPLE;
2877 nextchar(pRExC_state);
2878 Set_Node_Length(ret, 2); /* MJD */
2881 ret = reg_node(pRExC_state, CLUMP);
2883 nextchar(pRExC_state);
2884 Set_Node_Length(ret, 2); /* MJD */
2887 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2888 *flagp |= HASWIDTH|SIMPLE;
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2894 *flagp |= HASWIDTH|SIMPLE;
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 RExC_seen_zerolen++;
2900 RExC_seen |= REG_SEEN_LOOKBEHIND;
2901 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2903 nextchar(pRExC_state);
2904 Set_Node_Length(ret, 2); /* MJD */
2907 RExC_seen_zerolen++;
2908 RExC_seen |= REG_SEEN_LOOKBEHIND;
2909 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2911 nextchar(pRExC_state);
2912 Set_Node_Length(ret, 2); /* MJD */
2915 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2916 *flagp |= HASWIDTH|SIMPLE;
2917 nextchar(pRExC_state);
2918 Set_Node_Length(ret, 2); /* MJD */
2921 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2922 *flagp |= HASWIDTH|SIMPLE;
2923 nextchar(pRExC_state);
2924 Set_Node_Length(ret, 2); /* MJD */
2927 ret = reg_node(pRExC_state, DIGIT);
2928 *flagp |= HASWIDTH|SIMPLE;
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, NDIGIT);
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2941 char* oldregxend = RExC_end;
2942 char* parse_start = RExC_parse;
2944 if (RExC_parse[1] == '{') {
2945 /* a lovely hack--pretend we saw [\pX] instead */
2946 RExC_end = strchr(RExC_parse, '}');
2948 U8 c = (U8)*RExC_parse;
2950 RExC_end = oldregxend;
2951 vFAIL2("Missing right brace on \\%c{}", c);
2956 RExC_end = RExC_parse + 2;
2959 ret = regclass(pRExC_state);
2961 RExC_end = oldregxend;
2963 Set_Node_Cur_Length(ret); /* MJD */
2964 nextchar(pRExC_state);
2965 *flagp |= HASWIDTH|SIMPLE;
2978 case '1': case '2': case '3': case '4':
2979 case '5': case '6': case '7': case '8': case '9':
2981 I32 num = atoi(RExC_parse);
2983 if (num > 9 && num >= RExC_npar)
2986 char * parse_start = RExC_parse - 1; /* MJD */
2987 while (isDIGIT(*RExC_parse))
2990 if (!SIZE_ONLY && num > RExC_rx->nparens)
2991 vFAIL("Reference to nonexistent group");
2993 ret = reganode(pRExC_state, FOLD
2994 ? (LOC ? REFFL : REFF)
2998 /* override incorrect value set in reganode MJD */
2999 Set_Node_Offset(ret, parse_start+1);
3000 Set_Node_Cur_Length(ret); /* MJD */
3002 nextchar(pRExC_state);
3007 if (RExC_parse >= RExC_end)
3008 FAIL("Trailing \\");
3011 /* Do not generate `unrecognized' warnings here, we fall
3012 back into the quick-grab loop below */
3018 if (RExC_flags16 & PMf_EXTENDED) {
3019 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3020 if (RExC_parse < RExC_end)
3026 register STRLEN len;
3032 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3034 parse_start = RExC_parse - 1;
3040 ret = reg_node(pRExC_state, FOLD
3041 ? (LOC ? EXACTFL : EXACTF)
3044 for (len = 0, p = RExC_parse - 1;
3045 len < 127 && p < RExC_end;
3050 if (RExC_flags16 & PMf_EXTENDED)
3051 p = regwhite(p, RExC_end);
3098 ender = ASCII_TO_NATIVE('\033');
3102 ender = ASCII_TO_NATIVE('\007');
3107 char* e = strchr(p, '}');
3111 vFAIL("Missing right brace on \\x{}");
3114 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115 | PERL_SCAN_DISALLOW_PREFIX;
3117 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3120 /* numlen is generous */
3121 if (numlen + len >= 127) {
3129 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3131 ender = grok_hex(p, &numlen, &flags, NULL);
3137 ender = UCHARAT(p++);
3138 ender = toCTRL(ender);
3140 case '0': case '1': case '2': case '3':case '4':
3141 case '5': case '6': case '7': case '8':case '9':
3143 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3146 ender = grok_oct(p, &numlen, &flags, NULL);
3156 FAIL("Trailing \\");
3159 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3160 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3161 goto normal_default;
3166 if (UTF8_IS_START(*p) && UTF) {
3167 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3175 if (RExC_flags16 & PMf_EXTENDED)
3176 p = regwhite(p, RExC_end);
3178 /* Prime the casefolded buffer. */
3179 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3181 if (ISMULT2(p)) { /* Back off on ?+*. */
3188 /* Emit all the Unicode characters. */
3189 for (foldbuf = tmpbuf;
3191 foldlen -= numlen) {
3192 ender = utf8_to_uvchr(foldbuf, &numlen);
3194 reguni(pRExC_state, ender, s, &unilen);
3197 /* In EBCDIC the numlen
3198 * and unilen can differ. */
3200 if (numlen >= foldlen)
3204 break; /* "Can't happen." */
3208 reguni(pRExC_state, ender, s, &unilen);
3225 /* Emit all the Unicode characters. */
3226 for (foldbuf = tmpbuf;
3228 foldlen -= numlen) {
3229 ender = utf8_to_uvchr(foldbuf, &numlen);
3231 reguni(pRExC_state, ender, s, &unilen);
3234 /* In EBCDIC the numlen
3235 * and unilen can differ. */
3237 if (numlen >= foldlen)
3245 reguni(pRExC_state, ender, s, &unilen);
3258 Set_Node_Cur_Length(ret); /* MJD */
3259 nextchar(pRExC_state);
3261 /* len is STRLEN which is unsigned, need to copy to signed */
3264 vFAIL("Internal disaster");
3273 RExC_size += STR_SZ(len);
3275 RExC_emit += STR_SZ(len);
3280 /* If the encoding pragma is in effect recode the text of
3281 * any EXACT-kind nodes. */
3282 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3283 STRLEN oldlen = STR_LEN(ret);
3284 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3288 if (sv_utf8_downgrade(sv, TRUE)) {
3289 char *s = sv_recode_to_utf8(sv, PL_encoding);
3290 STRLEN newlen = SvCUR(sv);
3293 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3294 (int)oldlen, STRING(ret),
3296 Copy(s, STRING(ret), newlen, char);
3297 STR_LEN(ret) += newlen - oldlen;
3298 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3300 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3308 S_regwhite(pTHX_ char *p, char *e)
3313 else if (*p == '#') {
3316 } while (p < e && *p != '\n');
3324 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3325 Character classes ([:foo:]) can also be negated ([:^foo:]).
3326 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3327 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3328 but trigger failures because they are currently unimplemented. */
3330 #define POSIXCC_DONE(c) ((c) == ':')
3331 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3332 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3335 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3338 I32 namedclass = OOB_NAMEDCLASS;
3340 if (value == '[' && RExC_parse + 1 < RExC_end &&
3341 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3342 POSIXCC(UCHARAT(RExC_parse))) {
3343 char c = UCHARAT(RExC_parse);
3344 char* s = RExC_parse++;
3346 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3348 if (RExC_parse == RExC_end)
3349 /* Grandfather lone [:, [=, [. */
3352 char* t = RExC_parse++; /* skip over the c */
3354 if (UCHARAT(RExC_parse) == ']') {
3355 RExC_parse++; /* skip over the ending ] */
3358 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3359 I32 skip = 5; /* the most common skip */
3363 if (strnEQ(posixcc, "alnum", 5))
3365 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3366 else if (strnEQ(posixcc, "alpha", 5))
3368 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3369 else if (strnEQ(posixcc, "ascii", 5))
3371 complement ? ANYOF_NASCII : ANYOF_ASCII;
3374 if (strnEQ(posixcc, "blank", 5))
3376 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3379 if (strnEQ(posixcc, "cntrl", 5))
3381 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3384 if (strnEQ(posixcc, "digit", 5))
3386 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3389 if (strnEQ(posixcc, "graph", 5))
3391 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3394 if (strnEQ(posixcc, "lower", 5))
3396 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3399 if (strnEQ(posixcc, "print", 5))
3401 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3402 else if (strnEQ(posixcc, "punct", 5))
3404 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3407 if (strnEQ(posixcc, "space", 5))
3409 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3412 if (strnEQ(posixcc, "upper", 5))
3414 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3416 case 'w': /* this is not POSIX, this is the Perl \w */
3417 if (strnEQ(posixcc, "word", 4)) {
3419 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3424 if (strnEQ(posixcc, "xdigit", 6)) {
3426 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3431 if (namedclass == OOB_NAMEDCLASS ||
3432 posixcc[skip] != ':' ||
3433 posixcc[skip+1] != ']')
3435 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3438 } else if (!SIZE_ONLY) {
3439 /* [[=foo=]] and [[.foo.]] are still future. */
3441 /* adjust RExC_parse so the warning shows after
3443 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3445 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3448 /* Maternal grandfather:
3449 * "[:" ending in ":" but not in ":]" */
3459 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3461 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3462 char *s = RExC_parse;
3465 while(*s && isALNUM(*s))
3467 if (*s && c == *s && s[1] == ']') {
3468 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3470 /* [[=foo=]] and [[.foo.]] are still future. */
3471 if (POSIXCC_NOTYET(c)) {
3472 /* adjust RExC_parse so the error shows after
3474 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3476 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3483 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3486 register UV nextvalue;
3487 register IV prevvalue = OOB_UNICODE;
3488 register IV range = 0;
3489 register regnode *ret;
3492 char *rangebegin = 0;
3493 bool need_class = 0;
3494 SV *listsv = Nullsv;
3497 bool optimize_invert = TRUE;
3498 AV* unicode_alternate = 0;
3500 ret = reganode(pRExC_state, ANYOF, 0);
3503 ANYOF_FLAGS(ret) = 0;
3505 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3509 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3513 RExC_size += ANYOF_SKIP;
3515 RExC_emit += ANYOF_SKIP;
3517 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3519 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3520 ANYOF_BITMAP_ZERO(ret);
3521 listsv = newSVpvn("# comment\n", 10);
3524 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3526 if (!SIZE_ONLY && POSIXCC(nextvalue))
3527 checkposixcc(pRExC_state);
3529 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3530 if (UCHARAT(RExC_parse) == ']')
3533 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3537 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3540 rangebegin = RExC_parse;
3542 value = utf8n_to_uvchr((U8*)RExC_parse,
3543 RExC_end - RExC_parse,
3545 RExC_parse += numlen;
3548 value = UCHARAT(RExC_parse++);
3549 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3550 if (value == '[' && POSIXCC(nextvalue))
3551 namedclass = regpposixcc(pRExC_state, value);
3552 else if (value == '\\') {
3554 value = utf8n_to_uvchr((U8*)RExC_parse,
3555 RExC_end - RExC_parse,
3557 RExC_parse += numlen;
3560 value = UCHARAT(RExC_parse++);
3561 /* Some compilers cannot handle switching on 64-bit integer
3562 * values, therefore value cannot be an UV. Yes, this will
3563 * be a problem later if we want switch on Unicode.
3564 * A similar issue a little bit later when switching on
3565 * namedclass. --jhi */
3566 switch ((I32)value) {
3567 case 'w': namedclass = ANYOF_ALNUM; break;
3568 case 'W': namedclass = ANYOF_NALNUM; break;
3569 case 's': namedclass = ANYOF_SPACE; break;
3570 case 'S': namedclass = ANYOF_NSPACE; break;
3571 case 'd': namedclass = ANYOF_DIGIT; break;
3572 case 'D': namedclass = ANYOF_NDIGIT; break;
3575 if (*RExC_parse == '{') {
3577 e = strchr(RExC_parse++, '}');
3579 vFAIL2("Missing right brace on \\%c{}", c);
3580 while (isSPACE(UCHARAT(RExC_parse)))
3582 if (e == RExC_parse)
3583 vFAIL2("Empty \\%c{}", c);
3585 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3593 if (UCHARAT(RExC_parse) == '^') {
3596 value = value == 'p' ? 'P' : 'p'; /* toggle */
3597 while (isSPACE(UCHARAT(RExC_parse))) {
3603 Perl_sv_catpvf(aTHX_ listsv,
3604 "+utf8::%.*s\n", (int)n, RExC_parse);
3606 Perl_sv_catpvf(aTHX_ listsv,
3607 "!utf8::%.*s\n", (int)n, RExC_parse);
3610 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3612 case 'n': value = '\n'; break;
3613 case 'r': value = '\r'; break;
3614 case 't': value = '\t'; break;
3615 case 'f': value = '\f'; break;
3616 case 'b': value = '\b'; break;
3617 case 'e': value = ASCII_TO_NATIVE('\033');break;
3618 case 'a': value = ASCII_TO_NATIVE('\007');break;
3620 if (*RExC_parse == '{') {
3621 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3622 | PERL_SCAN_DISALLOW_PREFIX;
3623 e = strchr(RExC_parse++, '}');
3625 vFAIL("Missing right brace on \\x{}");
3627 numlen = e - RExC_parse;
3628 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3632 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3634 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3635 RExC_parse += numlen;
3639 value = UCHARAT(RExC_parse++);
3640 value = toCTRL(value);
3642 case '0': case '1': case '2': case '3': case '4':
3643 case '5': case '6': case '7': case '8': case '9':
3647 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3648 RExC_parse += numlen;
3652 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3654 "Unrecognized escape \\%c in character class passed through",
3658 } /* end of \blah */
3660 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3662 if (!SIZE_ONLY && !need_class)
3663 ANYOF_CLASS_ZERO(ret);
3667 /* a bad range like a-\d, a-[:digit:] ? */
3670 if (ckWARN(WARN_REGEXP))
3672 "False [] range \"%*.*s\"",
3673 RExC_parse - rangebegin,
3674 RExC_parse - rangebegin,
3676 if (prevvalue < 256) {
3677 ANYOF_BITMAP_SET(ret, prevvalue);
3678 ANYOF_BITMAP_SET(ret, '-');
3681 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3682 Perl_sv_catpvf(aTHX_ listsv,
3683 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3687 range = 0; /* this was not a true range */
3691 if (namedclass > OOB_NAMEDCLASS)
3692 optimize_invert = FALSE;
3693 /* Possible truncation here but in some 64-bit environments
3694 * the compiler gets heartburn about switch on 64-bit values.
3695 * A similar issue a little earlier when switching on value.
3697 switch ((I32)namedclass) {
3700 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3702 for (value = 0; value < 256; value++)
3704 ANYOF_BITMAP_SET(ret, value);
3706 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3710 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3712 for (value = 0; value < 256; value++)
3713 if (!isALNUM(value))
3714 ANYOF_BITMAP_SET(ret, value);
3716 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3720 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3722 for (value = 0; value < 256; value++)
3723 if (isALNUMC(value))
3724 ANYOF_BITMAP_SET(ret, value);
3726 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3730 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3732 for (value = 0; value < 256; value++)
3733 if (!isALNUMC(value))
3734 ANYOF_BITMAP_SET(ret, value);
3736 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3740 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3742 for (value = 0; value < 256; value++)
3744 ANYOF_BITMAP_SET(ret, value);
3746 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3750 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3752 for (value = 0; value < 256; value++)
3753 if (!isALPHA(value))
3754 ANYOF_BITMAP_SET(ret, value);
3756 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3760 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3763 for (value = 0; value < 128; value++)
3764 ANYOF_BITMAP_SET(ret, value);
3766 for (value = 0; value < 256; value++) {
3768 ANYOF_BITMAP_SET(ret, value);
3772 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3776 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3779 for (value = 128; value < 256; value++)
3780 ANYOF_BITMAP_SET(ret, value);
3782 for (value = 0; value < 256; value++) {
3783 if (!isASCII(value))
3784 ANYOF_BITMAP_SET(ret, value);
3788 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3792 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3794 for (value = 0; value < 256; value++)
3796 ANYOF_BITMAP_SET(ret, value);
3798 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3802 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3804 for (value = 0; value < 256; value++)
3805 if (!isBLANK(value))
3806 ANYOF_BITMAP_SET(ret, value);
3808 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3812 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3814 for (value = 0; value < 256; value++)
3816 ANYOF_BITMAP_SET(ret, value);
3818 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3822 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3824 for (value = 0; value < 256; value++)
3825 if (!isCNTRL(value))
3826 ANYOF_BITMAP_SET(ret, value);
3828 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3832 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3834 /* consecutive digits assumed */
3835 for (value = '0'; value <= '9'; value++)
3836 ANYOF_BITMAP_SET(ret, value);
3838 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3842 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3844 /* consecutive digits assumed */
3845 for (value = 0; value < '0'; value++)
3846 ANYOF_BITMAP_SET(ret, value);
3847 for (value = '9' + 1; value < 256; value++)
3848 ANYOF_BITMAP_SET(ret, value);
3850 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3854 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3856 for (value = 0; value < 256; value++)
3858 ANYOF_BITMAP_SET(ret, value);
3860 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3864 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3866 for (value = 0; value < 256; value++)
3867 if (!isGRAPH(value))
3868 ANYOF_BITMAP_SET(ret, value);
3870 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3874 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3876 for (value = 0; value < 256; value++)
3878 ANYOF_BITMAP_SET(ret, value);
3880 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3884 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3886 for (value = 0; value < 256; value++)
3887 if (!isLOWER(value))
3888 ANYOF_BITMAP_SET(ret, value);
3890 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3894 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3896 for (value = 0; value < 256; value++)
3898 ANYOF_BITMAP_SET(ret, value);
3900 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3904 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3906 for (value = 0; value < 256; value++)
3907 if (!isPRINT(value))
3908 ANYOF_BITMAP_SET(ret, value);
3910 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3914 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3916 for (value = 0; value < 256; value++)
3917 if (isPSXSPC(value))
3918 ANYOF_BITMAP_SET(ret, value);
3920 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3924 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3926 for (value = 0; value < 256; value++)
3927 if (!isPSXSPC(value))
3928 ANYOF_BITMAP_SET(ret, value);
3930 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3934 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3936 for (value = 0; value < 256; value++)
3938 ANYOF_BITMAP_SET(ret, value);
3940 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3944 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3946 for (value = 0; value < 256; value++)
3947 if (!isPUNCT(value))
3948 ANYOF_BITMAP_SET(ret, value);
3950 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3954 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3956 for (value = 0; value < 256; value++)
3958 ANYOF_BITMAP_SET(ret, value);
3960 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3964 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3966 for (value = 0; value < 256; value++)
3967 if (!isSPACE(value))
3968 ANYOF_BITMAP_SET(ret, value);
3970 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3974 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3976 for (value = 0; value < 256; value++)
3978 ANYOF_BITMAP_SET(ret, value);
3980 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3984 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3986 for (value = 0; value < 256; value++)
3987 if (!isUPPER(value))
3988 ANYOF_BITMAP_SET(ret, value);
3990 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3994 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3996 for (value = 0; value < 256; value++)
3997 if (isXDIGIT(value))
3998 ANYOF_BITMAP_SET(ret, value);
4000 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4004 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4006 for (value = 0; value < 256; value++)
4007 if (!isXDIGIT(value))
4008 ANYOF_BITMAP_SET(ret, value);
4010 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4013 vFAIL("Invalid [::] class");
4017 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4020 } /* end of namedclass \blah */
4023 if (prevvalue > value) /* b-a */ {
4024 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4025 RExC_parse - rangebegin,
4026 RExC_parse - rangebegin,
4028 range = 0; /* not a valid range */
4032 prevvalue = value; /* save the beginning of the range */
4033 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4034 RExC_parse[1] != ']') {
4037 /* a bad range like \w-, [:word:]- ? */
4038 if (namedclass > OOB_NAMEDCLASS) {
4039 if (ckWARN(WARN_REGEXP))
4041 "False [] range \"%*.*s\"",
4042 RExC_parse - rangebegin,
4043 RExC_parse - rangebegin,
4046 ANYOF_BITMAP_SET(ret, '-');
4048 range = 1; /* yeah, it's a range! */
4049 continue; /* but do it the next time */
4053 /* now is the next time */
4057 if (prevvalue < 256) {
4058 IV ceilvalue = value < 256 ? value : 255;
4061 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4062 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4064 if (isLOWER(prevvalue)) {
4065 for (i = prevvalue; i <= ceilvalue; i++)
4067 ANYOF_BITMAP_SET(ret, i);
4069 for (i = prevvalue; i <= ceilvalue; i++)
4071 ANYOF_BITMAP_SET(ret, i);
4076 for (i = prevvalue; i <= ceilvalue; i++)
4077 ANYOF_BITMAP_SET(ret, i);
4079 if (value > 255 || UTF) {
4080 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4081 UV natvalue = NATIVE_TO_UNI(value);
4083 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4084 if (prevnatvalue < natvalue) { /* what about > ? */
4085 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4086 prevnatvalue, natvalue);
4088 else if (prevnatvalue == natvalue) {
4089 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4091 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4093 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4095 /* If folding and foldable and a single
4096 * character, insert also the folded version
4097 * to the charclass. */
4099 if (foldlen == UNISKIP(f))
4100 Perl_sv_catpvf(aTHX_ listsv,
4103 /* Any multicharacter foldings
4104 * require the following transform:
4105 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4106 * where E folds into "pq" and F folds
4107 * into "rst", all other characters
4108 * fold to single characters. We save
4109 * away these multicharacter foldings,
4110 * to be later saved as part of the
4111 * additional "s" data. */
4114 if (!unicode_alternate)
4115 unicode_alternate = newAV();
4116 sv = newSVpvn((char*)foldbuf, foldlen);
4118 av_push(unicode_alternate, sv);
4122 /* If folding and the value is one of the Greek
4123 * sigmas insert a few more sigmas to make the
4124 * folding rules of the sigmas to work right.
4125 * Note that not all the possible combinations
4126 * are handled here: some of them are handled
4127 * by the standard folding rules, and some of
4128 * them (literal or EXACTF cases) are handled
4129 * during runtime in regexec.c:S_find_byclass(). */
4130 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4131 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4132 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4133 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4134 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4136 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4137 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4138 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4144 range = 0; /* this range (if it was one) is done now */
4148 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4150 RExC_size += ANYOF_CLASS_ADD_SKIP;
4152 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4155 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4157 /* If the only flag is folding (plus possibly inversion). */
4158 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4160 for (value = 0; value < 256; ++value) {
4161 if (ANYOF_BITMAP_TEST(ret, value)) {
4162 IV fold = PL_fold[value];
4165 ANYOF_BITMAP_SET(ret, fold);
4168 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4171 /* optimize inverted simple patterns (e.g. [^a-z]) */
4172 if (!SIZE_ONLY && optimize_invert &&
4173 /* If the only flag is inversion. */
4174 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4175 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4176 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4177 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4184 /* The 0th element stores the character class description
4185 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4186 * to initialize the appropriate swash (which gets stored in
4187 * the 1st element), and also useful for dumping the regnode.
4188 * The 2nd element stores the multicharacter foldings,
4189 * used later (regexec.c:s_reginclasslen()). */
4190 av_store(av, 0, listsv);
4191 av_store(av, 1, NULL);
4192 av_store(av, 2, (SV*)unicode_alternate);
4193 rv = newRV_noinc((SV*)av);
4194 n = add_data(pRExC_state, 1, "s");
4195 RExC_rx->data->data[n] = (void*)rv;
4203 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4205 char* retval = RExC_parse++;
4208 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4209 RExC_parse[2] == '#') {
4210 while (*RExC_parse && *RExC_parse != ')')
4215 if (RExC_flags16 & PMf_EXTENDED) {
4216 if (isSPACE(*RExC_parse)) {
4220 else if (*RExC_parse == '#') {
4221 while (*RExC_parse && *RExC_parse != '\n')
4232 - reg_node - emit a node
4234 STATIC regnode * /* Location. */
4235 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4237 register regnode *ret;
4238 register regnode *ptr;
4242 SIZE_ALIGN(RExC_size);
4247 NODE_ALIGN_FILL(ret);
4249 FILL_ADVANCE_NODE(ptr, op);
4250 if (RExC_offsets) { /* MJD */
4251 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4252 "reg_node", __LINE__,
4254 RExC_emit - RExC_emit_start > RExC_offsets[0]
4255 ? "Overwriting end of array!\n" : "OK",
4256 RExC_emit - RExC_emit_start,
4257 RExC_parse - RExC_start,
4259 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4268 - reganode - emit a node with an argument
4270 STATIC regnode * /* Location. */
4271 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4273 register regnode *ret;
4274 register regnode *ptr;
4278 SIZE_ALIGN(RExC_size);
4283 NODE_ALIGN_FILL(ret);
4285 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4286 if (RExC_offsets) { /* MJD */
4287 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4289 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4290 "Overwriting end of array!\n" : "OK",
4291 RExC_emit - RExC_emit_start,
4292 RExC_parse - RExC_start,
4294 Set_Cur_Node_Offset;
4303 - reguni - emit (if appropriate) a Unicode character
4306 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4308 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4312 - reginsert - insert an operator in front of already-emitted operand
4314 * Means relocating the operand.
4317 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4319 register regnode *src;
4320 register regnode *dst;
4321 register regnode *place;
4322 register int offset = regarglen[(U8)op];
4324 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4327 RExC_size += NODE_STEP_REGNODE + offset;
4332 RExC_emit += NODE_STEP_REGNODE + offset;
4334 while (src > opnd) {
4335 StructCopy(--src, --dst, regnode);
4336 if (RExC_offsets) { /* MJD 20010112 */
4337 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4339 dst - RExC_emit_start > RExC_offsets[0]
4340 ? "Overwriting end of array!\n" : "OK",
4341 src - RExC_emit_start,
4342 dst - RExC_emit_start,
4344 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4345 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4350 place = opnd; /* Op node, where operand used to be. */
4351 if (RExC_offsets) { /* MJD */
4352 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4354 place - RExC_emit_start > RExC_offsets[0]
4355 ? "Overwriting end of array!\n" : "OK",
4356 place - RExC_emit_start,
4357 RExC_parse - RExC_start,
4359 Set_Node_Offset(place, RExC_parse);
4361 src = NEXTOPER(place);
4362 FILL_ADVANCE_NODE(place, op);
4363 Zero(src, offset, regnode);
4367 - regtail - set the next-pointer at the end of a node chain of p to val.
4370 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4372 register regnode *scan;
4373 register regnode *temp;
4378 /* Find last node. */
4381 temp = regnext(scan);
4387 if (reg_off_by_arg[OP(scan)]) {
4388 ARG_SET(scan, val - scan);
4391 NEXT_OFF(scan) = val - scan;
4396 - regoptail - regtail on operand of first argument; nop if operandless
4399 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4401 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4402 if (p == NULL || SIZE_ONLY)
4404 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4405 regtail(pRExC_state, NEXTOPER(p), val);
4407 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4408 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4415 - regcurly - a little FSA that accepts {\d+,?\d*}
4418 S_regcurly(pTHX_ register char *s)
4439 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4441 register U8 op = EXACT; /* Arbitrary non-END op. */
4442 register regnode *next;
4444 while (op != END && (!last || node < last)) {
4445 /* While that wasn't END last time... */
4451 next = regnext(node);
4453 if (OP(node) == OPTIMIZED)
4456 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4457 (int)(2*l + 1), "", SvPVX(sv));
4458 if (next == NULL) /* Next ptr. */
4459 PerlIO_printf(Perl_debug_log, "(0)");
4461 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4462 (void)PerlIO_putc(Perl_debug_log, '\n');
4464 if (PL_regkind[(U8)op] == BRANCHJ) {
4465 register regnode *nnode = (OP(next) == LONGJMP
4468 if (last && nnode > last)
4470 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4472 else if (PL_regkind[(U8)op] == BRANCH) {
4473 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4475 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4476 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4477 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4479 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4480 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4483 else if ( op == PLUS || op == STAR) {
4484 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4486 else if (op == ANYOF) {
4487 /* arglen 1 + class block */
4488 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4489 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4490 node = NEXTOPER(node);
4492 else if (PL_regkind[(U8)op] == EXACT) {
4493 /* Literal string, where present. */
4494 node += NODE_SZ_STR(node) - 1;
4495 node = NEXTOPER(node);
4498 node = NEXTOPER(node);
4499 node += regarglen[(U8)op];
4501 if (op == CURLYX || op == OPEN)
4503 else if (op == WHILEM)
4509 #endif /* DEBUGGING */
4512 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4515 Perl_regdump(pTHX_ regexp *r)
4518 SV *sv = sv_newmortal();
4520 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4522 /* Header fields of interest. */
4523 if (r->anchored_substr)
4524 PerlIO_printf(Perl_debug_log,
4525 "anchored `%s%.*s%s'%s at %"IVdf" ",
4527 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4528 SvPVX(r->anchored_substr),
4530 SvTAIL(r->anchored_substr) ? "$" : "",
4531 (IV)r->anchored_offset);
4532 if (r->float_substr)
4533 PerlIO_printf(Perl_debug_log,
4534 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4536 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4537 SvPVX(r->float_substr),
4539 SvTAIL(r->float_substr) ? "$" : "",
4540 (IV)r->float_min_offset, (UV)r->float_max_offset);
4541 if (r->check_substr)
4542 PerlIO_printf(Perl_debug_log,
4543 r->check_substr == r->float_substr
4544 ? "(checking floating" : "(checking anchored");
4545 if (r->reganch & ROPT_NOSCAN)
4546 PerlIO_printf(Perl_debug_log, " noscan");
4547 if (r->reganch & ROPT_CHECK_ALL)
4548 PerlIO_printf(Perl_debug_log, " isall");
4549 if (r->check_substr)
4550 PerlIO_printf(Perl_debug_log, ") ");
4552 if (r->regstclass) {
4553 regprop(sv, r->regstclass);
4554 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4556 if (r->reganch & ROPT_ANCH) {
4557 PerlIO_printf(Perl_debug_log, "anchored");
4558 if (r->reganch & ROPT_ANCH_BOL)
4559 PerlIO_printf(Perl_debug_log, "(BOL)");
4560 if (r->reganch & ROPT_ANCH_MBOL)
4561 PerlIO_printf(Perl_debug_log, "(MBOL)");
4562 if (r->reganch & ROPT_ANCH_SBOL)
4563 PerlIO_printf(Perl_debug_log, "(SBOL)");
4564 if (r->reganch & ROPT_ANCH_GPOS)
4565 PerlIO_printf(Perl_debug_log, "(GPOS)");
4566 PerlIO_putc(Perl_debug_log, ' ');
4568 if (r->reganch & ROPT_GPOS_SEEN)
4569 PerlIO_printf(Perl_debug_log, "GPOS ");
4570 if (r->reganch & ROPT_SKIP)
4571 PerlIO_printf(Perl_debug_log, "plus ");
4572 if (r->reganch & ROPT_IMPLICIT)
4573 PerlIO_printf(Perl_debug_log, "implicit ");
4574 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4575 if (r->reganch & ROPT_EVAL_SEEN)
4576 PerlIO_printf(Perl_debug_log, "with eval ");
4577 PerlIO_printf(Perl_debug_log, "\n");
4580 U32 len = r->offsets[0];
4581 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4582 for (i = 1; i <= len; i++)
4583 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4584 (UV)r->offsets[i*2-1],
4585 (UV)r->offsets[i*2]);
4586 PerlIO_printf(Perl_debug_log, "\n");
4588 #endif /* DEBUGGING */
4594 S_put_byte(pTHX_ SV *sv, int c)
4596 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4597 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4598 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4599 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4601 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4604 #endif /* DEBUGGING */
4607 - regprop - printable representation of opcode
4610 Perl_regprop(pTHX_ SV *sv, regnode *o)
4615 sv_setpvn(sv, "", 0);
4616 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4617 /* It would be nice to FAIL() here, but this may be called from
4618 regexec.c, and it would be hard to supply pRExC_state. */
4619 Perl_croak(aTHX_ "Corrupted regexp opcode");
4620 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4622 k = PL_regkind[(U8)OP(o)];
4625 SV *dsv = sv_2mortal(newSVpvn("", 0));
4626 /* Using is_utf8_string() is a crude hack but it may
4627 * be the best for now since we have no flag "this EXACTish
4628 * node was UTF-8" --jhi */
4629 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4631 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4632 UNI_DISPLAY_REGEX) :
4637 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4642 else if (k == CURLY) {
4643 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4644 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4645 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4647 else if (k == WHILEM && o->flags) /* Ordinal/of */
4648 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4649 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4650 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4651 else if (k == LOGICAL)
4652 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4653 else if (k == ANYOF) {
4654 int i, rangestart = -1;
4655 U8 flags = ANYOF_FLAGS(o);
4656 const char * const anyofs[] = { /* Should be syncronized with
4657 * ANYOF_ #xdefines in regcomp.h */
4690 if (flags & ANYOF_LOCALE)
4691 sv_catpv(sv, "{loc}");
4692 if (flags & ANYOF_FOLD)
4693 sv_catpv(sv, "{i}");
4694 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4695 if (flags & ANYOF_INVERT)
4697 for (i = 0; i <= 256; i++) {
4698 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4699 if (rangestart == -1)
4701 } else if (rangestart != -1) {
4702 if (i <= rangestart + 3)
4703 for (; rangestart < i; rangestart++)
4704 put_byte(sv, rangestart);
4706 put_byte(sv, rangestart);
4708 put_byte(sv, i - 1);
4714 if (o->flags & ANYOF_CLASS)
4715 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4716 if (ANYOF_CLASS_TEST(o,i))
4717 sv_catpv(sv, anyofs[i]);
4719 if (flags & ANYOF_UNICODE)
4720 sv_catpv(sv, "{unicode}");
4721 else if (flags & ANYOF_UNICODE_ALL)
4722 sv_catpv(sv, "{unicode_all}");
4726 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4731 U8 s[UTF8_MAXLEN+1];
4733 for (i = 0; i <= 256; i++) { /* just the first 256 */
4734 U8 *e = uvchr_to_utf8(s, i);
4736 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4737 if (rangestart == -1)
4739 } else if (rangestart != -1) {
4742 if (i <= rangestart + 3)
4743 for (; rangestart < i; rangestart++) {
4744 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4748 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4751 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4758 sv_catpv(sv, "..."); /* et cetera */
4762 char *s = savepv(SvPVX(lv));
4765 while(*s && *s != '\n') s++;
4786 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4788 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4789 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4790 #endif /* DEBUGGING */
4794 Perl_re_intuit_string(pTHX_ regexp *prog)
4795 { /* Assume that RE_INTUIT is set */
4798 char *s = SvPV(prog->check_substr,n_a);
4800 if (!PL_colorset) reginitcolors();
4801 PerlIO_printf(Perl_debug_log,
4802 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4803 PL_colors[4],PL_colors[5],PL_colors[0],
4806 (strlen(s) > 60 ? "..." : ""));
4809 return prog->check_substr;
4813 Perl_pregfree(pTHX_ struct regexp *r)
4816 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4819 if (!r || (--r->refcnt > 0))
4822 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4824 int len = SvCUR(dsv);
4827 PerlIO_printf(Perl_debug_log,
4828 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4829 PL_colors[4],PL_colors[5],PL_colors[0],
4832 len > 60 ? "..." : "");
4836 Safefree(r->precomp);
4837 if (r->offsets) /* 20010421 MJD */
4838 Safefree(r->offsets);
4839 if (RX_MATCH_COPIED(r))
4840 Safefree(r->subbeg);
4842 if (r->anchored_substr)
4843 SvREFCNT_dec(r->anchored_substr);
4844 if (r->float_substr)
4845 SvREFCNT_dec(r->float_substr);
4846 Safefree(r->substrs);
4849 int n = r->data->count;
4850 AV* new_comppad = NULL;
4855 /* If you add a ->what type here, update the comment in regcomp.h */
4856 switch (r->data->what[n]) {
4858 SvREFCNT_dec((SV*)r->data->data[n]);
4861 Safefree(r->data->data[n]);
4864 new_comppad = (AV*)r->data->data[n];
4867 if (new_comppad == NULL)
4868 Perl_croak(aTHX_ "panic: pregfree comppad");
4869 old_comppad = PL_comppad;
4870 old_curpad = PL_curpad;
4871 /* Watch out for global destruction's random ordering. */
4872 if (SvTYPE(new_comppad) == SVt_PVAV) {
4873 PL_comppad = new_comppad;
4874 PL_curpad = AvARRAY(new_comppad);
4879 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4880 op_free((OP_4tree*)r->data->data[n]);
4883 PL_comppad = old_comppad;
4884 PL_curpad = old_curpad;
4885 SvREFCNT_dec((SV*)new_comppad);
4891 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4894 Safefree(r->data->what);
4897 Safefree(r->startp);
4903 - regnext - dig the "next" pointer out of a node
4905 * [Note, when REGALIGN is defined there are two places in regmatch()
4906 * that bypass this code for speed.]
4909 Perl_regnext(pTHX_ register regnode *p)
4911 register I32 offset;
4913 if (p == &PL_regdummy)
4916 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4924 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4927 STRLEN l1 = strlen(pat1);
4928 STRLEN l2 = strlen(pat2);
4937 Copy(pat1, buf, l1 , char);
4938 Copy(pat2, buf + l1, l2 , char);
4939 buf[l1 + l2] = '\n';
4940 buf[l1 + l2 + 1] = '\0';
4942 /* ANSI variant takes additional second argument */
4943 va_start(args, pat2);
4947 msv = vmess(buf, &args);
4949 message = SvPV(msv,l1);
4952 Copy(message, buf, l1 , char);
4953 buf[l1] = '\0'; /* Overwrite \n */
4954 Perl_croak(aTHX_ "%s", buf);
4957 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4960 Perl_save_re_context(pTHX)
4963 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4964 SAVEI32(RExC_npar); /* () count. */
4965 SAVEI32(RExC_size); /* Code size. */
4966 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4967 SAVEVPTR(RExC_rx); /* from regcomp.c */
4968 SAVEI32(RExC_seen); /* from regcomp.c */
4969 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4970 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4971 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
4972 SAVEPPTR(RExC_end); /* End of input for compile */
4973 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4976 SAVEI32(PL_reg_flags); /* from regexec.c */
4978 SAVEPPTR(PL_reginput); /* String-input pointer. */
4979 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4980 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
4981 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4982 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4983 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
4984 SAVEPPTR(PL_regtill); /* How far we are required to go. */
4985 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
4986 PL_reg_start_tmp = 0;
4987 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4988 PL_reg_start_tmpl = 0;
4989 SAVEVPTR(PL_regdata);
4990 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4991 SAVEI32(PL_regnarrate); /* from regexec.c */
4992 SAVEVPTR(PL_regprogram); /* from regexec.c */
4993 SAVEINT(PL_regindent); /* from regexec.c */
4994 SAVEVPTR(PL_regcc); /* from regexec.c */
4995 SAVEVPTR(PL_curcop);
4996 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4997 SAVEVPTR(PL_reg_re); /* from regexec.c */
4998 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4999 SAVESPTR(PL_reg_sv); /* from regexec.c */
5000 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5001 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5002 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5003 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5004 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5005 SAVEI32(PL_regnpar); /* () count. */
5006 SAVEI32(PL_regsize); /* from regexec.c */
5008 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5013 clear_re(pTHX_ void *r)
5015 ReREFCNT_dec((regexp *)r);