5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 data->flags &= ~SF_FL_BEFORE_EOL;
515 SvCUR_set(data->last_found, 0);
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
600 /* 'OR' a given class with another one. Can create false positives */
601 /* We assume that cl is not inverted */
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 if (or_with->flags & ANYOF_INVERT) {
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
624 cl_anything(pRExC_state, cl);
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629 && (!(or_with->flags & ANYOF_FOLD)
630 || (cl->flags & ANYOF_FOLD)) ) {
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
643 cl_anything(pRExC_state, cl);
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
664 #if defined(__sparc64__) && defined(__GNUC__)
665 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 # undef SPARC64_GCC_WORKAROUND
667 # define SPARC64_GCC_WORKAROUND 1
671 /* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687 int is_inf_internal = 0; /* The studied chunk is infinite */
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
696 /* Merge several consecutive EXACTish nodes into one. */
697 regnode *n = regnext(scan);
700 regnode *stop = scan;
703 next = scan + NODE_SZ_STR(scan);
704 /* Skip NOTHING, merge EXACT*. */
706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
707 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
722 int oldl = STR_LEN(scan);
723 regnode *nnext = regnext(n);
725 if (oldl + STR_LEN(n) > U8_MAX)
727 NEXT_OFF(scan) += NEXT_OFF(n);
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
730 /* Now we can overwrite *n : */
731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 Two problematic code points in Unicode casefolding of EXACT nodes:
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
774 s < s2 && (t = ninstr(s, s1, t0, t1));
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
784 n = scan + NODE_SZ_STR(scan);
786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
796 if (OP(scan) != CURLYX) {
797 int max = (reg_off_by_arg[OP(scan)]
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 if (reg_off_by_arg[OP(scan)])
814 NEXT_OFF(scan) = off;
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824 I32 max1 = 0, min1 = I32_MAX, num = 0;
825 struct regnode_charclass_class accum;
827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829 if (flags & SCF_DO_STCLASS)
830 cl_init_zero(pRExC_state, &accum);
831 while (OP(scan) == code) {
832 I32 deltanext, minnext, f = 0, fake;
833 struct regnode_charclass_class this_class;
838 data_fake.whilem_c = data->whilem_c;
839 data_fake.last_closep = data->last_closep;
842 data_fake.last_closep = &fake;
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
846 scan = NEXTOPER(scan);
847 if (flags & SCF_DO_STCLASS) {
848 cl_init(pRExC_state, &this_class);
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
854 /* we suppose the run is continuous, last=next...*/
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
862 is_inf = is_inf_internal = 1;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 if (data && (data_fake.flags & SF_HAS_EVAL))
867 data->flags |= SF_HAS_EVAL;
869 data->whilem_c = data_fake.whilem_c;
870 if (flags & SCF_DO_STCLASS)
871 cl_or(pRExC_state, &accum, &this_class);
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
884 delta += max1 - min1;
885 if (flags & SCF_DO_STCLASS_OR) {
886 cl_or(pRExC_state, data->start_class, &accum);
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
892 else if (flags & SCF_DO_STCLASS_AND) {
894 cl_and(data->start_class, &accum);
895 flags &= ~SCF_DO_STCLASS;
898 /* Switch to OR mode: cache the old value of
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
910 else if (code == BRANCHJ) /* single branch is optimized. */
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
916 else if (OP(scan) == EXACT) {
917 I32 l = STR_LEN(scan);
918 UV uc = *((U8*)STRING(scan));
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
922 uc = utf8_to_uvchr(s, NULL);
925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
931 ? I32_MAX : data->pos_min + data->pos_delta;
933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935 SvUTF8_on(data->last_found);
936 data->last_end = data->pos_min + l;
937 data->pos_min += l; /* As in the first entry. */
938 data->flags &= ~SF_BEFORE_EOL;
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
947 && (!(data->start_class->flags & ANYOF_FOLD)
948 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951 ANYOF_CLASS_ZERO(data->start_class);
952 ANYOF_BITMAP_ZERO(data->start_class);
954 ANYOF_BITMAP_SET(data->start_class, uc);
955 data->start_class->flags &= ~ANYOF_EOS;
957 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959 else if (flags & SCF_DO_STCLASS_OR) {
960 /* false positive possible if the class is case-folded */
962 ANYOF_BITMAP_SET(data->start_class, uc);
964 data->start_class->flags |= ANYOF_UNICODE_ALL;
965 data->start_class->flags &= ~ANYOF_EOS;
966 cl_and(data->start_class, &and_with);
968 flags &= ~SCF_DO_STCLASS;
970 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
971 I32 l = STR_LEN(scan);
972 UV uc = *((U8*)STRING(scan));
974 /* Search for fixed substrings supports EXACT only. */
975 if (flags & SCF_DO_SUBSTR)
976 scan_commit(pRExC_state, data);
978 U8 *s = (U8 *)STRING(scan);
979 l = utf8_length(s, s + l);
980 uc = utf8_to_uvchr(s, NULL);
983 if (data && (flags & SCF_DO_SUBSTR))
985 if (flags & SCF_DO_STCLASS_AND) {
986 /* Check whether it is compatible with what we know already! */
990 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
991 && !ANYOF_BITMAP_TEST(data->start_class, uc)
992 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994 ANYOF_CLASS_ZERO(data->start_class);
995 ANYOF_BITMAP_ZERO(data->start_class);
997 ANYOF_BITMAP_SET(data->start_class, uc);
998 data->start_class->flags &= ~ANYOF_EOS;
999 data->start_class->flags |= ANYOF_FOLD;
1000 if (OP(scan) == EXACTFL)
1001 data->start_class->flags |= ANYOF_LOCALE;
1004 else if (flags & SCF_DO_STCLASS_OR) {
1005 if (data->start_class->flags & ANYOF_FOLD) {
1006 /* false positive possible if the class is case-folded.
1007 Assume that the locale settings are the same... */
1009 ANYOF_BITMAP_SET(data->start_class, uc);
1010 data->start_class->flags &= ~ANYOF_EOS;
1012 cl_and(data->start_class, &and_with);
1014 flags &= ~SCF_DO_STCLASS;
1016 else if (strchr((char*)PL_varies,OP(scan))) {
1017 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1018 I32 f = flags, pos_before = 0;
1019 regnode *oscan = scan;
1020 struct regnode_charclass_class this_class;
1021 struct regnode_charclass_class *oclass = NULL;
1022 I32 next_is_eval = 0;
1024 switch (PL_regkind[(U8)OP(scan)]) {
1025 case WHILEM: /* End of (?:...)* . */
1026 scan = NEXTOPER(scan);
1029 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1030 next = NEXTOPER(scan);
1031 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033 maxcount = REG_INFTY;
1034 next = regnext(scan);
1035 scan = NEXTOPER(scan);
1039 if (flags & SCF_DO_SUBSTR)
1044 if (flags & SCF_DO_STCLASS) {
1046 maxcount = REG_INFTY;
1047 next = regnext(scan);
1048 scan = NEXTOPER(scan);
1051 is_inf = is_inf_internal = 1;
1052 scan = regnext(scan);
1053 if (flags & SCF_DO_SUBSTR) {
1054 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1055 data->longest = &(data->longest_float);
1057 goto optimize_curly_tail;
1059 mincount = ARG1(scan);
1060 maxcount = ARG2(scan);
1061 next = regnext(scan);
1062 if (OP(scan) == CURLYX) {
1063 I32 lp = (data ? *(data->last_closep) : 0);
1065 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1068 next_is_eval = (OP(scan) == EVAL);
1070 if (flags & SCF_DO_SUBSTR) {
1071 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1072 pos_before = data->pos_min;
1076 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 data->flags |= SF_IS_INF;
1080 if (flags & SCF_DO_STCLASS) {
1081 cl_init(pRExC_state, &this_class);
1082 oclass = data->start_class;
1083 data->start_class = &this_class;
1084 f |= SCF_DO_STCLASS_AND;
1085 f &= ~SCF_DO_STCLASS_OR;
1087 /* These are the cases when once a subexpression
1088 fails at a particular position, it cannot succeed
1089 even after backtracking at the enclosing scope.
1091 XXXX what if minimal match and we are at the
1092 initial run of {n,m}? */
1093 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094 f &= ~SCF_WHILEM_VISITED_POS;
1096 /* This will finish on WHILEM, setting scan, or on NULL: */
1097 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 ? (f & ~SCF_DO_SUBSTR) : f);
1101 if (flags & SCF_DO_STCLASS)
1102 data->start_class = oclass;
1103 if (mincount == 0 || minnext == 0) {
1104 if (flags & SCF_DO_STCLASS_OR) {
1105 cl_or(pRExC_state, data->start_class, &this_class);
1107 else if (flags & SCF_DO_STCLASS_AND) {
1108 /* Switch to OR mode: cache the old value of
1109 * data->start_class */
1110 StructCopy(data->start_class, &and_with,
1111 struct regnode_charclass_class);
1112 flags &= ~SCF_DO_STCLASS_AND;
1113 StructCopy(&this_class, data->start_class,
1114 struct regnode_charclass_class);
1115 flags |= SCF_DO_STCLASS_OR;
1116 data->start_class->flags |= ANYOF_EOS;
1118 } else { /* Non-zero len */
1119 if (flags & SCF_DO_STCLASS_OR) {
1120 cl_or(pRExC_state, data->start_class, &this_class);
1121 cl_and(data->start_class, &and_with);
1123 else if (flags & SCF_DO_STCLASS_AND)
1124 cl_and(data->start_class, &this_class);
1125 flags &= ~SCF_DO_STCLASS;
1127 if (!scan) /* It was not CURLYX, but CURLY. */
1129 if (ckWARN(WARN_REGEXP)
1130 /* ? quantifier ok, except for (?{ ... }) */
1131 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1132 && (minnext == 0) && (deltanext == 0)
1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137 "Quantifier unexpected on zero-length expression");
1140 min += minnext * mincount;
1141 is_inf_internal |= ((maxcount == REG_INFTY
1142 && (minnext + deltanext) > 0)
1143 || deltanext == I32_MAX);
1144 is_inf |= is_inf_internal;
1145 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147 /* Try powerful optimization CURLYX => CURLYN. */
1148 if ( OP(oscan) == CURLYX && data
1149 && data->flags & SF_IN_PAR
1150 && !(data->flags & SF_HAS_EVAL)
1151 && !deltanext && minnext == 1 ) {
1152 /* Try to optimize to CURLYN. */
1153 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1154 regnode *nxt1 = nxt;
1161 if (!strchr((char*)PL_simple,OP(nxt))
1162 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1163 && STR_LEN(nxt) == 1))
1169 if (OP(nxt) != CLOSE)
1171 /* Now we know that nxt2 is the only contents: */
1172 oscan->flags = ARG(nxt);
1174 OP(nxt1) = NOTHING; /* was OPEN. */
1176 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1180 OP(nxt + 1) = OPTIMIZED; /* was count. */
1181 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1186 /* Try optimization CURLYX => CURLYM. */
1187 if ( OP(oscan) == CURLYX && data
1188 && !(data->flags & SF_HAS_PAR)
1189 && !(data->flags & SF_HAS_EVAL)
1191 /* XXXX How to optimize if data == 0? */
1192 /* Optimize to a simpler form. */
1193 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1198 && (OP(nxt2) != WHILEM))
1200 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1201 /* Need to optimize away parenths. */
1202 if (data->flags & SF_IN_PAR) {
1203 /* Set the parenth number. */
1204 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206 if (OP(nxt) != CLOSE)
1207 FAIL("Panic opt close");
1208 oscan->flags = ARG(nxt);
1209 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1210 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1212 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213 OP(nxt + 1) = OPTIMIZED; /* was count. */
1214 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219 regnode *nnxt = regnext(nxt1);
1222 if (reg_off_by_arg[OP(nxt1)])
1223 ARG_SET(nxt1, nxt2 - nxt1);
1224 else if (nxt2 - nxt1 < U16_MAX)
1225 NEXT_OFF(nxt1) = nxt2 - nxt1;
1227 OP(nxt) = NOTHING; /* Cannot beautify */
1232 /* Optimize again: */
1233 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 else if ((OP(oscan) == CURLYX)
1240 && (flags & SCF_WHILEM_VISITED_POS)
1241 /* See the comment on a similar expression above.
1242 However, this time it not a subexpression
1243 we care about, but the expression itself. */
1244 && (maxcount == REG_INFTY)
1245 && data && ++data->whilem_c < 16) {
1246 /* This stays as CURLYX, we can put the count/of pair. */
1247 /* Find WHILEM (as in regexec.c) */
1248 regnode *nxt = oscan + NEXT_OFF(oscan);
1250 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252 PREVOPER(nxt)->flags = data->whilem_c
1253 | (RExC_whilem_seen << 4); /* On WHILEM */
1255 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1257 if (flags & SCF_DO_SUBSTR) {
1258 SV *last_str = Nullsv;
1259 int counted = mincount != 0;
1261 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1262 #if defined(SPARC64_GCC_WORKAROUND)
1268 if (pos_before >= data->last_start_min)
1271 b = data->last_start_min;
1274 s = SvPV(data->last_found, l);
1275 old = b - data->last_start_min;
1278 I32 b = pos_before >= data->last_start_min
1279 ? pos_before : data->last_start_min;
1281 char *s = SvPV(data->last_found, l);
1282 I32 old = b - data->last_start_min;
1286 old = utf8_hop((U8*)s, old) - (U8*)s;
1289 /* Get the added string: */
1290 last_str = newSVpvn(s + old, l);
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1294 SvGROW(last_str, (mincount * l) + 1);
1295 repeatcpy(SvPVX(last_str) + l,
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
1299 SvCUR_set(data->last_found,
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1302 data->last_end += l * (mincount - 1);
1305 /* start offset must point into the last copy */
1306 data->last_start_min += minnext * (mincount - 1);
1307 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308 * (minnext + data->pos_delta);
1311 /* It is counted once already... */
1312 data->pos_min += minnext * (mincount - counted);
1313 data->pos_delta += - counted * deltanext +
1314 (minnext + deltanext) * maxcount - minnext * mincount;
1315 if (mincount != maxcount) {
1316 /* Cannot extend fixed substrings found inside
1318 scan_commit(pRExC_state,data);
1319 if (mincount && last_str) {
1320 sv_setsv(data->last_found, last_str);
1321 data->last_end = data->pos_min;
1322 data->last_start_min =
1323 data->pos_min - CHR_SVLEN(last_str);
1324 data->last_start_max = is_inf
1326 : data->pos_min + data->pos_delta
1327 - CHR_SVLEN(last_str);
1329 data->longest = &(data->longest_float);
1331 SvREFCNT_dec(last_str);
1333 if (data && (fl & SF_HAS_EVAL))
1334 data->flags |= SF_HAS_EVAL;
1335 optimize_curly_tail:
1336 if (OP(oscan) != CURLYX) {
1337 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1339 NEXT_OFF(oscan) += NEXT_OFF(next);
1342 default: /* REF and CLUMP only? */
1343 if (flags & SCF_DO_SUBSTR) {
1344 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1345 data->longest = &(data->longest_float);
1347 is_inf = is_inf_internal = 1;
1348 if (flags & SCF_DO_STCLASS_OR)
1349 cl_anything(pRExC_state, data->start_class);
1350 flags &= ~SCF_DO_STCLASS;
1354 else if (strchr((char*)PL_simple,OP(scan))) {
1357 if (flags & SCF_DO_SUBSTR) {
1358 scan_commit(pRExC_state,data);
1362 if (flags & SCF_DO_STCLASS) {
1363 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1365 /* Some of the logic below assumes that switching
1366 locale on will only add false positives. */
1367 switch (PL_regkind[(U8)OP(scan)]) {
1371 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1373 cl_anything(pRExC_state, data->start_class);
1376 if (OP(scan) == SANY)
1378 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380 || (data->start_class->flags & ANYOF_CLASS));
1381 cl_anything(pRExC_state, data->start_class);
1383 if (flags & SCF_DO_STCLASS_AND || !value)
1384 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387 if (flags & SCF_DO_STCLASS_AND)
1388 cl_and(data->start_class,
1389 (struct regnode_charclass_class*)scan);
1391 cl_or(pRExC_state, data->start_class,
1392 (struct regnode_charclass_class*)scan);
1395 if (flags & SCF_DO_STCLASS_AND) {
1396 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398 for (value = 0; value < 256; value++)
1399 if (!isALNUM(value))
1400 ANYOF_BITMAP_CLEAR(data->start_class, value);
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1407 for (value = 0; value < 256; value++)
1409 ANYOF_BITMAP_SET(data->start_class, value);
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 data->start_class->flags |= ANYOF_LOCALE;
1424 if (flags & SCF_DO_STCLASS_AND) {
1425 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427 for (value = 0; value < 256; value++)
1429 ANYOF_BITMAP_CLEAR(data->start_class, value);
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1436 for (value = 0; value < 256; value++)
1437 if (!isALNUM(value))
1438 ANYOF_BITMAP_SET(data->start_class, value);
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448 data->start_class->flags |= ANYOF_LOCALE;
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453 if (flags & SCF_DO_STCLASS_AND) {
1454 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456 for (value = 0; value < 256; value++)
1457 if (!isSPACE(value))
1458 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1465 for (value = 0; value < 256; value++)
1467 ANYOF_BITMAP_SET(data->start_class, value);
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477 data->start_class->flags |= ANYOF_LOCALE;
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482 if (flags & SCF_DO_STCLASS_AND) {
1483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485 for (value = 0; value < 256; value++)
1487 ANYOF_BITMAP_CLEAR(data->start_class, value);
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1494 for (value = 0; value < 256; value++)
1495 if (!isSPACE(value))
1496 ANYOF_BITMAP_SET(data->start_class, value);
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (data->start_class->flags & ANYOF_LOCALE) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1505 if (!isSPACE(value))
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1510 data->start_class->flags |= ANYOF_LOCALE;
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1515 if (flags & SCF_DO_STCLASS_AND) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517 for (value = 0; value < 256; value++)
1518 if (!isDIGIT(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 if (data->start_class->flags & ANYOF_LOCALE)
1523 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1525 for (value = 0; value < 256; value++)
1527 ANYOF_BITMAP_SET(data->start_class, value);
1532 if (flags & SCF_DO_STCLASS_AND) {
1533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534 for (value = 0; value < 256; value++)
1536 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539 if (data->start_class->flags & ANYOF_LOCALE)
1540 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1542 for (value = 0; value < 256; value++)
1543 if (!isDIGIT(value))
1544 ANYOF_BITMAP_SET(data->start_class, value);
1549 if (flags & SCF_DO_STCLASS_OR)
1550 cl_and(data->start_class, &and_with);
1551 flags &= ~SCF_DO_STCLASS;
1554 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1555 data->flags |= (OP(scan) == MEOL
1559 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1560 /* Lookbehind, or need to calculate parens/evals/stclass: */
1561 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1562 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1563 /* Lookahead/lookbehind */
1564 I32 deltanext, minnext, fake = 0;
1566 struct regnode_charclass_class intrnl;
1569 data_fake.flags = 0;
1571 data_fake.whilem_c = data->whilem_c;
1572 data_fake.last_closep = data->last_closep;
1575 data_fake.last_closep = &fake;
1576 if ( flags & SCF_DO_STCLASS && !scan->flags
1577 && OP(scan) == IFMATCH ) { /* Lookahead */
1578 cl_init(pRExC_state, &intrnl);
1579 data_fake.start_class = &intrnl;
1580 f |= SCF_DO_STCLASS_AND;
1582 if (flags & SCF_WHILEM_VISITED_POS)
1583 f |= SCF_WHILEM_VISITED_POS;
1584 next = regnext(scan);
1585 nscan = NEXTOPER(NEXTOPER(scan));
1586 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589 vFAIL("Variable length lookbehind not implemented");
1591 else if (minnext > U8_MAX) {
1592 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1594 scan->flags = minnext;
1596 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1598 if (data && (data_fake.flags & SF_HAS_EVAL))
1599 data->flags |= SF_HAS_EVAL;
1601 data->whilem_c = data_fake.whilem_c;
1602 if (f & SCF_DO_STCLASS_AND) {
1603 int was = (data->start_class->flags & ANYOF_EOS);
1605 cl_and(data->start_class, &intrnl);
1607 data->start_class->flags |= ANYOF_EOS;
1610 else if (OP(scan) == OPEN) {
1613 else if (OP(scan) == CLOSE) {
1614 if (ARG(scan) == is_par) {
1615 next = regnext(scan);
1617 if ( next && (OP(next) != WHILEM) && next < last)
1618 is_par = 0; /* Disable optimization */
1621 *(data->last_closep) = ARG(scan);
1623 else if (OP(scan) == EVAL) {
1625 data->flags |= SF_HAS_EVAL;
1627 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1628 if (flags & SCF_DO_SUBSTR) {
1629 scan_commit(pRExC_state,data);
1630 data->longest = &(data->longest_float);
1632 is_inf = is_inf_internal = 1;
1633 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1634 cl_anything(pRExC_state, data->start_class);
1635 flags &= ~SCF_DO_STCLASS;
1637 /* Else: zero-length, ignore. */
1638 scan = regnext(scan);
1643 *deltap = is_inf_internal ? I32_MAX : delta;
1644 if (flags & SCF_DO_SUBSTR && is_inf)
1645 data->pos_delta = I32_MAX - data->pos_min;
1646 if (is_par > U8_MAX)
1648 if (is_par && pars==1 && data) {
1649 data->flags |= SF_IN_PAR;
1650 data->flags &= ~SF_HAS_PAR;
1652 else if (pars && data) {
1653 data->flags |= SF_HAS_PAR;
1654 data->flags &= ~SF_IN_PAR;
1656 if (flags & SCF_DO_STCLASS_OR)
1657 cl_and(data->start_class, &and_with);
1662 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1664 if (RExC_rx->data) {
1665 Renewc(RExC_rx->data,
1666 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1667 char, struct reg_data);
1668 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669 RExC_rx->data->count += n;
1672 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1673 char, struct reg_data);
1674 New(1208, RExC_rx->data->what, n, U8);
1675 RExC_rx->data->count = n;
1677 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678 return RExC_rx->data->count - n;
1682 Perl_reginitcolors(pTHX)
1685 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688 PL_colors[0] = s = savepv(s);
1690 s = strchr(s, '\t');
1696 PL_colors[i] = s = "";
1700 PL_colors[i++] = "";
1707 - pregcomp - compile a regular expression into internal code
1709 * We can't allocate space until we know how big the compiled form will be,
1710 * but we can't compile it (and thus know how big it is) until we've got a
1711 * place to put the code. So we cheat: we compile it twice, once with code
1712 * generation turned off and size counting turned on, and once "for real".
1713 * This also means that we don't allocate space until we are sure that the
1714 * thing really will compile successfully, and we never have to move the
1715 * code and thus invalidate pointers into it. (Note that it has to be in
1716 * one piece because free() must be able to free it all.) [NB: not true in perl]
1718 * Beware that the optimization-preparation code in here knows about some
1719 * of the structure of the compiled regexp. [I'll say.]
1722 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1732 RExC_state_t RExC_state;
1733 RExC_state_t *pRExC_state = &RExC_state;
1736 FAIL("NULL regexp argument");
1738 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1742 if (!PL_colorset) reginitcolors();
1743 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744 PL_colors[4],PL_colors[5],PL_colors[0],
1745 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1747 RExC_flags16 = pm->op_pmflags;
1751 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752 RExC_seen_evals = 0;
1755 /* First pass: determine size, legality. */
1762 RExC_emit = &PL_regdummy;
1763 RExC_whilem_seen = 0;
1764 #if 0 /* REGC() is (currently) a NOP at the first pass.
1765 * Clever compilers notice this and complain. --jhi */
1766 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1768 if (reg(pRExC_state, 0, &flags) == NULL) {
1769 RExC_precomp = Nullch;
1772 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1774 /* Small enough for pointer-storage convention?
1775 If extralen==0, this means that we will not need long jumps. */
1776 if (RExC_size >= 0x10000L && RExC_extralen)
1777 RExC_size += RExC_extralen;
1780 if (RExC_whilem_seen > 15)
1781 RExC_whilem_seen = 15;
1783 /* Allocate space and initialize. */
1784 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787 FAIL("Regexp out of space");
1790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1791 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 r->prelen = xend - exp;
1795 r->precomp = savepvn(RExC_precomp, r->prelen);
1797 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1798 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1800 r->substrs = 0; /* Useful during FAIL. */
1801 r->startp = 0; /* Useful during FAIL. */
1802 r->endp = 0; /* Useful during FAIL. */
1804 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1806 r->offsets[0] = RExC_size;
1808 DEBUG_r(PerlIO_printf(Perl_debug_log,
1809 "%s %"UVuf" bytes for offset annotations.\n",
1810 r->offsets ? "Got" : "Couldn't get",
1811 (UV)((2*RExC_size+1) * sizeof(U32))));
1815 /* Second pass: emit code. */
1816 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1821 RExC_emit_start = r->program;
1822 RExC_emit = r->program;
1823 /* Store the count of eval-groups for security checks: */
1824 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1827 if (reg(pRExC_state, 0, &flags) == NULL)
1830 /* Dig out information for optimizations. */
1831 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1832 pm->op_pmflags = RExC_flags16;
1834 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1835 r->regstclass = NULL;
1836 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1837 r->reganch |= ROPT_NAUGHTY;
1838 scan = r->program + 1; /* First BRANCH. */
1840 /* XXXX To minimize changes to RE engine we always allocate
1841 3-units-long substrs field. */
1842 Newz(1004, r->substrs, 1, struct reg_substr_data);
1844 StructCopy(&zero_scan_data, &data, scan_data_t);
1845 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1848 STRLEN longest_float_length, longest_fixed_length;
1849 struct regnode_charclass_class ch_class;
1854 /* Skip introductions and multiplicators >= 1. */
1855 while ((OP(first) == OPEN && (sawopen = 1)) ||
1856 /* An OR of *one* alternative - should not happen now. */
1857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858 (OP(first) == PLUS) ||
1859 (OP(first) == MINMOD) ||
1860 /* An {n,m} with n>0 */
1861 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1862 if (OP(first) == PLUS)
1865 first += regarglen[(U8)OP(first)];
1866 first = NEXTOPER(first);
1869 /* Starting-point info. */
1871 if (PL_regkind[(U8)OP(first)] == EXACT) {
1872 if (OP(first) == EXACT)
1873 ; /* Empty, get anchored substr later. */
1874 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1875 r->regstclass = first;
1877 else if (strchr((char*)PL_simple,OP(first)))
1878 r->regstclass = first;
1879 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880 PL_regkind[(U8)OP(first)] == NBOUND)
1881 r->regstclass = first;
1882 else if (PL_regkind[(U8)OP(first)] == BOL) {
1883 r->reganch |= (OP(first) == MBOL
1885 : (OP(first) == SBOL
1888 first = NEXTOPER(first);
1891 else if (OP(first) == GPOS) {
1892 r->reganch |= ROPT_ANCH_GPOS;
1893 first = NEXTOPER(first);
1896 else if (!sawopen && (OP(first) == STAR &&
1897 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1898 !(r->reganch & ROPT_ANCH) )
1900 /* turn .* into ^.* with an implied $*=1 */
1901 int type = OP(NEXTOPER(first));
1903 if (type == REG_ANY)
1904 type = ROPT_ANCH_MBOL;
1906 type = ROPT_ANCH_SBOL;
1908 r->reganch |= type | ROPT_IMPLICIT;
1909 first = NEXTOPER(first);
1912 if (sawplus && (!sawopen || !RExC_sawback)
1913 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1914 /* x+ must match at the 1st pos of run of x's */
1915 r->reganch |= ROPT_SKIP;
1917 /* Scan is after the zeroth branch, first is atomic matcher. */
1918 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1919 (IV)(first - scan + 1)));
1921 * If there's something expensive in the r.e., find the
1922 * longest literal string that must appear and make it the
1923 * regmust. Resolve ties in favor of later strings, since
1924 * the regstart check works with the beginning of the r.e.
1925 * and avoiding duplication strengthens checking. Not a
1926 * strong reason, but sufficient in the absence of others.
1927 * [Now we resolve ties in favor of the earlier string if
1928 * it happens that c_offset_min has been invalidated, since the
1929 * earlier string may buy us something the later one won't.]
1933 data.longest_fixed = newSVpvn("",0);
1934 data.longest_float = newSVpvn("",0);
1935 data.last_found = newSVpvn("",0);
1936 data.longest = &(data.longest_fixed);
1938 if (!r->regstclass) {
1939 cl_init(pRExC_state, &ch_class);
1940 data.start_class = &ch_class;
1941 stclass_flag = SCF_DO_STCLASS_AND;
1942 } else /* XXXX Check for BOUND? */
1944 data.last_closep = &last_close;
1946 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1947 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1948 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1949 && data.last_start_min == 0 && data.last_end > 0
1950 && !RExC_seen_zerolen
1951 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1952 r->reganch |= ROPT_CHECK_ALL;
1953 scan_commit(pRExC_state, &data);
1954 SvREFCNT_dec(data.last_found);
1956 longest_float_length = CHR_SVLEN(data.longest_float);
1957 if (longest_float_length
1958 || (data.flags & SF_FL_BEFORE_EOL
1959 && (!(data.flags & SF_FL_BEFORE_MEOL)
1960 || (RExC_flags16 & PMf_MULTILINE)))) {
1963 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1964 && data.offset_fixed == data.offset_float_min
1965 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966 goto remove_float; /* As in (a)+. */
1968 if (SvUTF8(data.longest_float)) {
1969 r->float_utf8 = data.longest_float;
1970 r->float_substr = Nullsv;
1972 r->float_substr = data.longest_float;
1973 r->float_utf8 = Nullsv;
1975 r->float_min_offset = data.offset_float_min;
1976 r->float_max_offset = data.offset_float_max;
1977 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978 && (!(data.flags & SF_FL_BEFORE_MEOL)
1979 || (RExC_flags16 & PMf_MULTILINE)));
1980 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1984 r->float_substr = r->float_utf8 = Nullsv;
1985 SvREFCNT_dec(data.longest_float);
1986 longest_float_length = 0;
1989 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1990 if (longest_fixed_length
1991 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags16 & PMf_MULTILINE)))) {
1996 if (SvUTF8(data.longest_fixed)) {
1997 r->anchored_utf8 = data.longest_fixed;
1998 r->anchored_substr = Nullsv;
2000 r->anchored_substr = data.longest_fixed;
2001 r->anchored_utf8 = Nullsv;
2003 r->anchored_offset = data.offset_fixed;
2004 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2006 || (RExC_flags16 & PMf_MULTILINE)));
2007 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010 r->anchored_substr = r->anchored_utf8 = Nullsv;
2011 SvREFCNT_dec(data.longest_fixed);
2012 longest_fixed_length = 0;
2015 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2016 r->regstclass = NULL;
2017 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2019 && !(data.start_class->flags & ANYOF_EOS)
2020 && !cl_is_anything(data.start_class)) {
2021 I32 n = add_data(pRExC_state, 1, "f");
2023 New(1006, RExC_rx->data->data[n], 1,
2024 struct regnode_charclass_class);
2025 StructCopy(data.start_class,
2026 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2027 struct regnode_charclass_class);
2028 r->regstclass = (regnode*)RExC_rx->data->data[n];
2029 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2030 PL_regdata = r->data; /* for regprop() */
2031 DEBUG_r({ SV *sv = sv_newmortal();
2032 regprop(sv, (regnode*)data.start_class);
2033 PerlIO_printf(Perl_debug_log,
2034 "synthetic stclass `%s'.\n",
2038 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2039 if (longest_fixed_length > longest_float_length) {
2040 r->check_substr = r->anchored_substr;
2041 r->check_utf8 = r->anchored_utf8;
2042 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043 if (r->reganch & ROPT_ANCH_SINGLE)
2044 r->reganch |= ROPT_NOSCAN;
2047 r->check_substr = r->float_substr;
2048 r->check_utf8 = r->float_utf8;
2049 r->check_offset_min = data.offset_float_min;
2050 r->check_offset_max = data.offset_float_max;
2052 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053 This should be changed ASAP! */
2054 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2055 r->reganch |= RE_USE_INTUIT;
2056 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2057 r->reganch |= RE_INTUIT_TAIL;
2061 /* Several toplevels. Best we can is to set minlen. */
2063 struct regnode_charclass_class ch_class;
2066 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067 scan = r->program + 1;
2068 cl_init(pRExC_state, &ch_class);
2069 data.start_class = &ch_class;
2070 data.last_closep = &last_close;
2071 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2072 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073 = r->float_substr = r->float_utf8 = Nullsv;
2074 if (!(data.start_class->flags & ANYOF_EOS)
2075 && !cl_is_anything(data.start_class)) {
2076 I32 n = add_data(pRExC_state, 1, "f");
2078 New(1006, RExC_rx->data->data[n], 1,
2079 struct regnode_charclass_class);
2080 StructCopy(data.start_class,
2081 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2082 struct regnode_charclass_class);
2083 r->regstclass = (regnode*)RExC_rx->data->data[n];
2084 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2085 DEBUG_r({ SV* sv = sv_newmortal();
2086 regprop(sv, (regnode*)data.start_class);
2087 PerlIO_printf(Perl_debug_log,
2088 "synthetic stclass `%s'.\n",
2094 if (RExC_seen & REG_SEEN_GPOS)
2095 r->reganch |= ROPT_GPOS_SEEN;
2096 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2097 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2098 if (RExC_seen & REG_SEEN_EVAL)
2099 r->reganch |= ROPT_EVAL_SEEN;
2100 if (RExC_seen & REG_SEEN_CANY)
2101 r->reganch |= ROPT_CANY_SEEN;
2102 Newz(1002, r->startp, RExC_npar, I32);
2103 Newz(1002, r->endp, RExC_npar, I32);
2104 PL_regdata = r->data; /* for regprop() */
2105 DEBUG_r(regdump(r));
2110 - reg - regular expression, i.e. main body or parenthesized thing
2112 * Caller must absorb opening parenthesis.
2114 * Combining parenthesis handling with the base level of regular expression
2115 * is a trifle forced, but the need to tie the tails of the branches to what
2116 * follows makes it hard to avoid.
2119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2120 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2122 register regnode *ret; /* Will be the head of the group. */
2123 register regnode *br;
2124 register regnode *lastbr;
2125 register regnode *ender = 0;
2126 register I32 parno = 0;
2127 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2129 /* for (?g), (?gc), and (?o) warnings; warning
2130 about (?c) will warn about (?g) -- japhy */
2132 I32 wastedflags = 0x00,
2135 wasted_gc = 0x02 | 0x04,
2138 char * parse_start = RExC_parse; /* MJD */
2139 char *oregcomp_parse = RExC_parse;
2142 *flagp = 0; /* Tentatively. */
2145 /* Make an OPEN node, if parenthesized. */
2147 if (*RExC_parse == '?') { /* (?...) */
2148 U16 posflags = 0, negflags = 0;
2149 U16 *flagsp = &posflags;
2151 char *seqstart = RExC_parse;
2154 paren = *RExC_parse++;
2155 ret = NULL; /* For look-ahead/behind. */
2157 case '<': /* (?<...) */
2158 RExC_seen |= REG_SEEN_LOOKBEHIND;
2159 if (*RExC_parse == '!')
2161 if (*RExC_parse != '=' && *RExC_parse != '!')
2164 case '=': /* (?=...) */
2165 case '!': /* (?!...) */
2166 RExC_seen_zerolen++;
2167 case ':': /* (?:...) */
2168 case '>': /* (?>...) */
2170 case '$': /* (?$...) */
2171 case '@': /* (?@...) */
2172 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2174 case '#': /* (?#...) */
2175 while (*RExC_parse && *RExC_parse != ')')
2177 if (*RExC_parse != ')')
2178 FAIL("Sequence (?#... not terminated");
2179 nextchar(pRExC_state);
2182 case 'p': /* (?p...) */
2183 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2184 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2186 case '?': /* (??...) */
2188 if (*RExC_parse != '{')
2190 paren = *RExC_parse++;
2192 case '{': /* (?{...}) */
2194 I32 count = 1, n = 0;
2196 char *s = RExC_parse;
2198 OP_4tree *sop, *rop;
2200 RExC_seen_zerolen++;
2201 RExC_seen |= REG_SEEN_EVAL;
2202 while (count && (c = *RExC_parse)) {
2203 if (c == '\\' && RExC_parse[1])
2211 if (*RExC_parse != ')')
2214 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219 if (RExC_parse - 1 - s)
2220 sv = newSVpvn(s, RExC_parse - 1 - s);
2222 sv = newSVpvn("", 0);
2225 Perl_save_re_context(aTHX);
2226 rop = sv_compile_2op(sv, &sop, "re", &av);
2227 sop->op_private |= OPpREFCOUNTED;
2228 /* re_dup will OpREFCNT_inc */
2229 OpREFCNT_set(sop, 1);
2232 n = add_data(pRExC_state, 3, "nop");
2233 RExC_rx->data->data[n] = (void*)rop;
2234 RExC_rx->data->data[n+1] = (void*)sop;
2235 RExC_rx->data->data[n+2] = (void*)av;
2238 else { /* First pass */
2239 if (PL_reginterp_cnt < ++RExC_seen_evals
2240 && PL_curcop != &PL_compiling)
2241 /* No compiled RE interpolated, has runtime
2242 components ===> unsafe. */
2243 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2244 if (PL_tainting && PL_tainted)
2245 FAIL("Eval-group in insecure regular expression");
2248 nextchar(pRExC_state);
2250 ret = reg_node(pRExC_state, LOGICAL);
2253 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2254 /* deal with the length of this later - MJD */
2257 return reganode(pRExC_state, EVAL, n);
2259 case '(': /* (?(?{...})...) and (?(?=...)...) */
2261 if (RExC_parse[0] == '?') { /* (?(?...)) */
2262 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263 || RExC_parse[1] == '<'
2264 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2267 ret = reg_node(pRExC_state, LOGICAL);
2270 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2274 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2276 parno = atoi(RExC_parse++);
2278 while (isDIGIT(*RExC_parse))
2280 ret = reganode(pRExC_state, GROUPP, parno);
2282 if ((c = *nextchar(pRExC_state)) != ')')
2283 vFAIL("Switch condition not recognized");
2285 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286 br = regbranch(pRExC_state, &flags, 1);
2288 br = reganode(pRExC_state, LONGJMP, 0);
2290 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291 c = *nextchar(pRExC_state);
2295 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296 regbranch(pRExC_state, &flags, 1);
2297 regtail(pRExC_state, ret, lastbr);
2300 c = *nextchar(pRExC_state);
2305 vFAIL("Switch (?(condition)... contains too many branches");
2306 ender = reg_node(pRExC_state, TAIL);
2307 regtail(pRExC_state, br, ender);
2309 regtail(pRExC_state, lastbr, ender);
2310 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2313 regtail(pRExC_state, ret, ender);
2317 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2321 RExC_parse--; /* for vFAIL to print correctly */
2322 vFAIL("Sequence (? incomplete");
2326 parse_flags: /* (?i) */
2327 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2328 /* (?g), (?gc) and (?o) are useless here
2329 and must be globally applied -- japhy */
2331 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334 if (! (wastedflags & wflagbit) ) {
2335 wastedflags |= wflagbit;
2338 "Useless (%s%c) - %suse /%c modifier",
2339 flagsp == &negflags ? "?-" : "?",
2341 flagsp == &negflags ? "don't " : "",
2347 else if (*RExC_parse == 'c') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 if (! (wastedflags & wasted_c) ) {
2350 wastedflags |= wasted_gc;
2353 "Useless (%sc) - %suse /gc modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 flagsp == &negflags ? "don't " : ""
2360 else { pmflag(flagsp, *RExC_parse); }
2364 if (*RExC_parse == '-') {
2366 wastedflags = 0; /* reset so (?g-c) warns twice */
2370 RExC_flags16 |= posflags;
2371 RExC_flags16 &= ~negflags;
2372 if (*RExC_parse == ':') {
2378 if (*RExC_parse != ')') {
2380 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2382 nextchar(pRExC_state);
2390 ret = reganode(pRExC_state, OPEN, parno);
2391 Set_Node_Length(ret, 1); /* MJD */
2392 Set_Node_Offset(ret, RExC_parse); /* MJD */
2399 /* Pick up the branches, linking them together. */
2400 parse_start = RExC_parse; /* MJD */
2401 br = regbranch(pRExC_state, &flags, 1);
2402 /* branch_len = (paren != 0); */
2406 if (*RExC_parse == '|') {
2407 if (!SIZE_ONLY && RExC_extralen) {
2408 reginsert(pRExC_state, BRANCHJ, br);
2411 reginsert(pRExC_state, BRANCH, br);
2412 Set_Node_Length(br, paren != 0);
2413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2419 else if (paren == ':') {
2420 *flagp |= flags&SIMPLE;
2422 if (open) { /* Starts with OPEN. */
2423 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2425 else if (paren != '?') /* Not Conditional */
2427 *flagp |= flags & (SPSTART | HASWIDTH);
2429 while (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 ender = reganode(pRExC_state, LONGJMP,0);
2432 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2435 RExC_extralen += 2; /* Account for LONGJMP. */
2436 nextchar(pRExC_state);
2437 br = regbranch(pRExC_state, &flags, 0);
2441 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2445 *flagp |= flags&SPSTART;
2448 if (have_branch || paren != ':') {
2449 /* Make a closing node, and hook it on the end. */
2452 ender = reg_node(pRExC_state, TAIL);
2455 ender = reganode(pRExC_state, CLOSE, parno);
2456 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457 Set_Node_Length(ender,1); /* MJD */
2463 *flagp &= ~HASWIDTH;
2466 ender = reg_node(pRExC_state, SUCCEED);
2469 ender = reg_node(pRExC_state, END);
2472 regtail(pRExC_state, lastbr, ender);
2475 /* Hook the tails of the branches to the closing node. */
2476 for (br = ret; br != NULL; br = regnext(br)) {
2477 regoptail(pRExC_state, br, ender);
2484 static char parens[] = "=!<,>";
2486 if (paren && (p = strchr(parens, paren))) {
2487 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488 int flag = (p - parens) > 1;
2491 node = SUSPEND, flag = 0;
2492 reginsert(pRExC_state, node,ret);
2494 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2498 /* Check for proper termination. */
2500 RExC_flags16 = oregflags;
2501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502 RExC_parse = oregcomp_parse;
2503 vFAIL("Unmatched (");
2506 else if (!paren && RExC_parse < RExC_end) {
2507 if (*RExC_parse == ')') {
2509 vFAIL("Unmatched )");
2512 FAIL("Junk on end of regexp"); /* "Can't happen". */
2520 - regbranch - one alternative of an | operator
2522 * Implements the concatenation operator.
2525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2527 register regnode *ret;
2528 register regnode *chain = NULL;
2529 register regnode *latest;
2530 I32 flags = 0, c = 0;
2535 if (!SIZE_ONLY && RExC_extralen)
2536 ret = reganode(pRExC_state, BRANCHJ,0);
2538 ret = reg_node(pRExC_state, BRANCH);
2539 Set_Node_Length(ret, 1);
2543 if (!first && SIZE_ONLY)
2544 RExC_extralen += 1; /* BRANCHJ */
2546 *flagp = WORST; /* Tentatively. */
2549 nextchar(pRExC_state);
2550 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2552 latest = regpiece(pRExC_state, &flags);
2553 if (latest == NULL) {
2554 if (flags & TRYAGAIN)
2558 else if (ret == NULL)
2560 *flagp |= flags&HASWIDTH;
2561 if (chain == NULL) /* First piece. */
2562 *flagp |= flags&SPSTART;
2565 regtail(pRExC_state, chain, latest);
2570 if (chain == NULL) { /* Loop ran zero times. */
2571 chain = reg_node(pRExC_state, NOTHING);
2576 *flagp |= flags&SIMPLE;
2583 - regpiece - something followed by possible [*+?]
2585 * Note that the branching code sequences used for ? and the general cases
2586 * of * and + are somewhat optimized: they use the same NOTHING node as
2587 * both the endmarker for their branch list and the body of the last branch.
2588 * It might seem that this node could be dispensed with entirely, but the
2589 * endmarker role is not redundant.
2592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2594 register regnode *ret;
2596 register char *next;
2598 char *origparse = RExC_parse;
2601 I32 max = REG_INFTY;
2604 ret = regatom(pRExC_state, &flags);
2606 if (flags & TRYAGAIN)
2613 if (op == '{' && regcurly(RExC_parse)) {
2614 parse_start = RExC_parse; /* MJD */
2615 next = RExC_parse + 1;
2617 while (isDIGIT(*next) || *next == ',') {
2626 if (*next == '}') { /* got one */
2630 min = atoi(RExC_parse);
2634 maxpos = RExC_parse;
2636 if (!max && *maxpos != '0')
2637 max = REG_INFTY; /* meaning "infinity" */
2638 else if (max >= REG_INFTY)
2639 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2641 nextchar(pRExC_state);
2644 if ((flags&SIMPLE)) {
2645 RExC_naughty += 2 + RExC_naughty / 2;
2646 reginsert(pRExC_state, CURLY, ret);
2647 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648 Set_Node_Cur_Length(ret);
2651 regnode *w = reg_node(pRExC_state, WHILEM);
2654 regtail(pRExC_state, ret, w);
2655 if (!SIZE_ONLY && RExC_extralen) {
2656 reginsert(pRExC_state, LONGJMP,ret);
2657 reginsert(pRExC_state, NOTHING,ret);
2658 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2660 reginsert(pRExC_state, CURLYX,ret);
2662 Set_Node_Offset(ret, parse_start+1);
2663 Set_Node_Length(ret,
2664 op == '{' ? (RExC_parse - parse_start) : 1);
2666 if (!SIZE_ONLY && RExC_extralen)
2667 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2668 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2670 RExC_whilem_seen++, RExC_extralen += 3;
2671 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2679 if (max && max < min)
2680 vFAIL("Can't do {n,m} with n > m");
2695 #if 0 /* Now runtime fix should be reliable. */
2697 /* if this is reinstated, don't forget to put this back into perldiag:
2699 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2701 (F) The part of the regexp subject to either the * or + quantifier
2702 could match an empty string. The {#} shows in the regular
2703 expression about where the problem was discovered.
2707 if (!(flags&HASWIDTH) && op != '?')
2708 vFAIL("Regexp *+ operand could be empty");
2711 parse_start = RExC_parse;
2712 nextchar(pRExC_state);
2714 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2716 if (op == '*' && (flags&SIMPLE)) {
2717 reginsert(pRExC_state, STAR, ret);
2721 else if (op == '*') {
2725 else if (op == '+' && (flags&SIMPLE)) {
2726 reginsert(pRExC_state, PLUS, ret);
2730 else if (op == '+') {
2734 else if (op == '?') {
2739 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2741 "%.*s matches null string many times",
2742 RExC_parse - origparse,
2746 if (*RExC_parse == '?') {
2747 nextchar(pRExC_state);
2748 reginsert(pRExC_state, MINMOD, ret);
2749 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2751 if (ISMULT2(RExC_parse)) {
2753 vFAIL("Nested quantifiers");
2760 - regatom - the lowest level
2762 * Optimization: gobbles an entire sequence of ordinary characters so that
2763 * it can turn them into a single node, which is smaller to store and
2764 * faster to run. Backslashed characters are exceptions, each becoming a
2765 * separate node; the code is simpler that way and it's not worth fixing.
2767 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2769 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2771 register regnode *ret = 0;
2773 char *parse_start = 0;
2775 *flagp = WORST; /* Tentatively. */
2778 switch (*RExC_parse) {
2780 RExC_seen_zerolen++;
2781 nextchar(pRExC_state);
2782 if (RExC_flags16 & PMf_MULTILINE)
2783 ret = reg_node(pRExC_state, MBOL);
2784 else if (RExC_flags16 & PMf_SINGLELINE)
2785 ret = reg_node(pRExC_state, SBOL);
2787 ret = reg_node(pRExC_state, BOL);
2788 Set_Node_Length(ret, 1); /* MJD */
2791 nextchar(pRExC_state);
2793 RExC_seen_zerolen++;
2794 if (RExC_flags16 & PMf_MULTILINE)
2795 ret = reg_node(pRExC_state, MEOL);
2796 else if (RExC_flags16 & PMf_SINGLELINE)
2797 ret = reg_node(pRExC_state, SEOL);
2799 ret = reg_node(pRExC_state, EOL);
2800 Set_Node_Length(ret, 1); /* MJD */
2803 nextchar(pRExC_state);
2804 if (RExC_flags16 & PMf_SINGLELINE)
2805 ret = reg_node(pRExC_state, SANY);
2807 ret = reg_node(pRExC_state, REG_ANY);
2808 *flagp |= HASWIDTH|SIMPLE;
2810 Set_Node_Length(ret, 1); /* MJD */
2814 char *oregcomp_parse = ++RExC_parse;
2815 ret = regclass(pRExC_state);
2816 if (*RExC_parse != ']') {
2817 RExC_parse = oregcomp_parse;
2818 vFAIL("Unmatched [");
2820 nextchar(pRExC_state);
2821 *flagp |= HASWIDTH|SIMPLE;
2822 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2826 nextchar(pRExC_state);
2827 ret = reg(pRExC_state, 1, &flags);
2829 if (flags & TRYAGAIN) {
2830 if (RExC_parse == RExC_end) {
2831 /* Make parent create an empty node if needed. */
2839 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2843 if (flags & TRYAGAIN) {
2847 vFAIL("Internal urp");
2848 /* Supposed to be caught earlier. */
2851 if (!regcurly(RExC_parse)) {
2860 vFAIL("Quantifier follows nothing");
2863 switch (*++RExC_parse) {
2865 RExC_seen_zerolen++;
2866 ret = reg_node(pRExC_state, SBOL);
2868 nextchar(pRExC_state);
2869 Set_Node_Length(ret, 2); /* MJD */
2872 ret = reg_node(pRExC_state, GPOS);
2873 RExC_seen |= REG_SEEN_GPOS;
2875 nextchar(pRExC_state);
2876 Set_Node_Length(ret, 2); /* MJD */
2879 ret = reg_node(pRExC_state, SEOL);
2881 RExC_seen_zerolen++; /* Do not optimize RE away */
2882 nextchar(pRExC_state);
2885 ret = reg_node(pRExC_state, EOS);
2887 RExC_seen_zerolen++; /* Do not optimize RE away */
2888 nextchar(pRExC_state);
2889 Set_Node_Length(ret, 2); /* MJD */
2892 ret = reg_node(pRExC_state, CANY);
2893 RExC_seen |= REG_SEEN_CANY;
2894 *flagp |= HASWIDTH|SIMPLE;
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 ret = reg_node(pRExC_state, CLUMP);
2901 nextchar(pRExC_state);
2902 Set_Node_Length(ret, 2); /* MJD */
2905 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2906 *flagp |= HASWIDTH|SIMPLE;
2907 nextchar(pRExC_state);
2908 Set_Node_Length(ret, 2); /* MJD */
2911 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 RExC_seen_zerolen++;
2918 RExC_seen |= REG_SEEN_LOOKBEHIND;
2919 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2925 RExC_seen_zerolen++;
2926 RExC_seen |= REG_SEEN_LOOKBEHIND;
2927 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2940 *flagp |= HASWIDTH|SIMPLE;
2941 nextchar(pRExC_state);
2942 Set_Node_Length(ret, 2); /* MJD */
2945 ret = reg_node(pRExC_state, DIGIT);
2946 *flagp |= HASWIDTH|SIMPLE;
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2951 ret = reg_node(pRExC_state, NDIGIT);
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2959 char* oldregxend = RExC_end;
2960 char* parse_start = RExC_parse;
2962 if (RExC_parse[1] == '{') {
2963 /* a lovely hack--pretend we saw [\pX] instead */
2964 RExC_end = strchr(RExC_parse, '}');
2966 U8 c = (U8)*RExC_parse;
2968 RExC_end = oldregxend;
2969 vFAIL2("Missing right brace on \\%c{}", c);
2974 RExC_end = RExC_parse + 2;
2977 ret = regclass(pRExC_state);
2979 RExC_end = oldregxend;
2981 Set_Node_Cur_Length(ret); /* MJD */
2982 nextchar(pRExC_state);
2983 *flagp |= HASWIDTH|SIMPLE;
2996 case '1': case '2': case '3': case '4':
2997 case '5': case '6': case '7': case '8': case '9':
2999 I32 num = atoi(RExC_parse);
3001 if (num > 9 && num >= RExC_npar)
3004 char * parse_start = RExC_parse - 1; /* MJD */
3005 while (isDIGIT(*RExC_parse))
3008 if (!SIZE_ONLY && num > RExC_rx->nparens)
3009 vFAIL("Reference to nonexistent group");
3011 ret = reganode(pRExC_state, FOLD
3012 ? (LOC ? REFFL : REFF)
3016 /* override incorrect value set in reganode MJD */
3017 Set_Node_Offset(ret, parse_start+1);
3018 Set_Node_Cur_Length(ret); /* MJD */
3020 nextchar(pRExC_state);
3025 if (RExC_parse >= RExC_end)
3026 FAIL("Trailing \\");
3029 /* Do not generate `unrecognized' warnings here, we fall
3030 back into the quick-grab loop below */
3036 if (RExC_flags16 & PMf_EXTENDED) {
3037 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3038 if (RExC_parse < RExC_end)
3044 register STRLEN len;
3050 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3052 parse_start = RExC_parse - 1;
3058 ret = reg_node(pRExC_state, FOLD
3059 ? (LOC ? EXACTFL : EXACTF)
3062 for (len = 0, p = RExC_parse - 1;
3063 len < 127 && p < RExC_end;
3068 if (RExC_flags16 & PMf_EXTENDED)
3069 p = regwhite(p, RExC_end);
3116 ender = ASCII_TO_NATIVE('\033');
3120 ender = ASCII_TO_NATIVE('\007');
3125 char* e = strchr(p, '}');
3129 vFAIL("Missing right brace on \\x{}");
3132 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3133 | PERL_SCAN_DISALLOW_PREFIX;
3135 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3138 /* numlen is generous */
3139 if (numlen + len >= 127) {
3147 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3149 ender = grok_hex(p, &numlen, &flags, NULL);
3155 ender = UCHARAT(p++);
3156 ender = toCTRL(ender);
3158 case '0': case '1': case '2': case '3':case '4':
3159 case '5': case '6': case '7': case '8':case '9':
3161 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3164 ender = grok_oct(p, &numlen, &flags, NULL);
3174 FAIL("Trailing \\");
3177 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3178 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3179 goto normal_default;
3184 if (UTF8_IS_START(*p) && UTF) {
3185 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3193 if (RExC_flags16 & PMf_EXTENDED)
3194 p = regwhite(p, RExC_end);
3196 /* Prime the casefolded buffer. */
3197 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3199 if (ISMULT2(p)) { /* Back off on ?+*. */
3206 /* Emit all the Unicode characters. */
3207 for (foldbuf = tmpbuf;
3209 foldlen -= numlen) {
3210 ender = utf8_to_uvchr(foldbuf, &numlen);
3212 reguni(pRExC_state, ender, s, &unilen);
3215 /* In EBCDIC the numlen
3216 * and unilen can differ. */
3218 if (numlen >= foldlen)
3222 break; /* "Can't happen." */
3226 reguni(pRExC_state, ender, s, &unilen);
3243 /* Emit all the Unicode characters. */
3244 for (foldbuf = tmpbuf;
3246 foldlen -= numlen) {
3247 ender = utf8_to_uvchr(foldbuf, &numlen);
3249 reguni(pRExC_state, ender, s, &unilen);
3252 /* In EBCDIC the numlen
3253 * and unilen can differ. */
3255 if (numlen >= foldlen)
3263 reguni(pRExC_state, ender, s, &unilen);
3276 Set_Node_Cur_Length(ret); /* MJD */
3277 nextchar(pRExC_state);
3279 /* len is STRLEN which is unsigned, need to copy to signed */
3282 vFAIL("Internal disaster");
3291 RExC_size += STR_SZ(len);
3293 RExC_emit += STR_SZ(len);
3298 /* If the encoding pragma is in effect recode the text of
3299 * any EXACT-kind nodes. */
3300 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3301 STRLEN oldlen = STR_LEN(ret);
3302 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3306 if (sv_utf8_downgrade(sv, TRUE)) {
3307 char *s = sv_recode_to_utf8(sv, PL_encoding);
3308 STRLEN newlen = SvCUR(sv);
3311 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312 (int)oldlen, STRING(ret),
3314 Copy(s, STRING(ret), newlen, char);
3315 STR_LEN(ret) += newlen - oldlen;
3316 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3318 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3326 S_regwhite(pTHX_ char *p, char *e)
3331 else if (*p == '#') {
3334 } while (p < e && *p != '\n');
3342 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343 Character classes ([:foo:]) can also be negated ([:^foo:]).
3344 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3346 but trigger failures because they are currently unimplemented. */
3348 #define POSIXCC_DONE(c) ((c) == ':')
3349 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3353 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3356 I32 namedclass = OOB_NAMEDCLASS;
3358 if (value == '[' && RExC_parse + 1 < RExC_end &&
3359 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3360 POSIXCC(UCHARAT(RExC_parse))) {
3361 char c = UCHARAT(RExC_parse);
3362 char* s = RExC_parse++;
3364 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3366 if (RExC_parse == RExC_end)
3367 /* Grandfather lone [:, [=, [. */
3370 char* t = RExC_parse++; /* skip over the c */
3372 if (UCHARAT(RExC_parse) == ']') {
3373 RExC_parse++; /* skip over the ending ] */
3376 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377 I32 skip = 5; /* the most common skip */
3381 if (strnEQ(posixcc, "alnum", 5))
3383 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384 else if (strnEQ(posixcc, "alpha", 5))
3386 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387 else if (strnEQ(posixcc, "ascii", 5))
3389 complement ? ANYOF_NASCII : ANYOF_ASCII;
3392 if (strnEQ(posixcc, "blank", 5))
3394 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3397 if (strnEQ(posixcc, "cntrl", 5))
3399 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3402 if (strnEQ(posixcc, "digit", 5))
3404 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3407 if (strnEQ(posixcc, "graph", 5))
3409 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3412 if (strnEQ(posixcc, "lower", 5))
3414 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3417 if (strnEQ(posixcc, "print", 5))
3419 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420 else if (strnEQ(posixcc, "punct", 5))
3422 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3425 if (strnEQ(posixcc, "space", 5))
3427 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3430 if (strnEQ(posixcc, "upper", 5))
3432 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3434 case 'w': /* this is not POSIX, this is the Perl \w */
3435 if (strnEQ(posixcc, "word", 4)) {
3437 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3442 if (strnEQ(posixcc, "xdigit", 6)) {
3444 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3449 if (namedclass == OOB_NAMEDCLASS ||
3450 posixcc[skip] != ':' ||
3451 posixcc[skip+1] != ']')
3453 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3456 } else if (!SIZE_ONLY) {
3457 /* [[=foo=]] and [[.foo.]] are still future. */
3459 /* adjust RExC_parse so the warning shows after
3461 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3463 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3466 /* Maternal grandfather:
3467 * "[:" ending in ":" but not in ":]" */
3477 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3479 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3480 char *s = RExC_parse;
3483 while(*s && isALNUM(*s))
3485 if (*s && c == *s && s[1] == ']') {
3486 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3488 /* [[=foo=]] and [[.foo.]] are still future. */
3489 if (POSIXCC_NOTYET(c)) {
3490 /* adjust RExC_parse so the error shows after
3492 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3494 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3501 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3504 register UV nextvalue;
3505 register IV prevvalue = OOB_UNICODE;
3506 register IV range = 0;
3507 register regnode *ret;
3510 char *rangebegin = 0;
3511 bool need_class = 0;
3512 SV *listsv = Nullsv;
3515 bool optimize_invert = TRUE;
3516 AV* unicode_alternate = 0;
3518 ret = reganode(pRExC_state, ANYOF, 0);
3521 ANYOF_FLAGS(ret) = 0;
3523 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3527 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3531 RExC_size += ANYOF_SKIP;
3533 RExC_emit += ANYOF_SKIP;
3535 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3537 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3538 ANYOF_BITMAP_ZERO(ret);
3539 listsv = newSVpvn("# comment\n", 10);
3542 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3544 if (!SIZE_ONLY && POSIXCC(nextvalue))
3545 checkposixcc(pRExC_state);
3547 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3548 if (UCHARAT(RExC_parse) == ']')
3551 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3555 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3558 rangebegin = RExC_parse;
3560 value = utf8n_to_uvchr((U8*)RExC_parse,
3561 RExC_end - RExC_parse,
3563 RExC_parse += numlen;
3566 value = UCHARAT(RExC_parse++);
3567 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3568 if (value == '[' && POSIXCC(nextvalue))
3569 namedclass = regpposixcc(pRExC_state, value);
3570 else if (value == '\\') {
3572 value = utf8n_to_uvchr((U8*)RExC_parse,
3573 RExC_end - RExC_parse,
3575 RExC_parse += numlen;
3578 value = UCHARAT(RExC_parse++);
3579 /* Some compilers cannot handle switching on 64-bit integer
3580 * values, therefore value cannot be an UV. Yes, this will
3581 * be a problem later if we want switch on Unicode.
3582 * A similar issue a little bit later when switching on
3583 * namedclass. --jhi */
3584 switch ((I32)value) {
3585 case 'w': namedclass = ANYOF_ALNUM; break;
3586 case 'W': namedclass = ANYOF_NALNUM; break;
3587 case 's': namedclass = ANYOF_SPACE; break;
3588 case 'S': namedclass = ANYOF_NSPACE; break;
3589 case 'd': namedclass = ANYOF_DIGIT; break;
3590 case 'D': namedclass = ANYOF_NDIGIT; break;
3593 if (*RExC_parse == '{') {
3595 e = strchr(RExC_parse++, '}');
3597 vFAIL2("Missing right brace on \\%c{}", c);
3598 while (isSPACE(UCHARAT(RExC_parse)))
3600 if (e == RExC_parse)
3601 vFAIL2("Empty \\%c{}", c);
3603 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3611 if (UCHARAT(RExC_parse) == '^') {
3614 value = value == 'p' ? 'P' : 'p'; /* toggle */
3615 while (isSPACE(UCHARAT(RExC_parse))) {
3621 Perl_sv_catpvf(aTHX_ listsv,
3622 "+utf8::%.*s\n", (int)n, RExC_parse);
3624 Perl_sv_catpvf(aTHX_ listsv,
3625 "!utf8::%.*s\n", (int)n, RExC_parse);
3628 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3630 case 'n': value = '\n'; break;
3631 case 'r': value = '\r'; break;
3632 case 't': value = '\t'; break;
3633 case 'f': value = '\f'; break;
3634 case 'b': value = '\b'; break;
3635 case 'e': value = ASCII_TO_NATIVE('\033');break;
3636 case 'a': value = ASCII_TO_NATIVE('\007');break;
3638 if (*RExC_parse == '{') {
3639 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3640 | PERL_SCAN_DISALLOW_PREFIX;
3641 e = strchr(RExC_parse++, '}');
3643 vFAIL("Missing right brace on \\x{}");
3645 numlen = e - RExC_parse;
3646 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3650 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3652 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3653 RExC_parse += numlen;
3657 value = UCHARAT(RExC_parse++);
3658 value = toCTRL(value);
3660 case '0': case '1': case '2': case '3': case '4':
3661 case '5': case '6': case '7': case '8': case '9':
3665 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3666 RExC_parse += numlen;
3670 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3672 "Unrecognized escape \\%c in character class passed through",
3676 } /* end of \blah */
3678 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3680 if (!SIZE_ONLY && !need_class)
3681 ANYOF_CLASS_ZERO(ret);
3685 /* a bad range like a-\d, a-[:digit:] ? */
3688 if (ckWARN(WARN_REGEXP))
3690 "False [] range \"%*.*s\"",
3691 RExC_parse - rangebegin,
3692 RExC_parse - rangebegin,
3694 if (prevvalue < 256) {
3695 ANYOF_BITMAP_SET(ret, prevvalue);
3696 ANYOF_BITMAP_SET(ret, '-');
3699 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3700 Perl_sv_catpvf(aTHX_ listsv,
3701 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3705 range = 0; /* this was not a true range */
3709 if (namedclass > OOB_NAMEDCLASS)
3710 optimize_invert = FALSE;
3711 /* Possible truncation here but in some 64-bit environments
3712 * the compiler gets heartburn about switch on 64-bit values.
3713 * A similar issue a little earlier when switching on value.
3715 switch ((I32)namedclass) {
3718 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3720 for (value = 0; value < 256; value++)
3722 ANYOF_BITMAP_SET(ret, value);
3724 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3728 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3730 for (value = 0; value < 256; value++)
3731 if (!isALNUM(value))
3732 ANYOF_BITMAP_SET(ret, value);
3734 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3738 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3740 for (value = 0; value < 256; value++)
3741 if (isALNUMC(value))
3742 ANYOF_BITMAP_SET(ret, value);
3744 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3748 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3750 for (value = 0; value < 256; value++)
3751 if (!isALNUMC(value))
3752 ANYOF_BITMAP_SET(ret, value);
3754 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3758 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3760 for (value = 0; value < 256; value++)
3762 ANYOF_BITMAP_SET(ret, value);
3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3768 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3770 for (value = 0; value < 256; value++)
3771 if (!isALPHA(value))
3772 ANYOF_BITMAP_SET(ret, value);
3774 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3778 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3781 for (value = 0; value < 128; value++)
3782 ANYOF_BITMAP_SET(ret, value);
3784 for (value = 0; value < 256; value++) {
3786 ANYOF_BITMAP_SET(ret, value);
3790 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3794 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3797 for (value = 128; value < 256; value++)
3798 ANYOF_BITMAP_SET(ret, value);
3800 for (value = 0; value < 256; value++) {
3801 if (!isASCII(value))
3802 ANYOF_BITMAP_SET(ret, value);
3806 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3810 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3812 for (value = 0; value < 256; value++)
3814 ANYOF_BITMAP_SET(ret, value);
3816 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3820 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3822 for (value = 0; value < 256; value++)
3823 if (!isBLANK(value))
3824 ANYOF_BITMAP_SET(ret, value);
3826 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3830 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3832 for (value = 0; value < 256; value++)
3834 ANYOF_BITMAP_SET(ret, value);
3836 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3840 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3842 for (value = 0; value < 256; value++)
3843 if (!isCNTRL(value))
3844 ANYOF_BITMAP_SET(ret, value);
3846 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3850 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3852 /* consecutive digits assumed */
3853 for (value = '0'; value <= '9'; value++)
3854 ANYOF_BITMAP_SET(ret, value);
3856 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3860 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3862 /* consecutive digits assumed */
3863 for (value = 0; value < '0'; value++)
3864 ANYOF_BITMAP_SET(ret, value);
3865 for (value = '9' + 1; value < 256; value++)
3866 ANYOF_BITMAP_SET(ret, value);
3868 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3872 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3874 for (value = 0; value < 256; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3878 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3882 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3884 for (value = 0; value < 256; value++)
3885 if (!isGRAPH(value))
3886 ANYOF_BITMAP_SET(ret, value);
3888 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3892 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3894 for (value = 0; value < 256; value++)
3896 ANYOF_BITMAP_SET(ret, value);
3898 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3902 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3904 for (value = 0; value < 256; value++)
3905 if (!isLOWER(value))
3906 ANYOF_BITMAP_SET(ret, value);
3908 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3912 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3914 for (value = 0; value < 256; value++)
3916 ANYOF_BITMAP_SET(ret, value);
3918 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3922 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3924 for (value = 0; value < 256; value++)
3925 if (!isPRINT(value))
3926 ANYOF_BITMAP_SET(ret, value);
3928 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3932 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3934 for (value = 0; value < 256; value++)
3935 if (isPSXSPC(value))
3936 ANYOF_BITMAP_SET(ret, value);
3938 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3942 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3944 for (value = 0; value < 256; value++)
3945 if (!isPSXSPC(value))
3946 ANYOF_BITMAP_SET(ret, value);
3948 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3952 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3954 for (value = 0; value < 256; value++)
3956 ANYOF_BITMAP_SET(ret, value);
3958 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3962 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3964 for (value = 0; value < 256; value++)
3965 if (!isPUNCT(value))
3966 ANYOF_BITMAP_SET(ret, value);
3968 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3972 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3974 for (value = 0; value < 256; value++)
3976 ANYOF_BITMAP_SET(ret, value);
3978 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3982 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3984 for (value = 0; value < 256; value++)
3985 if (!isSPACE(value))
3986 ANYOF_BITMAP_SET(ret, value);
3988 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3992 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3994 for (value = 0; value < 256; value++)
3996 ANYOF_BITMAP_SET(ret, value);
3998 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4002 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4004 for (value = 0; value < 256; value++)
4005 if (!isUPPER(value))
4006 ANYOF_BITMAP_SET(ret, value);
4008 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4012 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4014 for (value = 0; value < 256; value++)
4015 if (isXDIGIT(value))
4016 ANYOF_BITMAP_SET(ret, value);
4018 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4022 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4024 for (value = 0; value < 256; value++)
4025 if (!isXDIGIT(value))
4026 ANYOF_BITMAP_SET(ret, value);
4028 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4031 vFAIL("Invalid [::] class");
4035 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4038 } /* end of namedclass \blah */
4041 if (prevvalue > value) /* b-a */ {
4042 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4043 RExC_parse - rangebegin,
4044 RExC_parse - rangebegin,
4046 range = 0; /* not a valid range */
4050 prevvalue = value; /* save the beginning of the range */
4051 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4052 RExC_parse[1] != ']') {
4055 /* a bad range like \w-, [:word:]- ? */
4056 if (namedclass > OOB_NAMEDCLASS) {
4057 if (ckWARN(WARN_REGEXP))
4059 "False [] range \"%*.*s\"",
4060 RExC_parse - rangebegin,
4061 RExC_parse - rangebegin,
4064 ANYOF_BITMAP_SET(ret, '-');
4066 range = 1; /* yeah, it's a range! */
4067 continue; /* but do it the next time */
4071 /* now is the next time */
4075 if (prevvalue < 256) {
4076 IV ceilvalue = value < 256 ? value : 255;
4079 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4080 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4082 if (isLOWER(prevvalue)) {
4083 for (i = prevvalue; i <= ceilvalue; i++)
4085 ANYOF_BITMAP_SET(ret, i);
4087 for (i = prevvalue; i <= ceilvalue; i++)
4089 ANYOF_BITMAP_SET(ret, i);
4094 for (i = prevvalue; i <= ceilvalue; i++)
4095 ANYOF_BITMAP_SET(ret, i);
4097 if (value > 255 || UTF) {
4098 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4099 UV natvalue = NATIVE_TO_UNI(value);
4101 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4102 if (prevnatvalue < natvalue) { /* what about > ? */
4103 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4104 prevnatvalue, natvalue);
4106 else if (prevnatvalue == natvalue) {
4107 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4109 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4111 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4113 /* If folding and foldable and a single
4114 * character, insert also the folded version
4115 * to the charclass. */
4117 if (foldlen == UNISKIP(f))
4118 Perl_sv_catpvf(aTHX_ listsv,
4121 /* Any multicharacter foldings
4122 * require the following transform:
4123 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4124 * where E folds into "pq" and F folds
4125 * into "rst", all other characters
4126 * fold to single characters. We save
4127 * away these multicharacter foldings,
4128 * to be later saved as part of the
4129 * additional "s" data. */
4132 if (!unicode_alternate)
4133 unicode_alternate = newAV();
4134 sv = newSVpvn((char*)foldbuf, foldlen);
4136 av_push(unicode_alternate, sv);
4140 /* If folding and the value is one of the Greek
4141 * sigmas insert a few more sigmas to make the
4142 * folding rules of the sigmas to work right.
4143 * Note that not all the possible combinations
4144 * are handled here: some of them are handled
4145 * by the standard folding rules, and some of
4146 * them (literal or EXACTF cases) are handled
4147 * during runtime in regexec.c:S_find_byclass(). */
4148 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4149 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4150 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4151 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4152 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4154 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4155 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4156 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4162 range = 0; /* this range (if it was one) is done now */
4166 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4168 RExC_size += ANYOF_CLASS_ADD_SKIP;
4170 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4173 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4175 /* If the only flag is folding (plus possibly inversion). */
4176 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4178 for (value = 0; value < 256; ++value) {
4179 if (ANYOF_BITMAP_TEST(ret, value)) {
4180 IV fold = PL_fold[value];
4183 ANYOF_BITMAP_SET(ret, fold);
4186 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4189 /* optimize inverted simple patterns (e.g. [^a-z]) */
4190 if (!SIZE_ONLY && optimize_invert &&
4191 /* If the only flag is inversion. */
4192 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4193 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4194 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4195 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4202 /* The 0th element stores the character class description
4203 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4204 * to initialize the appropriate swash (which gets stored in
4205 * the 1st element), and also useful for dumping the regnode.
4206 * The 2nd element stores the multicharacter foldings,
4207 * used later (regexec.c:s_reginclasslen()). */
4208 av_store(av, 0, listsv);
4209 av_store(av, 1, NULL);
4210 av_store(av, 2, (SV*)unicode_alternate);
4211 rv = newRV_noinc((SV*)av);
4212 n = add_data(pRExC_state, 1, "s");
4213 RExC_rx->data->data[n] = (void*)rv;
4221 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4223 char* retval = RExC_parse++;
4226 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4227 RExC_parse[2] == '#') {
4228 while (*RExC_parse && *RExC_parse != ')')
4233 if (RExC_flags16 & PMf_EXTENDED) {
4234 if (isSPACE(*RExC_parse)) {
4238 else if (*RExC_parse == '#') {
4239 while (*RExC_parse && *RExC_parse != '\n')
4250 - reg_node - emit a node
4252 STATIC regnode * /* Location. */
4253 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4255 register regnode *ret;
4256 register regnode *ptr;
4260 SIZE_ALIGN(RExC_size);
4265 NODE_ALIGN_FILL(ret);
4267 FILL_ADVANCE_NODE(ptr, op);
4268 if (RExC_offsets) { /* MJD */
4269 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4270 "reg_node", __LINE__,
4272 RExC_emit - RExC_emit_start > RExC_offsets[0]
4273 ? "Overwriting end of array!\n" : "OK",
4274 RExC_emit - RExC_emit_start,
4275 RExC_parse - RExC_start,
4277 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4286 - reganode - emit a node with an argument
4288 STATIC regnode * /* Location. */
4289 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4291 register regnode *ret;
4292 register regnode *ptr;
4296 SIZE_ALIGN(RExC_size);
4301 NODE_ALIGN_FILL(ret);
4303 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4304 if (RExC_offsets) { /* MJD */
4305 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4307 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4308 "Overwriting end of array!\n" : "OK",
4309 RExC_emit - RExC_emit_start,
4310 RExC_parse - RExC_start,
4312 Set_Cur_Node_Offset;
4321 - reguni - emit (if appropriate) a Unicode character
4324 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4326 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4330 - reginsert - insert an operator in front of already-emitted operand
4332 * Means relocating the operand.
4335 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4337 register regnode *src;
4338 register regnode *dst;
4339 register regnode *place;
4340 register int offset = regarglen[(U8)op];
4342 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4345 RExC_size += NODE_STEP_REGNODE + offset;
4350 RExC_emit += NODE_STEP_REGNODE + offset;
4352 while (src > opnd) {
4353 StructCopy(--src, --dst, regnode);
4354 if (RExC_offsets) { /* MJD 20010112 */
4355 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4357 dst - RExC_emit_start > RExC_offsets[0]
4358 ? "Overwriting end of array!\n" : "OK",
4359 src - RExC_emit_start,
4360 dst - RExC_emit_start,
4362 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4363 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4368 place = opnd; /* Op node, where operand used to be. */
4369 if (RExC_offsets) { /* MJD */
4370 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4372 place - RExC_emit_start > RExC_offsets[0]
4373 ? "Overwriting end of array!\n" : "OK",
4374 place - RExC_emit_start,
4375 RExC_parse - RExC_start,
4377 Set_Node_Offset(place, RExC_parse);
4379 src = NEXTOPER(place);
4380 FILL_ADVANCE_NODE(place, op);
4381 Zero(src, offset, regnode);
4385 - regtail - set the next-pointer at the end of a node chain of p to val.
4388 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4390 register regnode *scan;
4391 register regnode *temp;
4396 /* Find last node. */
4399 temp = regnext(scan);
4405 if (reg_off_by_arg[OP(scan)]) {
4406 ARG_SET(scan, val - scan);
4409 NEXT_OFF(scan) = val - scan;
4414 - regoptail - regtail on operand of first argument; nop if operandless
4417 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4419 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4420 if (p == NULL || SIZE_ONLY)
4422 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4423 regtail(pRExC_state, NEXTOPER(p), val);
4425 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4426 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4433 - regcurly - a little FSA that accepts {\d+,?\d*}
4436 S_regcurly(pTHX_ register char *s)
4457 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4459 register U8 op = EXACT; /* Arbitrary non-END op. */
4460 register regnode *next;
4462 while (op != END && (!last || node < last)) {
4463 /* While that wasn't END last time... */
4469 next = regnext(node);
4471 if (OP(node) == OPTIMIZED)
4474 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4475 (int)(2*l + 1), "", SvPVX(sv));
4476 if (next == NULL) /* Next ptr. */
4477 PerlIO_printf(Perl_debug_log, "(0)");
4479 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4480 (void)PerlIO_putc(Perl_debug_log, '\n');
4482 if (PL_regkind[(U8)op] == BRANCHJ) {
4483 register regnode *nnode = (OP(next) == LONGJMP
4486 if (last && nnode > last)
4488 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4490 else if (PL_regkind[(U8)op] == BRANCH) {
4491 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4493 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4494 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4495 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4497 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4498 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4501 else if ( op == PLUS || op == STAR) {
4502 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4504 else if (op == ANYOF) {
4505 /* arglen 1 + class block */
4506 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4507 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4508 node = NEXTOPER(node);
4510 else if (PL_regkind[(U8)op] == EXACT) {
4511 /* Literal string, where present. */
4512 node += NODE_SZ_STR(node) - 1;
4513 node = NEXTOPER(node);
4516 node = NEXTOPER(node);
4517 node += regarglen[(U8)op];
4519 if (op == CURLYX || op == OPEN)
4521 else if (op == WHILEM)
4527 #endif /* DEBUGGING */
4530 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4533 Perl_regdump(pTHX_ regexp *r)
4536 SV *sv = sv_newmortal();
4538 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4540 /* Header fields of interest. */
4541 if (r->anchored_substr)
4542 PerlIO_printf(Perl_debug_log,
4543 "anchored `%s%.*s%s'%s at %"IVdf" ",
4545 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4546 SvPVX(r->anchored_substr),
4548 SvTAIL(r->anchored_substr) ? "$" : "",
4549 (IV)r->anchored_offset);
4550 else if (r->anchored_utf8)
4551 PerlIO_printf(Perl_debug_log,
4552 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4554 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4555 SvPVX(r->anchored_utf8),
4557 SvTAIL(r->anchored_utf8) ? "$" : "",
4558 (IV)r->anchored_offset);
4559 if (r->float_substr)
4560 PerlIO_printf(Perl_debug_log,
4561 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4563 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4564 SvPVX(r->float_substr),
4566 SvTAIL(r->float_substr) ? "$" : "",
4567 (IV)r->float_min_offset, (UV)r->float_max_offset);
4568 else if (r->float_utf8)
4569 PerlIO_printf(Perl_debug_log,
4570 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4572 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4573 SvPVX(r->float_utf8),
4575 SvTAIL(r->float_utf8) ? "$" : "",
4576 (IV)r->float_min_offset, (UV)r->float_max_offset);
4577 if (r->check_substr || r->check_utf8)
4578 PerlIO_printf(Perl_debug_log,
4579 r->check_substr == r->float_substr
4580 && r->check_utf8 == r->float_utf8
4581 ? "(checking floating" : "(checking anchored");
4582 if (r->reganch & ROPT_NOSCAN)
4583 PerlIO_printf(Perl_debug_log, " noscan");
4584 if (r->reganch & ROPT_CHECK_ALL)
4585 PerlIO_printf(Perl_debug_log, " isall");
4586 if (r->check_substr || r->check_utf8)
4587 PerlIO_printf(Perl_debug_log, ") ");
4589 if (r->regstclass) {
4590 regprop(sv, r->regstclass);
4591 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4593 if (r->reganch & ROPT_ANCH) {
4594 PerlIO_printf(Perl_debug_log, "anchored");
4595 if (r->reganch & ROPT_ANCH_BOL)
4596 PerlIO_printf(Perl_debug_log, "(BOL)");
4597 if (r->reganch & ROPT_ANCH_MBOL)
4598 PerlIO_printf(Perl_debug_log, "(MBOL)");
4599 if (r->reganch & ROPT_ANCH_SBOL)
4600 PerlIO_printf(Perl_debug_log, "(SBOL)");
4601 if (r->reganch & ROPT_ANCH_GPOS)
4602 PerlIO_printf(Perl_debug_log, "(GPOS)");
4603 PerlIO_putc(Perl_debug_log, ' ');
4605 if (r->reganch & ROPT_GPOS_SEEN)
4606 PerlIO_printf(Perl_debug_log, "GPOS ");
4607 if (r->reganch & ROPT_SKIP)
4608 PerlIO_printf(Perl_debug_log, "plus ");
4609 if (r->reganch & ROPT_IMPLICIT)
4610 PerlIO_printf(Perl_debug_log, "implicit ");
4611 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4612 if (r->reganch & ROPT_EVAL_SEEN)
4613 PerlIO_printf(Perl_debug_log, "with eval ");
4614 PerlIO_printf(Perl_debug_log, "\n");
4617 U32 len = r->offsets[0];
4618 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4619 for (i = 1; i <= len; i++)
4620 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4621 (UV)r->offsets[i*2-1],
4622 (UV)r->offsets[i*2]);
4623 PerlIO_printf(Perl_debug_log, "\n");
4625 #endif /* DEBUGGING */
4631 S_put_byte(pTHX_ SV *sv, int c)
4633 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4634 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4635 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4636 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4638 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4641 #endif /* DEBUGGING */
4644 - regprop - printable representation of opcode
4647 Perl_regprop(pTHX_ SV *sv, regnode *o)
4652 sv_setpvn(sv, "", 0);
4653 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4654 /* It would be nice to FAIL() here, but this may be called from
4655 regexec.c, and it would be hard to supply pRExC_state. */
4656 Perl_croak(aTHX_ "Corrupted regexp opcode");
4657 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4659 k = PL_regkind[(U8)OP(o)];
4662 SV *dsv = sv_2mortal(newSVpvn("", 0));
4663 /* Using is_utf8_string() is a crude hack but it may
4664 * be the best for now since we have no flag "this EXACTish
4665 * node was UTF-8" --jhi */
4666 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4668 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4669 UNI_DISPLAY_REGEX) :
4674 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4679 else if (k == CURLY) {
4680 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4681 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4682 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4684 else if (k == WHILEM && o->flags) /* Ordinal/of */
4685 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4686 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4687 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4688 else if (k == LOGICAL)
4689 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4690 else if (k == ANYOF) {
4691 int i, rangestart = -1;
4692 U8 flags = ANYOF_FLAGS(o);
4693 const char * const anyofs[] = { /* Should be syncronized with
4694 * ANYOF_ #xdefines in regcomp.h */
4727 if (flags & ANYOF_LOCALE)
4728 sv_catpv(sv, "{loc}");
4729 if (flags & ANYOF_FOLD)
4730 sv_catpv(sv, "{i}");
4731 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4732 if (flags & ANYOF_INVERT)
4734 for (i = 0; i <= 256; i++) {
4735 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4736 if (rangestart == -1)
4738 } else if (rangestart != -1) {
4739 if (i <= rangestart + 3)
4740 for (; rangestart < i; rangestart++)
4741 put_byte(sv, rangestart);
4743 put_byte(sv, rangestart);
4745 put_byte(sv, i - 1);
4751 if (o->flags & ANYOF_CLASS)
4752 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4753 if (ANYOF_CLASS_TEST(o,i))
4754 sv_catpv(sv, anyofs[i]);
4756 if (flags & ANYOF_UNICODE)
4757 sv_catpv(sv, "{unicode}");
4758 else if (flags & ANYOF_UNICODE_ALL)
4759 sv_catpv(sv, "{unicode_all}");
4763 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4768 U8 s[UTF8_MAXLEN+1];
4770 for (i = 0; i <= 256; i++) { /* just the first 256 */
4771 U8 *e = uvchr_to_utf8(s, i);
4773 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4774 if (rangestart == -1)
4776 } else if (rangestart != -1) {
4779 if (i <= rangestart + 3)
4780 for (; rangestart < i; rangestart++) {
4781 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4785 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4788 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4795 sv_catpv(sv, "..."); /* et cetera */
4799 char *s = savepv(SvPVX(lv));
4802 while(*s && *s != '\n') s++;
4823 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4825 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4826 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4827 #endif /* DEBUGGING */
4831 Perl_re_intuit_string(pTHX_ regexp *prog)
4832 { /* Assume that RE_INTUIT is set */
4835 char *s = SvPV(prog->check_substr
4836 ? prog->check_substr : prog->check_utf8, n_a);
4838 if (!PL_colorset) reginitcolors();
4839 PerlIO_printf(Perl_debug_log,
4840 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4842 prog->check_substr ? "" : "utf8 ",
4843 PL_colors[5],PL_colors[0],
4846 (strlen(s) > 60 ? "..." : ""));
4849 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4853 Perl_pregfree(pTHX_ struct regexp *r)
4856 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4859 if (!r || (--r->refcnt > 0))
4862 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4864 int len = SvCUR(dsv);
4867 PerlIO_printf(Perl_debug_log,
4868 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4869 PL_colors[4],PL_colors[5],PL_colors[0],
4872 len > 60 ? "..." : "");
4876 Safefree(r->precomp);
4877 if (r->offsets) /* 20010421 MJD */
4878 Safefree(r->offsets);
4879 if (RX_MATCH_COPIED(r))
4880 Safefree(r->subbeg);
4882 if (r->anchored_substr)
4883 SvREFCNT_dec(r->anchored_substr);
4884 if (r->anchored_utf8)
4885 SvREFCNT_dec(r->anchored_utf8);
4886 if (r->float_substr)
4887 SvREFCNT_dec(r->float_substr);
4889 SvREFCNT_dec(r->float_utf8);
4890 Safefree(r->substrs);
4893 int n = r->data->count;
4894 AV* new_comppad = NULL;
4899 /* If you add a ->what type here, update the comment in regcomp.h */
4900 switch (r->data->what[n]) {
4902 SvREFCNT_dec((SV*)r->data->data[n]);
4905 Safefree(r->data->data[n]);
4908 new_comppad = (AV*)r->data->data[n];
4911 if (new_comppad == NULL)
4912 Perl_croak(aTHX_ "panic: pregfree comppad");
4913 old_comppad = PL_comppad;
4914 old_curpad = PL_curpad;
4915 /* Watch out for global destruction's random ordering. */
4916 if (SvTYPE(new_comppad) == SVt_PVAV) {
4917 PL_comppad = new_comppad;
4918 PL_curpad = AvARRAY(new_comppad);
4923 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4924 op_free((OP_4tree*)r->data->data[n]);
4927 PL_comppad = old_comppad;
4928 PL_curpad = old_curpad;
4929 SvREFCNT_dec((SV*)new_comppad);
4935 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4938 Safefree(r->data->what);
4941 Safefree(r->startp);
4947 - regnext - dig the "next" pointer out of a node
4949 * [Note, when REGALIGN is defined there are two places in regmatch()
4950 * that bypass this code for speed.]
4953 Perl_regnext(pTHX_ register regnode *p)
4955 register I32 offset;
4957 if (p == &PL_regdummy)
4960 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4968 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4971 STRLEN l1 = strlen(pat1);
4972 STRLEN l2 = strlen(pat2);
4981 Copy(pat1, buf, l1 , char);
4982 Copy(pat2, buf + l1, l2 , char);
4983 buf[l1 + l2] = '\n';
4984 buf[l1 + l2 + 1] = '\0';
4986 /* ANSI variant takes additional second argument */
4987 va_start(args, pat2);
4991 msv = vmess(buf, &args);
4993 message = SvPV(msv,l1);
4996 Copy(message, buf, l1 , char);
4997 buf[l1] = '\0'; /* Overwrite \n */
4998 Perl_croak(aTHX_ "%s", buf);
5001 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5004 Perl_save_re_context(pTHX)
5007 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5008 SAVEI32(RExC_npar); /* () count. */
5009 SAVEI32(RExC_size); /* Code size. */
5010 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5011 SAVEVPTR(RExC_rx); /* from regcomp.c */
5012 SAVEI32(RExC_seen); /* from regcomp.c */
5013 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5014 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5015 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
5016 SAVEPPTR(RExC_end); /* End of input for compile */
5017 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5020 SAVEI32(PL_reg_flags); /* from regexec.c */
5022 SAVEPPTR(PL_reginput); /* String-input pointer. */
5023 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5024 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5025 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5026 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5027 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5028 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5029 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5030 PL_reg_start_tmp = 0;
5031 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5032 PL_reg_start_tmpl = 0;
5033 SAVEVPTR(PL_regdata);
5034 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5035 SAVEI32(PL_regnarrate); /* from regexec.c */
5036 SAVEVPTR(PL_regprogram); /* from regexec.c */
5037 SAVEINT(PL_regindent); /* from regexec.c */
5038 SAVEVPTR(PL_regcc); /* from regexec.c */
5039 SAVEVPTR(PL_curcop);
5040 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5041 SAVEVPTR(PL_reg_re); /* from regexec.c */
5042 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5043 SAVESPTR(PL_reg_sv); /* from regexec.c */
5044 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5045 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5046 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5047 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5048 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5049 SAVEI32(PL_regnpar); /* () count. */
5050 SAVEI32(PL_regsize); /* from regexec.c */
5052 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5057 clear_re(pTHX_ void *r)
5059 ReREFCNT_dec((regexp *)r);