5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if ((U32)data->offset_float_max > (U32)I32_MAX)
509 data->offset_float_max = I32_MAX;
510 if (data->flags & SF_BEFORE_EOL)
512 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
514 data->flags &= ~SF_FL_BEFORE_EOL;
517 SvCUR_set(data->last_found, 0);
519 data->flags &= ~SF_BEFORE_EOL;
522 /* Can match anything (initialization) */
524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
526 ANYOF_CLASS_ZERO(cl);
527 ANYOF_BITMAP_SETALL(cl);
528 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
530 cl->flags |= ANYOF_LOCALE;
533 /* Can match anything (initialization) */
535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
539 for (value = 0; value <= ANYOF_MAX; value += 2)
540 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
542 if (!(cl->flags & ANYOF_UNICODE_ALL))
544 if (!ANYOF_BITMAP_TESTALLSET(cl))
549 /* Can match anything (initialization) */
551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
553 Zero(cl, 1, struct regnode_charclass_class);
555 cl_anything(pRExC_state, cl);
559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
561 Zero(cl, 1, struct regnode_charclass_class);
563 cl_anything(pRExC_state, cl);
565 cl->flags |= ANYOF_LOCALE;
568 /* 'And' a given class with another one. Can create false positives */
569 /* We assume that cl is not inverted */
571 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572 struct regnode_charclass_class *and_with)
574 if (!(and_with->flags & ANYOF_CLASS)
575 && !(cl->flags & ANYOF_CLASS)
576 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577 && !(and_with->flags & ANYOF_FOLD)
578 && !(cl->flags & ANYOF_FOLD)) {
581 if (and_with->flags & ANYOF_INVERT)
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] &= ~and_with->bitmap[i];
585 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586 cl->bitmap[i] &= and_with->bitmap[i];
587 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588 if (!(and_with->flags & ANYOF_EOS))
589 cl->flags &= ~ANYOF_EOS;
591 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592 cl->flags &= ~ANYOF_UNICODE_ALL;
593 cl->flags |= ANYOF_UNICODE;
594 ARG_SET(cl, ARG(and_with));
596 if (!(and_with->flags & ANYOF_UNICODE_ALL))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599 cl->flags &= ~ANYOF_UNICODE;
602 /* 'OR' a given class with another one. Can create false positives */
603 /* We assume that cl is not inverted */
605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
607 if (or_with->flags & ANYOF_INVERT) {
609 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610 * <= (B1 | !B2) | (CL1 | !CL2)
611 * which is wasteful if CL2 is small, but we ignore CL2:
612 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613 * XXXX Can we handle case-fold? Unclear:
614 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618 && !(or_with->flags & ANYOF_FOLD)
619 && !(cl->flags & ANYOF_FOLD) ) {
622 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623 cl->bitmap[i] |= ~or_with->bitmap[i];
624 } /* XXXX: logic is complicated otherwise */
626 cl_anything(pRExC_state, cl);
629 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
631 && (!(or_with->flags & ANYOF_FOLD)
632 || (cl->flags & ANYOF_FOLD)) ) {
635 /* OR char bitmap and class bitmap separately */
636 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637 cl->bitmap[i] |= or_with->bitmap[i];
638 if (or_with->flags & ANYOF_CLASS) {
639 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640 cl->classflags[i] |= or_with->classflags[i];
641 cl->flags |= ANYOF_CLASS;
644 else { /* XXXX: logic is complicated, leave it along for a moment. */
645 cl_anything(pRExC_state, cl);
648 if (or_with->flags & ANYOF_EOS)
649 cl->flags |= ANYOF_EOS;
651 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652 ARG(cl) != ARG(or_with)) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
656 if (or_with->flags & ANYOF_UNICODE_ALL) {
657 cl->flags |= ANYOF_UNICODE_ALL;
658 cl->flags &= ~ANYOF_UNICODE;
663 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664 * These need to be revisited when a newer toolchain becomes available.
666 #if defined(__sparc64__) && defined(__GNUC__)
667 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668 # undef SPARC64_GCC_WORKAROUND
669 # define SPARC64_GCC_WORKAROUND 1
673 /* REx optimizer. Converts nodes into quickier variants "in place".
674 Finds fixed substrings. */
676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677 to the position after last scanned or to NULL. */
680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
681 /* scanp: Start here (read-write). */
682 /* deltap: Write maxlen-minlen here. */
683 /* last: Stop before this one. */
685 I32 min = 0, pars = 0, code;
686 regnode *scan = *scanp, *next;
688 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
689 int is_inf_internal = 0; /* The studied chunk is infinite */
690 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691 scan_data_t data_fake;
692 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
694 while (scan && OP(scan) != END && scan < last) {
695 /* Peephole optimizer: */
697 if (PL_regkind[(U8)OP(scan)] == EXACT) {
698 /* Merge several consecutive EXACTish nodes into one. */
699 regnode *n = regnext(scan);
702 regnode *stop = scan;
705 next = scan + NODE_SZ_STR(scan);
706 /* Skip NOTHING, merge EXACT*. */
708 ( PL_regkind[(U8)OP(n)] == NOTHING ||
709 (stringok && (OP(n) == OP(scan))))
711 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712 if (OP(n) == TAIL || n > next)
714 if (PL_regkind[(U8)OP(n)] == NOTHING) {
715 NEXT_OFF(scan) += NEXT_OFF(n);
716 next = n + NODE_STEP_REGNODE;
724 int oldl = STR_LEN(scan);
725 regnode *nnext = regnext(n);
727 if (oldl + STR_LEN(n) > U8_MAX)
729 NEXT_OFF(scan) += NEXT_OFF(n);
730 STR_LEN(scan) += STR_LEN(n);
731 next = n + NODE_SZ_STR(n);
732 /* Now we can overwrite *n : */
733 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
741 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
743 Two problematic code points in Unicode casefolding of EXACT nodes:
745 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
752 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
753 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
755 This means that in case-insensitive matching (or "loose matching",
756 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757 length of the above casefolded versions) can match a target string
758 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759 This would rather mess up the minimum length computation.
761 What we'll do is to look for the tail four bytes, and then peek
762 at the preceding two bytes to see whether we need to decrease
763 the minimum length by four (six minus two).
765 Thanks to the design of UTF-8, there cannot be false matches:
766 A sequence of valid UTF-8 bytes cannot be a subsequence of
767 another valid sequence of UTF-8 bytes.
770 char *s0 = STRING(scan), *s, *t;
771 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772 char *t0 = "\xcc\x88\xcc\x81";
776 s < s2 && (t = ninstr(s, s1, t0, t1));
778 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
786 n = scan + NODE_SZ_STR(scan);
788 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
796 /* Follow the next-chain of the current node and optimize
797 away all the NOTHINGs from it. */
798 if (OP(scan) != CURLYX) {
799 int max = (reg_off_by_arg[OP(scan)]
801 /* I32 may be smaller than U16 on CRAYs! */
802 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
803 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
807 /* Skip NOTHING and LONGJMP. */
808 while ((n = regnext(n))
809 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
810 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
813 if (reg_off_by_arg[OP(scan)])
816 NEXT_OFF(scan) = off;
818 /* The principal pseudo-switch. Cannot be a switch, since we
819 look into several different things. */
820 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
821 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822 next = regnext(scan);
825 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
826 I32 max1 = 0, min1 = I32_MAX, num = 0;
827 struct regnode_charclass_class accum;
829 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
831 if (flags & SCF_DO_STCLASS)
832 cl_init_zero(pRExC_state, &accum);
833 while (OP(scan) == code) {
834 I32 deltanext, minnext, f = 0, fake;
835 struct regnode_charclass_class this_class;
840 data_fake.whilem_c = data->whilem_c;
841 data_fake.last_closep = data->last_closep;
844 data_fake.last_closep = &fake;
845 next = regnext(scan);
846 scan = NEXTOPER(scan);
848 scan = NEXTOPER(scan);
849 if (flags & SCF_DO_STCLASS) {
850 cl_init(pRExC_state, &this_class);
851 data_fake.start_class = &this_class;
852 f = SCF_DO_STCLASS_AND;
854 if (flags & SCF_WHILEM_VISITED_POS)
855 f |= SCF_WHILEM_VISITED_POS;
856 /* we suppose the run is continuous, last=next...*/
857 minnext = study_chunk(pRExC_state, &scan, &deltanext,
858 next, &data_fake, f);
861 if (max1 < minnext + deltanext)
862 max1 = minnext + deltanext;
863 if (deltanext == I32_MAX)
864 is_inf = is_inf_internal = 1;
866 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868 if (data && (data_fake.flags & SF_HAS_EVAL))
869 data->flags |= SF_HAS_EVAL;
871 data->whilem_c = data_fake.whilem_c;
872 if (flags & SCF_DO_STCLASS)
873 cl_or(pRExC_state, &accum, &this_class);
877 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879 if (flags & SCF_DO_SUBSTR) {
880 data->pos_min += min1;
881 data->pos_delta += max1 - min1;
882 if (max1 != min1 || is_inf)
883 data->longest = &(data->longest_float);
886 delta += max1 - min1;
887 if (flags & SCF_DO_STCLASS_OR) {
888 cl_or(pRExC_state, data->start_class, &accum);
890 cl_and(data->start_class, &and_with);
891 flags &= ~SCF_DO_STCLASS;
894 else if (flags & SCF_DO_STCLASS_AND) {
896 cl_and(data->start_class, &accum);
897 flags &= ~SCF_DO_STCLASS;
900 /* Switch to OR mode: cache the old value of
901 * data->start_class */
902 StructCopy(data->start_class, &and_with,
903 struct regnode_charclass_class);
904 flags &= ~SCF_DO_STCLASS_AND;
905 StructCopy(&accum, data->start_class,
906 struct regnode_charclass_class);
907 flags |= SCF_DO_STCLASS_OR;
908 data->start_class->flags |= ANYOF_EOS;
912 else if (code == BRANCHJ) /* single branch is optimized. */
913 scan = NEXTOPER(NEXTOPER(scan));
914 else /* single branch is optimized. */
915 scan = NEXTOPER(scan);
918 else if (OP(scan) == EXACT) {
919 I32 l = STR_LEN(scan);
920 UV uc = *((U8*)STRING(scan));
922 U8 *s = (U8*)STRING(scan);
923 l = utf8_length(s, s + l);
924 uc = utf8_to_uvchr(s, NULL);
927 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
928 /* The code below prefers earlier match for fixed
929 offset, later match for variable offset. */
930 if (data->last_end == -1) { /* Update the start info. */
931 data->last_start_min = data->pos_min;
932 data->last_start_max = is_inf
933 ? I32_MAX : data->pos_min + data->pos_delta;
935 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
937 SvUTF8_on(data->last_found);
938 data->last_end = data->pos_min + l;
939 data->pos_min += l; /* As in the first entry. */
940 data->flags &= ~SF_BEFORE_EOL;
942 if (flags & SCF_DO_STCLASS_AND) {
943 /* Check whether it is compatible with what we know already! */
947 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
948 && !ANYOF_BITMAP_TEST(data->start_class, uc)
949 && (!(data->start_class->flags & ANYOF_FOLD)
950 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
953 ANYOF_CLASS_ZERO(data->start_class);
954 ANYOF_BITMAP_ZERO(data->start_class);
956 ANYOF_BITMAP_SET(data->start_class, uc);
957 data->start_class->flags &= ~ANYOF_EOS;
959 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
961 else if (flags & SCF_DO_STCLASS_OR) {
962 /* false positive possible if the class is case-folded */
964 ANYOF_BITMAP_SET(data->start_class, uc);
966 data->start_class->flags |= ANYOF_UNICODE_ALL;
967 data->start_class->flags &= ~ANYOF_EOS;
968 cl_and(data->start_class, &and_with);
970 flags &= ~SCF_DO_STCLASS;
972 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
973 I32 l = STR_LEN(scan);
974 UV uc = *((U8*)STRING(scan));
976 /* Search for fixed substrings supports EXACT only. */
977 if (flags & SCF_DO_SUBSTR)
978 scan_commit(pRExC_state, data);
980 U8 *s = (U8 *)STRING(scan);
981 l = utf8_length(s, s + l);
982 uc = utf8_to_uvchr(s, NULL);
985 if (data && (flags & SCF_DO_SUBSTR))
987 if (flags & SCF_DO_STCLASS_AND) {
988 /* Check whether it is compatible with what we know already! */
992 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
993 && !ANYOF_BITMAP_TEST(data->start_class, uc)
994 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
996 ANYOF_CLASS_ZERO(data->start_class);
997 ANYOF_BITMAP_ZERO(data->start_class);
999 ANYOF_BITMAP_SET(data->start_class, uc);
1000 data->start_class->flags &= ~ANYOF_EOS;
1001 data->start_class->flags |= ANYOF_FOLD;
1002 if (OP(scan) == EXACTFL)
1003 data->start_class->flags |= ANYOF_LOCALE;
1006 else if (flags & SCF_DO_STCLASS_OR) {
1007 if (data->start_class->flags & ANYOF_FOLD) {
1008 /* false positive possible if the class is case-folded.
1009 Assume that the locale settings are the same... */
1011 ANYOF_BITMAP_SET(data->start_class, uc);
1012 data->start_class->flags &= ~ANYOF_EOS;
1014 cl_and(data->start_class, &and_with);
1016 flags &= ~SCF_DO_STCLASS;
1018 else if (strchr((char*)PL_varies,OP(scan))) {
1019 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1020 I32 f = flags, pos_before = 0;
1021 regnode *oscan = scan;
1022 struct regnode_charclass_class this_class;
1023 struct regnode_charclass_class *oclass = NULL;
1024 I32 next_is_eval = 0;
1026 switch (PL_regkind[(U8)OP(scan)]) {
1027 case WHILEM: /* End of (?:...)* . */
1028 scan = NEXTOPER(scan);
1031 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1032 next = NEXTOPER(scan);
1033 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1035 maxcount = REG_INFTY;
1036 next = regnext(scan);
1037 scan = NEXTOPER(scan);
1041 if (flags & SCF_DO_SUBSTR)
1046 if (flags & SCF_DO_STCLASS) {
1048 maxcount = REG_INFTY;
1049 next = regnext(scan);
1050 scan = NEXTOPER(scan);
1053 is_inf = is_inf_internal = 1;
1054 scan = regnext(scan);
1055 if (flags & SCF_DO_SUBSTR) {
1056 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1057 data->longest = &(data->longest_float);
1059 goto optimize_curly_tail;
1061 mincount = ARG1(scan);
1062 maxcount = ARG2(scan);
1063 next = regnext(scan);
1064 if (OP(scan) == CURLYX) {
1065 I32 lp = (data ? *(data->last_closep) : 0);
1067 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1069 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1070 next_is_eval = (OP(scan) == EVAL);
1072 if (flags & SCF_DO_SUBSTR) {
1073 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1074 pos_before = data->pos_min;
1078 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1080 data->flags |= SF_IS_INF;
1082 if (flags & SCF_DO_STCLASS) {
1083 cl_init(pRExC_state, &this_class);
1084 oclass = data->start_class;
1085 data->start_class = &this_class;
1086 f |= SCF_DO_STCLASS_AND;
1087 f &= ~SCF_DO_STCLASS_OR;
1089 /* These are the cases when once a subexpression
1090 fails at a particular position, it cannot succeed
1091 even after backtracking at the enclosing scope.
1093 XXXX what if minimal match and we are at the
1094 initial run of {n,m}? */
1095 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096 f &= ~SCF_WHILEM_VISITED_POS;
1098 /* This will finish on WHILEM, setting scan, or on NULL: */
1099 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1101 ? (f & ~SCF_DO_SUBSTR) : f);
1103 if (flags & SCF_DO_STCLASS)
1104 data->start_class = oclass;
1105 if (mincount == 0 || minnext == 0) {
1106 if (flags & SCF_DO_STCLASS_OR) {
1107 cl_or(pRExC_state, data->start_class, &this_class);
1109 else if (flags & SCF_DO_STCLASS_AND) {
1110 /* Switch to OR mode: cache the old value of
1111 * data->start_class */
1112 StructCopy(data->start_class, &and_with,
1113 struct regnode_charclass_class);
1114 flags &= ~SCF_DO_STCLASS_AND;
1115 StructCopy(&this_class, data->start_class,
1116 struct regnode_charclass_class);
1117 flags |= SCF_DO_STCLASS_OR;
1118 data->start_class->flags |= ANYOF_EOS;
1120 } else { /* Non-zero len */
1121 if (flags & SCF_DO_STCLASS_OR) {
1122 cl_or(pRExC_state, data->start_class, &this_class);
1123 cl_and(data->start_class, &and_with);
1125 else if (flags & SCF_DO_STCLASS_AND)
1126 cl_and(data->start_class, &this_class);
1127 flags &= ~SCF_DO_STCLASS;
1129 if (!scan) /* It was not CURLYX, but CURLY. */
1131 if (ckWARN(WARN_REGEXP)
1132 /* ? quantifier ok, except for (?{ ... }) */
1133 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1134 && (minnext == 0) && (deltanext == 0)
1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1139 "Quantifier unexpected on zero-length expression");
1142 min += minnext * mincount;
1143 is_inf_internal |= ((maxcount == REG_INFTY
1144 && (minnext + deltanext) > 0)
1145 || deltanext == I32_MAX);
1146 is_inf |= is_inf_internal;
1147 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1149 /* Try powerful optimization CURLYX => CURLYN. */
1150 if ( OP(oscan) == CURLYX && data
1151 && data->flags & SF_IN_PAR
1152 && !(data->flags & SF_HAS_EVAL)
1153 && !deltanext && minnext == 1 ) {
1154 /* Try to optimize to CURLYN. */
1155 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1156 regnode *nxt1 = nxt;
1163 if (!strchr((char*)PL_simple,OP(nxt))
1164 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1165 && STR_LEN(nxt) == 1))
1171 if (OP(nxt) != CLOSE)
1173 /* Now we know that nxt2 is the only contents: */
1174 oscan->flags = (U8)ARG(nxt);
1176 OP(nxt1) = NOTHING; /* was OPEN. */
1178 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1182 OP(nxt + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1188 /* Try optimization CURLYX => CURLYM. */
1189 if ( OP(oscan) == CURLYX && data
1190 && !(data->flags & SF_HAS_PAR)
1191 && !(data->flags & SF_HAS_EVAL)
1193 /* XXXX How to optimize if data == 0? */
1194 /* Optimize to a simpler form. */
1195 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1199 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1200 && (OP(nxt2) != WHILEM))
1202 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1203 /* Need to optimize away parenths. */
1204 if (data->flags & SF_IN_PAR) {
1205 /* Set the parenth number. */
1206 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208 if (OP(nxt) != CLOSE)
1209 FAIL("Panic opt close");
1210 oscan->flags = (U8)ARG(nxt);
1211 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1212 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1214 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215 OP(nxt + 1) = OPTIMIZED; /* was count. */
1216 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1220 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221 regnode *nnxt = regnext(nxt1);
1224 if (reg_off_by_arg[OP(nxt1)])
1225 ARG_SET(nxt1, nxt2 - nxt1);
1226 else if (nxt2 - nxt1 < U16_MAX)
1227 NEXT_OFF(nxt1) = nxt2 - nxt1;
1229 OP(nxt) = NOTHING; /* Cannot beautify */
1234 /* Optimize again: */
1235 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1241 else if ((OP(oscan) == CURLYX)
1242 && (flags & SCF_WHILEM_VISITED_POS)
1243 /* See the comment on a similar expression above.
1244 However, this time it not a subexpression
1245 we care about, but the expression itself. */
1246 && (maxcount == REG_INFTY)
1247 && data && ++data->whilem_c < 16) {
1248 /* This stays as CURLYX, we can put the count/of pair. */
1249 /* Find WHILEM (as in regexec.c) */
1250 regnode *nxt = oscan + NEXT_OFF(oscan);
1252 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1255 | (RExC_whilem_seen << 4)); /* On WHILEM */
1257 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1259 if (flags & SCF_DO_SUBSTR) {
1260 SV *last_str = Nullsv;
1261 int counted = mincount != 0;
1263 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1264 #if defined(SPARC64_GCC_WORKAROUND)
1270 if (pos_before >= data->last_start_min)
1273 b = data->last_start_min;
1276 s = SvPV(data->last_found, l);
1277 old = b - data->last_start_min;
1280 I32 b = pos_before >= data->last_start_min
1281 ? pos_before : data->last_start_min;
1283 char *s = SvPV(data->last_found, l);
1284 I32 old = b - data->last_start_min;
1288 old = utf8_hop((U8*)s, old) - (U8*)s;
1291 /* Get the added string: */
1292 last_str = newSVpvn(s + old, l);
1293 if (deltanext == 0 && pos_before == b) {
1294 /* What was added is a constant string */
1296 SvGROW(last_str, (mincount * l) + 1);
1297 repeatcpy(SvPVX(last_str) + l,
1298 SvPVX(last_str), l, mincount - 1);
1299 SvCUR(last_str) *= mincount;
1300 /* Add additional parts. */
1301 SvCUR_set(data->last_found,
1302 SvCUR(data->last_found) - l);
1303 sv_catsv(data->last_found, last_str);
1304 data->last_end += l * (mincount - 1);
1307 /* start offset must point into the last copy */
1308 data->last_start_min += minnext * (mincount - 1);
1309 data->last_start_max += is_inf ? I32_MAX
1310 : (maxcount - 1) * (minnext + data->pos_delta);
1313 /* It is counted once already... */
1314 data->pos_min += minnext * (mincount - counted);
1315 data->pos_delta += - counted * deltanext +
1316 (minnext + deltanext) * maxcount - minnext * mincount;
1317 if (mincount != maxcount) {
1318 /* Cannot extend fixed substrings found inside
1320 scan_commit(pRExC_state,data);
1321 if (mincount && last_str) {
1322 sv_setsv(data->last_found, last_str);
1323 data->last_end = data->pos_min;
1324 data->last_start_min =
1325 data->pos_min - CHR_SVLEN(last_str);
1326 data->last_start_max = is_inf
1328 : data->pos_min + data->pos_delta
1329 - CHR_SVLEN(last_str);
1331 data->longest = &(data->longest_float);
1333 SvREFCNT_dec(last_str);
1335 if (data && (fl & SF_HAS_EVAL))
1336 data->flags |= SF_HAS_EVAL;
1337 optimize_curly_tail:
1338 if (OP(oscan) != CURLYX) {
1339 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1341 NEXT_OFF(oscan) += NEXT_OFF(next);
1344 default: /* REF and CLUMP only? */
1345 if (flags & SCF_DO_SUBSTR) {
1346 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1347 data->longest = &(data->longest_float);
1349 is_inf = is_inf_internal = 1;
1350 if (flags & SCF_DO_STCLASS_OR)
1351 cl_anything(pRExC_state, data->start_class);
1352 flags &= ~SCF_DO_STCLASS;
1356 else if (strchr((char*)PL_simple,OP(scan))) {
1359 if (flags & SCF_DO_SUBSTR) {
1360 scan_commit(pRExC_state,data);
1364 if (flags & SCF_DO_STCLASS) {
1365 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1367 /* Some of the logic below assumes that switching
1368 locale on will only add false positives. */
1369 switch (PL_regkind[(U8)OP(scan)]) {
1373 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1375 cl_anything(pRExC_state, data->start_class);
1378 if (OP(scan) == SANY)
1380 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382 || (data->start_class->flags & ANYOF_CLASS));
1383 cl_anything(pRExC_state, data->start_class);
1385 if (flags & SCF_DO_STCLASS_AND || !value)
1386 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1389 if (flags & SCF_DO_STCLASS_AND)
1390 cl_and(data->start_class,
1391 (struct regnode_charclass_class*)scan);
1393 cl_or(pRExC_state, data->start_class,
1394 (struct regnode_charclass_class*)scan);
1397 if (flags & SCF_DO_STCLASS_AND) {
1398 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400 for (value = 0; value < 256; value++)
1401 if (!isALNUM(value))
1402 ANYOF_BITMAP_CLEAR(data->start_class, value);
1406 if (data->start_class->flags & ANYOF_LOCALE)
1407 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1409 for (value = 0; value < 256; value++)
1411 ANYOF_BITMAP_SET(data->start_class, value);
1416 if (flags & SCF_DO_STCLASS_AND) {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422 data->start_class->flags |= ANYOF_LOCALE;
1426 if (flags & SCF_DO_STCLASS_AND) {
1427 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429 for (value = 0; value < 256; value++)
1431 ANYOF_BITMAP_CLEAR(data->start_class, value);
1435 if (data->start_class->flags & ANYOF_LOCALE)
1436 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1438 for (value = 0; value < 256; value++)
1439 if (!isALNUM(value))
1440 ANYOF_BITMAP_SET(data->start_class, value);
1445 if (flags & SCF_DO_STCLASS_AND) {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 data->start_class->flags |= ANYOF_LOCALE;
1451 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1455 if (flags & SCF_DO_STCLASS_AND) {
1456 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458 for (value = 0; value < 256; value++)
1459 if (!isSPACE(value))
1460 ANYOF_BITMAP_CLEAR(data->start_class, value);
1464 if (data->start_class->flags & ANYOF_LOCALE)
1465 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1467 for (value = 0; value < 256; value++)
1469 ANYOF_BITMAP_SET(data->start_class, value);
1474 if (flags & SCF_DO_STCLASS_AND) {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 data->start_class->flags |= ANYOF_LOCALE;
1480 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1484 if (flags & SCF_DO_STCLASS_AND) {
1485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487 for (value = 0; value < 256; value++)
1489 ANYOF_BITMAP_CLEAR(data->start_class, value);
1493 if (data->start_class->flags & ANYOF_LOCALE)
1494 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1496 for (value = 0; value < 256; value++)
1497 if (!isSPACE(value))
1498 ANYOF_BITMAP_SET(data->start_class, value);
1503 if (flags & SCF_DO_STCLASS_AND) {
1504 if (data->start_class->flags & ANYOF_LOCALE) {
1505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506 for (value = 0; value < 256; value++)
1507 if (!isSPACE(value))
1508 ANYOF_BITMAP_CLEAR(data->start_class, value);
1512 data->start_class->flags |= ANYOF_LOCALE;
1513 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1517 if (flags & SCF_DO_STCLASS_AND) {
1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519 for (value = 0; value < 256; value++)
1520 if (!isDIGIT(value))
1521 ANYOF_BITMAP_CLEAR(data->start_class, value);
1524 if (data->start_class->flags & ANYOF_LOCALE)
1525 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1527 for (value = 0; value < 256; value++)
1529 ANYOF_BITMAP_SET(data->start_class, value);
1534 if (flags & SCF_DO_STCLASS_AND) {
1535 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536 for (value = 0; value < 256; value++)
1538 ANYOF_BITMAP_CLEAR(data->start_class, value);
1541 if (data->start_class->flags & ANYOF_LOCALE)
1542 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1544 for (value = 0; value < 256; value++)
1545 if (!isDIGIT(value))
1546 ANYOF_BITMAP_SET(data->start_class, value);
1551 if (flags & SCF_DO_STCLASS_OR)
1552 cl_and(data->start_class, &and_with);
1553 flags &= ~SCF_DO_STCLASS;
1556 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1557 data->flags |= (OP(scan) == MEOL
1561 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1562 /* Lookbehind, or need to calculate parens/evals/stclass: */
1563 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1564 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1565 /* Lookahead/lookbehind */
1566 I32 deltanext, minnext, fake = 0;
1568 struct regnode_charclass_class intrnl;
1571 data_fake.flags = 0;
1573 data_fake.whilem_c = data->whilem_c;
1574 data_fake.last_closep = data->last_closep;
1577 data_fake.last_closep = &fake;
1578 if ( flags & SCF_DO_STCLASS && !scan->flags
1579 && OP(scan) == IFMATCH ) { /* Lookahead */
1580 cl_init(pRExC_state, &intrnl);
1581 data_fake.start_class = &intrnl;
1582 f |= SCF_DO_STCLASS_AND;
1584 if (flags & SCF_WHILEM_VISITED_POS)
1585 f |= SCF_WHILEM_VISITED_POS;
1586 next = regnext(scan);
1587 nscan = NEXTOPER(NEXTOPER(scan));
1588 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1591 vFAIL("Variable length lookbehind not implemented");
1593 else if (minnext > U8_MAX) {
1594 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1596 scan->flags = (U8)minnext;
1598 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1600 if (data && (data_fake.flags & SF_HAS_EVAL))
1601 data->flags |= SF_HAS_EVAL;
1603 data->whilem_c = data_fake.whilem_c;
1604 if (f & SCF_DO_STCLASS_AND) {
1605 int was = (data->start_class->flags & ANYOF_EOS);
1607 cl_and(data->start_class, &intrnl);
1609 data->start_class->flags |= ANYOF_EOS;
1612 else if (OP(scan) == OPEN) {
1615 else if (OP(scan) == CLOSE) {
1616 if ((I32)ARG(scan) == is_par) {
1617 next = regnext(scan);
1619 if ( next && (OP(next) != WHILEM) && next < last)
1620 is_par = 0; /* Disable optimization */
1623 *(data->last_closep) = ARG(scan);
1625 else if (OP(scan) == EVAL) {
1627 data->flags |= SF_HAS_EVAL;
1629 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1630 if (flags & SCF_DO_SUBSTR) {
1631 scan_commit(pRExC_state,data);
1632 data->longest = &(data->longest_float);
1634 is_inf = is_inf_internal = 1;
1635 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1636 cl_anything(pRExC_state, data->start_class);
1637 flags &= ~SCF_DO_STCLASS;
1639 /* Else: zero-length, ignore. */
1640 scan = regnext(scan);
1645 *deltap = is_inf_internal ? I32_MAX : delta;
1646 if (flags & SCF_DO_SUBSTR && is_inf)
1647 data->pos_delta = I32_MAX - data->pos_min;
1648 if (is_par > U8_MAX)
1650 if (is_par && pars==1 && data) {
1651 data->flags |= SF_IN_PAR;
1652 data->flags &= ~SF_HAS_PAR;
1654 else if (pars && data) {
1655 data->flags |= SF_HAS_PAR;
1656 data->flags &= ~SF_IN_PAR;
1658 if (flags & SCF_DO_STCLASS_OR)
1659 cl_and(data->start_class, &and_with);
1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1666 if (RExC_rx->data) {
1667 Renewc(RExC_rx->data,
1668 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1669 char, struct reg_data);
1670 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671 RExC_rx->data->count += n;
1674 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1675 char, struct reg_data);
1676 New(1208, RExC_rx->data->what, n, U8);
1677 RExC_rx->data->count = n;
1679 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680 return RExC_rx->data->count - n;
1684 Perl_reginitcolors(pTHX)
1687 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1690 PL_colors[0] = s = savepv(s);
1692 s = strchr(s, '\t');
1698 PL_colors[i] = s = "";
1702 PL_colors[i++] = "";
1709 - pregcomp - compile a regular expression into internal code
1711 * We can't allocate space until we know how big the compiled form will be,
1712 * but we can't compile it (and thus know how big it is) until we've got a
1713 * place to put the code. So we cheat: we compile it twice, once with code
1714 * generation turned off and size counting turned on, and once "for real".
1715 * This also means that we don't allocate space until we are sure that the
1716 * thing really will compile successfully, and we never have to move the
1717 * code and thus invalidate pointers into it. (Note that it has to be in
1718 * one piece because free() must be able to free it all.) [NB: not true in perl]
1720 * Beware that the optimization-preparation code in here knows about some
1721 * of the structure of the compiled regexp. [I'll say.]
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1734 RExC_state_t RExC_state;
1735 RExC_state_t *pRExC_state = &RExC_state;
1738 FAIL("NULL regexp argument");
1740 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1744 if (!PL_colorset) reginitcolors();
1745 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746 PL_colors[4],PL_colors[5],PL_colors[0],
1747 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1749 RExC_flags = pm->op_pmflags;
1753 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754 RExC_seen_evals = 0;
1757 /* First pass: determine size, legality. */
1764 RExC_emit = &PL_regdummy;
1765 RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767 * Clever compilers notice this and complain. --jhi */
1768 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1770 if (reg(pRExC_state, 0, &flags) == NULL) {
1771 RExC_precomp = Nullch;
1774 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1776 /* Small enough for pointer-storage convention?
1777 If extralen==0, this means that we will not need long jumps. */
1778 if (RExC_size >= 0x10000L && RExC_extralen)
1779 RExC_size += RExC_extralen;
1782 if (RExC_whilem_seen > 15)
1783 RExC_whilem_seen = 15;
1785 /* Allocate space and initialize. */
1786 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1789 FAIL("Regexp out of space");
1792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1796 r->prelen = xend - exp;
1797 r->precomp = savepvn(RExC_precomp, r->prelen);
1799 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1802 r->substrs = 0; /* Useful during FAIL. */
1803 r->startp = 0; /* Useful during FAIL. */
1804 r->endp = 0; /* Useful during FAIL. */
1806 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1808 r->offsets[0] = RExC_size;
1810 DEBUG_r(PerlIO_printf(Perl_debug_log,
1811 "%s %"UVuf" bytes for offset annotations.\n",
1812 r->offsets ? "Got" : "Couldn't get",
1813 (UV)((2*RExC_size+1) * sizeof(U32))));
1817 /* Second pass: emit code. */
1818 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1823 RExC_emit_start = r->program;
1824 RExC_emit = r->program;
1825 /* Store the count of eval-groups for security checks: */
1826 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1829 if (reg(pRExC_state, 0, &flags) == NULL)
1832 /* Dig out information for optimizations. */
1833 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1834 pm->op_pmflags = RExC_flags;
1836 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1837 r->regstclass = NULL;
1838 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1839 r->reganch |= ROPT_NAUGHTY;
1840 scan = r->program + 1; /* First BRANCH. */
1842 /* XXXX To minimize changes to RE engine we always allocate
1843 3-units-long substrs field. */
1844 Newz(1004, r->substrs, 1, struct reg_substr_data);
1846 StructCopy(&zero_scan_data, &data, scan_data_t);
1847 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1850 STRLEN longest_float_length, longest_fixed_length;
1851 struct regnode_charclass_class ch_class;
1856 /* Skip introductions and multiplicators >= 1. */
1857 while ((OP(first) == OPEN && (sawopen = 1)) ||
1858 /* An OR of *one* alternative - should not happen now. */
1859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860 (OP(first) == PLUS) ||
1861 (OP(first) == MINMOD) ||
1862 /* An {n,m} with n>0 */
1863 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1864 if (OP(first) == PLUS)
1867 first += regarglen[(U8)OP(first)];
1868 first = NEXTOPER(first);
1871 /* Starting-point info. */
1873 if (PL_regkind[(U8)OP(first)] == EXACT) {
1874 if (OP(first) == EXACT)
1875 ; /* Empty, get anchored substr later. */
1876 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1877 r->regstclass = first;
1879 else if (strchr((char*)PL_simple,OP(first)))
1880 r->regstclass = first;
1881 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882 PL_regkind[(U8)OP(first)] == NBOUND)
1883 r->regstclass = first;
1884 else if (PL_regkind[(U8)OP(first)] == BOL) {
1885 r->reganch |= (OP(first) == MBOL
1887 : (OP(first) == SBOL
1890 first = NEXTOPER(first);
1893 else if (OP(first) == GPOS) {
1894 r->reganch |= ROPT_ANCH_GPOS;
1895 first = NEXTOPER(first);
1898 else if (!sawopen && (OP(first) == STAR &&
1899 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1900 !(r->reganch & ROPT_ANCH) )
1902 /* turn .* into ^.* with an implied $*=1 */
1903 int type = OP(NEXTOPER(first));
1905 if (type == REG_ANY)
1906 type = ROPT_ANCH_MBOL;
1908 type = ROPT_ANCH_SBOL;
1910 r->reganch |= type | ROPT_IMPLICIT;
1911 first = NEXTOPER(first);
1914 if (sawplus && (!sawopen || !RExC_sawback)
1915 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1916 /* x+ must match at the 1st pos of run of x's */
1917 r->reganch |= ROPT_SKIP;
1919 /* Scan is after the zeroth branch, first is atomic matcher. */
1920 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1921 (IV)(first - scan + 1)));
1923 * If there's something expensive in the r.e., find the
1924 * longest literal string that must appear and make it the
1925 * regmust. Resolve ties in favor of later strings, since
1926 * the regstart check works with the beginning of the r.e.
1927 * and avoiding duplication strengthens checking. Not a
1928 * strong reason, but sufficient in the absence of others.
1929 * [Now we resolve ties in favor of the earlier string if
1930 * it happens that c_offset_min has been invalidated, since the
1931 * earlier string may buy us something the later one won't.]
1935 data.longest_fixed = newSVpvn("",0);
1936 data.longest_float = newSVpvn("",0);
1937 data.last_found = newSVpvn("",0);
1938 data.longest = &(data.longest_fixed);
1940 if (!r->regstclass) {
1941 cl_init(pRExC_state, &ch_class);
1942 data.start_class = &ch_class;
1943 stclass_flag = SCF_DO_STCLASS_AND;
1944 } else /* XXXX Check for BOUND? */
1946 data.last_closep = &last_close;
1948 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1949 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1950 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1951 && data.last_start_min == 0 && data.last_end > 0
1952 && !RExC_seen_zerolen
1953 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1954 r->reganch |= ROPT_CHECK_ALL;
1955 scan_commit(pRExC_state, &data);
1956 SvREFCNT_dec(data.last_found);
1958 longest_float_length = CHR_SVLEN(data.longest_float);
1959 if (longest_float_length
1960 || (data.flags & SF_FL_BEFORE_EOL
1961 && (!(data.flags & SF_FL_BEFORE_MEOL)
1962 || (RExC_flags & PMf_MULTILINE)))) {
1965 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1966 && data.offset_fixed == data.offset_float_min
1967 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968 goto remove_float; /* As in (a)+. */
1970 if (SvUTF8(data.longest_float)) {
1971 r->float_utf8 = data.longest_float;
1972 r->float_substr = Nullsv;
1974 r->float_substr = data.longest_float;
1975 r->float_utf8 = Nullsv;
1977 r->float_min_offset = data.offset_float_min;
1978 r->float_max_offset = data.offset_float_max;
1979 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981 || (RExC_flags & PMf_MULTILINE)));
1982 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1986 r->float_substr = r->float_utf8 = Nullsv;
1987 SvREFCNT_dec(data.longest_float);
1988 longest_float_length = 0;
1991 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1992 if (longest_fixed_length
1993 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1995 || (RExC_flags & PMf_MULTILINE)))) {
1998 if (SvUTF8(data.longest_fixed)) {
1999 r->anchored_utf8 = data.longest_fixed;
2000 r->anchored_substr = Nullsv;
2002 r->anchored_substr = data.longest_fixed;
2003 r->anchored_utf8 = Nullsv;
2005 r->anchored_offset = data.offset_fixed;
2006 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2008 || (RExC_flags & PMf_MULTILINE)));
2009 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2012 r->anchored_substr = r->anchored_utf8 = Nullsv;
2013 SvREFCNT_dec(data.longest_fixed);
2014 longest_fixed_length = 0;
2017 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2018 r->regstclass = NULL;
2019 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2021 && !(data.start_class->flags & ANYOF_EOS)
2022 && !cl_is_anything(data.start_class))
2024 I32 n = add_data(pRExC_state, 1, "f");
2026 New(1006, RExC_rx->data->data[n], 1,
2027 struct regnode_charclass_class);
2028 StructCopy(data.start_class,
2029 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2030 struct regnode_charclass_class);
2031 r->regstclass = (regnode*)RExC_rx->data->data[n];
2032 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2033 PL_regdata = r->data; /* for regprop() */
2034 DEBUG_r({ SV *sv = sv_newmortal();
2035 regprop(sv, (regnode*)data.start_class);
2036 PerlIO_printf(Perl_debug_log,
2037 "synthetic stclass `%s'.\n",
2041 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2042 if (longest_fixed_length > longest_float_length) {
2043 r->check_substr = r->anchored_substr;
2044 r->check_utf8 = r->anchored_utf8;
2045 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2046 if (r->reganch & ROPT_ANCH_SINGLE)
2047 r->reganch |= ROPT_NOSCAN;
2050 r->check_substr = r->float_substr;
2051 r->check_utf8 = r->float_utf8;
2052 r->check_offset_min = data.offset_float_min;
2053 r->check_offset_max = data.offset_float_max;
2055 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2056 This should be changed ASAP! */
2057 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2058 r->reganch |= RE_USE_INTUIT;
2059 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2060 r->reganch |= RE_INTUIT_TAIL;
2064 /* Several toplevels. Best we can is to set minlen. */
2066 struct regnode_charclass_class ch_class;
2069 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2070 scan = r->program + 1;
2071 cl_init(pRExC_state, &ch_class);
2072 data.start_class = &ch_class;
2073 data.last_closep = &last_close;
2074 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2075 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2076 = r->float_substr = r->float_utf8 = Nullsv;
2077 if (!(data.start_class->flags & ANYOF_EOS)
2078 && !cl_is_anything(data.start_class))
2080 I32 n = add_data(pRExC_state, 1, "f");
2082 New(1006, RExC_rx->data->data[n], 1,
2083 struct regnode_charclass_class);
2084 StructCopy(data.start_class,
2085 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2086 struct regnode_charclass_class);
2087 r->regstclass = (regnode*)RExC_rx->data->data[n];
2088 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2089 DEBUG_r({ SV* sv = sv_newmortal();
2090 regprop(sv, (regnode*)data.start_class);
2091 PerlIO_printf(Perl_debug_log,
2092 "synthetic stclass `%s'.\n",
2098 if (RExC_seen & REG_SEEN_GPOS)
2099 r->reganch |= ROPT_GPOS_SEEN;
2100 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2101 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2102 if (RExC_seen & REG_SEEN_EVAL)
2103 r->reganch |= ROPT_EVAL_SEEN;
2104 if (RExC_seen & REG_SEEN_CANY)
2105 r->reganch |= ROPT_CANY_SEEN;
2106 Newz(1002, r->startp, RExC_npar, I32);
2107 Newz(1002, r->endp, RExC_npar, I32);
2108 PL_regdata = r->data; /* for regprop() */
2109 DEBUG_r(regdump(r));
2114 - reg - regular expression, i.e. main body or parenthesized thing
2116 * Caller must absorb opening parenthesis.
2118 * Combining parenthesis handling with the base level of regular expression
2119 * is a trifle forced, but the need to tie the tails of the branches to what
2120 * follows makes it hard to avoid.
2123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2124 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2126 register regnode *ret; /* Will be the head of the group. */
2127 register regnode *br;
2128 register regnode *lastbr;
2129 register regnode *ender = 0;
2130 register I32 parno = 0;
2131 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2133 /* for (?g), (?gc), and (?o) warnings; warning
2134 about (?c) will warn about (?g) -- japhy */
2136 I32 wastedflags = 0x00,
2139 wasted_gc = 0x02 | 0x04,
2142 char * parse_start = RExC_parse; /* MJD */
2143 char *oregcomp_parse = RExC_parse;
2146 *flagp = 0; /* Tentatively. */
2149 /* Make an OPEN node, if parenthesized. */
2151 if (*RExC_parse == '?') { /* (?...) */
2152 U32 posflags = 0, negflags = 0;
2153 U32 *flagsp = &posflags;
2155 char *seqstart = RExC_parse;
2158 paren = *RExC_parse++;
2159 ret = NULL; /* For look-ahead/behind. */
2161 case '<': /* (?<...) */
2162 RExC_seen |= REG_SEEN_LOOKBEHIND;
2163 if (*RExC_parse == '!')
2165 if (*RExC_parse != '=' && *RExC_parse != '!')
2168 case '=': /* (?=...) */
2169 case '!': /* (?!...) */
2170 RExC_seen_zerolen++;
2171 case ':': /* (?:...) */
2172 case '>': /* (?>...) */
2174 case '$': /* (?$...) */
2175 case '@': /* (?@...) */
2176 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2178 case '#': /* (?#...) */
2179 while (*RExC_parse && *RExC_parse != ')')
2181 if (*RExC_parse != ')')
2182 FAIL("Sequence (?#... not terminated");
2183 nextchar(pRExC_state);
2186 case 'p': /* (?p...) */
2187 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2188 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2190 case '?': /* (??...) */
2192 if (*RExC_parse != '{')
2194 paren = *RExC_parse++;
2196 case '{': /* (?{...}) */
2198 I32 count = 1, n = 0;
2200 char *s = RExC_parse;
2202 OP_4tree *sop, *rop;
2204 RExC_seen_zerolen++;
2205 RExC_seen |= REG_SEEN_EVAL;
2206 while (count && (c = *RExC_parse)) {
2207 if (c == '\\' && RExC_parse[1])
2215 if (*RExC_parse != ')')
2218 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2223 if (RExC_parse - 1 - s)
2224 sv = newSVpvn(s, RExC_parse - 1 - s);
2226 sv = newSVpvn("", 0);
2229 Perl_save_re_context(aTHX);
2230 rop = sv_compile_2op(sv, &sop, "re", &pad);
2231 sop->op_private |= OPpREFCOUNTED;
2232 /* re_dup will OpREFCNT_inc */
2233 OpREFCNT_set(sop, 1);
2236 n = add_data(pRExC_state, 3, "nop");
2237 RExC_rx->data->data[n] = (void*)rop;
2238 RExC_rx->data->data[n+1] = (void*)sop;
2239 RExC_rx->data->data[n+2] = (void*)pad;
2242 else { /* First pass */
2243 if (PL_reginterp_cnt < ++RExC_seen_evals
2244 && PL_curcop != &PL_compiling)
2245 /* No compiled RE interpolated, has runtime
2246 components ===> unsafe. */
2247 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2248 if (PL_tainting && PL_tainted)
2249 FAIL("Eval-group in insecure regular expression");
2252 nextchar(pRExC_state);
2254 ret = reg_node(pRExC_state, LOGICAL);
2257 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2258 /* deal with the length of this later - MJD */
2261 return reganode(pRExC_state, EVAL, n);
2263 case '(': /* (?(?{...})...) and (?(?=...)...) */
2265 if (RExC_parse[0] == '?') { /* (?(?...)) */
2266 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2267 || RExC_parse[1] == '<'
2268 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2271 ret = reg_node(pRExC_state, LOGICAL);
2274 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2278 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2280 parno = atoi(RExC_parse++);
2282 while (isDIGIT(*RExC_parse))
2284 ret = reganode(pRExC_state, GROUPP, parno);
2286 if ((c = *nextchar(pRExC_state)) != ')')
2287 vFAIL("Switch condition not recognized");
2289 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2290 br = regbranch(pRExC_state, &flags, 1);
2292 br = reganode(pRExC_state, LONGJMP, 0);
2294 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2295 c = *nextchar(pRExC_state);
2299 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2300 regbranch(pRExC_state, &flags, 1);
2301 regtail(pRExC_state, ret, lastbr);
2304 c = *nextchar(pRExC_state);
2309 vFAIL("Switch (?(condition)... contains too many branches");
2310 ender = reg_node(pRExC_state, TAIL);
2311 regtail(pRExC_state, br, ender);
2313 regtail(pRExC_state, lastbr, ender);
2314 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2317 regtail(pRExC_state, ret, ender);
2321 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2325 RExC_parse--; /* for vFAIL to print correctly */
2326 vFAIL("Sequence (? incomplete");
2330 parse_flags: /* (?i) */
2331 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2332 /* (?g), (?gc) and (?o) are useless here
2333 and must be globally applied -- japhy */
2335 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2336 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2337 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2338 if (! (wastedflags & wflagbit) ) {
2339 wastedflags |= wflagbit;
2342 "Useless (%s%c) - %suse /%c modifier",
2343 flagsp == &negflags ? "?-" : "?",
2345 flagsp == &negflags ? "don't " : "",
2351 else if (*RExC_parse == 'c') {
2352 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2353 if (! (wastedflags & wasted_c) ) {
2354 wastedflags |= wasted_gc;
2357 "Useless (%sc) - %suse /gc modifier",
2358 flagsp == &negflags ? "?-" : "?",
2359 flagsp == &negflags ? "don't " : ""
2364 else { pmflag(flagsp, *RExC_parse); }
2368 if (*RExC_parse == '-') {
2370 wastedflags = 0; /* reset so (?g-c) warns twice */
2374 RExC_flags |= posflags;
2375 RExC_flags &= ~negflags;
2376 if (*RExC_parse == ':') {
2382 if (*RExC_parse != ')') {
2384 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2386 nextchar(pRExC_state);
2394 ret = reganode(pRExC_state, OPEN, parno);
2395 Set_Node_Length(ret, 1); /* MJD */
2396 Set_Node_Offset(ret, RExC_parse); /* MJD */
2403 /* Pick up the branches, linking them together. */
2404 parse_start = RExC_parse; /* MJD */
2405 br = regbranch(pRExC_state, &flags, 1);
2406 /* branch_len = (paren != 0); */
2410 if (*RExC_parse == '|') {
2411 if (!SIZE_ONLY && RExC_extralen) {
2412 reginsert(pRExC_state, BRANCHJ, br);
2415 reginsert(pRExC_state, BRANCH, br);
2416 Set_Node_Length(br, paren != 0);
2417 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2421 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2423 else if (paren == ':') {
2424 *flagp |= flags&SIMPLE;
2426 if (open) { /* Starts with OPEN. */
2427 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2429 else if (paren != '?') /* Not Conditional */
2431 *flagp |= flags & (SPSTART | HASWIDTH);
2433 while (*RExC_parse == '|') {
2434 if (!SIZE_ONLY && RExC_extralen) {
2435 ender = reganode(pRExC_state, LONGJMP,0);
2436 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2439 RExC_extralen += 2; /* Account for LONGJMP. */
2440 nextchar(pRExC_state);
2441 br = regbranch(pRExC_state, &flags, 0);
2445 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2449 *flagp |= flags&SPSTART;
2452 if (have_branch || paren != ':') {
2453 /* Make a closing node, and hook it on the end. */
2456 ender = reg_node(pRExC_state, TAIL);
2459 ender = reganode(pRExC_state, CLOSE, parno);
2460 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2461 Set_Node_Length(ender,1); /* MJD */
2467 *flagp &= ~HASWIDTH;
2470 ender = reg_node(pRExC_state, SUCCEED);
2473 ender = reg_node(pRExC_state, END);
2476 regtail(pRExC_state, lastbr, ender);
2479 /* Hook the tails of the branches to the closing node. */
2480 for (br = ret; br != NULL; br = regnext(br)) {
2481 regoptail(pRExC_state, br, ender);
2488 static char parens[] = "=!<,>";
2490 if (paren && (p = strchr(parens, paren))) {
2491 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2492 int flag = (p - parens) > 1;
2495 node = SUSPEND, flag = 0;
2496 reginsert(pRExC_state, node,ret);
2498 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2502 /* Check for proper termination. */
2504 RExC_flags = oregflags;
2505 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2506 RExC_parse = oregcomp_parse;
2507 vFAIL("Unmatched (");
2510 else if (!paren && RExC_parse < RExC_end) {
2511 if (*RExC_parse == ')') {
2513 vFAIL("Unmatched )");
2516 FAIL("Junk on end of regexp"); /* "Can't happen". */
2524 - regbranch - one alternative of an | operator
2526 * Implements the concatenation operator.
2529 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2531 register regnode *ret;
2532 register regnode *chain = NULL;
2533 register regnode *latest;
2534 I32 flags = 0, c = 0;
2539 if (!SIZE_ONLY && RExC_extralen)
2540 ret = reganode(pRExC_state, BRANCHJ,0);
2542 ret = reg_node(pRExC_state, BRANCH);
2543 Set_Node_Length(ret, 1);
2547 if (!first && SIZE_ONLY)
2548 RExC_extralen += 1; /* BRANCHJ */
2550 *flagp = WORST; /* Tentatively. */
2553 nextchar(pRExC_state);
2554 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2556 latest = regpiece(pRExC_state, &flags);
2557 if (latest == NULL) {
2558 if (flags & TRYAGAIN)
2562 else if (ret == NULL)
2564 *flagp |= flags&HASWIDTH;
2565 if (chain == NULL) /* First piece. */
2566 *flagp |= flags&SPSTART;
2569 regtail(pRExC_state, chain, latest);
2574 if (chain == NULL) { /* Loop ran zero times. */
2575 chain = reg_node(pRExC_state, NOTHING);
2580 *flagp |= flags&SIMPLE;
2587 - regpiece - something followed by possible [*+?]
2589 * Note that the branching code sequences used for ? and the general cases
2590 * of * and + are somewhat optimized: they use the same NOTHING node as
2591 * both the endmarker for their branch list and the body of the last branch.
2592 * It might seem that this node could be dispensed with entirely, but the
2593 * endmarker role is not redundant.
2596 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2598 register regnode *ret;
2600 register char *next;
2602 char *origparse = RExC_parse;
2605 I32 max = REG_INFTY;
2608 ret = regatom(pRExC_state, &flags);
2610 if (flags & TRYAGAIN)
2617 if (op == '{' && regcurly(RExC_parse)) {
2618 parse_start = RExC_parse; /* MJD */
2619 next = RExC_parse + 1;
2621 while (isDIGIT(*next) || *next == ',') {
2630 if (*next == '}') { /* got one */
2634 min = atoi(RExC_parse);
2638 maxpos = RExC_parse;
2640 if (!max && *maxpos != '0')
2641 max = REG_INFTY; /* meaning "infinity" */
2642 else if (max >= REG_INFTY)
2643 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2645 nextchar(pRExC_state);
2648 if ((flags&SIMPLE)) {
2649 RExC_naughty += 2 + RExC_naughty / 2;
2650 reginsert(pRExC_state, CURLY, ret);
2651 Set_Node_Offset(ret, parse_start+1); /* MJD */
2652 Set_Node_Cur_Length(ret);
2655 regnode *w = reg_node(pRExC_state, WHILEM);
2658 regtail(pRExC_state, ret, w);
2659 if (!SIZE_ONLY && RExC_extralen) {
2660 reginsert(pRExC_state, LONGJMP,ret);
2661 reginsert(pRExC_state, NOTHING,ret);
2662 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2664 reginsert(pRExC_state, CURLYX,ret);
2666 Set_Node_Offset(ret, parse_start+1);
2667 Set_Node_Length(ret,
2668 op == '{' ? (RExC_parse - parse_start) : 1);
2670 if (!SIZE_ONLY && RExC_extralen)
2671 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2672 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2674 RExC_whilem_seen++, RExC_extralen += 3;
2675 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2683 if (max && max < min)
2684 vFAIL("Can't do {n,m} with n > m");
2686 ARG1_SET(ret, (U16)min);
2687 ARG2_SET(ret, (U16)max);
2699 #if 0 /* Now runtime fix should be reliable. */
2701 /* if this is reinstated, don't forget to put this back into perldiag:
2703 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2705 (F) The part of the regexp subject to either the * or + quantifier
2706 could match an empty string. The {#} shows in the regular
2707 expression about where the problem was discovered.
2711 if (!(flags&HASWIDTH) && op != '?')
2712 vFAIL("Regexp *+ operand could be empty");
2715 parse_start = RExC_parse;
2716 nextchar(pRExC_state);
2718 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2720 if (op == '*' && (flags&SIMPLE)) {
2721 reginsert(pRExC_state, STAR, ret);
2725 else if (op == '*') {
2729 else if (op == '+' && (flags&SIMPLE)) {
2730 reginsert(pRExC_state, PLUS, ret);
2734 else if (op == '+') {
2738 else if (op == '?') {
2743 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2745 "%.*s matches null string many times",
2746 RExC_parse - origparse,
2750 if (*RExC_parse == '?') {
2751 nextchar(pRExC_state);
2752 reginsert(pRExC_state, MINMOD, ret);
2753 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2755 if (ISMULT2(RExC_parse)) {
2757 vFAIL("Nested quantifiers");
2764 - regatom - the lowest level
2766 * Optimization: gobbles an entire sequence of ordinary characters so that
2767 * it can turn them into a single node, which is smaller to store and
2768 * faster to run. Backslashed characters are exceptions, each becoming a
2769 * separate node; the code is simpler that way and it's not worth fixing.
2771 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2773 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2775 register regnode *ret = 0;
2777 char *parse_start = 0;
2779 *flagp = WORST; /* Tentatively. */
2782 switch (*RExC_parse) {
2784 RExC_seen_zerolen++;
2785 nextchar(pRExC_state);
2786 if (RExC_flags & PMf_MULTILINE)
2787 ret = reg_node(pRExC_state, MBOL);
2788 else if (RExC_flags & PMf_SINGLELINE)
2789 ret = reg_node(pRExC_state, SBOL);
2791 ret = reg_node(pRExC_state, BOL);
2792 Set_Node_Length(ret, 1); /* MJD */
2795 nextchar(pRExC_state);
2797 RExC_seen_zerolen++;
2798 if (RExC_flags & PMf_MULTILINE)
2799 ret = reg_node(pRExC_state, MEOL);
2800 else if (RExC_flags & PMf_SINGLELINE)
2801 ret = reg_node(pRExC_state, SEOL);
2803 ret = reg_node(pRExC_state, EOL);
2804 Set_Node_Length(ret, 1); /* MJD */
2807 nextchar(pRExC_state);
2808 if (RExC_flags & PMf_SINGLELINE)
2809 ret = reg_node(pRExC_state, SANY);
2811 ret = reg_node(pRExC_state, REG_ANY);
2812 *flagp |= HASWIDTH|SIMPLE;
2814 Set_Node_Length(ret, 1); /* MJD */
2818 char *oregcomp_parse = ++RExC_parse;
2819 ret = regclass(pRExC_state);
2820 if (*RExC_parse != ']') {
2821 RExC_parse = oregcomp_parse;
2822 vFAIL("Unmatched [");
2824 nextchar(pRExC_state);
2825 *flagp |= HASWIDTH|SIMPLE;
2826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2830 nextchar(pRExC_state);
2831 ret = reg(pRExC_state, 1, &flags);
2833 if (flags & TRYAGAIN) {
2834 if (RExC_parse == RExC_end) {
2835 /* Make parent create an empty node if needed. */
2843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2847 if (flags & TRYAGAIN) {
2851 vFAIL("Internal urp");
2852 /* Supposed to be caught earlier. */
2855 if (!regcurly(RExC_parse)) {
2864 vFAIL("Quantifier follows nothing");
2867 switch (*++RExC_parse) {
2869 RExC_seen_zerolen++;
2870 ret = reg_node(pRExC_state, SBOL);
2872 nextchar(pRExC_state);
2873 Set_Node_Length(ret, 2); /* MJD */
2876 ret = reg_node(pRExC_state, GPOS);
2877 RExC_seen |= REG_SEEN_GPOS;
2879 nextchar(pRExC_state);
2880 Set_Node_Length(ret, 2); /* MJD */
2883 ret = reg_node(pRExC_state, SEOL);
2885 RExC_seen_zerolen++; /* Do not optimize RE away */
2886 nextchar(pRExC_state);
2889 ret = reg_node(pRExC_state, EOS);
2891 RExC_seen_zerolen++; /* Do not optimize RE away */
2892 nextchar(pRExC_state);
2893 Set_Node_Length(ret, 2); /* MJD */
2896 ret = reg_node(pRExC_state, CANY);
2897 RExC_seen |= REG_SEEN_CANY;
2898 *flagp |= HASWIDTH|SIMPLE;
2899 nextchar(pRExC_state);
2900 Set_Node_Length(ret, 2); /* MJD */
2903 ret = reg_node(pRExC_state, CLUMP);
2905 nextchar(pRExC_state);
2906 Set_Node_Length(ret, 2); /* MJD */
2909 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2910 *flagp |= HASWIDTH|SIMPLE;
2911 nextchar(pRExC_state);
2912 Set_Node_Length(ret, 2); /* MJD */
2915 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2916 *flagp |= HASWIDTH|SIMPLE;
2917 nextchar(pRExC_state);
2918 Set_Node_Length(ret, 2); /* MJD */
2921 RExC_seen_zerolen++;
2922 RExC_seen |= REG_SEEN_LOOKBEHIND;
2923 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2929 RExC_seen_zerolen++;
2930 RExC_seen |= REG_SEEN_LOOKBEHIND;
2931 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2933 nextchar(pRExC_state);
2934 Set_Node_Length(ret, 2); /* MJD */
2937 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2938 *flagp |= HASWIDTH|SIMPLE;
2939 nextchar(pRExC_state);
2940 Set_Node_Length(ret, 2); /* MJD */
2943 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2944 *flagp |= HASWIDTH|SIMPLE;
2945 nextchar(pRExC_state);
2946 Set_Node_Length(ret, 2); /* MJD */
2949 ret = reg_node(pRExC_state, DIGIT);
2950 *flagp |= HASWIDTH|SIMPLE;
2951 nextchar(pRExC_state);
2952 Set_Node_Length(ret, 2); /* MJD */
2955 ret = reg_node(pRExC_state, NDIGIT);
2956 *flagp |= HASWIDTH|SIMPLE;
2957 nextchar(pRExC_state);
2958 Set_Node_Length(ret, 2); /* MJD */
2963 char* oldregxend = RExC_end;
2964 char* parse_start = RExC_parse;
2966 if (RExC_parse[1] == '{') {
2967 /* a lovely hack--pretend we saw [\pX] instead */
2968 RExC_end = strchr(RExC_parse, '}');
2970 U8 c = (U8)*RExC_parse;
2972 RExC_end = oldregxend;
2973 vFAIL2("Missing right brace on \\%c{}", c);
2978 RExC_end = RExC_parse + 2;
2979 if (RExC_end > oldregxend)
2980 RExC_end = oldregxend;
2984 ret = regclass(pRExC_state);
2986 RExC_end = oldregxend;
2988 Set_Node_Cur_Length(ret); /* MJD */
2989 nextchar(pRExC_state);
2990 *flagp |= HASWIDTH|SIMPLE;
3003 case '1': case '2': case '3': case '4':
3004 case '5': case '6': case '7': case '8': case '9':
3006 I32 num = atoi(RExC_parse);
3008 if (num > 9 && num >= RExC_npar)
3011 char * parse_start = RExC_parse - 1; /* MJD */
3012 while (isDIGIT(*RExC_parse))
3015 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3016 vFAIL("Reference to nonexistent group");
3018 ret = reganode(pRExC_state,
3019 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3023 /* override incorrect value set in reganode MJD */
3024 Set_Node_Offset(ret, parse_start+1);
3025 Set_Node_Cur_Length(ret); /* MJD */
3027 nextchar(pRExC_state);
3032 if (RExC_parse >= RExC_end)
3033 FAIL("Trailing \\");
3036 /* Do not generate `unrecognized' warnings here, we fall
3037 back into the quick-grab loop below */
3043 if (RExC_flags & PMf_EXTENDED) {
3044 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3045 if (RExC_parse < RExC_end)
3051 register STRLEN len;
3057 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3059 parse_start = RExC_parse - 1;
3065 ret = reg_node(pRExC_state,
3066 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3068 for (len = 0, p = RExC_parse - 1;
3069 len < 127 && p < RExC_end;
3074 if (RExC_flags & PMf_EXTENDED)
3075 p = regwhite(p, RExC_end);
3122 ender = ASCII_TO_NATIVE('\033');
3126 ender = ASCII_TO_NATIVE('\007');
3131 char* e = strchr(p, '}');
3135 vFAIL("Missing right brace on \\x{}");
3138 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139 | PERL_SCAN_DISALLOW_PREFIX;
3141 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3144 /* numlen is generous */
3145 if (numlen + len >= 127) {
3153 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3155 ender = grok_hex(p, &numlen, &flags, NULL);
3161 ender = UCHARAT(p++);
3162 ender = toCTRL(ender);
3164 case '0': case '1': case '2': case '3':case '4':
3165 case '5': case '6': case '7': case '8':case '9':
3167 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3170 ender = grok_oct(p, &numlen, &flags, NULL);
3180 FAIL("Trailing \\");
3183 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3184 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3185 goto normal_default;
3190 if (UTF8_IS_START(*p) && UTF) {
3191 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3199 if (RExC_flags & PMf_EXTENDED)
3200 p = regwhite(p, RExC_end);
3202 /* Prime the casefolded buffer. */
3203 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3205 if (ISMULT2(p)) { /* Back off on ?+*. */
3212 /* Emit all the Unicode characters. */
3213 for (foldbuf = tmpbuf;
3215 foldlen -= numlen) {
3216 ender = utf8_to_uvchr(foldbuf, &numlen);
3218 reguni(pRExC_state, ender, s, &unilen);
3221 /* In EBCDIC the numlen
3222 * and unilen can differ. */
3224 if (numlen >= foldlen)
3228 break; /* "Can't happen." */
3232 reguni(pRExC_state, ender, s, &unilen);
3241 REGC((char)ender, s++);
3249 /* Emit all the Unicode characters. */
3250 for (foldbuf = tmpbuf;
3252 foldlen -= numlen) {
3253 ender = utf8_to_uvchr(foldbuf, &numlen);
3255 reguni(pRExC_state, ender, s, &unilen);
3258 /* In EBCDIC the numlen
3259 * and unilen can differ. */
3261 if (numlen >= foldlen)
3269 reguni(pRExC_state, ender, s, &unilen);
3278 REGC((char)ender, s++);
3282 Set_Node_Cur_Length(ret); /* MJD */
3283 nextchar(pRExC_state);
3285 /* len is STRLEN which is unsigned, need to copy to signed */
3288 vFAIL("Internal disaster");
3297 RExC_size += STR_SZ(len);
3299 RExC_emit += STR_SZ(len);
3304 /* If the encoding pragma is in effect recode the text of
3305 * any EXACT-kind nodes. */
3306 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3307 STRLEN oldlen = STR_LEN(ret);
3308 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3312 if (sv_utf8_downgrade(sv, TRUE)) {
3313 char *s = sv_recode_to_utf8(sv, PL_encoding);
3314 STRLEN newlen = SvCUR(sv);
3319 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3320 (int)oldlen, STRING(ret),
3322 Copy(s, STRING(ret), newlen, char);
3323 STR_LEN(ret) += newlen - oldlen;
3324 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3326 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3334 S_regwhite(pTHX_ char *p, char *e)
3339 else if (*p == '#') {
3342 } while (p < e && *p != '\n');
3350 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3351 Character classes ([:foo:]) can also be negated ([:^foo:]).
3352 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3353 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3354 but trigger failures because they are currently unimplemented. */
3356 #define POSIXCC_DONE(c) ((c) == ':')
3357 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3358 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3361 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3364 I32 namedclass = OOB_NAMEDCLASS;
3366 if (value == '[' && RExC_parse + 1 < RExC_end &&
3367 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3368 POSIXCC(UCHARAT(RExC_parse))) {
3369 char c = UCHARAT(RExC_parse);
3370 char* s = RExC_parse++;
3372 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3374 if (RExC_parse == RExC_end)
3375 /* Grandfather lone [:, [=, [. */
3378 char* t = RExC_parse++; /* skip over the c */
3380 if (UCHARAT(RExC_parse) == ']') {
3381 RExC_parse++; /* skip over the ending ] */
3384 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3385 I32 skip = 5; /* the most common skip */
3389 if (strnEQ(posixcc, "alnum", 5))
3391 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3392 else if (strnEQ(posixcc, "alpha", 5))
3394 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3395 else if (strnEQ(posixcc, "ascii", 5))
3397 complement ? ANYOF_NASCII : ANYOF_ASCII;
3400 if (strnEQ(posixcc, "blank", 5))
3402 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3405 if (strnEQ(posixcc, "cntrl", 5))
3407 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3410 if (strnEQ(posixcc, "digit", 5))
3412 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3415 if (strnEQ(posixcc, "graph", 5))
3417 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3420 if (strnEQ(posixcc, "lower", 5))
3422 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3425 if (strnEQ(posixcc, "print", 5))
3427 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3428 else if (strnEQ(posixcc, "punct", 5))
3430 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3433 if (strnEQ(posixcc, "space", 5))
3435 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3438 if (strnEQ(posixcc, "upper", 5))
3440 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3442 case 'w': /* this is not POSIX, this is the Perl \w */
3443 if (strnEQ(posixcc, "word", 4)) {
3445 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3450 if (strnEQ(posixcc, "xdigit", 6)) {
3452 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3457 if (namedclass == OOB_NAMEDCLASS ||
3458 posixcc[skip] != ':' ||
3459 posixcc[skip+1] != ']')
3461 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3464 } else if (!SIZE_ONLY) {
3465 /* [[=foo=]] and [[.foo.]] are still future. */
3467 /* adjust RExC_parse so the warning shows after
3469 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3471 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3474 /* Maternal grandfather:
3475 * "[:" ending in ":" but not in ":]" */
3485 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3487 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3488 char *s = RExC_parse;
3491 while(*s && isALNUM(*s))
3493 if (*s && c == *s && s[1] == ']') {
3494 if (ckWARN(WARN_REGEXP))
3496 "POSIX syntax [%c %c] belongs inside character classes",
3499 /* [[=foo=]] and [[.foo.]] are still future. */
3500 if (POSIXCC_NOTYET(c)) {
3501 /* adjust RExC_parse so the error shows after
3503 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3505 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3512 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3515 register UV nextvalue;
3516 register IV prevvalue = OOB_UNICODE;
3517 register IV range = 0;
3518 register regnode *ret;
3521 char *rangebegin = 0;
3522 bool need_class = 0;
3523 SV *listsv = Nullsv;
3526 bool optimize_invert = TRUE;
3527 AV* unicode_alternate = 0;
3529 UV literal_endpoint = 0;
3532 ret = reganode(pRExC_state, ANYOF, 0);
3535 ANYOF_FLAGS(ret) = 0;
3537 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3541 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3545 RExC_size += ANYOF_SKIP;
3547 RExC_emit += ANYOF_SKIP;
3549 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3551 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3552 ANYOF_BITMAP_ZERO(ret);
3553 listsv = newSVpvn("# comment\n", 10);
3556 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3558 if (!SIZE_ONLY && POSIXCC(nextvalue))
3559 checkposixcc(pRExC_state);
3561 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3562 if (UCHARAT(RExC_parse) == ']')
3565 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3569 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3572 rangebegin = RExC_parse;
3574 value = utf8n_to_uvchr((U8*)RExC_parse,
3575 RExC_end - RExC_parse,
3577 RExC_parse += numlen;
3580 value = UCHARAT(RExC_parse++);
3581 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3582 if (value == '[' && POSIXCC(nextvalue))
3583 namedclass = regpposixcc(pRExC_state, value);
3584 else if (value == '\\') {
3586 value = utf8n_to_uvchr((U8*)RExC_parse,
3587 RExC_end - RExC_parse,
3589 RExC_parse += numlen;
3592 value = UCHARAT(RExC_parse++);
3593 /* Some compilers cannot handle switching on 64-bit integer
3594 * values, therefore value cannot be an UV. Yes, this will
3595 * be a problem later if we want switch on Unicode.
3596 * A similar issue a little bit later when switching on
3597 * namedclass. --jhi */
3598 switch ((I32)value) {
3599 case 'w': namedclass = ANYOF_ALNUM; break;
3600 case 'W': namedclass = ANYOF_NALNUM; break;
3601 case 's': namedclass = ANYOF_SPACE; break;
3602 case 'S': namedclass = ANYOF_NSPACE; break;
3603 case 'd': namedclass = ANYOF_DIGIT; break;
3604 case 'D': namedclass = ANYOF_NDIGIT; break;
3607 if (RExC_parse >= RExC_end)
3608 vFAIL2("Empty \\%c{}", (U8)value);
3609 if (*RExC_parse == '{') {
3611 e = strchr(RExC_parse++, '}');
3613 vFAIL2("Missing right brace on \\%c{}", c);
3614 while (isSPACE(UCHARAT(RExC_parse)))
3616 if (e == RExC_parse)
3617 vFAIL2("Empty \\%c{}", c);
3619 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3627 if (UCHARAT(RExC_parse) == '^') {
3630 value = value == 'p' ? 'P' : 'p'; /* toggle */
3631 while (isSPACE(UCHARAT(RExC_parse))) {
3637 Perl_sv_catpvf(aTHX_ listsv,
3638 "+utf8::%.*s\n", (int)n, RExC_parse);
3640 Perl_sv_catpvf(aTHX_ listsv,
3641 "!utf8::%.*s\n", (int)n, RExC_parse);
3644 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3646 case 'n': value = '\n'; break;
3647 case 'r': value = '\r'; break;
3648 case 't': value = '\t'; break;
3649 case 'f': value = '\f'; break;
3650 case 'b': value = '\b'; break;
3651 case 'e': value = ASCII_TO_NATIVE('\033');break;
3652 case 'a': value = ASCII_TO_NATIVE('\007');break;
3654 if (*RExC_parse == '{') {
3655 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3656 | PERL_SCAN_DISALLOW_PREFIX;
3657 e = strchr(RExC_parse++, '}');
3659 vFAIL("Missing right brace on \\x{}");
3661 numlen = e - RExC_parse;
3662 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3666 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3668 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3669 RExC_parse += numlen;
3673 value = UCHARAT(RExC_parse++);
3674 value = toCTRL(value);
3676 case '0': case '1': case '2': case '3': case '4':
3677 case '5': case '6': case '7': case '8': case '9':
3681 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3682 RExC_parse += numlen;
3686 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3688 "Unrecognized escape \\%c in character class passed through",
3692 } /* end of \blah */
3698 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3700 if (!SIZE_ONLY && !need_class)
3701 ANYOF_CLASS_ZERO(ret);
3705 /* a bad range like a-\d, a-[:digit:] ? */
3708 if (ckWARN(WARN_REGEXP))
3710 "False [] range \"%*.*s\"",
3711 RExC_parse - rangebegin,
3712 RExC_parse - rangebegin,
3714 if (prevvalue < 256) {
3715 ANYOF_BITMAP_SET(ret, prevvalue);
3716 ANYOF_BITMAP_SET(ret, '-');
3719 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3720 Perl_sv_catpvf(aTHX_ listsv,
3721 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3725 range = 0; /* this was not a true range */
3729 if (namedclass > OOB_NAMEDCLASS)
3730 optimize_invert = FALSE;
3731 /* Possible truncation here but in some 64-bit environments
3732 * the compiler gets heartburn about switch on 64-bit values.
3733 * A similar issue a little earlier when switching on value.
3735 switch ((I32)namedclass) {
3738 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3740 for (value = 0; value < 256; value++)
3742 ANYOF_BITMAP_SET(ret, value);
3744 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3748 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3750 for (value = 0; value < 256; value++)
3751 if (!isALNUM(value))
3752 ANYOF_BITMAP_SET(ret, value);
3754 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3758 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3760 for (value = 0; value < 256; value++)
3761 if (isALNUMC(value))
3762 ANYOF_BITMAP_SET(ret, value);
3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3768 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3770 for (value = 0; value < 256; value++)
3771 if (!isALNUMC(value))
3772 ANYOF_BITMAP_SET(ret, value);
3774 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3778 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3780 for (value = 0; value < 256; value++)
3782 ANYOF_BITMAP_SET(ret, value);
3784 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3788 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3790 for (value = 0; value < 256; value++)
3791 if (!isALPHA(value))
3792 ANYOF_BITMAP_SET(ret, value);
3794 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3798 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3801 for (value = 0; value < 128; value++)
3802 ANYOF_BITMAP_SET(ret, value);
3804 for (value = 0; value < 256; value++) {
3806 ANYOF_BITMAP_SET(ret, value);
3810 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3814 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3817 for (value = 128; value < 256; value++)
3818 ANYOF_BITMAP_SET(ret, value);
3820 for (value = 0; value < 256; value++) {
3821 if (!isASCII(value))
3822 ANYOF_BITMAP_SET(ret, value);
3826 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3830 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3832 for (value = 0; value < 256; value++)
3834 ANYOF_BITMAP_SET(ret, value);
3836 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3840 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3842 for (value = 0; value < 256; value++)
3843 if (!isBLANK(value))
3844 ANYOF_BITMAP_SET(ret, value);
3846 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3850 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3852 for (value = 0; value < 256; value++)
3854 ANYOF_BITMAP_SET(ret, value);
3856 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3860 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3862 for (value = 0; value < 256; value++)
3863 if (!isCNTRL(value))
3864 ANYOF_BITMAP_SET(ret, value);
3866 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3870 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3872 /* consecutive digits assumed */
3873 for (value = '0'; value <= '9'; value++)
3874 ANYOF_BITMAP_SET(ret, value);
3876 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3880 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3882 /* consecutive digits assumed */
3883 for (value = 0; value < '0'; value++)
3884 ANYOF_BITMAP_SET(ret, value);
3885 for (value = '9' + 1; value < 256; value++)
3886 ANYOF_BITMAP_SET(ret, value);
3888 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3892 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3894 for (value = 0; value < 256; value++)
3896 ANYOF_BITMAP_SET(ret, value);
3898 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3902 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3904 for (value = 0; value < 256; value++)
3905 if (!isGRAPH(value))
3906 ANYOF_BITMAP_SET(ret, value);
3908 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3912 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3914 for (value = 0; value < 256; value++)
3916 ANYOF_BITMAP_SET(ret, value);
3918 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3922 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3924 for (value = 0; value < 256; value++)
3925 if (!isLOWER(value))
3926 ANYOF_BITMAP_SET(ret, value);
3928 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3932 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3934 for (value = 0; value < 256; value++)
3936 ANYOF_BITMAP_SET(ret, value);
3938 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3942 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3944 for (value = 0; value < 256; value++)
3945 if (!isPRINT(value))
3946 ANYOF_BITMAP_SET(ret, value);
3948 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3952 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3954 for (value = 0; value < 256; value++)
3955 if (isPSXSPC(value))
3956 ANYOF_BITMAP_SET(ret, value);
3958 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3962 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3964 for (value = 0; value < 256; value++)
3965 if (!isPSXSPC(value))
3966 ANYOF_BITMAP_SET(ret, value);
3968 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3972 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3974 for (value = 0; value < 256; value++)
3976 ANYOF_BITMAP_SET(ret, value);
3978 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3982 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3984 for (value = 0; value < 256; value++)
3985 if (!isPUNCT(value))
3986 ANYOF_BITMAP_SET(ret, value);
3988 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3992 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3994 for (value = 0; value < 256; value++)
3996 ANYOF_BITMAP_SET(ret, value);
3998 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4002 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4004 for (value = 0; value < 256; value++)
4005 if (!isSPACE(value))
4006 ANYOF_BITMAP_SET(ret, value);
4008 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4012 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4014 for (value = 0; value < 256; value++)
4016 ANYOF_BITMAP_SET(ret, value);
4018 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4022 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4024 for (value = 0; value < 256; value++)
4025 if (!isUPPER(value))
4026 ANYOF_BITMAP_SET(ret, value);
4028 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4032 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4034 for (value = 0; value < 256; value++)
4035 if (isXDIGIT(value))
4036 ANYOF_BITMAP_SET(ret, value);
4038 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4042 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4044 for (value = 0; value < 256; value++)
4045 if (!isXDIGIT(value))
4046 ANYOF_BITMAP_SET(ret, value);
4048 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4051 vFAIL("Invalid [::] class");
4055 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4058 } /* end of namedclass \blah */
4061 if (prevvalue > (IV)value) /* b-a */ {
4062 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4063 RExC_parse - rangebegin,
4064 RExC_parse - rangebegin,
4066 range = 0; /* not a valid range */
4070 prevvalue = value; /* save the beginning of the range */
4071 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4072 RExC_parse[1] != ']') {
4075 /* a bad range like \w-, [:word:]- ? */
4076 if (namedclass > OOB_NAMEDCLASS) {
4077 if (ckWARN(WARN_REGEXP))
4079 "False [] range \"%*.*s\"",
4080 RExC_parse - rangebegin,
4081 RExC_parse - rangebegin,
4084 ANYOF_BITMAP_SET(ret, '-');
4086 range = 1; /* yeah, it's a range! */
4087 continue; /* but do it the next time */
4091 /* now is the next time */
4095 if (prevvalue < 256) {
4096 IV ceilvalue = value < 256 ? value : 255;
4099 /* In EBCDIC [\x89-\x91] should include
4100 * the \x8e but [i-j] should not. */
4101 if (literal_endpoint == 2 &&
4102 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4103 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4105 if (isLOWER(prevvalue)) {
4106 for (i = prevvalue; i <= ceilvalue; i++)
4108 ANYOF_BITMAP_SET(ret, i);
4110 for (i = prevvalue; i <= ceilvalue; i++)
4112 ANYOF_BITMAP_SET(ret, i);
4117 for (i = prevvalue; i <= ceilvalue; i++)
4118 ANYOF_BITMAP_SET(ret, i);
4120 if (value > 255 || UTF) {
4121 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4122 UV natvalue = NATIVE_TO_UNI(value);
4124 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4125 if (prevnatvalue < natvalue) { /* what about > ? */
4126 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4127 prevnatvalue, natvalue);
4129 else if (prevnatvalue == natvalue) {
4130 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4132 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4134 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4136 /* If folding and foldable and a single
4137 * character, insert also the folded version
4138 * to the charclass. */
4140 if (foldlen == (STRLEN)UNISKIP(f))
4141 Perl_sv_catpvf(aTHX_ listsv,
4144 /* Any multicharacter foldings
4145 * require the following transform:
4146 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4147 * where E folds into "pq" and F folds
4148 * into "rst", all other characters
4149 * fold to single characters. We save
4150 * away these multicharacter foldings,
4151 * to be later saved as part of the
4152 * additional "s" data. */
4155 if (!unicode_alternate)
4156 unicode_alternate = newAV();
4157 sv = newSVpvn((char*)foldbuf, foldlen);
4159 av_push(unicode_alternate, sv);
4163 /* If folding and the value is one of the Greek
4164 * sigmas insert a few more sigmas to make the
4165 * folding rules of the sigmas to work right.
4166 * Note that not all the possible combinations
4167 * are handled here: some of them are handled
4168 * by the standard folding rules, and some of
4169 * them (literal or EXACTF cases) are handled
4170 * during runtime in regexec.c:S_find_byclass(). */
4171 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4172 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4173 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4174 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4175 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4177 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4178 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4179 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4184 literal_endpoint = 0;
4188 range = 0; /* this range (if it was one) is done now */
4192 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4194 RExC_size += ANYOF_CLASS_ADD_SKIP;
4196 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4199 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4201 /* If the only flag is folding (plus possibly inversion). */
4202 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4204 for (value = 0; value < 256; ++value) {
4205 if (ANYOF_BITMAP_TEST(ret, value)) {
4206 UV fold = PL_fold[value];
4209 ANYOF_BITMAP_SET(ret, fold);
4212 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4215 /* optimize inverted simple patterns (e.g. [^a-z]) */
4216 if (!SIZE_ONLY && optimize_invert &&
4217 /* If the only flag is inversion. */
4218 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4219 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4220 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4221 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4228 /* The 0th element stores the character class description
4229 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4230 * to initialize the appropriate swash (which gets stored in
4231 * the 1st element), and also useful for dumping the regnode.
4232 * The 2nd element stores the multicharacter foldings,
4233 * used later (regexec.c:S_reginclass()). */
4234 av_store(av, 0, listsv);
4235 av_store(av, 1, NULL);
4236 av_store(av, 2, (SV*)unicode_alternate);
4237 rv = newRV_noinc((SV*)av);
4238 n = add_data(pRExC_state, 1, "s");
4239 RExC_rx->data->data[n] = (void*)rv;
4247 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4249 char* retval = RExC_parse++;
4252 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4253 RExC_parse[2] == '#') {
4254 while (*RExC_parse && *RExC_parse != ')')
4259 if (RExC_flags & PMf_EXTENDED) {
4260 if (isSPACE(*RExC_parse)) {
4264 else if (*RExC_parse == '#') {
4265 while (*RExC_parse && *RExC_parse != '\n')
4276 - reg_node - emit a node
4278 STATIC regnode * /* Location. */
4279 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4281 register regnode *ret;
4282 register regnode *ptr;
4286 SIZE_ALIGN(RExC_size);
4291 NODE_ALIGN_FILL(ret);
4293 FILL_ADVANCE_NODE(ptr, op);
4294 if (RExC_offsets) { /* MJD */
4295 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4296 "reg_node", __LINE__,
4298 RExC_emit - RExC_emit_start > RExC_offsets[0]
4299 ? "Overwriting end of array!\n" : "OK",
4300 RExC_emit - RExC_emit_start,
4301 RExC_parse - RExC_start,
4303 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4312 - reganode - emit a node with an argument
4314 STATIC regnode * /* Location. */
4315 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4317 register regnode *ret;
4318 register regnode *ptr;
4322 SIZE_ALIGN(RExC_size);
4327 NODE_ALIGN_FILL(ret);
4329 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4330 if (RExC_offsets) { /* MJD */
4331 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4333 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4334 "Overwriting end of array!\n" : "OK",
4335 RExC_emit - RExC_emit_start,
4336 RExC_parse - RExC_start,
4338 Set_Cur_Node_Offset;
4347 - reguni - emit (if appropriate) a Unicode character
4350 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4352 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4356 - reginsert - insert an operator in front of already-emitted operand
4358 * Means relocating the operand.
4361 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4363 register regnode *src;
4364 register regnode *dst;
4365 register regnode *place;
4366 register int offset = regarglen[(U8)op];
4368 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4371 RExC_size += NODE_STEP_REGNODE + offset;
4376 RExC_emit += NODE_STEP_REGNODE + offset;
4378 while (src > opnd) {
4379 StructCopy(--src, --dst, regnode);
4380 if (RExC_offsets) { /* MJD 20010112 */
4381 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4383 dst - RExC_emit_start > RExC_offsets[0]
4384 ? "Overwriting end of array!\n" : "OK",
4385 src - RExC_emit_start,
4386 dst - RExC_emit_start,
4388 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4389 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4394 place = opnd; /* Op node, where operand used to be. */
4395 if (RExC_offsets) { /* MJD */
4396 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4398 place - RExC_emit_start > RExC_offsets[0]
4399 ? "Overwriting end of array!\n" : "OK",
4400 place - RExC_emit_start,
4401 RExC_parse - RExC_start,
4403 Set_Node_Offset(place, RExC_parse);
4405 src = NEXTOPER(place);
4406 FILL_ADVANCE_NODE(place, op);
4407 Zero(src, offset, regnode);
4411 - regtail - set the next-pointer at the end of a node chain of p to val.
4414 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4416 register regnode *scan;
4417 register regnode *temp;
4422 /* Find last node. */
4425 temp = regnext(scan);
4431 if (reg_off_by_arg[OP(scan)]) {
4432 ARG_SET(scan, val - scan);
4435 NEXT_OFF(scan) = val - scan;
4440 - regoptail - regtail on operand of first argument; nop if operandless
4443 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4445 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4446 if (p == NULL || SIZE_ONLY)
4448 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4449 regtail(pRExC_state, NEXTOPER(p), val);
4451 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4452 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4459 - regcurly - a little FSA that accepts {\d+,?\d*}
4462 S_regcurly(pTHX_ register char *s)
4483 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4485 register U8 op = EXACT; /* Arbitrary non-END op. */
4486 register regnode *next;
4488 while (op != END && (!last || node < last)) {
4489 /* While that wasn't END last time... */
4495 next = regnext(node);
4497 if (OP(node) == OPTIMIZED)
4500 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4501 (int)(2*l + 1), "", SvPVX(sv));
4502 if (next == NULL) /* Next ptr. */
4503 PerlIO_printf(Perl_debug_log, "(0)");
4505 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4506 (void)PerlIO_putc(Perl_debug_log, '\n');
4508 if (PL_regkind[(U8)op] == BRANCHJ) {
4509 register regnode *nnode = (OP(next) == LONGJMP
4512 if (last && nnode > last)
4514 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4516 else if (PL_regkind[(U8)op] == BRANCH) {
4517 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4519 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4520 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4521 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4523 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4524 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4527 else if ( op == PLUS || op == STAR) {
4528 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4530 else if (op == ANYOF) {
4531 /* arglen 1 + class block */
4532 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4533 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4534 node = NEXTOPER(node);
4536 else if (PL_regkind[(U8)op] == EXACT) {
4537 /* Literal string, where present. */
4538 node += NODE_SZ_STR(node) - 1;
4539 node = NEXTOPER(node);
4542 node = NEXTOPER(node);
4543 node += regarglen[(U8)op];
4545 if (op == CURLYX || op == OPEN)
4547 else if (op == WHILEM)
4553 #endif /* DEBUGGING */
4556 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4559 Perl_regdump(pTHX_ regexp *r)
4562 SV *sv = sv_newmortal();
4564 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4566 /* Header fields of interest. */
4567 if (r->anchored_substr)
4568 PerlIO_printf(Perl_debug_log,
4569 "anchored `%s%.*s%s'%s at %"IVdf" ",
4571 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4572 SvPVX(r->anchored_substr),
4574 SvTAIL(r->anchored_substr) ? "$" : "",
4575 (IV)r->anchored_offset);
4576 else if (r->anchored_utf8)
4577 PerlIO_printf(Perl_debug_log,
4578 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4580 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4581 SvPVX(r->anchored_utf8),
4583 SvTAIL(r->anchored_utf8) ? "$" : "",
4584 (IV)r->anchored_offset);
4585 if (r->float_substr)
4586 PerlIO_printf(Perl_debug_log,
4587 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4589 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4590 SvPVX(r->float_substr),
4592 SvTAIL(r->float_substr) ? "$" : "",
4593 (IV)r->float_min_offset, (UV)r->float_max_offset);
4594 else if (r->float_utf8)
4595 PerlIO_printf(Perl_debug_log,
4596 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4598 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4599 SvPVX(r->float_utf8),
4601 SvTAIL(r->float_utf8) ? "$" : "",
4602 (IV)r->float_min_offset, (UV)r->float_max_offset);
4603 if (r->check_substr || r->check_utf8)
4604 PerlIO_printf(Perl_debug_log,
4605 r->check_substr == r->float_substr
4606 && r->check_utf8 == r->float_utf8
4607 ? "(checking floating" : "(checking anchored");
4608 if (r->reganch & ROPT_NOSCAN)
4609 PerlIO_printf(Perl_debug_log, " noscan");
4610 if (r->reganch & ROPT_CHECK_ALL)
4611 PerlIO_printf(Perl_debug_log, " isall");
4612 if (r->check_substr || r->check_utf8)
4613 PerlIO_printf(Perl_debug_log, ") ");
4615 if (r->regstclass) {
4616 regprop(sv, r->regstclass);
4617 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4619 if (r->reganch & ROPT_ANCH) {
4620 PerlIO_printf(Perl_debug_log, "anchored");
4621 if (r->reganch & ROPT_ANCH_BOL)
4622 PerlIO_printf(Perl_debug_log, "(BOL)");
4623 if (r->reganch & ROPT_ANCH_MBOL)
4624 PerlIO_printf(Perl_debug_log, "(MBOL)");
4625 if (r->reganch & ROPT_ANCH_SBOL)
4626 PerlIO_printf(Perl_debug_log, "(SBOL)");
4627 if (r->reganch & ROPT_ANCH_GPOS)
4628 PerlIO_printf(Perl_debug_log, "(GPOS)");
4629 PerlIO_putc(Perl_debug_log, ' ');
4631 if (r->reganch & ROPT_GPOS_SEEN)
4632 PerlIO_printf(Perl_debug_log, "GPOS ");
4633 if (r->reganch & ROPT_SKIP)
4634 PerlIO_printf(Perl_debug_log, "plus ");
4635 if (r->reganch & ROPT_IMPLICIT)
4636 PerlIO_printf(Perl_debug_log, "implicit ");
4637 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4638 if (r->reganch & ROPT_EVAL_SEEN)
4639 PerlIO_printf(Perl_debug_log, "with eval ");
4640 PerlIO_printf(Perl_debug_log, "\n");
4643 U32 len = r->offsets[0];
4644 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4645 for (i = 1; i <= len; i++)
4646 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4647 (UV)r->offsets[i*2-1],
4648 (UV)r->offsets[i*2]);
4649 PerlIO_printf(Perl_debug_log, "\n");
4651 #endif /* DEBUGGING */
4657 S_put_byte(pTHX_ SV *sv, int c)
4659 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4660 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4661 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4662 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4664 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4667 #endif /* DEBUGGING */
4670 - regprop - printable representation of opcode
4673 Perl_regprop(pTHX_ SV *sv, regnode *o)
4678 sv_setpvn(sv, "", 0);
4679 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4680 /* It would be nice to FAIL() here, but this may be called from
4681 regexec.c, and it would be hard to supply pRExC_state. */
4682 Perl_croak(aTHX_ "Corrupted regexp opcode");
4683 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4685 k = PL_regkind[(U8)OP(o)];
4688 SV *dsv = sv_2mortal(newSVpvn("", 0));
4689 /* Using is_utf8_string() is a crude hack but it may
4690 * be the best for now since we have no flag "this EXACTish
4691 * node was UTF-8" --jhi */
4692 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4694 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4695 UNI_DISPLAY_REGEX) :
4700 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4705 else if (k == CURLY) {
4706 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4707 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4708 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4710 else if (k == WHILEM && o->flags) /* Ordinal/of */
4711 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4712 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4713 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4714 else if (k == LOGICAL)
4715 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4716 else if (k == ANYOF) {
4717 int i, rangestart = -1;
4718 U8 flags = ANYOF_FLAGS(o);
4719 const char * const anyofs[] = { /* Should be syncronized with
4720 * ANYOF_ #xdefines in regcomp.h */
4753 if (flags & ANYOF_LOCALE)
4754 sv_catpv(sv, "{loc}");
4755 if (flags & ANYOF_FOLD)
4756 sv_catpv(sv, "{i}");
4757 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4758 if (flags & ANYOF_INVERT)
4760 for (i = 0; i <= 256; i++) {
4761 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4762 if (rangestart == -1)
4764 } else if (rangestart != -1) {
4765 if (i <= rangestart + 3)
4766 for (; rangestart < i; rangestart++)
4767 put_byte(sv, rangestart);
4769 put_byte(sv, rangestart);
4771 put_byte(sv, i - 1);
4777 if (o->flags & ANYOF_CLASS)
4778 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4779 if (ANYOF_CLASS_TEST(o,i))
4780 sv_catpv(sv, anyofs[i]);
4782 if (flags & ANYOF_UNICODE)
4783 sv_catpv(sv, "{unicode}");
4784 else if (flags & ANYOF_UNICODE_ALL)
4785 sv_catpv(sv, "{unicode_all}");
4789 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4793 U8 s[UTF8_MAXLEN+1];
4795 for (i = 0; i <= 256; i++) { /* just the first 256 */
4796 U8 *e = uvchr_to_utf8(s, i);
4798 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4799 if (rangestart == -1)
4801 } else if (rangestart != -1) {
4804 if (i <= rangestart + 3)
4805 for (; rangestart < i; rangestart++) {
4806 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4810 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4813 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4820 sv_catpv(sv, "..."); /* et cetera */
4824 char *s = savepv(SvPVX(lv));
4827 while(*s && *s != '\n') s++;
4848 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4850 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4851 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4852 #endif /* DEBUGGING */
4856 Perl_re_intuit_string(pTHX_ regexp *prog)
4857 { /* Assume that RE_INTUIT is set */
4860 char *s = SvPV(prog->check_substr
4861 ? prog->check_substr : prog->check_utf8, n_a);
4863 if (!PL_colorset) reginitcolors();
4864 PerlIO_printf(Perl_debug_log,
4865 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4867 prog->check_substr ? "" : "utf8 ",
4868 PL_colors[5],PL_colors[0],
4871 (strlen(s) > 60 ? "..." : ""));
4874 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4878 Perl_pregfree(pTHX_ struct regexp *r)
4881 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4884 if (!r || (--r->refcnt > 0))
4890 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4891 r->prelen, 60, UNI_DISPLAY_REGEX)
4892 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4896 PerlIO_printf(Perl_debug_log,
4897 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4898 PL_colors[4],PL_colors[5],PL_colors[0],
4901 len > 60 ? "..." : "");
4905 Safefree(r->precomp);
4906 if (r->offsets) /* 20010421 MJD */
4907 Safefree(r->offsets);
4908 if (RX_MATCH_COPIED(r))
4909 Safefree(r->subbeg);
4911 if (r->anchored_substr)
4912 SvREFCNT_dec(r->anchored_substr);
4913 if (r->anchored_utf8)
4914 SvREFCNT_dec(r->anchored_utf8);
4915 if (r->float_substr)
4916 SvREFCNT_dec(r->float_substr);
4918 SvREFCNT_dec(r->float_utf8);
4919 Safefree(r->substrs);
4922 int n = r->data->count;
4923 PAD* new_comppad = NULL;
4927 /* If you add a ->what type here, update the comment in regcomp.h */
4928 switch (r->data->what[n]) {
4930 SvREFCNT_dec((SV*)r->data->data[n]);
4933 Safefree(r->data->data[n]);
4936 new_comppad = (AV*)r->data->data[n];
4939 if (new_comppad == NULL)
4940 Perl_croak(aTHX_ "panic: pregfree comppad");
4941 PAD_SAVE_LOCAL(old_comppad,
4942 /* Watch out for global destruction's random ordering. */
4943 (SvTYPE(new_comppad) == SVt_PVAV) ?
4944 new_comppad : Null(PAD *)
4946 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4947 op_free((OP_4tree*)r->data->data[n]);
4950 PAD_RESTORE_LOCAL(old_comppad);
4951 SvREFCNT_dec((SV*)new_comppad);
4957 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4960 Safefree(r->data->what);
4963 Safefree(r->startp);
4969 - regnext - dig the "next" pointer out of a node
4971 * [Note, when REGALIGN is defined there are two places in regmatch()
4972 * that bypass this code for speed.]
4975 Perl_regnext(pTHX_ register regnode *p)
4977 register I32 offset;
4979 if (p == &PL_regdummy)
4982 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4990 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4993 STRLEN l1 = strlen(pat1);
4994 STRLEN l2 = strlen(pat2);
5003 Copy(pat1, buf, l1 , char);
5004 Copy(pat2, buf + l1, l2 , char);
5005 buf[l1 + l2] = '\n';
5006 buf[l1 + l2 + 1] = '\0';
5008 /* ANSI variant takes additional second argument */
5009 va_start(args, pat2);
5013 msv = vmess(buf, &args);
5015 message = SvPV(msv,l1);
5018 Copy(message, buf, l1 , char);
5019 buf[l1] = '\0'; /* Overwrite \n */
5020 Perl_croak(aTHX_ "%s", buf);
5023 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5026 Perl_save_re_context(pTHX)
5029 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5030 SAVEI32(RExC_npar); /* () count. */
5031 SAVEI32(RExC_size); /* Code size. */
5032 SAVEI32(RExC_flags); /* are we folding, multilining? */
5033 SAVEVPTR(RExC_rx); /* from regcomp.c */
5034 SAVEI32(RExC_seen); /* from regcomp.c */
5035 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5036 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5037 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
5038 SAVEPPTR(RExC_end); /* End of input for compile */
5039 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5042 SAVEI32(PL_reg_flags); /* from regexec.c */
5044 SAVEPPTR(PL_reginput); /* String-input pointer. */
5045 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5046 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5047 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5048 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5049 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5050 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5051 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5052 PL_reg_start_tmp = 0;
5053 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5054 PL_reg_start_tmpl = 0;
5055 SAVEVPTR(PL_regdata);
5056 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5057 SAVEI32(PL_regnarrate); /* from regexec.c */
5058 SAVEVPTR(PL_regprogram); /* from regexec.c */
5059 SAVEINT(PL_regindent); /* from regexec.c */
5060 SAVEVPTR(PL_regcc); /* from regexec.c */
5061 SAVEVPTR(PL_curcop);
5062 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5063 SAVEVPTR(PL_reg_re); /* from regexec.c */
5064 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5065 SAVESPTR(PL_reg_sv); /* from regexec.c */
5066 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5067 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5068 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5069 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5070 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5071 SAVEI32(PL_regnpar); /* () count. */
5072 SAVEI32(PL_regsize); /* from regexec.c */
5074 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5079 clear_re(pTHX_ void *r)
5081 ReREFCNT_dec((regexp *)r);