5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN2(loc, m, a1) \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
406 (int)offset, RExC_precomp, RExC_precomp + offset); \
409 #define vWARN3(loc, m, a1, a2) \
411 IV offset = loc - RExC_precomp; \
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
414 (int)offset, RExC_precomp, RExC_precomp + offset); \
417 #define vWARN4(loc, m, a1, a2, a3) \
419 IV offset = loc - RExC_precomp; \
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
422 (int)offset, RExC_precomp, RExC_precomp + offset); \
425 /* used for the parse_flags section for (?c) -- japhy */
426 #define vWARN5(loc, m, a1, a2, a3, a4) \
428 IV offset = loc - RExC_precomp; \
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
435 /* Allow for side effects in s */
436 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
438 /* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
444 #define MJD_OFFSET_DEBUG(x)
445 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
448 # define Set_Node_Offset_To_R(node,byte) \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
454 RExC_offsets[2*(node)-1] = (byte); \
459 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
462 # define Set_Node_Length_To_R(node,len) \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
469 RExC_offsets[2*(node)] = (len); \
474 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
478 /* Get offsets and lengths */
479 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
482 static void clear_re(pTHX_ void *r);
484 /* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
489 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
502 data->flags &= ~SF_FIX_BEFORE_EOL;
505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
506 data->offset_float_max = (l
507 ? data->last_start_max
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513 data->flags &= ~SF_FL_BEFORE_EOL;
516 SvCUR_set(data->last_found, 0);
518 data->flags &= ~SF_BEFORE_EOL;
521 /* Can match anything (initialization) */
523 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
525 ANYOF_CLASS_ZERO(cl);
526 ANYOF_BITMAP_SETALL(cl);
527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
529 cl->flags |= ANYOF_LOCALE;
532 /* Can match anything (initialization) */
534 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
538 for (value = 0; value <= ANYOF_MAX; value += 2)
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
548 /* Can match anything (initialization) */
550 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 Zero(cl, 1, struct regnode_charclass_class);
554 cl_anything(pRExC_state, cl);
558 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
560 Zero(cl, 1, struct regnode_charclass_class);
562 cl_anything(pRExC_state, cl);
564 cl->flags |= ANYOF_LOCALE;
567 /* 'And' a given class with another one. Can create false positives */
568 /* We assume that cl is not inverted */
570 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
601 /* 'OR' a given class with another one. Can create false positives */
602 /* We assume that cl is not inverted */
604 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
606 if (or_with->flags & ANYOF_INVERT) {
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
625 cl_anything(pRExC_state, cl);
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
630 && (!(or_with->flags & ANYOF_FOLD)
631 || (cl->flags & ANYOF_FOLD)) ) {
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
644 cl_anything(pRExC_state, cl);
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
665 #if defined(__sparc64__) && defined(__GNUC__)
666 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667 # undef SPARC64_GCC_WORKAROUND
668 # define SPARC64_GCC_WORKAROUND 1
672 /* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
675 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
679 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
688 int is_inf_internal = 0; /* The studied chunk is infinite */
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
697 /* Merge several consecutive EXACTish nodes into one. */
698 regnode *n = regnext(scan);
701 regnode *stop = scan;
704 next = scan + NODE_SZ_STR(scan);
705 /* Skip NOTHING, merge EXACT*. */
707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
708 (stringok && (OP(n) == OP(scan))))
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
723 int oldl = STR_LEN(scan);
724 regnode *nnext = regnext(n);
726 if (oldl + STR_LEN(n) > U8_MAX)
728 NEXT_OFF(scan) += NEXT_OFF(n);
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
731 /* Now we can overwrite *n : */
732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
740 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
742 Two problematic code points in Unicode casefolding of EXACT nodes:
744 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
751 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
752 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
754 This means that in case-insensitive matching (or "loose matching",
755 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756 length of the above casefolded versions) can match a target string
757 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758 This would rather mess up the minimum length computation.
760 What we'll do is to look for the tail four bytes, and then peek
761 at the preceding two bytes to see whether we need to decrease
762 the minimum length by four (six minus two).
764 Thanks to the design of UTF-8, there cannot be false matches:
765 A sequence of valid UTF-8 bytes cannot be a subsequence of
766 another valid sequence of UTF-8 bytes.
769 char *s0 = STRING(scan), *s, *t;
770 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771 char *t0 = "\xcc\x88\xcc\x81";
775 s < s2 && (t = ninstr(s, s1, t0, t1));
777 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
785 n = scan + NODE_SZ_STR(scan);
787 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
795 /* Follow the next-chain of the current node and optimize
796 away all the NOTHINGs from it. */
797 if (OP(scan) != CURLYX) {
798 int max = (reg_off_by_arg[OP(scan)]
800 /* I32 may be smaller than U16 on CRAYs! */
801 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
802 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
806 /* Skip NOTHING and LONGJMP. */
807 while ((n = regnext(n))
808 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
809 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812 if (reg_off_by_arg[OP(scan)])
815 NEXT_OFF(scan) = off;
817 /* The principal pseudo-switch. Cannot be a switch, since we
818 look into several different things. */
819 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
820 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821 next = regnext(scan);
824 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
825 I32 max1 = 0, min1 = I32_MAX, num = 0;
826 struct regnode_charclass_class accum;
828 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
829 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
830 if (flags & SCF_DO_STCLASS)
831 cl_init_zero(pRExC_state, &accum);
832 while (OP(scan) == code) {
833 I32 deltanext, minnext, f = 0, fake;
834 struct regnode_charclass_class this_class;
839 data_fake.whilem_c = data->whilem_c;
840 data_fake.last_closep = data->last_closep;
843 data_fake.last_closep = &fake;
844 next = regnext(scan);
845 scan = NEXTOPER(scan);
847 scan = NEXTOPER(scan);
848 if (flags & SCF_DO_STCLASS) {
849 cl_init(pRExC_state, &this_class);
850 data_fake.start_class = &this_class;
851 f = SCF_DO_STCLASS_AND;
853 if (flags & SCF_WHILEM_VISITED_POS)
854 f |= SCF_WHILEM_VISITED_POS;
855 /* we suppose the run is continuous, last=next...*/
856 minnext = study_chunk(pRExC_state, &scan, &deltanext,
857 next, &data_fake, f);
860 if (max1 < minnext + deltanext)
861 max1 = minnext + deltanext;
862 if (deltanext == I32_MAX)
863 is_inf = is_inf_internal = 1;
865 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867 if (data && (data_fake.flags & SF_HAS_EVAL))
868 data->flags |= SF_HAS_EVAL;
870 data->whilem_c = data_fake.whilem_c;
871 if (flags & SCF_DO_STCLASS)
872 cl_or(pRExC_state, &accum, &this_class);
876 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878 if (flags & SCF_DO_SUBSTR) {
879 data->pos_min += min1;
880 data->pos_delta += max1 - min1;
881 if (max1 != min1 || is_inf)
882 data->longest = &(data->longest_float);
885 delta += max1 - min1;
886 if (flags & SCF_DO_STCLASS_OR) {
887 cl_or(pRExC_state, data->start_class, &accum);
889 cl_and(data->start_class, &and_with);
890 flags &= ~SCF_DO_STCLASS;
893 else if (flags & SCF_DO_STCLASS_AND) {
895 cl_and(data->start_class, &accum);
896 flags &= ~SCF_DO_STCLASS;
899 /* Switch to OR mode: cache the old value of
900 * data->start_class */
901 StructCopy(data->start_class, &and_with,
902 struct regnode_charclass_class);
903 flags &= ~SCF_DO_STCLASS_AND;
904 StructCopy(&accum, data->start_class,
905 struct regnode_charclass_class);
906 flags |= SCF_DO_STCLASS_OR;
907 data->start_class->flags |= ANYOF_EOS;
911 else if (code == BRANCHJ) /* single branch is optimized. */
912 scan = NEXTOPER(NEXTOPER(scan));
913 else /* single branch is optimized. */
914 scan = NEXTOPER(scan);
917 else if (OP(scan) == EXACT) {
918 I32 l = STR_LEN(scan);
919 UV uc = *((U8*)STRING(scan));
921 U8 *s = (U8*)STRING(scan);
922 l = utf8_length(s, s + l);
923 uc = utf8_to_uvchr(s, NULL);
926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
927 /* The code below prefers earlier match for fixed
928 offset, later match for variable offset. */
929 if (data->last_end == -1) { /* Update the start info. */
930 data->last_start_min = data->pos_min;
931 data->last_start_max = is_inf
932 ? I32_MAX : data->pos_min + data->pos_delta;
934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935 data->last_end = data->pos_min + l;
936 data->pos_min += l; /* As in the first entry. */
937 data->flags &= ~SF_BEFORE_EOL;
939 if (flags & SCF_DO_STCLASS_AND) {
940 /* Check whether it is compatible with what we know already! */
944 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
945 && !ANYOF_BITMAP_TEST(data->start_class, uc)
946 && (!(data->start_class->flags & ANYOF_FOLD)
947 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950 ANYOF_CLASS_ZERO(data->start_class);
951 ANYOF_BITMAP_ZERO(data->start_class);
953 ANYOF_BITMAP_SET(data->start_class, uc);
954 data->start_class->flags &= ~ANYOF_EOS;
956 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
958 else if (flags & SCF_DO_STCLASS_OR) {
959 /* false positive possible if the class is case-folded */
961 ANYOF_BITMAP_SET(data->start_class, uc);
963 data->start_class->flags |= ANYOF_UNICODE_ALL;
964 data->start_class->flags &= ~ANYOF_EOS;
965 cl_and(data->start_class, &and_with);
967 flags &= ~SCF_DO_STCLASS;
969 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
970 I32 l = STR_LEN(scan);
971 UV uc = *((U8*)STRING(scan));
973 /* Search for fixed substrings supports EXACT only. */
974 if (flags & SCF_DO_SUBSTR)
975 scan_commit(pRExC_state, data);
977 U8 *s = (U8 *)STRING(scan);
978 l = utf8_length(s, s + l);
979 uc = utf8_to_uvchr(s, NULL);
982 if (data && (flags & SCF_DO_SUBSTR))
984 if (flags & SCF_DO_STCLASS_AND) {
985 /* Check whether it is compatible with what we know already! */
989 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
990 && !ANYOF_BITMAP_TEST(data->start_class, uc)
991 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
993 ANYOF_CLASS_ZERO(data->start_class);
994 ANYOF_BITMAP_ZERO(data->start_class);
996 ANYOF_BITMAP_SET(data->start_class, uc);
997 data->start_class->flags &= ~ANYOF_EOS;
998 data->start_class->flags |= ANYOF_FOLD;
999 if (OP(scan) == EXACTFL)
1000 data->start_class->flags |= ANYOF_LOCALE;
1003 else if (flags & SCF_DO_STCLASS_OR) {
1004 if (data->start_class->flags & ANYOF_FOLD) {
1005 /* false positive possible if the class is case-folded.
1006 Assume that the locale settings are the same... */
1008 ANYOF_BITMAP_SET(data->start_class, uc);
1009 data->start_class->flags &= ~ANYOF_EOS;
1011 cl_and(data->start_class, &and_with);
1013 flags &= ~SCF_DO_STCLASS;
1015 else if (strchr((char*)PL_varies,OP(scan))) {
1016 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1017 I32 f = flags, pos_before = 0;
1018 regnode *oscan = scan;
1019 struct regnode_charclass_class this_class;
1020 struct regnode_charclass_class *oclass = NULL;
1021 I32 next_is_eval = 0;
1023 switch (PL_regkind[(U8)OP(scan)]) {
1024 case WHILEM: /* End of (?:...)* . */
1025 scan = NEXTOPER(scan);
1028 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1029 next = NEXTOPER(scan);
1030 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1032 maxcount = REG_INFTY;
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1038 if (flags & SCF_DO_SUBSTR)
1043 if (flags & SCF_DO_STCLASS) {
1045 maxcount = REG_INFTY;
1046 next = regnext(scan);
1047 scan = NEXTOPER(scan);
1050 is_inf = is_inf_internal = 1;
1051 scan = regnext(scan);
1052 if (flags & SCF_DO_SUBSTR) {
1053 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1054 data->longest = &(data->longest_float);
1056 goto optimize_curly_tail;
1058 mincount = ARG1(scan);
1059 maxcount = ARG2(scan);
1060 next = regnext(scan);
1061 if (OP(scan) == CURLYX) {
1062 I32 lp = (data ? *(data->last_closep) : 0);
1064 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1066 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1067 next_is_eval = (OP(scan) == EVAL);
1069 if (flags & SCF_DO_SUBSTR) {
1070 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1071 pos_before = data->pos_min;
1075 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1077 data->flags |= SF_IS_INF;
1079 if (flags & SCF_DO_STCLASS) {
1080 cl_init(pRExC_state, &this_class);
1081 oclass = data->start_class;
1082 data->start_class = &this_class;
1083 f |= SCF_DO_STCLASS_AND;
1084 f &= ~SCF_DO_STCLASS_OR;
1086 /* These are the cases when once a subexpression
1087 fails at a particular position, it cannot succeed
1088 even after backtracking at the enclosing scope.
1090 XXXX what if minimal match and we are at the
1091 initial run of {n,m}? */
1092 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093 f &= ~SCF_WHILEM_VISITED_POS;
1095 /* This will finish on WHILEM, setting scan, or on NULL: */
1096 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1098 ? (f & ~SCF_DO_SUBSTR) : f);
1100 if (flags & SCF_DO_STCLASS)
1101 data->start_class = oclass;
1102 if (mincount == 0 || minnext == 0) {
1103 if (flags & SCF_DO_STCLASS_OR) {
1104 cl_or(pRExC_state, data->start_class, &this_class);
1106 else if (flags & SCF_DO_STCLASS_AND) {
1107 /* Switch to OR mode: cache the old value of
1108 * data->start_class */
1109 StructCopy(data->start_class, &and_with,
1110 struct regnode_charclass_class);
1111 flags &= ~SCF_DO_STCLASS_AND;
1112 StructCopy(&this_class, data->start_class,
1113 struct regnode_charclass_class);
1114 flags |= SCF_DO_STCLASS_OR;
1115 data->start_class->flags |= ANYOF_EOS;
1117 } else { /* Non-zero len */
1118 if (flags & SCF_DO_STCLASS_OR) {
1119 cl_or(pRExC_state, data->start_class, &this_class);
1120 cl_and(data->start_class, &and_with);
1122 else if (flags & SCF_DO_STCLASS_AND)
1123 cl_and(data->start_class, &this_class);
1124 flags &= ~SCF_DO_STCLASS;
1126 if (!scan) /* It was not CURLYX, but CURLY. */
1128 if (ckWARN(WARN_REGEXP)
1129 /* ? quantifier ok, except for (?{ ... }) */
1130 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1131 && (minnext == 0) && (deltanext == 0)
1132 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1133 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1136 "Quantifier unexpected on zero-length expression");
1139 min += minnext * mincount;
1140 is_inf_internal |= ((maxcount == REG_INFTY
1141 && (minnext + deltanext) > 0)
1142 || deltanext == I32_MAX);
1143 is_inf |= is_inf_internal;
1144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1146 /* Try powerful optimization CURLYX => CURLYN. */
1147 if ( OP(oscan) == CURLYX && data
1148 && data->flags & SF_IN_PAR
1149 && !(data->flags & SF_HAS_EVAL)
1150 && !deltanext && minnext == 1 ) {
1151 /* Try to optimize to CURLYN. */
1152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1153 regnode *nxt1 = nxt;
1160 if (!strchr((char*)PL_simple,OP(nxt))
1161 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1162 && STR_LEN(nxt) == 1))
1168 if (OP(nxt) != CLOSE)
1170 /* Now we know that nxt2 is the only contents: */
1171 oscan->flags = ARG(nxt);
1173 OP(nxt1) = NOTHING; /* was OPEN. */
1175 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1179 OP(nxt + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1185 /* Try optimization CURLYX => CURLYM. */
1186 if ( OP(oscan) == CURLYX && data
1187 && !(data->flags & SF_HAS_PAR)
1188 && !(data->flags & SF_HAS_EVAL)
1190 /* XXXX How to optimize if data == 0? */
1191 /* Optimize to a simpler form. */
1192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1197 && (OP(nxt2) != WHILEM))
1199 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1200 /* Need to optimize away parenths. */
1201 if (data->flags & SF_IN_PAR) {
1202 /* Set the parenth number. */
1203 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1205 if (OP(nxt) != CLOSE)
1206 FAIL("Panic opt close");
1207 oscan->flags = ARG(nxt);
1208 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1209 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1211 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212 OP(nxt + 1) = OPTIMIZED; /* was count. */
1213 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1217 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218 regnode *nnxt = regnext(nxt1);
1221 if (reg_off_by_arg[OP(nxt1)])
1222 ARG_SET(nxt1, nxt2 - nxt1);
1223 else if (nxt2 - nxt1 < U16_MAX)
1224 NEXT_OFF(nxt1) = nxt2 - nxt1;
1226 OP(nxt) = NOTHING; /* Cannot beautify */
1231 /* Optimize again: */
1232 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1238 else if ((OP(oscan) == CURLYX)
1239 && (flags & SCF_WHILEM_VISITED_POS)
1240 /* See the comment on a similar expression above.
1241 However, this time it not a subexpression
1242 we care about, but the expression itself. */
1243 && (maxcount == REG_INFTY)
1244 && data && ++data->whilem_c < 16) {
1245 /* This stays as CURLYX, we can put the count/of pair. */
1246 /* Find WHILEM (as in regexec.c) */
1247 regnode *nxt = oscan + NEXT_OFF(oscan);
1249 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1251 PREVOPER(nxt)->flags = data->whilem_c
1252 | (RExC_whilem_seen << 4); /* On WHILEM */
1254 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1256 if (flags & SCF_DO_SUBSTR) {
1257 SV *last_str = Nullsv;
1258 int counted = mincount != 0;
1260 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1261 #if defined(SPARC64_GCC_WORKAROUND)
1267 if (pos_before >= data->last_start_min)
1270 b = data->last_start_min;
1273 s = SvPV(data->last_found, l);
1274 old = b - data->last_start_min;
1277 I32 b = pos_before >= data->last_start_min
1278 ? pos_before : data->last_start_min;
1280 char *s = SvPV(data->last_found, l);
1281 I32 old = b - data->last_start_min;
1285 old = utf8_hop((U8*)s, old) - (U8*)s;
1288 /* Get the added string: */
1289 last_str = newSVpvn(s + old, l);
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1293 SvGROW(last_str, (mincount * l) + 1);
1294 repeatcpy(SvPVX(last_str) + l,
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
1298 SvCUR_set(data->last_found,
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1301 data->last_end += l * (mincount - 1);
1304 /* start offset must point into the last copy */
1305 data->last_start_min += minnext * (mincount - 1);
1306 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307 * (minnext + data->pos_delta);
1310 /* It is counted once already... */
1311 data->pos_min += minnext * (mincount - counted);
1312 data->pos_delta += - counted * deltanext +
1313 (minnext + deltanext) * maxcount - minnext * mincount;
1314 if (mincount != maxcount) {
1315 /* Cannot extend fixed substrings found inside
1317 scan_commit(pRExC_state,data);
1318 if (mincount && last_str) {
1319 sv_setsv(data->last_found, last_str);
1320 data->last_end = data->pos_min;
1321 data->last_start_min =
1322 data->pos_min - CHR_SVLEN(last_str);
1323 data->last_start_max = is_inf
1325 : data->pos_min + data->pos_delta
1326 - CHR_SVLEN(last_str);
1328 data->longest = &(data->longest_float);
1330 SvREFCNT_dec(last_str);
1332 if (data && (fl & SF_HAS_EVAL))
1333 data->flags |= SF_HAS_EVAL;
1334 optimize_curly_tail:
1335 if (OP(oscan) != CURLYX) {
1336 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1338 NEXT_OFF(oscan) += NEXT_OFF(next);
1341 default: /* REF and CLUMP only? */
1342 if (flags & SCF_DO_SUBSTR) {
1343 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1344 data->longest = &(data->longest_float);
1346 is_inf = is_inf_internal = 1;
1347 if (flags & SCF_DO_STCLASS_OR)
1348 cl_anything(pRExC_state, data->start_class);
1349 flags &= ~SCF_DO_STCLASS;
1353 else if (strchr((char*)PL_simple,OP(scan))) {
1356 if (flags & SCF_DO_SUBSTR) {
1357 scan_commit(pRExC_state,data);
1361 if (flags & SCF_DO_STCLASS) {
1362 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1364 /* Some of the logic below assumes that switching
1365 locale on will only add false positives. */
1366 switch (PL_regkind[(U8)OP(scan)]) {
1370 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1372 cl_anything(pRExC_state, data->start_class);
1375 if (OP(scan) == SANY)
1377 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379 || (data->start_class->flags & ANYOF_CLASS));
1380 cl_anything(pRExC_state, data->start_class);
1382 if (flags & SCF_DO_STCLASS_AND || !value)
1383 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1386 if (flags & SCF_DO_STCLASS_AND)
1387 cl_and(data->start_class,
1388 (struct regnode_charclass_class*)scan);
1390 cl_or(pRExC_state, data->start_class,
1391 (struct regnode_charclass_class*)scan);
1394 if (flags & SCF_DO_STCLASS_AND) {
1395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397 for (value = 0; value < 256; value++)
1398 if (!isALNUM(value))
1399 ANYOF_BITMAP_CLEAR(data->start_class, value);
1403 if (data->start_class->flags & ANYOF_LOCALE)
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1406 for (value = 0; value < 256; value++)
1408 ANYOF_BITMAP_SET(data->start_class, value);
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (data->start_class->flags & ANYOF_LOCALE)
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 data->start_class->flags |= ANYOF_LOCALE;
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1428 ANYOF_BITMAP_CLEAR(data->start_class, value);
1432 if (data->start_class->flags & ANYOF_LOCALE)
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435 for (value = 0; value < 256; value++)
1436 if (!isALNUM(value))
1437 ANYOF_BITMAP_SET(data->start_class, value);
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (data->start_class->flags & ANYOF_LOCALE)
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1447 data->start_class->flags |= ANYOF_LOCALE;
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452 if (flags & SCF_DO_STCLASS_AND) {
1453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455 for (value = 0; value < 256; value++)
1456 if (!isSPACE(value))
1457 ANYOF_BITMAP_CLEAR(data->start_class, value);
1461 if (data->start_class->flags & ANYOF_LOCALE)
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464 for (value = 0; value < 256; value++)
1466 ANYOF_BITMAP_SET(data->start_class, value);
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (data->start_class->flags & ANYOF_LOCALE)
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1476 data->start_class->flags |= ANYOF_LOCALE;
1477 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481 if (flags & SCF_DO_STCLASS_AND) {
1482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1486 ANYOF_BITMAP_CLEAR(data->start_class, value);
1490 if (data->start_class->flags & ANYOF_LOCALE)
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1493 for (value = 0; value < 256; value++)
1494 if (!isSPACE(value))
1495 ANYOF_BITMAP_SET(data->start_class, value);
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (data->start_class->flags & ANYOF_LOCALE) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (!isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1509 data->start_class->flags |= ANYOF_LOCALE;
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516 for (value = 0; value < 256; value++)
1517 if (!isDIGIT(value))
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1524 for (value = 0; value < 256; value++)
1526 ANYOF_BITMAP_SET(data->start_class, value);
1531 if (flags & SCF_DO_STCLASS_AND) {
1532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1535 ANYOF_BITMAP_CLEAR(data->start_class, value);
1538 if (data->start_class->flags & ANYOF_LOCALE)
1539 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1541 for (value = 0; value < 256; value++)
1542 if (!isDIGIT(value))
1543 ANYOF_BITMAP_SET(data->start_class, value);
1548 if (flags & SCF_DO_STCLASS_OR)
1549 cl_and(data->start_class, &and_with);
1550 flags &= ~SCF_DO_STCLASS;
1553 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1554 data->flags |= (OP(scan) == MEOL
1558 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1559 /* Lookbehind, or need to calculate parens/evals/stclass: */
1560 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1561 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1562 /* Lookahead/lookbehind */
1563 I32 deltanext, minnext, fake = 0;
1565 struct regnode_charclass_class intrnl;
1568 data_fake.flags = 0;
1570 data_fake.whilem_c = data->whilem_c;
1571 data_fake.last_closep = data->last_closep;
1574 data_fake.last_closep = &fake;
1575 if ( flags & SCF_DO_STCLASS && !scan->flags
1576 && OP(scan) == IFMATCH ) { /* Lookahead */
1577 cl_init(pRExC_state, &intrnl);
1578 data_fake.start_class = &intrnl;
1579 f |= SCF_DO_STCLASS_AND;
1581 if (flags & SCF_WHILEM_VISITED_POS)
1582 f |= SCF_WHILEM_VISITED_POS;
1583 next = regnext(scan);
1584 nscan = NEXTOPER(NEXTOPER(scan));
1585 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1588 vFAIL("Variable length lookbehind not implemented");
1590 else if (minnext > U8_MAX) {
1591 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1593 scan->flags = minnext;
1595 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1597 if (data && (data_fake.flags & SF_HAS_EVAL))
1598 data->flags |= SF_HAS_EVAL;
1600 data->whilem_c = data_fake.whilem_c;
1601 if (f & SCF_DO_STCLASS_AND) {
1602 int was = (data->start_class->flags & ANYOF_EOS);
1604 cl_and(data->start_class, &intrnl);
1606 data->start_class->flags |= ANYOF_EOS;
1609 else if (OP(scan) == OPEN) {
1612 else if (OP(scan) == CLOSE) {
1613 if (ARG(scan) == is_par) {
1614 next = regnext(scan);
1616 if ( next && (OP(next) != WHILEM) && next < last)
1617 is_par = 0; /* Disable optimization */
1620 *(data->last_closep) = ARG(scan);
1622 else if (OP(scan) == EVAL) {
1624 data->flags |= SF_HAS_EVAL;
1626 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1627 if (flags & SCF_DO_SUBSTR) {
1628 scan_commit(pRExC_state,data);
1629 data->longest = &(data->longest_float);
1631 is_inf = is_inf_internal = 1;
1632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1633 cl_anything(pRExC_state, data->start_class);
1634 flags &= ~SCF_DO_STCLASS;
1636 /* Else: zero-length, ignore. */
1637 scan = regnext(scan);
1642 *deltap = is_inf_internal ? I32_MAX : delta;
1643 if (flags & SCF_DO_SUBSTR && is_inf)
1644 data->pos_delta = I32_MAX - data->pos_min;
1645 if (is_par > U8_MAX)
1647 if (is_par && pars==1 && data) {
1648 data->flags |= SF_IN_PAR;
1649 data->flags &= ~SF_HAS_PAR;
1651 else if (pars && data) {
1652 data->flags |= SF_HAS_PAR;
1653 data->flags &= ~SF_IN_PAR;
1655 if (flags & SCF_DO_STCLASS_OR)
1656 cl_and(data->start_class, &and_with);
1661 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1663 if (RExC_rx->data) {
1664 Renewc(RExC_rx->data,
1665 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1666 char, struct reg_data);
1667 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668 RExC_rx->data->count += n;
1671 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1672 char, struct reg_data);
1673 New(1208, RExC_rx->data->what, n, U8);
1674 RExC_rx->data->count = n;
1676 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677 return RExC_rx->data->count - n;
1681 Perl_reginitcolors(pTHX)
1684 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1687 PL_colors[0] = s = savepv(s);
1689 s = strchr(s, '\t');
1695 PL_colors[i] = s = "";
1699 PL_colors[i++] = "";
1706 - pregcomp - compile a regular expression into internal code
1708 * We can't allocate space until we know how big the compiled form will be,
1709 * but we can't compile it (and thus know how big it is) until we've got a
1710 * place to put the code. So we cheat: we compile it twice, once with code
1711 * generation turned off and size counting turned on, and once "for real".
1712 * This also means that we don't allocate space until we are sure that the
1713 * thing really will compile successfully, and we never have to move the
1714 * code and thus invalidate pointers into it. (Note that it has to be in
1715 * one piece because free() must be able to free it all.) [NB: not true in perl]
1717 * Beware that the optimization-preparation code in here knows about some
1718 * of the structure of the compiled regexp. [I'll say.]
1721 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1731 RExC_state_t RExC_state;
1732 RExC_state_t *pRExC_state = &RExC_state;
1735 FAIL("NULL regexp argument");
1737 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741 if (!PL_colorset) reginitcolors();
1742 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1744 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1746 RExC_flags16 = pm->op_pmflags;
1750 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751 RExC_seen_evals = 0;
1754 /* First pass: determine size, legality. */
1761 RExC_emit = &PL_regdummy;
1762 RExC_whilem_seen = 0;
1763 #if 0 /* REGC() is (currently) a NOP at the first pass.
1764 * Clever compilers notice this and complain. --jhi */
1765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1767 if (reg(pRExC_state, 0, &flags) == NULL) {
1768 RExC_precomp = Nullch;
1771 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1773 /* Small enough for pointer-storage convention?
1774 If extralen==0, this means that we will not need long jumps. */
1775 if (RExC_size >= 0x10000L && RExC_extralen)
1776 RExC_size += RExC_extralen;
1779 if (RExC_whilem_seen > 15)
1780 RExC_whilem_seen = 15;
1782 /* Allocate space and initialize. */
1783 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1786 FAIL("Regexp out of space");
1789 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1790 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1793 r->prelen = xend - exp;
1794 r->precomp = savepvn(RExC_precomp, r->prelen);
1796 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1797 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1799 r->substrs = 0; /* Useful during FAIL. */
1800 r->startp = 0; /* Useful during FAIL. */
1801 r->endp = 0; /* Useful during FAIL. */
1803 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1805 r->offsets[0] = RExC_size;
1807 DEBUG_r(PerlIO_printf(Perl_debug_log,
1808 "%s %"UVuf" bytes for offset annotations.\n",
1809 r->offsets ? "Got" : "Couldn't get",
1810 (UV)((2*RExC_size+1) * sizeof(U32))));
1814 /* Second pass: emit code. */
1815 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1820 RExC_emit_start = r->program;
1821 RExC_emit = r->program;
1822 /* Store the count of eval-groups for security checks: */
1823 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1826 if (reg(pRExC_state, 0, &flags) == NULL)
1829 /* Dig out information for optimizations. */
1830 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1831 pm->op_pmflags = RExC_flags16;
1833 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1834 r->regstclass = NULL;
1835 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1836 r->reganch |= ROPT_NAUGHTY;
1837 scan = r->program + 1; /* First BRANCH. */
1839 /* XXXX To minimize changes to RE engine we always allocate
1840 3-units-long substrs field. */
1841 Newz(1004, r->substrs, 1, struct reg_substr_data);
1843 StructCopy(&zero_scan_data, &data, scan_data_t);
1844 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1845 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1847 STRLEN longest_float_length, longest_fixed_length;
1848 struct regnode_charclass_class ch_class;
1853 /* Skip introductions and multiplicators >= 1. */
1854 while ((OP(first) == OPEN && (sawopen = 1)) ||
1855 /* An OR of *one* alternative - should not happen now. */
1856 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857 (OP(first) == PLUS) ||
1858 (OP(first) == MINMOD) ||
1859 /* An {n,m} with n>0 */
1860 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1861 if (OP(first) == PLUS)
1864 first += regarglen[(U8)OP(first)];
1865 first = NEXTOPER(first);
1868 /* Starting-point info. */
1870 if (PL_regkind[(U8)OP(first)] == EXACT) {
1871 if (OP(first) == EXACT)
1872 ; /* Empty, get anchored substr later. */
1873 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1874 r->regstclass = first;
1876 else if (strchr((char*)PL_simple,OP(first)))
1877 r->regstclass = first;
1878 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879 PL_regkind[(U8)OP(first)] == NBOUND)
1880 r->regstclass = first;
1881 else if (PL_regkind[(U8)OP(first)] == BOL) {
1882 r->reganch |= (OP(first) == MBOL
1884 : (OP(first) == SBOL
1887 first = NEXTOPER(first);
1890 else if (OP(first) == GPOS) {
1891 r->reganch |= ROPT_ANCH_GPOS;
1892 first = NEXTOPER(first);
1895 else if (!sawopen && (OP(first) == STAR &&
1896 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1897 !(r->reganch & ROPT_ANCH) )
1899 /* turn .* into ^.* with an implied $*=1 */
1900 int type = OP(NEXTOPER(first));
1902 if (type == REG_ANY)
1903 type = ROPT_ANCH_MBOL;
1905 type = ROPT_ANCH_SBOL;
1907 r->reganch |= type | ROPT_IMPLICIT;
1908 first = NEXTOPER(first);
1911 if (sawplus && (!sawopen || !RExC_sawback)
1912 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1913 /* x+ must match at the 1st pos of run of x's */
1914 r->reganch |= ROPT_SKIP;
1916 /* Scan is after the zeroth branch, first is atomic matcher. */
1917 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1918 (IV)(first - scan + 1)));
1920 * If there's something expensive in the r.e., find the
1921 * longest literal string that must appear and make it the
1922 * regmust. Resolve ties in favor of later strings, since
1923 * the regstart check works with the beginning of the r.e.
1924 * and avoiding duplication strengthens checking. Not a
1925 * strong reason, but sufficient in the absence of others.
1926 * [Now we resolve ties in favor of the earlier string if
1927 * it happens that c_offset_min has been invalidated, since the
1928 * earlier string may buy us something the later one won't.]
1932 data.longest_fixed = newSVpvn("",0);
1933 data.longest_float = newSVpvn("",0);
1934 data.last_found = newSVpvn("",0);
1935 data.longest = &(data.longest_fixed);
1937 if (!r->regstclass) {
1938 cl_init(pRExC_state, &ch_class);
1939 data.start_class = &ch_class;
1940 stclass_flag = SCF_DO_STCLASS_AND;
1941 } else /* XXXX Check for BOUND? */
1943 data.last_closep = &last_close;
1945 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1946 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1947 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1948 && data.last_start_min == 0 && data.last_end > 0
1949 && !RExC_seen_zerolen
1950 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1951 r->reganch |= ROPT_CHECK_ALL;
1952 scan_commit(pRExC_state, &data);
1953 SvREFCNT_dec(data.last_found);
1955 longest_float_length = CHR_SVLEN(data.longest_float);
1956 if (longest_float_length
1957 || (data.flags & SF_FL_BEFORE_EOL
1958 && (!(data.flags & SF_FL_BEFORE_MEOL)
1959 || (RExC_flags16 & PMf_MULTILINE)))) {
1962 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1963 && data.offset_fixed == data.offset_float_min
1964 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965 goto remove_float; /* As in (a)+. */
1967 r->float_substr = data.longest_float;
1968 r->float_min_offset = data.offset_float_min;
1969 r->float_max_offset = data.offset_float_max;
1970 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971 && (!(data.flags & SF_FL_BEFORE_MEOL)
1972 || (RExC_flags16 & PMf_MULTILINE)));
1973 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1977 r->float_substr = Nullsv;
1978 SvREFCNT_dec(data.longest_float);
1979 longest_float_length = 0;
1982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1983 if (longest_fixed_length
1984 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1986 || (RExC_flags16 & PMf_MULTILINE)))) {
1989 r->anchored_substr = data.longest_fixed;
1990 r->anchored_offset = data.offset_fixed;
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags16 & PMf_MULTILINE)));
1994 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1997 r->anchored_substr = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
1999 longest_fixed_length = 0;
2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003 r->regstclass = NULL;
2004 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005 && !(data.start_class->flags & ANYOF_EOS)
2006 && !cl_is_anything(data.start_class)) {
2007 I32 n = add_data(pRExC_state, 1, "f");
2009 New(1006, RExC_rx->data->data[n], 1,
2010 struct regnode_charclass_class);
2011 StructCopy(data.start_class,
2012 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2013 struct regnode_charclass_class);
2014 r->regstclass = (regnode*)RExC_rx->data->data[n];
2015 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2016 PL_regdata = r->data; /* for regprop() */
2017 DEBUG_r({ SV *sv = sv_newmortal();
2018 regprop(sv, (regnode*)data.start_class);
2019 PerlIO_printf(Perl_debug_log,
2020 "synthetic stclass `%s'.\n",
2024 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2025 if (longest_fixed_length > longest_float_length) {
2026 r->check_substr = r->anchored_substr;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
2032 r->check_substr = r->float_substr;
2033 r->check_offset_min = data.offset_float_min;
2034 r->check_offset_max = data.offset_float_max;
2036 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037 This should be changed ASAP! */
2038 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2039 r->reganch |= RE_USE_INTUIT;
2040 if (SvTAIL(r->check_substr))
2041 r->reganch |= RE_INTUIT_TAIL;
2045 /* Several toplevels. Best we can is to set minlen. */
2047 struct regnode_charclass_class ch_class;
2050 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051 scan = r->program + 1;
2052 cl_init(pRExC_state, &ch_class);
2053 data.start_class = &ch_class;
2054 data.last_closep = &last_close;
2055 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2056 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class)) {
2059 I32 n = add_data(pRExC_state, 1, "f");
2061 New(1006, RExC_rx->data->data[n], 1,
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065 struct regnode_charclass_class);
2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2068 DEBUG_r({ SV* sv = sv_newmortal();
2069 regprop(sv, (regnode*)data.start_class);
2070 PerlIO_printf(Perl_debug_log,
2071 "synthetic stclass `%s'.\n",
2077 if (RExC_seen & REG_SEEN_GPOS)
2078 r->reganch |= ROPT_GPOS_SEEN;
2079 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2080 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2081 if (RExC_seen & REG_SEEN_EVAL)
2082 r->reganch |= ROPT_EVAL_SEEN;
2083 if (RExC_seen & REG_SEEN_CANY)
2084 r->reganch |= ROPT_CANY_SEEN;
2085 Newz(1002, r->startp, RExC_npar, I32);
2086 Newz(1002, r->endp, RExC_npar, I32);
2087 PL_regdata = r->data; /* for regprop() */
2088 DEBUG_r(regdump(r));
2093 - reg - regular expression, i.e. main body or parenthesized thing
2095 * Caller must absorb opening parenthesis.
2097 * Combining parenthesis handling with the base level of regular expression
2098 * is a trifle forced, but the need to tie the tails of the branches to what
2099 * follows makes it hard to avoid.
2102 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2103 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2105 register regnode *ret; /* Will be the head of the group. */
2106 register regnode *br;
2107 register regnode *lastbr;
2108 register regnode *ender = 0;
2109 register I32 parno = 0;
2110 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2112 /* for (?g), (?gc), and (?o) warnings; warning
2113 about (?c) will warn about (?g) -- japhy */
2115 I32 wastedflags = 0x00,
2118 wasted_gc = 0x02 | 0x04,
2121 char * parse_start = RExC_parse; /* MJD */
2122 char *oregcomp_parse = RExC_parse;
2125 *flagp = 0; /* Tentatively. */
2128 /* Make an OPEN node, if parenthesized. */
2130 if (*RExC_parse == '?') { /* (?...) */
2131 U16 posflags = 0, negflags = 0;
2132 U16 *flagsp = &posflags;
2134 char *seqstart = RExC_parse;
2137 paren = *RExC_parse++;
2138 ret = NULL; /* For look-ahead/behind. */
2140 case '<': /* (?<...) */
2141 RExC_seen |= REG_SEEN_LOOKBEHIND;
2142 if (*RExC_parse == '!')
2144 if (*RExC_parse != '=' && *RExC_parse != '!')
2147 case '=': /* (?=...) */
2148 case '!': /* (?!...) */
2149 RExC_seen_zerolen++;
2150 case ':': /* (?:...) */
2151 case '>': /* (?>...) */
2153 case '$': /* (?$...) */
2154 case '@': /* (?@...) */
2155 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2157 case '#': /* (?#...) */
2158 while (*RExC_parse && *RExC_parse != ')')
2160 if (*RExC_parse != ')')
2161 FAIL("Sequence (?#... not terminated");
2162 nextchar(pRExC_state);
2165 case 'p': /* (?p...) */
2166 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2169 case '?': /* (??...) */
2171 if (*RExC_parse != '{')
2173 paren = *RExC_parse++;
2175 case '{': /* (?{...}) */
2177 I32 count = 1, n = 0;
2179 char *s = RExC_parse;
2181 OP_4tree *sop, *rop;
2183 RExC_seen_zerolen++;
2184 RExC_seen |= REG_SEEN_EVAL;
2185 while (count && (c = *RExC_parse)) {
2186 if (c == '\\' && RExC_parse[1])
2194 if (*RExC_parse != ')')
2197 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2202 if (RExC_parse - 1 - s)
2203 sv = newSVpvn(s, RExC_parse - 1 - s);
2205 sv = newSVpvn("", 0);
2208 Perl_save_re_context(aTHX);
2209 rop = sv_compile_2op(sv, &sop, "re", &av);
2210 sop->op_private |= OPpREFCOUNTED;
2211 /* re_dup will OpREFCNT_inc */
2212 OpREFCNT_set(sop, 1);
2215 n = add_data(pRExC_state, 3, "nop");
2216 RExC_rx->data->data[n] = (void*)rop;
2217 RExC_rx->data->data[n+1] = (void*)sop;
2218 RExC_rx->data->data[n+2] = (void*)av;
2221 else { /* First pass */
2222 if (PL_reginterp_cnt < ++RExC_seen_evals
2223 && PL_curcop != &PL_compiling)
2224 /* No compiled RE interpolated, has runtime
2225 components ===> unsafe. */
2226 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2227 if (PL_tainting && PL_tainted)
2228 FAIL("Eval-group in insecure regular expression");
2231 nextchar(pRExC_state);
2233 ret = reg_node(pRExC_state, LOGICAL);
2236 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2237 /* deal with the length of this later - MJD */
2240 return reganode(pRExC_state, EVAL, n);
2242 case '(': /* (?(?{...})...) and (?(?=...)...) */
2244 if (RExC_parse[0] == '?') { /* (?(?...)) */
2245 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2246 || RExC_parse[1] == '<'
2247 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2250 ret = reg_node(pRExC_state, LOGICAL);
2253 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2257 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2259 parno = atoi(RExC_parse++);
2261 while (isDIGIT(*RExC_parse))
2263 ret = reganode(pRExC_state, GROUPP, parno);
2265 if ((c = *nextchar(pRExC_state)) != ')')
2266 vFAIL("Switch condition not recognized");
2268 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2269 br = regbranch(pRExC_state, &flags, 1);
2271 br = reganode(pRExC_state, LONGJMP, 0);
2273 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2274 c = *nextchar(pRExC_state);
2278 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2279 regbranch(pRExC_state, &flags, 1);
2280 regtail(pRExC_state, ret, lastbr);
2283 c = *nextchar(pRExC_state);
2288 vFAIL("Switch (?(condition)... contains too many branches");
2289 ender = reg_node(pRExC_state, TAIL);
2290 regtail(pRExC_state, br, ender);
2292 regtail(pRExC_state, lastbr, ender);
2293 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2296 regtail(pRExC_state, ret, ender);
2300 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2304 RExC_parse--; /* for vFAIL to print correctly */
2305 vFAIL("Sequence (? incomplete");
2309 parse_flags: /* (?i) */
2310 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2311 /* (?g), (?gc) and (?o) are useless here
2312 and must be globally applied -- japhy */
2314 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2315 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2316 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2317 if (! (wastedflags & wflagbit) ) {
2318 wastedflags |= wflagbit;
2321 "Useless (%s%c) - %suse /%c modifier",
2322 flagsp == &negflags ? "?-" : "?",
2324 flagsp == &negflags ? "don't " : "",
2330 else if (*RExC_parse == 'c') {
2331 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2332 if (! (wastedflags & wasted_c) ) {
2333 wastedflags |= wasted_gc;
2336 "Useless (%sc) - %suse /gc modifier",
2337 flagsp == &negflags ? "?-" : "?",
2338 flagsp == &negflags ? "don't " : ""
2343 else { pmflag(flagsp, *RExC_parse); }
2347 if (*RExC_parse == '-') {
2349 wastedflags = 0; /* reset so (?g-c) warns twice */
2353 RExC_flags16 |= posflags;
2354 RExC_flags16 &= ~negflags;
2355 if (*RExC_parse == ':') {
2361 if (*RExC_parse != ')') {
2363 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2365 nextchar(pRExC_state);
2373 ret = reganode(pRExC_state, OPEN, parno);
2374 Set_Node_Length(ret, 1); /* MJD */
2375 Set_Node_Offset(ret, RExC_parse); /* MJD */
2382 /* Pick up the branches, linking them together. */
2383 parse_start = RExC_parse; /* MJD */
2384 br = regbranch(pRExC_state, &flags, 1);
2385 /* branch_len = (paren != 0); */
2389 if (*RExC_parse == '|') {
2390 if (!SIZE_ONLY && RExC_extralen) {
2391 reginsert(pRExC_state, BRANCHJ, br);
2394 reginsert(pRExC_state, BRANCH, br);
2395 Set_Node_Length(br, paren != 0);
2396 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2400 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2402 else if (paren == ':') {
2403 *flagp |= flags&SIMPLE;
2405 if (open) { /* Starts with OPEN. */
2406 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2408 else if (paren != '?') /* Not Conditional */
2410 *flagp |= flags & (SPSTART | HASWIDTH);
2412 while (*RExC_parse == '|') {
2413 if (!SIZE_ONLY && RExC_extralen) {
2414 ender = reganode(pRExC_state, LONGJMP,0);
2415 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2418 RExC_extralen += 2; /* Account for LONGJMP. */
2419 nextchar(pRExC_state);
2420 br = regbranch(pRExC_state, &flags, 0);
2424 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2428 *flagp |= flags&SPSTART;
2431 if (have_branch || paren != ':') {
2432 /* Make a closing node, and hook it on the end. */
2435 ender = reg_node(pRExC_state, TAIL);
2438 ender = reganode(pRExC_state, CLOSE, parno);
2439 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2440 Set_Node_Length(ender,1); /* MJD */
2446 *flagp &= ~HASWIDTH;
2449 ender = reg_node(pRExC_state, SUCCEED);
2452 ender = reg_node(pRExC_state, END);
2455 regtail(pRExC_state, lastbr, ender);
2458 /* Hook the tails of the branches to the closing node. */
2459 for (br = ret; br != NULL; br = regnext(br)) {
2460 regoptail(pRExC_state, br, ender);
2467 static char parens[] = "=!<,>";
2469 if (paren && (p = strchr(parens, paren))) {
2470 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2471 int flag = (p - parens) > 1;
2474 node = SUSPEND, flag = 0;
2475 reginsert(pRExC_state, node,ret);
2477 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2481 /* Check for proper termination. */
2483 RExC_flags16 = oregflags;
2484 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2485 RExC_parse = oregcomp_parse;
2486 vFAIL("Unmatched (");
2489 else if (!paren && RExC_parse < RExC_end) {
2490 if (*RExC_parse == ')') {
2492 vFAIL("Unmatched )");
2495 FAIL("Junk on end of regexp"); /* "Can't happen". */
2503 - regbranch - one alternative of an | operator
2505 * Implements the concatenation operator.
2508 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2510 register regnode *ret;
2511 register regnode *chain = NULL;
2512 register regnode *latest;
2513 I32 flags = 0, c = 0;
2518 if (!SIZE_ONLY && RExC_extralen)
2519 ret = reganode(pRExC_state, BRANCHJ,0);
2521 ret = reg_node(pRExC_state, BRANCH);
2522 Set_Node_Length(ret, 1);
2526 if (!first && SIZE_ONLY)
2527 RExC_extralen += 1; /* BRANCHJ */
2529 *flagp = WORST; /* Tentatively. */
2532 nextchar(pRExC_state);
2533 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2535 latest = regpiece(pRExC_state, &flags);
2536 if (latest == NULL) {
2537 if (flags & TRYAGAIN)
2541 else if (ret == NULL)
2543 *flagp |= flags&HASWIDTH;
2544 if (chain == NULL) /* First piece. */
2545 *flagp |= flags&SPSTART;
2548 regtail(pRExC_state, chain, latest);
2553 if (chain == NULL) { /* Loop ran zero times. */
2554 chain = reg_node(pRExC_state, NOTHING);
2559 *flagp |= flags&SIMPLE;
2566 - regpiece - something followed by possible [*+?]
2568 * Note that the branching code sequences used for ? and the general cases
2569 * of * and + are somewhat optimized: they use the same NOTHING node as
2570 * both the endmarker for their branch list and the body of the last branch.
2571 * It might seem that this node could be dispensed with entirely, but the
2572 * endmarker role is not redundant.
2575 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2577 register regnode *ret;
2579 register char *next;
2581 char *origparse = RExC_parse;
2584 I32 max = REG_INFTY;
2587 ret = regatom(pRExC_state, &flags);
2589 if (flags & TRYAGAIN)
2596 if (op == '{' && regcurly(RExC_parse)) {
2597 parse_start = RExC_parse; /* MJD */
2598 next = RExC_parse + 1;
2600 while (isDIGIT(*next) || *next == ',') {
2609 if (*next == '}') { /* got one */
2613 min = atoi(RExC_parse);
2617 maxpos = RExC_parse;
2619 if (!max && *maxpos != '0')
2620 max = REG_INFTY; /* meaning "infinity" */
2621 else if (max >= REG_INFTY)
2622 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2624 nextchar(pRExC_state);
2627 if ((flags&SIMPLE)) {
2628 RExC_naughty += 2 + RExC_naughty / 2;
2629 reginsert(pRExC_state, CURLY, ret);
2630 Set_Node_Offset(ret, parse_start+1); /* MJD */
2631 Set_Node_Cur_Length(ret);
2634 regnode *w = reg_node(pRExC_state, WHILEM);
2637 regtail(pRExC_state, ret, w);
2638 if (!SIZE_ONLY && RExC_extralen) {
2639 reginsert(pRExC_state, LONGJMP,ret);
2640 reginsert(pRExC_state, NOTHING,ret);
2641 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2643 reginsert(pRExC_state, CURLYX,ret);
2645 Set_Node_Offset(ret, parse_start+1);
2646 Set_Node_Length(ret,
2647 op == '{' ? (RExC_parse - parse_start) : 1);
2649 if (!SIZE_ONLY && RExC_extralen)
2650 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2651 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2653 RExC_whilem_seen++, RExC_extralen += 3;
2654 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2662 if (max && max < min)
2663 vFAIL("Can't do {n,m} with n > m");
2678 #if 0 /* Now runtime fix should be reliable. */
2680 /* if this is reinstated, don't forget to put this back into perldiag:
2682 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2684 (F) The part of the regexp subject to either the * or + quantifier
2685 could match an empty string. The {#} shows in the regular
2686 expression about where the problem was discovered.
2690 if (!(flags&HASWIDTH) && op != '?')
2691 vFAIL("Regexp *+ operand could be empty");
2694 parse_start = RExC_parse;
2695 nextchar(pRExC_state);
2697 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2699 if (op == '*' && (flags&SIMPLE)) {
2700 reginsert(pRExC_state, STAR, ret);
2704 else if (op == '*') {
2708 else if (op == '+' && (flags&SIMPLE)) {
2709 reginsert(pRExC_state, PLUS, ret);
2713 else if (op == '+') {
2717 else if (op == '?') {
2722 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2724 "%.*s matches null string many times",
2725 RExC_parse - origparse,
2729 if (*RExC_parse == '?') {
2730 nextchar(pRExC_state);
2731 reginsert(pRExC_state, MINMOD, ret);
2732 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2734 if (ISMULT2(RExC_parse)) {
2736 vFAIL("Nested quantifiers");
2743 - regatom - the lowest level
2745 * Optimization: gobbles an entire sequence of ordinary characters so that
2746 * it can turn them into a single node, which is smaller to store and
2747 * faster to run. Backslashed characters are exceptions, each becoming a
2748 * separate node; the code is simpler that way and it's not worth fixing.
2750 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2752 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2754 register regnode *ret = 0;
2756 char *parse_start = 0;
2758 *flagp = WORST; /* Tentatively. */
2761 switch (*RExC_parse) {
2763 RExC_seen_zerolen++;
2764 nextchar(pRExC_state);
2765 if (RExC_flags16 & PMf_MULTILINE)
2766 ret = reg_node(pRExC_state, MBOL);
2767 else if (RExC_flags16 & PMf_SINGLELINE)
2768 ret = reg_node(pRExC_state, SBOL);
2770 ret = reg_node(pRExC_state, BOL);
2771 Set_Node_Length(ret, 1); /* MJD */
2774 nextchar(pRExC_state);
2776 RExC_seen_zerolen++;
2777 if (RExC_flags16 & PMf_MULTILINE)
2778 ret = reg_node(pRExC_state, MEOL);
2779 else if (RExC_flags16 & PMf_SINGLELINE)
2780 ret = reg_node(pRExC_state, SEOL);
2782 ret = reg_node(pRExC_state, EOL);
2783 Set_Node_Length(ret, 1); /* MJD */
2786 nextchar(pRExC_state);
2787 if (RExC_flags16 & PMf_SINGLELINE)
2788 ret = reg_node(pRExC_state, SANY);
2790 ret = reg_node(pRExC_state, REG_ANY);
2791 *flagp |= HASWIDTH|SIMPLE;
2793 Set_Node_Length(ret, 1); /* MJD */
2797 char *oregcomp_parse = ++RExC_parse;
2798 ret = regclass(pRExC_state);
2799 if (*RExC_parse != ']') {
2800 RExC_parse = oregcomp_parse;
2801 vFAIL("Unmatched [");
2803 nextchar(pRExC_state);
2804 *flagp |= HASWIDTH|SIMPLE;
2805 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2809 nextchar(pRExC_state);
2810 ret = reg(pRExC_state, 1, &flags);
2812 if (flags & TRYAGAIN) {
2813 if (RExC_parse == RExC_end) {
2814 /* Make parent create an empty node if needed. */
2822 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2826 if (flags & TRYAGAIN) {
2830 vFAIL("Internal urp");
2831 /* Supposed to be caught earlier. */
2834 if (!regcurly(RExC_parse)) {
2843 vFAIL("Quantifier follows nothing");
2846 switch (*++RExC_parse) {
2848 RExC_seen_zerolen++;
2849 ret = reg_node(pRExC_state, SBOL);
2851 nextchar(pRExC_state);
2852 Set_Node_Length(ret, 2); /* MJD */
2855 ret = reg_node(pRExC_state, GPOS);
2856 RExC_seen |= REG_SEEN_GPOS;
2858 nextchar(pRExC_state);
2859 Set_Node_Length(ret, 2); /* MJD */
2862 ret = reg_node(pRExC_state, SEOL);
2864 RExC_seen_zerolen++; /* Do not optimize RE away */
2865 nextchar(pRExC_state);
2868 ret = reg_node(pRExC_state, EOS);
2870 RExC_seen_zerolen++; /* Do not optimize RE away */
2871 nextchar(pRExC_state);
2872 Set_Node_Length(ret, 2); /* MJD */
2875 ret = reg_node(pRExC_state, CANY);
2876 RExC_seen |= REG_SEEN_CANY;
2877 *flagp |= HASWIDTH|SIMPLE;
2878 nextchar(pRExC_state);
2879 Set_Node_Length(ret, 2); /* MJD */
2882 ret = reg_node(pRExC_state, CLUMP);
2884 nextchar(pRExC_state);
2885 Set_Node_Length(ret, 2); /* MJD */
2888 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2889 *flagp |= HASWIDTH|SIMPLE;
2890 nextchar(pRExC_state);
2891 Set_Node_Length(ret, 2); /* MJD */
2894 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2895 *flagp |= HASWIDTH|SIMPLE;
2896 nextchar(pRExC_state);
2897 Set_Node_Length(ret, 2); /* MJD */
2900 RExC_seen_zerolen++;
2901 RExC_seen |= REG_SEEN_LOOKBEHIND;
2902 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2904 nextchar(pRExC_state);
2905 Set_Node_Length(ret, 2); /* MJD */
2908 RExC_seen_zerolen++;
2909 RExC_seen |= REG_SEEN_LOOKBEHIND;
2910 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2912 nextchar(pRExC_state);
2913 Set_Node_Length(ret, 2); /* MJD */
2916 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2917 *flagp |= HASWIDTH|SIMPLE;
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2922 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2923 *flagp |= HASWIDTH|SIMPLE;
2924 nextchar(pRExC_state);
2925 Set_Node_Length(ret, 2); /* MJD */
2928 ret = reg_node(pRExC_state, DIGIT);
2929 *flagp |= HASWIDTH|SIMPLE;
2930 nextchar(pRExC_state);
2931 Set_Node_Length(ret, 2); /* MJD */
2934 ret = reg_node(pRExC_state, NDIGIT);
2935 *flagp |= HASWIDTH|SIMPLE;
2936 nextchar(pRExC_state);
2937 Set_Node_Length(ret, 2); /* MJD */
2942 char* oldregxend = RExC_end;
2943 char* parse_start = RExC_parse;
2945 if (RExC_parse[1] == '{') {
2946 /* a lovely hack--pretend we saw [\pX] instead */
2947 RExC_end = strchr(RExC_parse, '}');
2949 U8 c = (U8)*RExC_parse;
2951 RExC_end = oldregxend;
2952 vFAIL2("Missing right brace on \\%c{}", c);
2957 RExC_end = RExC_parse + 2;
2960 ret = regclass(pRExC_state);
2962 RExC_end = oldregxend;
2964 Set_Node_Cur_Length(ret); /* MJD */
2965 nextchar(pRExC_state);
2966 *flagp |= HASWIDTH|SIMPLE;
2979 case '1': case '2': case '3': case '4':
2980 case '5': case '6': case '7': case '8': case '9':
2982 I32 num = atoi(RExC_parse);
2984 if (num > 9 && num >= RExC_npar)
2987 char * parse_start = RExC_parse - 1; /* MJD */
2988 while (isDIGIT(*RExC_parse))
2991 if (!SIZE_ONLY && num > RExC_rx->nparens)
2992 vFAIL("Reference to nonexistent group");
2994 ret = reganode(pRExC_state, FOLD
2995 ? (LOC ? REFFL : REFF)
2999 /* override incorrect value set in reganode MJD */
3000 Set_Node_Offset(ret, parse_start+1);
3001 Set_Node_Cur_Length(ret); /* MJD */
3003 nextchar(pRExC_state);
3008 if (RExC_parse >= RExC_end)
3009 FAIL("Trailing \\");
3012 /* Do not generate `unrecognized' warnings here, we fall
3013 back into the quick-grab loop below */
3019 if (RExC_flags16 & PMf_EXTENDED) {
3020 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3021 if (RExC_parse < RExC_end)
3027 register STRLEN len;
3033 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3035 parse_start = RExC_parse - 1;
3040 ret = reg_node(pRExC_state, FOLD
3041 ? (LOC ? EXACTFL : EXACTF)
3044 for (len = 0, p = RExC_parse - 1;
3045 len < 127 && p < RExC_end;
3050 if (RExC_flags16 & PMf_EXTENDED)
3051 p = regwhite(p, RExC_end);
3098 ender = ASCII_TO_NATIVE('\033');
3102 ender = ASCII_TO_NATIVE('\007');
3107 char* e = strchr(p, '}');
3111 vFAIL("Missing right brace on \\x{}");
3114 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3115 | PERL_SCAN_DISALLOW_PREFIX;
3117 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3120 /* numlen is generous */
3121 if (numlen + len >= 127) {
3129 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3131 ender = grok_hex(p, &numlen, &flags, NULL);
3137 ender = UCHARAT(p++);
3138 ender = toCTRL(ender);
3140 case '0': case '1': case '2': case '3':case '4':
3141 case '5': case '6': case '7': case '8':case '9':
3143 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3146 ender = grok_oct(p, &numlen, &flags, NULL);
3156 FAIL("Trailing \\");
3159 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3160 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3161 goto normal_default;
3166 if (UTF8_IS_START(*p) && UTF) {
3167 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3175 if (RExC_flags16 & PMf_EXTENDED)
3176 p = regwhite(p, RExC_end);
3178 /* Prime the casefolded buffer. */
3179 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3181 if (ISMULT2(p)) { /* Back off on ?+*. */
3186 /* Emit all the Unicode characters. */
3187 for (foldbuf = tmpbuf;
3189 foldlen -= numlen) {
3190 ender = utf8_to_uvchr(foldbuf, &numlen);
3192 reguni(pRExC_state, ender, s, &numlen);
3198 break; /* "Can't happen." */
3202 reguni(pRExC_state, ender, s, &numlen);
3217 /* Emit all the Unicode characters. */
3218 for (foldbuf = tmpbuf;
3220 foldlen -= numlen) {
3221 ender = utf8_to_uvchr(foldbuf, &numlen);
3223 reguni(pRExC_state, ender, s, &numlen);
3233 reguni(pRExC_state, ender, s, &numlen);
3246 Set_Node_Cur_Length(ret); /* MJD */
3247 nextchar(pRExC_state);
3249 /* len is STRLEN which is unsigned, need to copy to signed */
3252 vFAIL("Internal disaster");
3261 RExC_size += STR_SZ(len);
3263 RExC_emit += STR_SZ(len);
3268 /* If the encoding pragma is in effect recode the text of
3269 * any EXACT-kind nodes. */
3270 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3271 STRLEN oldlen = STR_LEN(ret);
3272 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3276 if (sv_utf8_downgrade(sv, TRUE)) {
3277 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3278 STRLEN newlen = SvCUR(sv);
3281 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3282 (int)oldlen, STRING(ret),
3284 Copy(s, STRING(ret), newlen, char);
3285 STR_LEN(ret) += newlen - oldlen;
3286 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3288 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3296 S_regwhite(pTHX_ char *p, char *e)
3301 else if (*p == '#') {
3304 } while (p < e && *p != '\n');
3312 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3313 Character classes ([:foo:]) can also be negated ([:^foo:]).
3314 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3315 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3316 but trigger failures because they are currently unimplemented. */
3318 #define POSIXCC_DONE(c) ((c) == ':')
3319 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3320 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3323 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3326 I32 namedclass = OOB_NAMEDCLASS;
3328 if (value == '[' && RExC_parse + 1 < RExC_end &&
3329 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3330 POSIXCC(UCHARAT(RExC_parse))) {
3331 char c = UCHARAT(RExC_parse);
3332 char* s = RExC_parse++;
3334 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3336 if (RExC_parse == RExC_end)
3337 /* Grandfather lone [:, [=, [. */
3340 char* t = RExC_parse++; /* skip over the c */
3342 if (UCHARAT(RExC_parse) == ']') {
3343 RExC_parse++; /* skip over the ending ] */
3346 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3347 I32 skip = 5; /* the most common skip */
3351 if (strnEQ(posixcc, "alnum", 5))
3353 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3354 else if (strnEQ(posixcc, "alpha", 5))
3356 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3357 else if (strnEQ(posixcc, "ascii", 5))
3359 complement ? ANYOF_NASCII : ANYOF_ASCII;
3362 if (strnEQ(posixcc, "blank", 5))
3364 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3367 if (strnEQ(posixcc, "cntrl", 5))
3369 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3372 if (strnEQ(posixcc, "digit", 5))
3374 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3377 if (strnEQ(posixcc, "graph", 5))
3379 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3382 if (strnEQ(posixcc, "lower", 5))
3384 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3387 if (strnEQ(posixcc, "print", 5))
3389 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3390 else if (strnEQ(posixcc, "punct", 5))
3392 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3395 if (strnEQ(posixcc, "space", 5))
3397 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3400 if (strnEQ(posixcc, "upper", 5))
3402 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3404 case 'w': /* this is not POSIX, this is the Perl \w */
3405 if (strnEQ(posixcc, "word", 4)) {
3407 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3412 if (strnEQ(posixcc, "xdigit", 6)) {
3414 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3419 if (namedclass == OOB_NAMEDCLASS ||
3420 posixcc[skip] != ':' ||
3421 posixcc[skip+1] != ']')
3423 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3426 } else if (!SIZE_ONLY) {
3427 /* [[=foo=]] and [[.foo.]] are still future. */
3429 /* adjust RExC_parse so the warning shows after
3431 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3433 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3436 /* Maternal grandfather:
3437 * "[:" ending in ":" but not in ":]" */
3447 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3449 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
3450 POSIXCC(UCHARAT(RExC_parse))) {
3451 char *s = RExC_parse;
3454 while(*s && isALNUM(*s))
3456 if (*s && c == *s && s[1] == ']') {
3457 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3459 /* [[=foo=]] and [[.foo.]] are still future. */
3460 if (POSIXCC_NOTYET(c)) {
3461 /* adjust RExC_parse so the error shows after
3463 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3465 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3472 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3475 register UV nextvalue;
3476 register IV prevvalue = OOB_UNICODE;
3477 register IV range = 0;
3478 register regnode *ret;
3481 char *rangebegin = 0;
3482 bool need_class = 0;
3483 SV *listsv = Nullsv;
3486 bool optimize_invert = TRUE;
3487 AV* unicode_alternate = 0;
3489 ret = reganode(pRExC_state, ANYOF, 0);
3492 ANYOF_FLAGS(ret) = 0;
3494 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3498 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3502 RExC_size += ANYOF_SKIP;
3504 RExC_emit += ANYOF_SKIP;
3506 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3508 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3509 ANYOF_BITMAP_ZERO(ret);
3510 listsv = newSVpvn("# comment\n", 10);
3513 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3515 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
3516 checkposixcc(pRExC_state);
3518 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3519 if (UCHARAT(RExC_parse) == ']')
3522 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3526 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3529 rangebegin = RExC_parse;
3531 value = utf8n_to_uvchr((U8*)RExC_parse,
3532 RExC_end - RExC_parse,
3534 RExC_parse += numlen;
3537 value = UCHARAT(RExC_parse++);
3538 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3539 if (value == '[' && POSIXCC(nextvalue))
3540 namedclass = regpposixcc(pRExC_state, value);
3541 else if (value == '\\') {
3543 value = utf8n_to_uvchr((U8*)RExC_parse,
3544 RExC_end - RExC_parse,
3546 RExC_parse += numlen;
3549 value = UCHARAT(RExC_parse++);
3550 /* Some compilers cannot handle switching on 64-bit integer
3551 * values, therefore value cannot be an UV. Yes, this will
3552 * be a problem later if we want switch on Unicode.
3553 * A similar issue a little bit later when switching on
3554 * namedclass. --jhi */
3555 switch ((I32)value) {
3556 case 'w': namedclass = ANYOF_ALNUM; break;
3557 case 'W': namedclass = ANYOF_NALNUM; break;
3558 case 's': namedclass = ANYOF_SPACE; break;
3559 case 'S': namedclass = ANYOF_NSPACE; break;
3560 case 'd': namedclass = ANYOF_DIGIT; break;
3561 case 'D': namedclass = ANYOF_NDIGIT; break;
3564 if (*RExC_parse == '{') {
3566 e = strchr(RExC_parse++, '}');
3568 vFAIL2("Missing right brace on \\%c{}", c);
3569 while (isSPACE(UCHARAT(RExC_parse)))
3571 if (e == RExC_parse)
3572 vFAIL2("Empty \\%c{}", c);
3574 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3582 if (UCHARAT(RExC_parse) == '^') {
3585 value = value == 'p' ? 'P' : 'p'; /* toggle */
3586 while (isSPACE(UCHARAT(RExC_parse))) {
3592 Perl_sv_catpvf(aTHX_ listsv,
3593 "+utf8::%.*s\n", (int)n, RExC_parse);
3595 Perl_sv_catpvf(aTHX_ listsv,
3596 "!utf8::%.*s\n", (int)n, RExC_parse);
3599 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3601 case 'n': value = '\n'; break;
3602 case 'r': value = '\r'; break;
3603 case 't': value = '\t'; break;
3604 case 'f': value = '\f'; break;
3605 case 'b': value = '\b'; break;
3606 case 'e': value = ASCII_TO_NATIVE('\033');break;
3607 case 'a': value = ASCII_TO_NATIVE('\007');break;
3609 if (*RExC_parse == '{') {
3610 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3611 | PERL_SCAN_DISALLOW_PREFIX;
3612 e = strchr(RExC_parse++, '}');
3614 vFAIL("Missing right brace on \\x{}");
3616 numlen = e - RExC_parse;
3617 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3621 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3623 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3624 RExC_parse += numlen;
3628 value = UCHARAT(RExC_parse++);
3629 value = toCTRL(value);
3631 case '0': case '1': case '2': case '3': case '4':
3632 case '5': case '6': case '7': case '8': case '9':
3636 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3637 RExC_parse += numlen;
3641 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3643 "Unrecognized escape \\%c in character class passed through",
3647 } /* end of \blah */
3649 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3651 if (!SIZE_ONLY && !need_class)
3652 ANYOF_CLASS_ZERO(ret);
3656 /* a bad range like a-\d, a-[:digit:] ? */
3659 if (ckWARN(WARN_REGEXP))
3661 "False [] range \"%*.*s\"",
3662 RExC_parse - rangebegin,
3663 RExC_parse - rangebegin,
3665 if (prevvalue < 256) {
3666 ANYOF_BITMAP_SET(ret, prevvalue);
3667 ANYOF_BITMAP_SET(ret, '-');
3670 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3671 Perl_sv_catpvf(aTHX_ listsv,
3672 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3676 range = 0; /* this was not a true range */
3680 if (namedclass > OOB_NAMEDCLASS)
3681 optimize_invert = FALSE;
3682 /* Possible truncation here but in some 64-bit environments
3683 * the compiler gets heartburn about switch on 64-bit values.
3684 * A similar issue a little earlier when switching on value.
3686 switch ((I32)namedclass) {
3689 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3691 for (value = 0; value < 256; value++)
3693 ANYOF_BITMAP_SET(ret, value);
3695 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3699 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3701 for (value = 0; value < 256; value++)
3702 if (!isALNUM(value))
3703 ANYOF_BITMAP_SET(ret, value);
3705 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3709 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3711 for (value = 0; value < 256; value++)
3712 if (isALNUMC(value))
3713 ANYOF_BITMAP_SET(ret, value);
3715 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3719 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3721 for (value = 0; value < 256; value++)
3722 if (!isALNUMC(value))
3723 ANYOF_BITMAP_SET(ret, value);
3725 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3729 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3731 for (value = 0; value < 256; value++)
3733 ANYOF_BITMAP_SET(ret, value);
3735 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3739 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3741 for (value = 0; value < 256; value++)
3742 if (!isALPHA(value))
3743 ANYOF_BITMAP_SET(ret, value);
3745 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3749 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3752 for (value = 0; value < 128; value++)
3753 ANYOF_BITMAP_SET(ret, value);
3755 for (value = 0; value < 256; value++) {
3757 ANYOF_BITMAP_SET(ret, value);
3761 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3765 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3768 for (value = 128; value < 256; value++)
3769 ANYOF_BITMAP_SET(ret, value);
3771 for (value = 0; value < 256; value++) {
3772 if (!isASCII(value))
3773 ANYOF_BITMAP_SET(ret, value);
3777 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3781 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3783 for (value = 0; value < 256; value++)
3785 ANYOF_BITMAP_SET(ret, value);
3787 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3791 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3793 for (value = 0; value < 256; value++)
3794 if (!isBLANK(value))
3795 ANYOF_BITMAP_SET(ret, value);
3797 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3801 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3803 for (value = 0; value < 256; value++)
3805 ANYOF_BITMAP_SET(ret, value);
3807 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3811 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3813 for (value = 0; value < 256; value++)
3814 if (!isCNTRL(value))
3815 ANYOF_BITMAP_SET(ret, value);
3817 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3821 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3823 /* consecutive digits assumed */
3824 for (value = '0'; value <= '9'; value++)
3825 ANYOF_BITMAP_SET(ret, value);
3827 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3831 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3833 /* consecutive digits assumed */
3834 for (value = 0; value < '0'; value++)
3835 ANYOF_BITMAP_SET(ret, value);
3836 for (value = '9' + 1; value < 256; value++)
3837 ANYOF_BITMAP_SET(ret, value);
3839 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3843 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3845 for (value = 0; value < 256; value++)
3847 ANYOF_BITMAP_SET(ret, value);
3849 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3853 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3855 for (value = 0; value < 256; value++)
3856 if (!isGRAPH(value))
3857 ANYOF_BITMAP_SET(ret, value);
3859 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3863 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3865 for (value = 0; value < 256; value++)
3867 ANYOF_BITMAP_SET(ret, value);
3869 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3873 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3875 for (value = 0; value < 256; value++)
3876 if (!isLOWER(value))
3877 ANYOF_BITMAP_SET(ret, value);
3879 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3883 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3885 for (value = 0; value < 256; value++)
3887 ANYOF_BITMAP_SET(ret, value);
3889 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3893 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3895 for (value = 0; value < 256; value++)
3896 if (!isPRINT(value))
3897 ANYOF_BITMAP_SET(ret, value);
3899 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3903 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3905 for (value = 0; value < 256; value++)
3906 if (isPSXSPC(value))
3907 ANYOF_BITMAP_SET(ret, value);
3909 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3913 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3915 for (value = 0; value < 256; value++)
3916 if (!isPSXSPC(value))
3917 ANYOF_BITMAP_SET(ret, value);
3919 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3923 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3925 for (value = 0; value < 256; value++)
3927 ANYOF_BITMAP_SET(ret, value);
3929 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3933 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3935 for (value = 0; value < 256; value++)
3936 if (!isPUNCT(value))
3937 ANYOF_BITMAP_SET(ret, value);
3939 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3943 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3945 for (value = 0; value < 256; value++)
3947 ANYOF_BITMAP_SET(ret, value);
3949 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3953 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3955 for (value = 0; value < 256; value++)
3956 if (!isSPACE(value))
3957 ANYOF_BITMAP_SET(ret, value);
3959 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3963 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3965 for (value = 0; value < 256; value++)
3967 ANYOF_BITMAP_SET(ret, value);
3969 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3973 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3975 for (value = 0; value < 256; value++)
3976 if (!isUPPER(value))
3977 ANYOF_BITMAP_SET(ret, value);
3979 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3983 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3985 for (value = 0; value < 256; value++)
3986 if (isXDIGIT(value))
3987 ANYOF_BITMAP_SET(ret, value);
3989 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
3993 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
3995 for (value = 0; value < 256; value++)
3996 if (!isXDIGIT(value))
3997 ANYOF_BITMAP_SET(ret, value);
3999 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4002 vFAIL("Invalid [::] class");
4006 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4009 } /* end of namedclass \blah */
4012 if (prevvalue > value) /* b-a */ {
4013 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4014 RExC_parse - rangebegin,
4015 RExC_parse - rangebegin,
4017 range = 0; /* not a valid range */
4021 prevvalue = value; /* save the beginning of the range */
4022 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4023 RExC_parse[1] != ']') {
4026 /* a bad range like \w-, [:word:]- ? */
4027 if (namedclass > OOB_NAMEDCLASS) {
4028 if (ckWARN(WARN_REGEXP))
4030 "False [] range \"%*.*s\"",
4031 RExC_parse - rangebegin,
4032 RExC_parse - rangebegin,
4035 ANYOF_BITMAP_SET(ret, '-');
4037 range = 1; /* yeah, it's a range! */
4038 continue; /* but do it the next time */
4042 /* now is the next time */
4046 if (prevvalue < 256) {
4047 IV ceilvalue = value < 256 ? value : 255;
4050 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4051 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4053 if (isLOWER(prevvalue)) {
4054 for (i = prevvalue; i <= ceilvalue; i++)
4056 ANYOF_BITMAP_SET(ret, i);
4058 for (i = prevvalue; i <= ceilvalue; i++)
4060 ANYOF_BITMAP_SET(ret, i);
4065 for (i = prevvalue; i <= ceilvalue; i++)
4066 ANYOF_BITMAP_SET(ret, i);
4068 if (value > 255 || UTF) {
4069 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4070 UV natvalue = NATIVE_TO_UNI(value);
4072 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4073 if (prevnatvalue < natvalue) { /* what about > ? */
4074 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4075 prevnatvalue, natvalue);
4077 else if (prevnatvalue == natvalue) {
4078 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4080 U8 tmpbuf [UTF8_MAXLEN+1];
4081 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4085 uvchr_to_utf8(tmpbuf, natvalue);
4086 to_utf8_fold(tmpbuf, foldbuf, &foldlen);
4087 f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0));
4089 /* If folding and foldable and a single
4090 * character, insert also the folded version
4091 * to the charclass. */
4093 if (foldlen == UNISKIP(f))
4094 Perl_sv_catpvf(aTHX_ listsv,
4097 /* Any multicharacter foldings
4098 * require the following transform:
4099 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4100 * where E folds into "pq" and F folds
4101 * into "rst", all other characters
4102 * fold to single characters. We save
4103 * away these multicharacter foldings,
4104 * to be later saved as part of the
4105 * additional "s" data. */
4108 if (!unicode_alternate)
4109 unicode_alternate = newAV();
4110 sv = newSVpvn((char*)foldbuf, foldlen);
4112 av_push(unicode_alternate, sv);
4116 /* If folding and the value is one of the Greek
4117 * sigmas insert a few more sigmas to make the
4118 * folding rules of the sigmas to work right.
4119 * Note that not all the possible combinations
4120 * are handled here: some of them are handled
4121 * by the standard folding rules, and some of
4122 * them (literal or EXACTF cases) are handled
4123 * during runtime in regexec.c:S_find_byclass(). */
4124 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4125 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4126 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4127 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4128 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4130 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4131 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4132 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4138 range = 0; /* this range (if it was one) is done now */
4142 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4144 RExC_size += ANYOF_CLASS_ADD_SKIP;
4146 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4149 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4151 /* If the only flag is folding (plus possibly inversion). */
4152 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4154 for (value = 0; value < 256; ++value) {
4155 if (ANYOF_BITMAP_TEST(ret, value)) {
4156 IV fold = PL_fold[value];
4159 ANYOF_BITMAP_SET(ret, fold);
4162 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4165 /* optimize inverted simple patterns (e.g. [^a-z]) */
4166 if (!SIZE_ONLY && optimize_invert &&
4167 /* If the only flag is inversion. */
4168 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4169 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4170 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4171 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4178 /* The 0th element stores the character class description
4179 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4180 * to initialize the appropriate swash (which gets stored in
4181 * the 1st element), and also useful for dumping the regnode.
4182 * The 2nd element stores the multicharacter foldings,
4183 * used later (regexec.c:s_reginclasslen()). */
4184 av_store(av, 0, listsv);
4185 av_store(av, 1, NULL);
4186 av_store(av, 2, (SV*)unicode_alternate);
4187 rv = newRV_noinc((SV*)av);
4188 n = add_data(pRExC_state, 1, "s");
4189 RExC_rx->data->data[n] = (void*)rv;
4197 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4199 char* retval = RExC_parse++;
4202 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4203 RExC_parse[2] == '#') {
4204 while (*RExC_parse && *RExC_parse != ')')
4209 if (RExC_flags16 & PMf_EXTENDED) {
4210 if (isSPACE(*RExC_parse)) {
4214 else if (*RExC_parse == '#') {
4215 while (*RExC_parse && *RExC_parse != '\n')
4226 - reg_node - emit a node
4228 STATIC regnode * /* Location. */
4229 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4231 register regnode *ret;
4232 register regnode *ptr;
4236 SIZE_ALIGN(RExC_size);
4241 NODE_ALIGN_FILL(ret);
4243 FILL_ADVANCE_NODE(ptr, op);
4244 if (RExC_offsets) { /* MJD */
4245 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4246 "reg_node", __LINE__,
4248 RExC_emit - RExC_emit_start > RExC_offsets[0]
4249 ? "Overwriting end of array!\n" : "OK",
4250 RExC_emit - RExC_emit_start,
4251 RExC_parse - RExC_start,
4253 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4262 - reganode - emit a node with an argument
4264 STATIC regnode * /* Location. */
4265 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4267 register regnode *ret;
4268 register regnode *ptr;
4272 SIZE_ALIGN(RExC_size);
4277 NODE_ALIGN_FILL(ret);
4279 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4280 if (RExC_offsets) { /* MJD */
4281 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4283 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4284 "Overwriting end of array!\n" : "OK",
4285 RExC_emit - RExC_emit_start,
4286 RExC_parse - RExC_start,
4288 Set_Cur_Node_Offset;
4297 - reguni - emit (if appropriate) a Unicode character
4300 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4302 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4306 - reginsert - insert an operator in front of already-emitted operand
4308 * Means relocating the operand.
4311 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4313 register regnode *src;
4314 register regnode *dst;
4315 register regnode *place;
4316 register int offset = regarglen[(U8)op];
4318 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4321 RExC_size += NODE_STEP_REGNODE + offset;
4326 RExC_emit += NODE_STEP_REGNODE + offset;
4328 while (src > opnd) {
4329 StructCopy(--src, --dst, regnode);
4330 if (RExC_offsets) { /* MJD 20010112 */
4331 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4333 dst - RExC_emit_start > RExC_offsets[0]
4334 ? "Overwriting end of array!\n" : "OK",
4335 src - RExC_emit_start,
4336 dst - RExC_emit_start,
4338 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4339 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4344 place = opnd; /* Op node, where operand used to be. */
4345 if (RExC_offsets) { /* MJD */
4346 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4348 place - RExC_emit_start > RExC_offsets[0]
4349 ? "Overwriting end of array!\n" : "OK",
4350 place - RExC_emit_start,
4351 RExC_parse - RExC_start,
4353 Set_Node_Offset(place, RExC_parse);
4355 src = NEXTOPER(place);
4356 FILL_ADVANCE_NODE(place, op);
4357 Zero(src, offset, regnode);
4361 - regtail - set the next-pointer at the end of a node chain of p to val.
4364 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4366 register regnode *scan;
4367 register regnode *temp;
4372 /* Find last node. */
4375 temp = regnext(scan);
4381 if (reg_off_by_arg[OP(scan)]) {
4382 ARG_SET(scan, val - scan);
4385 NEXT_OFF(scan) = val - scan;
4390 - regoptail - regtail on operand of first argument; nop if operandless
4393 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4395 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4396 if (p == NULL || SIZE_ONLY)
4398 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4399 regtail(pRExC_state, NEXTOPER(p), val);
4401 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4402 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4409 - regcurly - a little FSA that accepts {\d+,?\d*}
4412 S_regcurly(pTHX_ register char *s)
4433 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4435 register U8 op = EXACT; /* Arbitrary non-END op. */
4436 register regnode *next;
4438 while (op != END && (!last || node < last)) {
4439 /* While that wasn't END last time... */
4445 next = regnext(node);
4447 if (OP(node) == OPTIMIZED)
4450 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4451 (int)(2*l + 1), "", SvPVX(sv));
4452 if (next == NULL) /* Next ptr. */
4453 PerlIO_printf(Perl_debug_log, "(0)");
4455 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4456 (void)PerlIO_putc(Perl_debug_log, '\n');
4458 if (PL_regkind[(U8)op] == BRANCHJ) {
4459 register regnode *nnode = (OP(next) == LONGJMP
4462 if (last && nnode > last)
4464 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4466 else if (PL_regkind[(U8)op] == BRANCH) {
4467 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4469 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4470 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4471 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4473 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4474 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4477 else if ( op == PLUS || op == STAR) {
4478 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4480 else if (op == ANYOF) {
4481 /* arglen 1 + class block */
4482 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4483 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4484 node = NEXTOPER(node);
4486 else if (PL_regkind[(U8)op] == EXACT) {
4487 /* Literal string, where present. */
4488 node += NODE_SZ_STR(node) - 1;
4489 node = NEXTOPER(node);
4492 node = NEXTOPER(node);
4493 node += regarglen[(U8)op];
4495 if (op == CURLYX || op == OPEN)
4497 else if (op == WHILEM)
4503 #endif /* DEBUGGING */
4506 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4509 Perl_regdump(pTHX_ regexp *r)
4512 SV *sv = sv_newmortal();
4514 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4516 /* Header fields of interest. */
4517 if (r->anchored_substr)
4518 PerlIO_printf(Perl_debug_log,
4519 "anchored `%s%.*s%s'%s at %"IVdf" ",
4521 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4522 SvPVX(r->anchored_substr),
4524 SvTAIL(r->anchored_substr) ? "$" : "",
4525 (IV)r->anchored_offset);
4526 if (r->float_substr)
4527 PerlIO_printf(Perl_debug_log,
4528 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4530 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4531 SvPVX(r->float_substr),
4533 SvTAIL(r->float_substr) ? "$" : "",
4534 (IV)r->float_min_offset, (UV)r->float_max_offset);
4535 if (r->check_substr)
4536 PerlIO_printf(Perl_debug_log,
4537 r->check_substr == r->float_substr
4538 ? "(checking floating" : "(checking anchored");
4539 if (r->reganch & ROPT_NOSCAN)
4540 PerlIO_printf(Perl_debug_log, " noscan");
4541 if (r->reganch & ROPT_CHECK_ALL)
4542 PerlIO_printf(Perl_debug_log, " isall");
4543 if (r->check_substr)
4544 PerlIO_printf(Perl_debug_log, ") ");
4546 if (r->regstclass) {
4547 regprop(sv, r->regstclass);
4548 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4550 if (r->reganch & ROPT_ANCH) {
4551 PerlIO_printf(Perl_debug_log, "anchored");
4552 if (r->reganch & ROPT_ANCH_BOL)
4553 PerlIO_printf(Perl_debug_log, "(BOL)");
4554 if (r->reganch & ROPT_ANCH_MBOL)
4555 PerlIO_printf(Perl_debug_log, "(MBOL)");
4556 if (r->reganch & ROPT_ANCH_SBOL)
4557 PerlIO_printf(Perl_debug_log, "(SBOL)");
4558 if (r->reganch & ROPT_ANCH_GPOS)
4559 PerlIO_printf(Perl_debug_log, "(GPOS)");
4560 PerlIO_putc(Perl_debug_log, ' ');
4562 if (r->reganch & ROPT_GPOS_SEEN)
4563 PerlIO_printf(Perl_debug_log, "GPOS ");
4564 if (r->reganch & ROPT_SKIP)
4565 PerlIO_printf(Perl_debug_log, "plus ");
4566 if (r->reganch & ROPT_IMPLICIT)
4567 PerlIO_printf(Perl_debug_log, "implicit ");
4568 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4569 if (r->reganch & ROPT_EVAL_SEEN)
4570 PerlIO_printf(Perl_debug_log, "with eval ");
4571 PerlIO_printf(Perl_debug_log, "\n");
4574 U32 len = r->offsets[0];
4575 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4576 for (i = 1; i <= len; i++)
4577 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4578 (UV)r->offsets[i*2-1],
4579 (UV)r->offsets[i*2]);
4580 PerlIO_printf(Perl_debug_log, "\n");
4582 #endif /* DEBUGGING */
4588 S_put_byte(pTHX_ SV *sv, int c)
4590 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4591 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4592 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4593 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4595 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4598 #endif /* DEBUGGING */
4601 - regprop - printable representation of opcode
4604 Perl_regprop(pTHX_ SV *sv, regnode *o)
4609 sv_setpvn(sv, "", 0);
4610 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4611 /* It would be nice to FAIL() here, but this may be called from
4612 regexec.c, and it would be hard to supply pRExC_state. */
4613 Perl_croak(aTHX_ "Corrupted regexp opcode");
4614 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4616 k = PL_regkind[(U8)OP(o)];
4619 SV *dsv = sv_2mortal(newSVpvn("", 0));
4620 /* Using is_utf8_string() is a crude hack but it may
4621 * be the best for now since we have no flag "this EXACTish
4622 * node was UTF-8" --jhi */
4623 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4625 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4626 UNI_DISPLAY_REGEX) :
4631 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4636 else if (k == CURLY) {
4637 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4638 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4639 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4641 else if (k == WHILEM && o->flags) /* Ordinal/of */
4642 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4643 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4644 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4645 else if (k == LOGICAL)
4646 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4647 else if (k == ANYOF) {
4648 int i, rangestart = -1;
4649 U8 flags = ANYOF_FLAGS(o);
4650 const char * const anyofs[] = { /* Should be syncronized with
4651 * ANYOF_ #xdefines in regcomp.h */
4684 if (flags & ANYOF_LOCALE)
4685 sv_catpv(sv, "{loc}");
4686 if (flags & ANYOF_FOLD)
4687 sv_catpv(sv, "{i}");
4688 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4689 if (flags & ANYOF_INVERT)
4691 for (i = 0; i <= 256; i++) {
4692 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4693 if (rangestart == -1)
4695 } else if (rangestart != -1) {
4696 if (i <= rangestart + 3)
4697 for (; rangestart < i; rangestart++)
4698 put_byte(sv, rangestart);
4700 put_byte(sv, rangestart);
4702 put_byte(sv, i - 1);
4708 if (o->flags & ANYOF_CLASS)
4709 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4710 if (ANYOF_CLASS_TEST(o,i))
4711 sv_catpv(sv, anyofs[i]);
4713 if (flags & ANYOF_UNICODE)
4714 sv_catpv(sv, "{unicode}");
4715 else if (flags & ANYOF_UNICODE_ALL)
4716 sv_catpv(sv, "{unicode_all}");
4720 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4725 U8 s[UTF8_MAXLEN+1];
4727 for (i = 0; i <= 256; i++) { /* just the first 256 */
4728 U8 *e = uvchr_to_utf8(s, i);
4730 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4731 if (rangestart == -1)
4733 } else if (rangestart != -1) {
4736 if (i <= rangestart + 3)
4737 for (; rangestart < i; rangestart++) {
4738 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4742 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4745 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4752 sv_catpv(sv, "..."); /* et cetera */
4756 char *s = savepv(SvPVX(lv));
4759 while(*s && *s != '\n') s++;
4780 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4782 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4783 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4784 #endif /* DEBUGGING */
4788 Perl_re_intuit_string(pTHX_ regexp *prog)
4789 { /* Assume that RE_INTUIT is set */
4792 char *s = SvPV(prog->check_substr,n_a);
4794 if (!PL_colorset) reginitcolors();
4795 PerlIO_printf(Perl_debug_log,
4796 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4797 PL_colors[4],PL_colors[5],PL_colors[0],
4800 (strlen(s) > 60 ? "..." : ""));
4803 return prog->check_substr;
4807 Perl_pregfree(pTHX_ struct regexp *r)
4810 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4813 if (!r || (--r->refcnt > 0))
4816 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4818 int len = SvCUR(dsv);
4821 PerlIO_printf(Perl_debug_log,
4822 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4823 PL_colors[4],PL_colors[5],PL_colors[0],
4826 len > 60 ? "..." : "");
4830 Safefree(r->precomp);
4831 if (r->offsets) /* 20010421 MJD */
4832 Safefree(r->offsets);
4833 if (RX_MATCH_COPIED(r))
4834 Safefree(r->subbeg);
4836 if (r->anchored_substr)
4837 SvREFCNT_dec(r->anchored_substr);
4838 if (r->float_substr)
4839 SvREFCNT_dec(r->float_substr);
4840 Safefree(r->substrs);
4843 int n = r->data->count;
4844 AV* new_comppad = NULL;
4849 /* If you add a ->what type here, update the comment in regcomp.h */
4850 switch (r->data->what[n]) {
4852 SvREFCNT_dec((SV*)r->data->data[n]);
4855 Safefree(r->data->data[n]);
4858 new_comppad = (AV*)r->data->data[n];
4861 if (new_comppad == NULL)
4862 Perl_croak(aTHX_ "panic: pregfree comppad");
4863 old_comppad = PL_comppad;
4864 old_curpad = PL_curpad;
4865 /* Watch out for global destruction's random ordering. */
4866 if (SvTYPE(new_comppad) == SVt_PVAV) {
4867 PL_comppad = new_comppad;
4868 PL_curpad = AvARRAY(new_comppad);
4873 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4874 op_free((OP_4tree*)r->data->data[n]);
4877 PL_comppad = old_comppad;
4878 PL_curpad = old_curpad;
4879 SvREFCNT_dec((SV*)new_comppad);
4885 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4888 Safefree(r->data->what);
4891 Safefree(r->startp);
4897 - regnext - dig the "next" pointer out of a node
4899 * [Note, when REGALIGN is defined there are two places in regmatch()
4900 * that bypass this code for speed.]
4903 Perl_regnext(pTHX_ register regnode *p)
4905 register I32 offset;
4907 if (p == &PL_regdummy)
4910 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4918 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4921 STRLEN l1 = strlen(pat1);
4922 STRLEN l2 = strlen(pat2);
4931 Copy(pat1, buf, l1 , char);
4932 Copy(pat2, buf + l1, l2 , char);
4933 buf[l1 + l2] = '\n';
4934 buf[l1 + l2 + 1] = '\0';
4936 /* ANSI variant takes additional second argument */
4937 va_start(args, pat2);
4941 msv = vmess(buf, &args);
4943 message = SvPV(msv,l1);
4946 Copy(message, buf, l1 , char);
4947 buf[l1] = '\0'; /* Overwrite \n */
4948 Perl_croak(aTHX_ "%s", buf);
4951 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4954 Perl_save_re_context(pTHX)
4957 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4958 SAVEI32(RExC_npar); /* () count. */
4959 SAVEI32(RExC_size); /* Code size. */
4960 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4961 SAVEVPTR(RExC_rx); /* from regcomp.c */
4962 SAVEI32(RExC_seen); /* from regcomp.c */
4963 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4964 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4965 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
4966 SAVEPPTR(RExC_end); /* End of input for compile */
4967 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4970 SAVEI32(PL_reg_flags); /* from regexec.c */
4972 SAVEPPTR(PL_reginput); /* String-input pointer. */
4973 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4974 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
4975 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4976 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4977 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
4978 SAVEPPTR(PL_regtill); /* How far we are required to go. */
4979 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
4980 PL_reg_start_tmp = 0;
4981 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4982 PL_reg_start_tmpl = 0;
4983 SAVEVPTR(PL_regdata);
4984 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4985 SAVEI32(PL_regnarrate); /* from regexec.c */
4986 SAVEVPTR(PL_regprogram); /* from regexec.c */
4987 SAVEINT(PL_regindent); /* from regexec.c */
4988 SAVEVPTR(PL_regcc); /* from regexec.c */
4989 SAVEVPTR(PL_curcop);
4990 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4991 SAVEVPTR(PL_reg_re); /* from regexec.c */
4992 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4993 SAVESPTR(PL_reg_sv); /* from regexec.c */
4994 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
4995 SAVEVPTR(PL_reg_magic); /* from regexec.c */
4996 SAVEI32(PL_reg_oldpos); /* from regexec.c */
4997 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4998 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
4999 SAVEI32(PL_regnpar); /* () count. */
5000 SAVEI32(PL_regsize); /* from regexec.c */
5002 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5007 clear_re(pTHX_ void *r)
5009 ReREFCNT_dec((regexp *)r);