5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 data->flags &= ~SF_FL_BEFORE_EOL;
515 SvCUR_set(data->last_found, 0);
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
600 /* 'OR' a given class with another one. Can create false positives */
601 /* We assume that cl is not inverted */
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 if (or_with->flags & ANYOF_INVERT) {
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
624 cl_anything(pRExC_state, cl);
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629 && (!(or_with->flags & ANYOF_FOLD)
630 || (cl->flags & ANYOF_FOLD)) ) {
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
643 cl_anything(pRExC_state, cl);
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
664 #if defined(__sparc64__) && defined(__GNUC__)
665 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 # undef SPARC64_GCC_WORKAROUND
667 # define SPARC64_GCC_WORKAROUND 1
671 /* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687 int is_inf_internal = 0; /* The studied chunk is infinite */
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
696 /* Merge several consecutive EXACTish nodes into one. */
697 regnode *n = regnext(scan);
700 regnode *stop = scan;
703 next = scan + NODE_SZ_STR(scan);
704 /* Skip NOTHING, merge EXACT*. */
706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
707 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
722 int oldl = STR_LEN(scan);
723 regnode *nnext = regnext(n);
725 if (oldl + STR_LEN(n) > U8_MAX)
727 NEXT_OFF(scan) += NEXT_OFF(n);
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
730 /* Now we can overwrite *n : */
731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 Two problematic code points in Unicode casefolding of EXACT nodes:
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
774 s < s2 && (t = ninstr(s, s1, t0, t1));
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
784 n = scan + NODE_SZ_STR(scan);
786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
796 if (OP(scan) != CURLYX) {
797 int max = (reg_off_by_arg[OP(scan)]
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 if (reg_off_by_arg[OP(scan)])
814 NEXT_OFF(scan) = off;
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824 I32 max1 = 0, min1 = I32_MAX, num = 0;
825 struct regnode_charclass_class accum;
827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829 if (flags & SCF_DO_STCLASS)
830 cl_init_zero(pRExC_state, &accum);
831 while (OP(scan) == code) {
832 I32 deltanext, minnext, f = 0, fake;
833 struct regnode_charclass_class this_class;
838 data_fake.whilem_c = data->whilem_c;
839 data_fake.last_closep = data->last_closep;
842 data_fake.last_closep = &fake;
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
846 scan = NEXTOPER(scan);
847 if (flags & SCF_DO_STCLASS) {
848 cl_init(pRExC_state, &this_class);
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
854 /* we suppose the run is continuous, last=next...*/
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
862 is_inf = is_inf_internal = 1;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 if (data && (data_fake.flags & SF_HAS_EVAL))
867 data->flags |= SF_HAS_EVAL;
869 data->whilem_c = data_fake.whilem_c;
870 if (flags & SCF_DO_STCLASS)
871 cl_or(pRExC_state, &accum, &this_class);
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
884 delta += max1 - min1;
885 if (flags & SCF_DO_STCLASS_OR) {
886 cl_or(pRExC_state, data->start_class, &accum);
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
892 else if (flags & SCF_DO_STCLASS_AND) {
894 cl_and(data->start_class, &accum);
895 flags &= ~SCF_DO_STCLASS;
898 /* Switch to OR mode: cache the old value of
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
910 else if (code == BRANCHJ) /* single branch is optimized. */
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
916 else if (OP(scan) == EXACT) {
917 I32 l = STR_LEN(scan);
918 UV uc = *((U8*)STRING(scan));
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
922 uc = utf8_to_uvchr(s, NULL);
925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
931 ? I32_MAX : data->pos_min + data->pos_delta;
933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935 SvUTF8_on(data->last_found);
936 data->last_end = data->pos_min + l;
937 data->pos_min += l; /* As in the first entry. */
938 data->flags &= ~SF_BEFORE_EOL;
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
947 && (!(data->start_class->flags & ANYOF_FOLD)
948 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951 ANYOF_CLASS_ZERO(data->start_class);
952 ANYOF_BITMAP_ZERO(data->start_class);
954 ANYOF_BITMAP_SET(data->start_class, uc);
955 data->start_class->flags &= ~ANYOF_EOS;
957 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959 else if (flags & SCF_DO_STCLASS_OR) {
960 /* false positive possible if the class is case-folded */
962 ANYOF_BITMAP_SET(data->start_class, uc);
964 data->start_class->flags |= ANYOF_UNICODE_ALL;
965 data->start_class->flags &= ~ANYOF_EOS;
966 cl_and(data->start_class, &and_with);
968 flags &= ~SCF_DO_STCLASS;
970 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
971 I32 l = STR_LEN(scan);
972 UV uc = *((U8*)STRING(scan));
974 /* Search for fixed substrings supports EXACT only. */
975 if (flags & SCF_DO_SUBSTR)
976 scan_commit(pRExC_state, data);
978 U8 *s = (U8 *)STRING(scan);
979 l = utf8_length(s, s + l);
980 uc = utf8_to_uvchr(s, NULL);
983 if (data && (flags & SCF_DO_SUBSTR))
985 if (flags & SCF_DO_STCLASS_AND) {
986 /* Check whether it is compatible with what we know already! */
990 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
991 && !ANYOF_BITMAP_TEST(data->start_class, uc)
992 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994 ANYOF_CLASS_ZERO(data->start_class);
995 ANYOF_BITMAP_ZERO(data->start_class);
997 ANYOF_BITMAP_SET(data->start_class, uc);
998 data->start_class->flags &= ~ANYOF_EOS;
999 data->start_class->flags |= ANYOF_FOLD;
1000 if (OP(scan) == EXACTFL)
1001 data->start_class->flags |= ANYOF_LOCALE;
1004 else if (flags & SCF_DO_STCLASS_OR) {
1005 if (data->start_class->flags & ANYOF_FOLD) {
1006 /* false positive possible if the class is case-folded.
1007 Assume that the locale settings are the same... */
1009 ANYOF_BITMAP_SET(data->start_class, uc);
1010 data->start_class->flags &= ~ANYOF_EOS;
1012 cl_and(data->start_class, &and_with);
1014 flags &= ~SCF_DO_STCLASS;
1016 else if (strchr((char*)PL_varies,OP(scan))) {
1017 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1018 I32 f = flags, pos_before = 0;
1019 regnode *oscan = scan;
1020 struct regnode_charclass_class this_class;
1021 struct regnode_charclass_class *oclass = NULL;
1022 I32 next_is_eval = 0;
1024 switch (PL_regkind[(U8)OP(scan)]) {
1025 case WHILEM: /* End of (?:...)* . */
1026 scan = NEXTOPER(scan);
1029 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1030 next = NEXTOPER(scan);
1031 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033 maxcount = REG_INFTY;
1034 next = regnext(scan);
1035 scan = NEXTOPER(scan);
1039 if (flags & SCF_DO_SUBSTR)
1044 if (flags & SCF_DO_STCLASS) {
1046 maxcount = REG_INFTY;
1047 next = regnext(scan);
1048 scan = NEXTOPER(scan);
1051 is_inf = is_inf_internal = 1;
1052 scan = regnext(scan);
1053 if (flags & SCF_DO_SUBSTR) {
1054 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1055 data->longest = &(data->longest_float);
1057 goto optimize_curly_tail;
1059 mincount = ARG1(scan);
1060 maxcount = ARG2(scan);
1061 next = regnext(scan);
1062 if (OP(scan) == CURLYX) {
1063 I32 lp = (data ? *(data->last_closep) : 0);
1065 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1068 next_is_eval = (OP(scan) == EVAL);
1070 if (flags & SCF_DO_SUBSTR) {
1071 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1072 pos_before = data->pos_min;
1076 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 data->flags |= SF_IS_INF;
1080 if (flags & SCF_DO_STCLASS) {
1081 cl_init(pRExC_state, &this_class);
1082 oclass = data->start_class;
1083 data->start_class = &this_class;
1084 f |= SCF_DO_STCLASS_AND;
1085 f &= ~SCF_DO_STCLASS_OR;
1087 /* These are the cases when once a subexpression
1088 fails at a particular position, it cannot succeed
1089 even after backtracking at the enclosing scope.
1091 XXXX what if minimal match and we are at the
1092 initial run of {n,m}? */
1093 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094 f &= ~SCF_WHILEM_VISITED_POS;
1096 /* This will finish on WHILEM, setting scan, or on NULL: */
1097 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 ? (f & ~SCF_DO_SUBSTR) : f);
1101 if (flags & SCF_DO_STCLASS)
1102 data->start_class = oclass;
1103 if (mincount == 0 || minnext == 0) {
1104 if (flags & SCF_DO_STCLASS_OR) {
1105 cl_or(pRExC_state, data->start_class, &this_class);
1107 else if (flags & SCF_DO_STCLASS_AND) {
1108 /* Switch to OR mode: cache the old value of
1109 * data->start_class */
1110 StructCopy(data->start_class, &and_with,
1111 struct regnode_charclass_class);
1112 flags &= ~SCF_DO_STCLASS_AND;
1113 StructCopy(&this_class, data->start_class,
1114 struct regnode_charclass_class);
1115 flags |= SCF_DO_STCLASS_OR;
1116 data->start_class->flags |= ANYOF_EOS;
1118 } else { /* Non-zero len */
1119 if (flags & SCF_DO_STCLASS_OR) {
1120 cl_or(pRExC_state, data->start_class, &this_class);
1121 cl_and(data->start_class, &and_with);
1123 else if (flags & SCF_DO_STCLASS_AND)
1124 cl_and(data->start_class, &this_class);
1125 flags &= ~SCF_DO_STCLASS;
1127 if (!scan) /* It was not CURLYX, but CURLY. */
1129 if (ckWARN(WARN_REGEXP)
1130 /* ? quantifier ok, except for (?{ ... }) */
1131 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1132 && (minnext == 0) && (deltanext == 0)
1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137 "Quantifier unexpected on zero-length expression");
1140 min += minnext * mincount;
1141 is_inf_internal |= ((maxcount == REG_INFTY
1142 && (minnext + deltanext) > 0)
1143 || deltanext == I32_MAX);
1144 is_inf |= is_inf_internal;
1145 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147 /* Try powerful optimization CURLYX => CURLYN. */
1148 if ( OP(oscan) == CURLYX && data
1149 && data->flags & SF_IN_PAR
1150 && !(data->flags & SF_HAS_EVAL)
1151 && !deltanext && minnext == 1 ) {
1152 /* Try to optimize to CURLYN. */
1153 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1154 regnode *nxt1 = nxt;
1161 if (!strchr((char*)PL_simple,OP(nxt))
1162 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1163 && STR_LEN(nxt) == 1))
1169 if (OP(nxt) != CLOSE)
1171 /* Now we know that nxt2 is the only contents: */
1172 oscan->flags = ARG(nxt);
1174 OP(nxt1) = NOTHING; /* was OPEN. */
1176 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1180 OP(nxt + 1) = OPTIMIZED; /* was count. */
1181 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1186 /* Try optimization CURLYX => CURLYM. */
1187 if ( OP(oscan) == CURLYX && data
1188 && !(data->flags & SF_HAS_PAR)
1189 && !(data->flags & SF_HAS_EVAL)
1191 /* XXXX How to optimize if data == 0? */
1192 /* Optimize to a simpler form. */
1193 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1198 && (OP(nxt2) != WHILEM))
1200 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1201 /* Need to optimize away parenths. */
1202 if (data->flags & SF_IN_PAR) {
1203 /* Set the parenth number. */
1204 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206 if (OP(nxt) != CLOSE)
1207 FAIL("Panic opt close");
1208 oscan->flags = ARG(nxt);
1209 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1210 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1212 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213 OP(nxt + 1) = OPTIMIZED; /* was count. */
1214 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219 regnode *nnxt = regnext(nxt1);
1222 if (reg_off_by_arg[OP(nxt1)])
1223 ARG_SET(nxt1, nxt2 - nxt1);
1224 else if (nxt2 - nxt1 < U16_MAX)
1225 NEXT_OFF(nxt1) = nxt2 - nxt1;
1227 OP(nxt) = NOTHING; /* Cannot beautify */
1232 /* Optimize again: */
1233 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 else if ((OP(oscan) == CURLYX)
1240 && (flags & SCF_WHILEM_VISITED_POS)
1241 /* See the comment on a similar expression above.
1242 However, this time it not a subexpression
1243 we care about, but the expression itself. */
1244 && (maxcount == REG_INFTY)
1245 && data && ++data->whilem_c < 16) {
1246 /* This stays as CURLYX, we can put the count/of pair. */
1247 /* Find WHILEM (as in regexec.c) */
1248 regnode *nxt = oscan + NEXT_OFF(oscan);
1250 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252 PREVOPER(nxt)->flags = data->whilem_c
1253 | (RExC_whilem_seen << 4); /* On WHILEM */
1255 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1257 if (flags & SCF_DO_SUBSTR) {
1258 SV *last_str = Nullsv;
1259 int counted = mincount != 0;
1261 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1262 #if defined(SPARC64_GCC_WORKAROUND)
1268 if (pos_before >= data->last_start_min)
1271 b = data->last_start_min;
1274 s = SvPV(data->last_found, l);
1275 old = b - data->last_start_min;
1278 I32 b = pos_before >= data->last_start_min
1279 ? pos_before : data->last_start_min;
1281 char *s = SvPV(data->last_found, l);
1282 I32 old = b - data->last_start_min;
1286 old = utf8_hop((U8*)s, old) - (U8*)s;
1289 /* Get the added string: */
1290 last_str = newSVpvn(s + old, l);
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1294 SvGROW(last_str, (mincount * l) + 1);
1295 repeatcpy(SvPVX(last_str) + l,
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
1299 SvCUR_set(data->last_found,
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1302 data->last_end += l * (mincount - 1);
1305 /* start offset must point into the last copy */
1306 data->last_start_min += minnext * (mincount - 1);
1307 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308 * (minnext + data->pos_delta);
1311 /* It is counted once already... */
1312 data->pos_min += minnext * (mincount - counted);
1313 data->pos_delta += - counted * deltanext +
1314 (minnext + deltanext) * maxcount - minnext * mincount;
1315 if (mincount != maxcount) {
1316 /* Cannot extend fixed substrings found inside
1318 scan_commit(pRExC_state,data);
1319 if (mincount && last_str) {
1320 sv_setsv(data->last_found, last_str);
1321 data->last_end = data->pos_min;
1322 data->last_start_min =
1323 data->pos_min - CHR_SVLEN(last_str);
1324 data->last_start_max = is_inf
1326 : data->pos_min + data->pos_delta
1327 - CHR_SVLEN(last_str);
1329 data->longest = &(data->longest_float);
1331 SvREFCNT_dec(last_str);
1333 if (data && (fl & SF_HAS_EVAL))
1334 data->flags |= SF_HAS_EVAL;
1335 optimize_curly_tail:
1336 if (OP(oscan) != CURLYX) {
1337 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1339 NEXT_OFF(oscan) += NEXT_OFF(next);
1342 default: /* REF and CLUMP only? */
1343 if (flags & SCF_DO_SUBSTR) {
1344 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1345 data->longest = &(data->longest_float);
1347 is_inf = is_inf_internal = 1;
1348 if (flags & SCF_DO_STCLASS_OR)
1349 cl_anything(pRExC_state, data->start_class);
1350 flags &= ~SCF_DO_STCLASS;
1354 else if (strchr((char*)PL_simple,OP(scan))) {
1357 if (flags & SCF_DO_SUBSTR) {
1358 scan_commit(pRExC_state,data);
1362 if (flags & SCF_DO_STCLASS) {
1363 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1365 /* Some of the logic below assumes that switching
1366 locale on will only add false positives. */
1367 switch (PL_regkind[(U8)OP(scan)]) {
1371 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1373 cl_anything(pRExC_state, data->start_class);
1376 if (OP(scan) == SANY)
1378 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380 || (data->start_class->flags & ANYOF_CLASS));
1381 cl_anything(pRExC_state, data->start_class);
1383 if (flags & SCF_DO_STCLASS_AND || !value)
1384 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387 if (flags & SCF_DO_STCLASS_AND)
1388 cl_and(data->start_class,
1389 (struct regnode_charclass_class*)scan);
1391 cl_or(pRExC_state, data->start_class,
1392 (struct regnode_charclass_class*)scan);
1395 if (flags & SCF_DO_STCLASS_AND) {
1396 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398 for (value = 0; value < 256; value++)
1399 if (!isALNUM(value))
1400 ANYOF_BITMAP_CLEAR(data->start_class, value);
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1407 for (value = 0; value < 256; value++)
1409 ANYOF_BITMAP_SET(data->start_class, value);
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 data->start_class->flags |= ANYOF_LOCALE;
1424 if (flags & SCF_DO_STCLASS_AND) {
1425 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427 for (value = 0; value < 256; value++)
1429 ANYOF_BITMAP_CLEAR(data->start_class, value);
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1436 for (value = 0; value < 256; value++)
1437 if (!isALNUM(value))
1438 ANYOF_BITMAP_SET(data->start_class, value);
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448 data->start_class->flags |= ANYOF_LOCALE;
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453 if (flags & SCF_DO_STCLASS_AND) {
1454 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456 for (value = 0; value < 256; value++)
1457 if (!isSPACE(value))
1458 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1465 for (value = 0; value < 256; value++)
1467 ANYOF_BITMAP_SET(data->start_class, value);
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477 data->start_class->flags |= ANYOF_LOCALE;
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482 if (flags & SCF_DO_STCLASS_AND) {
1483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485 for (value = 0; value < 256; value++)
1487 ANYOF_BITMAP_CLEAR(data->start_class, value);
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1494 for (value = 0; value < 256; value++)
1495 if (!isSPACE(value))
1496 ANYOF_BITMAP_SET(data->start_class, value);
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (data->start_class->flags & ANYOF_LOCALE) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1505 if (!isSPACE(value))
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1510 data->start_class->flags |= ANYOF_LOCALE;
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1515 if (flags & SCF_DO_STCLASS_AND) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517 for (value = 0; value < 256; value++)
1518 if (!isDIGIT(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 if (data->start_class->flags & ANYOF_LOCALE)
1523 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1525 for (value = 0; value < 256; value++)
1527 ANYOF_BITMAP_SET(data->start_class, value);
1532 if (flags & SCF_DO_STCLASS_AND) {
1533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534 for (value = 0; value < 256; value++)
1536 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539 if (data->start_class->flags & ANYOF_LOCALE)
1540 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1542 for (value = 0; value < 256; value++)
1543 if (!isDIGIT(value))
1544 ANYOF_BITMAP_SET(data->start_class, value);
1549 if (flags & SCF_DO_STCLASS_OR)
1550 cl_and(data->start_class, &and_with);
1551 flags &= ~SCF_DO_STCLASS;
1554 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1555 data->flags |= (OP(scan) == MEOL
1559 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1560 /* Lookbehind, or need to calculate parens/evals/stclass: */
1561 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1562 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1563 /* Lookahead/lookbehind */
1564 I32 deltanext, minnext, fake = 0;
1566 struct regnode_charclass_class intrnl;
1569 data_fake.flags = 0;
1571 data_fake.whilem_c = data->whilem_c;
1572 data_fake.last_closep = data->last_closep;
1575 data_fake.last_closep = &fake;
1576 if ( flags & SCF_DO_STCLASS && !scan->flags
1577 && OP(scan) == IFMATCH ) { /* Lookahead */
1578 cl_init(pRExC_state, &intrnl);
1579 data_fake.start_class = &intrnl;
1580 f |= SCF_DO_STCLASS_AND;
1582 if (flags & SCF_WHILEM_VISITED_POS)
1583 f |= SCF_WHILEM_VISITED_POS;
1584 next = regnext(scan);
1585 nscan = NEXTOPER(NEXTOPER(scan));
1586 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589 vFAIL("Variable length lookbehind not implemented");
1591 else if (minnext > U8_MAX) {
1592 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1594 scan->flags = minnext;
1596 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1598 if (data && (data_fake.flags & SF_HAS_EVAL))
1599 data->flags |= SF_HAS_EVAL;
1601 data->whilem_c = data_fake.whilem_c;
1602 if (f & SCF_DO_STCLASS_AND) {
1603 int was = (data->start_class->flags & ANYOF_EOS);
1605 cl_and(data->start_class, &intrnl);
1607 data->start_class->flags |= ANYOF_EOS;
1610 else if (OP(scan) == OPEN) {
1613 else if (OP(scan) == CLOSE) {
1614 if (ARG(scan) == is_par) {
1615 next = regnext(scan);
1617 if ( next && (OP(next) != WHILEM) && next < last)
1618 is_par = 0; /* Disable optimization */
1621 *(data->last_closep) = ARG(scan);
1623 else if (OP(scan) == EVAL) {
1625 data->flags |= SF_HAS_EVAL;
1627 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1628 if (flags & SCF_DO_SUBSTR) {
1629 scan_commit(pRExC_state,data);
1630 data->longest = &(data->longest_float);
1632 is_inf = is_inf_internal = 1;
1633 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1634 cl_anything(pRExC_state, data->start_class);
1635 flags &= ~SCF_DO_STCLASS;
1637 /* Else: zero-length, ignore. */
1638 scan = regnext(scan);
1643 *deltap = is_inf_internal ? I32_MAX : delta;
1644 if (flags & SCF_DO_SUBSTR && is_inf)
1645 data->pos_delta = I32_MAX - data->pos_min;
1646 if (is_par > U8_MAX)
1648 if (is_par && pars==1 && data) {
1649 data->flags |= SF_IN_PAR;
1650 data->flags &= ~SF_HAS_PAR;
1652 else if (pars && data) {
1653 data->flags |= SF_HAS_PAR;
1654 data->flags &= ~SF_IN_PAR;
1656 if (flags & SCF_DO_STCLASS_OR)
1657 cl_and(data->start_class, &and_with);
1662 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1664 if (RExC_rx->data) {
1665 Renewc(RExC_rx->data,
1666 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1667 char, struct reg_data);
1668 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669 RExC_rx->data->count += n;
1672 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1673 char, struct reg_data);
1674 New(1208, RExC_rx->data->what, n, U8);
1675 RExC_rx->data->count = n;
1677 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678 return RExC_rx->data->count - n;
1682 Perl_reginitcolors(pTHX)
1685 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688 PL_colors[0] = s = savepv(s);
1690 s = strchr(s, '\t');
1696 PL_colors[i] = s = "";
1700 PL_colors[i++] = "";
1707 - pregcomp - compile a regular expression into internal code
1709 * We can't allocate space until we know how big the compiled form will be,
1710 * but we can't compile it (and thus know how big it is) until we've got a
1711 * place to put the code. So we cheat: we compile it twice, once with code
1712 * generation turned off and size counting turned on, and once "for real".
1713 * This also means that we don't allocate space until we are sure that the
1714 * thing really will compile successfully, and we never have to move the
1715 * code and thus invalidate pointers into it. (Note that it has to be in
1716 * one piece because free() must be able to free it all.) [NB: not true in perl]
1718 * Beware that the optimization-preparation code in here knows about some
1719 * of the structure of the compiled regexp. [I'll say.]
1722 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1732 RExC_state_t RExC_state;
1733 RExC_state_t *pRExC_state = &RExC_state;
1736 FAIL("NULL regexp argument");
1738 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1742 if (!PL_colorset) reginitcolors();
1743 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744 PL_colors[4],PL_colors[5],PL_colors[0],
1745 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1747 RExC_flags16 = pm->op_pmflags;
1751 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752 RExC_seen_evals = 0;
1755 /* First pass: determine size, legality. */
1762 RExC_emit = &PL_regdummy;
1763 RExC_whilem_seen = 0;
1764 #if 0 /* REGC() is (currently) a NOP at the first pass.
1765 * Clever compilers notice this and complain. --jhi */
1766 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1768 if (reg(pRExC_state, 0, &flags) == NULL) {
1769 RExC_precomp = Nullch;
1772 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1774 /* Small enough for pointer-storage convention?
1775 If extralen==0, this means that we will not need long jumps. */
1776 if (RExC_size >= 0x10000L && RExC_extralen)
1777 RExC_size += RExC_extralen;
1780 if (RExC_whilem_seen > 15)
1781 RExC_whilem_seen = 15;
1783 /* Allocate space and initialize. */
1784 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787 FAIL("Regexp out of space");
1790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1791 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 r->prelen = xend - exp;
1795 r->precomp = savepvn(RExC_precomp, r->prelen);
1797 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1798 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1800 r->substrs = 0; /* Useful during FAIL. */
1801 r->startp = 0; /* Useful during FAIL. */
1802 r->endp = 0; /* Useful during FAIL. */
1804 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1806 r->offsets[0] = RExC_size;
1808 DEBUG_r(PerlIO_printf(Perl_debug_log,
1809 "%s %"UVuf" bytes for offset annotations.\n",
1810 r->offsets ? "Got" : "Couldn't get",
1811 (UV)((2*RExC_size+1) * sizeof(U32))));
1815 /* Second pass: emit code. */
1816 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1821 RExC_emit_start = r->program;
1822 RExC_emit = r->program;
1823 /* Store the count of eval-groups for security checks: */
1824 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1827 if (reg(pRExC_state, 0, &flags) == NULL)
1830 /* Dig out information for optimizations. */
1831 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1832 pm->op_pmflags = RExC_flags16;
1834 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1835 r->regstclass = NULL;
1836 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1837 r->reganch |= ROPT_NAUGHTY;
1838 scan = r->program + 1; /* First BRANCH. */
1840 /* XXXX To minimize changes to RE engine we always allocate
1841 3-units-long substrs field. */
1842 Newz(1004, r->substrs, 1, struct reg_substr_data);
1844 StructCopy(&zero_scan_data, &data, scan_data_t);
1845 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1848 STRLEN longest_float_length, longest_fixed_length;
1849 struct regnode_charclass_class ch_class;
1854 /* Skip introductions and multiplicators >= 1. */
1855 while ((OP(first) == OPEN && (sawopen = 1)) ||
1856 /* An OR of *one* alternative - should not happen now. */
1857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858 (OP(first) == PLUS) ||
1859 (OP(first) == MINMOD) ||
1860 /* An {n,m} with n>0 */
1861 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1862 if (OP(first) == PLUS)
1865 first += regarglen[(U8)OP(first)];
1866 first = NEXTOPER(first);
1869 /* Starting-point info. */
1871 if (PL_regkind[(U8)OP(first)] == EXACT) {
1872 if (OP(first) == EXACT)
1873 ; /* Empty, get anchored substr later. */
1874 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1875 r->regstclass = first;
1877 else if (strchr((char*)PL_simple,OP(first)))
1878 r->regstclass = first;
1879 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880 PL_regkind[(U8)OP(first)] == NBOUND)
1881 r->regstclass = first;
1882 else if (PL_regkind[(U8)OP(first)] == BOL) {
1883 r->reganch |= (OP(first) == MBOL
1885 : (OP(first) == SBOL
1888 first = NEXTOPER(first);
1891 else if (OP(first) == GPOS) {
1892 r->reganch |= ROPT_ANCH_GPOS;
1893 first = NEXTOPER(first);
1896 else if (!sawopen && (OP(first) == STAR &&
1897 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1898 !(r->reganch & ROPT_ANCH) )
1900 /* turn .* into ^.* with an implied $*=1 */
1901 int type = OP(NEXTOPER(first));
1903 if (type == REG_ANY)
1904 type = ROPT_ANCH_MBOL;
1906 type = ROPT_ANCH_SBOL;
1908 r->reganch |= type | ROPT_IMPLICIT;
1909 first = NEXTOPER(first);
1912 if (sawplus && (!sawopen || !RExC_sawback)
1913 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1914 /* x+ must match at the 1st pos of run of x's */
1915 r->reganch |= ROPT_SKIP;
1917 /* Scan is after the zeroth branch, first is atomic matcher. */
1918 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1919 (IV)(first - scan + 1)));
1921 * If there's something expensive in the r.e., find the
1922 * longest literal string that must appear and make it the
1923 * regmust. Resolve ties in favor of later strings, since
1924 * the regstart check works with the beginning of the r.e.
1925 * and avoiding duplication strengthens checking. Not a
1926 * strong reason, but sufficient in the absence of others.
1927 * [Now we resolve ties in favor of the earlier string if
1928 * it happens that c_offset_min has been invalidated, since the
1929 * earlier string may buy us something the later one won't.]
1933 data.longest_fixed = newSVpvn("",0);
1934 data.longest_float = newSVpvn("",0);
1935 data.last_found = newSVpvn("",0);
1936 data.longest = &(data.longest_fixed);
1938 if (!r->regstclass) {
1939 cl_init(pRExC_state, &ch_class);
1940 data.start_class = &ch_class;
1941 stclass_flag = SCF_DO_STCLASS_AND;
1942 } else /* XXXX Check for BOUND? */
1944 data.last_closep = &last_close;
1946 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1947 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1948 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1949 && data.last_start_min == 0 && data.last_end > 0
1950 && !RExC_seen_zerolen
1951 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1952 r->reganch |= ROPT_CHECK_ALL;
1953 scan_commit(pRExC_state, &data);
1954 SvREFCNT_dec(data.last_found);
1956 longest_float_length = CHR_SVLEN(data.longest_float);
1957 if (longest_float_length
1958 || (data.flags & SF_FL_BEFORE_EOL
1959 && (!(data.flags & SF_FL_BEFORE_MEOL)
1960 || (RExC_flags16 & PMf_MULTILINE)))) {
1963 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1964 && data.offset_fixed == data.offset_float_min
1965 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966 goto remove_float; /* As in (a)+. */
1968 if (SvUTF8(data.longest_float)) {
1969 r->float_utf8 = data.longest_float;
1970 r->float_substr = Nullsv;
1972 r->float_substr = data.longest_float;
1973 r->float_utf8 = Nullsv;
1975 r->float_min_offset = data.offset_float_min;
1976 r->float_max_offset = data.offset_float_max;
1977 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978 && (!(data.flags & SF_FL_BEFORE_MEOL)
1979 || (RExC_flags16 & PMf_MULTILINE)));
1980 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1984 r->float_substr = r->float_utf8 = Nullsv;
1985 SvREFCNT_dec(data.longest_float);
1986 longest_float_length = 0;
1989 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1990 if (longest_fixed_length
1991 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags16 & PMf_MULTILINE)))) {
1996 if (SvUTF8(data.longest_fixed)) {
1997 r->anchored_utf8 = data.longest_fixed;
1998 r->anchored_substr = Nullsv;
2000 r->anchored_substr = data.longest_fixed;
2001 r->anchored_utf8 = Nullsv;
2003 r->anchored_offset = data.offset_fixed;
2004 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2006 || (RExC_flags16 & PMf_MULTILINE)));
2007 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010 r->anchored_substr = r->anchored_utf8 = Nullsv;
2011 SvREFCNT_dec(data.longest_fixed);
2012 longest_fixed_length = 0;
2015 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2016 r->regstclass = NULL;
2017 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2019 && !(data.start_class->flags & ANYOF_EOS)
2020 && !cl_is_anything(data.start_class)) {
2021 I32 n = add_data(pRExC_state, 1, "f");
2023 New(1006, RExC_rx->data->data[n], 1,
2024 struct regnode_charclass_class);
2025 StructCopy(data.start_class,
2026 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2027 struct regnode_charclass_class);
2028 r->regstclass = (regnode*)RExC_rx->data->data[n];
2029 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2030 PL_regdata = r->data; /* for regprop() */
2031 DEBUG_r({ SV *sv = sv_newmortal();
2032 regprop(sv, (regnode*)data.start_class);
2033 PerlIO_printf(Perl_debug_log,
2034 "synthetic stclass `%s'.\n",
2038 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2039 if (longest_fixed_length > longest_float_length) {
2040 r->check_substr = r->anchored_substr;
2041 r->check_utf8 = r->anchored_utf8;
2042 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043 if (r->reganch & ROPT_ANCH_SINGLE)
2044 r->reganch |= ROPT_NOSCAN;
2047 r->check_substr = r->float_substr;
2048 r->check_utf8 = r->float_utf8;
2049 r->check_offset_min = data.offset_float_min;
2050 r->check_offset_max = data.offset_float_max;
2052 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053 This should be changed ASAP! */
2054 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2055 r->reganch |= RE_USE_INTUIT;
2056 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2057 r->reganch |= RE_INTUIT_TAIL;
2061 /* Several toplevels. Best we can is to set minlen. */
2063 struct regnode_charclass_class ch_class;
2066 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067 scan = r->program + 1;
2068 cl_init(pRExC_state, &ch_class);
2069 data.start_class = &ch_class;
2070 data.last_closep = &last_close;
2071 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2072 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073 = r->float_substr = r->float_utf8 = Nullsv;
2074 if (!(data.start_class->flags & ANYOF_EOS)
2075 && !cl_is_anything(data.start_class)) {
2076 I32 n = add_data(pRExC_state, 1, "f");
2078 New(1006, RExC_rx->data->data[n], 1,
2079 struct regnode_charclass_class);
2080 StructCopy(data.start_class,
2081 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2082 struct regnode_charclass_class);
2083 r->regstclass = (regnode*)RExC_rx->data->data[n];
2084 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2085 DEBUG_r({ SV* sv = sv_newmortal();
2086 regprop(sv, (regnode*)data.start_class);
2087 PerlIO_printf(Perl_debug_log,
2088 "synthetic stclass `%s'.\n",
2094 if (RExC_seen & REG_SEEN_GPOS)
2095 r->reganch |= ROPT_GPOS_SEEN;
2096 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2097 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2098 if (RExC_seen & REG_SEEN_EVAL)
2099 r->reganch |= ROPT_EVAL_SEEN;
2100 if (RExC_seen & REG_SEEN_CANY)
2101 r->reganch |= ROPT_CANY_SEEN;
2102 Newz(1002, r->startp, RExC_npar, I32);
2103 Newz(1002, r->endp, RExC_npar, I32);
2104 PL_regdata = r->data; /* for regprop() */
2105 DEBUG_r(regdump(r));
2110 - reg - regular expression, i.e. main body or parenthesized thing
2112 * Caller must absorb opening parenthesis.
2114 * Combining parenthesis handling with the base level of regular expression
2115 * is a trifle forced, but the need to tie the tails of the branches to what
2116 * follows makes it hard to avoid.
2119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2120 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2122 register regnode *ret; /* Will be the head of the group. */
2123 register regnode *br;
2124 register regnode *lastbr;
2125 register regnode *ender = 0;
2126 register I32 parno = 0;
2127 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2129 /* for (?g), (?gc), and (?o) warnings; warning
2130 about (?c) will warn about (?g) -- japhy */
2132 I32 wastedflags = 0x00,
2135 wasted_gc = 0x02 | 0x04,
2138 char * parse_start = RExC_parse; /* MJD */
2139 char *oregcomp_parse = RExC_parse;
2142 *flagp = 0; /* Tentatively. */
2145 /* Make an OPEN node, if parenthesized. */
2147 if (*RExC_parse == '?') { /* (?...) */
2148 U16 posflags = 0, negflags = 0;
2149 U16 *flagsp = &posflags;
2151 char *seqstart = RExC_parse;
2154 paren = *RExC_parse++;
2155 ret = NULL; /* For look-ahead/behind. */
2157 case '<': /* (?<...) */
2158 RExC_seen |= REG_SEEN_LOOKBEHIND;
2159 if (*RExC_parse == '!')
2161 if (*RExC_parse != '=' && *RExC_parse != '!')
2164 case '=': /* (?=...) */
2165 case '!': /* (?!...) */
2166 RExC_seen_zerolen++;
2167 case ':': /* (?:...) */
2168 case '>': /* (?>...) */
2170 case '$': /* (?$...) */
2171 case '@': /* (?@...) */
2172 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2174 case '#': /* (?#...) */
2175 while (*RExC_parse && *RExC_parse != ')')
2177 if (*RExC_parse != ')')
2178 FAIL("Sequence (?#... not terminated");
2179 nextchar(pRExC_state);
2182 case 'p': /* (?p...) */
2183 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2184 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2186 case '?': /* (??...) */
2188 if (*RExC_parse != '{')
2190 paren = *RExC_parse++;
2192 case '{': /* (?{...}) */
2194 I32 count = 1, n = 0;
2196 char *s = RExC_parse;
2198 OP_4tree *sop, *rop;
2200 RExC_seen_zerolen++;
2201 RExC_seen |= REG_SEEN_EVAL;
2202 while (count && (c = *RExC_parse)) {
2203 if (c == '\\' && RExC_parse[1])
2211 if (*RExC_parse != ')')
2214 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219 if (RExC_parse - 1 - s)
2220 sv = newSVpvn(s, RExC_parse - 1 - s);
2222 sv = newSVpvn("", 0);
2225 Perl_save_re_context(aTHX);
2226 rop = sv_compile_2op(sv, &sop, "re", &av);
2227 sop->op_private |= OPpREFCOUNTED;
2228 /* re_dup will OpREFCNT_inc */
2229 OpREFCNT_set(sop, 1);
2232 n = add_data(pRExC_state, 3, "nop");
2233 RExC_rx->data->data[n] = (void*)rop;
2234 RExC_rx->data->data[n+1] = (void*)sop;
2235 RExC_rx->data->data[n+2] = (void*)av;
2238 else { /* First pass */
2239 if (PL_reginterp_cnt < ++RExC_seen_evals
2240 && PL_curcop != &PL_compiling)
2241 /* No compiled RE interpolated, has runtime
2242 components ===> unsafe. */
2243 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2244 if (PL_tainting && PL_tainted)
2245 FAIL("Eval-group in insecure regular expression");
2248 nextchar(pRExC_state);
2250 ret = reg_node(pRExC_state, LOGICAL);
2253 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2254 /* deal with the length of this later - MJD */
2257 return reganode(pRExC_state, EVAL, n);
2259 case '(': /* (?(?{...})...) and (?(?=...)...) */
2261 if (RExC_parse[0] == '?') { /* (?(?...)) */
2262 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263 || RExC_parse[1] == '<'
2264 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2267 ret = reg_node(pRExC_state, LOGICAL);
2270 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2274 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2276 parno = atoi(RExC_parse++);
2278 while (isDIGIT(*RExC_parse))
2280 ret = reganode(pRExC_state, GROUPP, parno);
2282 if ((c = *nextchar(pRExC_state)) != ')')
2283 vFAIL("Switch condition not recognized");
2285 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286 br = regbranch(pRExC_state, &flags, 1);
2288 br = reganode(pRExC_state, LONGJMP, 0);
2290 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291 c = *nextchar(pRExC_state);
2295 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296 regbranch(pRExC_state, &flags, 1);
2297 regtail(pRExC_state, ret, lastbr);
2300 c = *nextchar(pRExC_state);
2305 vFAIL("Switch (?(condition)... contains too many branches");
2306 ender = reg_node(pRExC_state, TAIL);
2307 regtail(pRExC_state, br, ender);
2309 regtail(pRExC_state, lastbr, ender);
2310 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2313 regtail(pRExC_state, ret, ender);
2317 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2321 RExC_parse--; /* for vFAIL to print correctly */
2322 vFAIL("Sequence (? incomplete");
2326 parse_flags: /* (?i) */
2327 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2328 /* (?g), (?gc) and (?o) are useless here
2329 and must be globally applied -- japhy */
2331 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334 if (! (wastedflags & wflagbit) ) {
2335 wastedflags |= wflagbit;
2338 "Useless (%s%c) - %suse /%c modifier",
2339 flagsp == &negflags ? "?-" : "?",
2341 flagsp == &negflags ? "don't " : "",
2347 else if (*RExC_parse == 'c') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 if (! (wastedflags & wasted_c) ) {
2350 wastedflags |= wasted_gc;
2353 "Useless (%sc) - %suse /gc modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 flagsp == &negflags ? "don't " : ""
2360 else { pmflag(flagsp, *RExC_parse); }
2364 if (*RExC_parse == '-') {
2366 wastedflags = 0; /* reset so (?g-c) warns twice */
2370 RExC_flags16 |= posflags;
2371 RExC_flags16 &= ~negflags;
2372 if (*RExC_parse == ':') {
2378 if (*RExC_parse != ')') {
2380 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2382 nextchar(pRExC_state);
2390 ret = reganode(pRExC_state, OPEN, parno);
2391 Set_Node_Length(ret, 1); /* MJD */
2392 Set_Node_Offset(ret, RExC_parse); /* MJD */
2399 /* Pick up the branches, linking them together. */
2400 parse_start = RExC_parse; /* MJD */
2401 br = regbranch(pRExC_state, &flags, 1);
2402 /* branch_len = (paren != 0); */
2406 if (*RExC_parse == '|') {
2407 if (!SIZE_ONLY && RExC_extralen) {
2408 reginsert(pRExC_state, BRANCHJ, br);
2411 reginsert(pRExC_state, BRANCH, br);
2412 Set_Node_Length(br, paren != 0);
2413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2419 else if (paren == ':') {
2420 *flagp |= flags&SIMPLE;
2422 if (open) { /* Starts with OPEN. */
2423 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2425 else if (paren != '?') /* Not Conditional */
2427 *flagp |= flags & (SPSTART | HASWIDTH);
2429 while (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 ender = reganode(pRExC_state, LONGJMP,0);
2432 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2435 RExC_extralen += 2; /* Account for LONGJMP. */
2436 nextchar(pRExC_state);
2437 br = regbranch(pRExC_state, &flags, 0);
2441 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2445 *flagp |= flags&SPSTART;
2448 if (have_branch || paren != ':') {
2449 /* Make a closing node, and hook it on the end. */
2452 ender = reg_node(pRExC_state, TAIL);
2455 ender = reganode(pRExC_state, CLOSE, parno);
2456 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457 Set_Node_Length(ender,1); /* MJD */
2463 *flagp &= ~HASWIDTH;
2466 ender = reg_node(pRExC_state, SUCCEED);
2469 ender = reg_node(pRExC_state, END);
2472 regtail(pRExC_state, lastbr, ender);
2475 /* Hook the tails of the branches to the closing node. */
2476 for (br = ret; br != NULL; br = regnext(br)) {
2477 regoptail(pRExC_state, br, ender);
2484 static char parens[] = "=!<,>";
2486 if (paren && (p = strchr(parens, paren))) {
2487 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488 int flag = (p - parens) > 1;
2491 node = SUSPEND, flag = 0;
2492 reginsert(pRExC_state, node,ret);
2494 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2498 /* Check for proper termination. */
2500 RExC_flags16 = oregflags;
2501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502 RExC_parse = oregcomp_parse;
2503 vFAIL("Unmatched (");
2506 else if (!paren && RExC_parse < RExC_end) {
2507 if (*RExC_parse == ')') {
2509 vFAIL("Unmatched )");
2512 FAIL("Junk on end of regexp"); /* "Can't happen". */
2520 - regbranch - one alternative of an | operator
2522 * Implements the concatenation operator.
2525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2527 register regnode *ret;
2528 register regnode *chain = NULL;
2529 register regnode *latest;
2530 I32 flags = 0, c = 0;
2535 if (!SIZE_ONLY && RExC_extralen)
2536 ret = reganode(pRExC_state, BRANCHJ,0);
2538 ret = reg_node(pRExC_state, BRANCH);
2539 Set_Node_Length(ret, 1);
2543 if (!first && SIZE_ONLY)
2544 RExC_extralen += 1; /* BRANCHJ */
2546 *flagp = WORST; /* Tentatively. */
2549 nextchar(pRExC_state);
2550 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2552 latest = regpiece(pRExC_state, &flags);
2553 if (latest == NULL) {
2554 if (flags & TRYAGAIN)
2558 else if (ret == NULL)
2560 *flagp |= flags&HASWIDTH;
2561 if (chain == NULL) /* First piece. */
2562 *flagp |= flags&SPSTART;
2565 regtail(pRExC_state, chain, latest);
2570 if (chain == NULL) { /* Loop ran zero times. */
2571 chain = reg_node(pRExC_state, NOTHING);
2576 *flagp |= flags&SIMPLE;
2583 - regpiece - something followed by possible [*+?]
2585 * Note that the branching code sequences used for ? and the general cases
2586 * of * and + are somewhat optimized: they use the same NOTHING node as
2587 * both the endmarker for their branch list and the body of the last branch.
2588 * It might seem that this node could be dispensed with entirely, but the
2589 * endmarker role is not redundant.
2592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2594 register regnode *ret;
2596 register char *next;
2598 char *origparse = RExC_parse;
2601 I32 max = REG_INFTY;
2604 ret = regatom(pRExC_state, &flags);
2606 if (flags & TRYAGAIN)
2613 if (op == '{' && regcurly(RExC_parse)) {
2614 parse_start = RExC_parse; /* MJD */
2615 next = RExC_parse + 1;
2617 while (isDIGIT(*next) || *next == ',') {
2626 if (*next == '}') { /* got one */
2630 min = atoi(RExC_parse);
2634 maxpos = RExC_parse;
2636 if (!max && *maxpos != '0')
2637 max = REG_INFTY; /* meaning "infinity" */
2638 else if (max >= REG_INFTY)
2639 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2641 nextchar(pRExC_state);
2644 if ((flags&SIMPLE)) {
2645 RExC_naughty += 2 + RExC_naughty / 2;
2646 reginsert(pRExC_state, CURLY, ret);
2647 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648 Set_Node_Cur_Length(ret);
2651 regnode *w = reg_node(pRExC_state, WHILEM);
2654 regtail(pRExC_state, ret, w);
2655 if (!SIZE_ONLY && RExC_extralen) {
2656 reginsert(pRExC_state, LONGJMP,ret);
2657 reginsert(pRExC_state, NOTHING,ret);
2658 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2660 reginsert(pRExC_state, CURLYX,ret);
2662 Set_Node_Offset(ret, parse_start+1);
2663 Set_Node_Length(ret,
2664 op == '{' ? (RExC_parse - parse_start) : 1);
2666 if (!SIZE_ONLY && RExC_extralen)
2667 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2668 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2670 RExC_whilem_seen++, RExC_extralen += 3;
2671 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2679 if (max && max < min)
2680 vFAIL("Can't do {n,m} with n > m");
2695 #if 0 /* Now runtime fix should be reliable. */
2697 /* if this is reinstated, don't forget to put this back into perldiag:
2699 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2701 (F) The part of the regexp subject to either the * or + quantifier
2702 could match an empty string. The {#} shows in the regular
2703 expression about where the problem was discovered.
2707 if (!(flags&HASWIDTH) && op != '?')
2708 vFAIL("Regexp *+ operand could be empty");
2711 parse_start = RExC_parse;
2712 nextchar(pRExC_state);
2714 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2716 if (op == '*' && (flags&SIMPLE)) {
2717 reginsert(pRExC_state, STAR, ret);
2721 else if (op == '*') {
2725 else if (op == '+' && (flags&SIMPLE)) {
2726 reginsert(pRExC_state, PLUS, ret);
2730 else if (op == '+') {
2734 else if (op == '?') {
2739 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2741 "%.*s matches null string many times",
2742 RExC_parse - origparse,
2746 if (*RExC_parse == '?') {
2747 nextchar(pRExC_state);
2748 reginsert(pRExC_state, MINMOD, ret);
2749 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2751 if (ISMULT2(RExC_parse)) {
2753 vFAIL("Nested quantifiers");
2760 - regatom - the lowest level
2762 * Optimization: gobbles an entire sequence of ordinary characters so that
2763 * it can turn them into a single node, which is smaller to store and
2764 * faster to run. Backslashed characters are exceptions, each becoming a
2765 * separate node; the code is simpler that way and it's not worth fixing.
2767 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2769 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2771 register regnode *ret = 0;
2773 char *parse_start = 0;
2775 *flagp = WORST; /* Tentatively. */
2778 switch (*RExC_parse) {
2780 RExC_seen_zerolen++;
2781 nextchar(pRExC_state);
2782 if (RExC_flags16 & PMf_MULTILINE)
2783 ret = reg_node(pRExC_state, MBOL);
2784 else if (RExC_flags16 & PMf_SINGLELINE)
2785 ret = reg_node(pRExC_state, SBOL);
2787 ret = reg_node(pRExC_state, BOL);
2788 Set_Node_Length(ret, 1); /* MJD */
2791 nextchar(pRExC_state);
2793 RExC_seen_zerolen++;
2794 if (RExC_flags16 & PMf_MULTILINE)
2795 ret = reg_node(pRExC_state, MEOL);
2796 else if (RExC_flags16 & PMf_SINGLELINE)
2797 ret = reg_node(pRExC_state, SEOL);
2799 ret = reg_node(pRExC_state, EOL);
2800 Set_Node_Length(ret, 1); /* MJD */
2803 nextchar(pRExC_state);
2804 if (RExC_flags16 & PMf_SINGLELINE)
2805 ret = reg_node(pRExC_state, SANY);
2807 ret = reg_node(pRExC_state, REG_ANY);
2808 *flagp |= HASWIDTH|SIMPLE;
2810 Set_Node_Length(ret, 1); /* MJD */
2814 char *oregcomp_parse = ++RExC_parse;
2815 ret = regclass(pRExC_state);
2816 if (*RExC_parse != ']') {
2817 RExC_parse = oregcomp_parse;
2818 vFAIL("Unmatched [");
2820 nextchar(pRExC_state);
2821 *flagp |= HASWIDTH|SIMPLE;
2822 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2826 nextchar(pRExC_state);
2827 ret = reg(pRExC_state, 1, &flags);
2829 if (flags & TRYAGAIN) {
2830 if (RExC_parse == RExC_end) {
2831 /* Make parent create an empty node if needed. */
2839 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2843 if (flags & TRYAGAIN) {
2847 vFAIL("Internal urp");
2848 /* Supposed to be caught earlier. */
2851 if (!regcurly(RExC_parse)) {
2860 vFAIL("Quantifier follows nothing");
2863 switch (*++RExC_parse) {
2865 RExC_seen_zerolen++;
2866 ret = reg_node(pRExC_state, SBOL);
2868 nextchar(pRExC_state);
2869 Set_Node_Length(ret, 2); /* MJD */
2872 ret = reg_node(pRExC_state, GPOS);
2873 RExC_seen |= REG_SEEN_GPOS;
2875 nextchar(pRExC_state);
2876 Set_Node_Length(ret, 2); /* MJD */
2879 ret = reg_node(pRExC_state, SEOL);
2881 RExC_seen_zerolen++; /* Do not optimize RE away */
2882 nextchar(pRExC_state);
2885 ret = reg_node(pRExC_state, EOS);
2887 RExC_seen_zerolen++; /* Do not optimize RE away */
2888 nextchar(pRExC_state);
2889 Set_Node_Length(ret, 2); /* MJD */
2892 ret = reg_node(pRExC_state, CANY);
2893 RExC_seen |= REG_SEEN_CANY;
2894 *flagp |= HASWIDTH|SIMPLE;
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 ret = reg_node(pRExC_state, CLUMP);
2901 nextchar(pRExC_state);
2902 Set_Node_Length(ret, 2); /* MJD */
2905 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2906 *flagp |= HASWIDTH|SIMPLE;
2907 nextchar(pRExC_state);
2908 Set_Node_Length(ret, 2); /* MJD */
2911 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 RExC_seen_zerolen++;
2918 RExC_seen |= REG_SEEN_LOOKBEHIND;
2919 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2925 RExC_seen_zerolen++;
2926 RExC_seen |= REG_SEEN_LOOKBEHIND;
2927 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2940 *flagp |= HASWIDTH|SIMPLE;
2941 nextchar(pRExC_state);
2942 Set_Node_Length(ret, 2); /* MJD */
2945 ret = reg_node(pRExC_state, DIGIT);
2946 *flagp |= HASWIDTH|SIMPLE;
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2951 ret = reg_node(pRExC_state, NDIGIT);
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2959 char* oldregxend = RExC_end;
2960 char* parse_start = RExC_parse;
2962 if (RExC_parse[1] == '{') {
2963 /* a lovely hack--pretend we saw [\pX] instead */
2964 RExC_end = strchr(RExC_parse, '}');
2966 U8 c = (U8)*RExC_parse;
2968 RExC_end = oldregxend;
2969 vFAIL2("Missing right brace on \\%c{}", c);
2974 RExC_end = RExC_parse + 2;
2975 if (RExC_end > oldregxend)
2976 RExC_end = oldregxend;
2980 ret = regclass(pRExC_state);
2982 RExC_end = oldregxend;
2984 Set_Node_Cur_Length(ret); /* MJD */
2985 nextchar(pRExC_state);
2986 *flagp |= HASWIDTH|SIMPLE;
2999 case '1': case '2': case '3': case '4':
3000 case '5': case '6': case '7': case '8': case '9':
3002 I32 num = atoi(RExC_parse);
3004 if (num > 9 && num >= RExC_npar)
3007 char * parse_start = RExC_parse - 1; /* MJD */
3008 while (isDIGIT(*RExC_parse))
3011 if (!SIZE_ONLY && num > RExC_rx->nparens)
3012 vFAIL("Reference to nonexistent group");
3014 ret = reganode(pRExC_state, FOLD
3015 ? (LOC ? REFFL : REFF)
3019 /* override incorrect value set in reganode MJD */
3020 Set_Node_Offset(ret, parse_start+1);
3021 Set_Node_Cur_Length(ret); /* MJD */
3023 nextchar(pRExC_state);
3028 if (RExC_parse >= RExC_end)
3029 FAIL("Trailing \\");
3032 /* Do not generate `unrecognized' warnings here, we fall
3033 back into the quick-grab loop below */
3039 if (RExC_flags16 & PMf_EXTENDED) {
3040 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3041 if (RExC_parse < RExC_end)
3047 register STRLEN len;
3053 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3055 parse_start = RExC_parse - 1;
3061 ret = reg_node(pRExC_state, FOLD
3062 ? (LOC ? EXACTFL : EXACTF)
3065 for (len = 0, p = RExC_parse - 1;
3066 len < 127 && p < RExC_end;
3071 if (RExC_flags16 & PMf_EXTENDED)
3072 p = regwhite(p, RExC_end);
3119 ender = ASCII_TO_NATIVE('\033');
3123 ender = ASCII_TO_NATIVE('\007');
3128 char* e = strchr(p, '}');
3132 vFAIL("Missing right brace on \\x{}");
3135 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3136 | PERL_SCAN_DISALLOW_PREFIX;
3138 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3141 /* numlen is generous */
3142 if (numlen + len >= 127) {
3150 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3152 ender = grok_hex(p, &numlen, &flags, NULL);
3158 ender = UCHARAT(p++);
3159 ender = toCTRL(ender);
3161 case '0': case '1': case '2': case '3':case '4':
3162 case '5': case '6': case '7': case '8':case '9':
3164 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3167 ender = grok_oct(p, &numlen, &flags, NULL);
3177 FAIL("Trailing \\");
3180 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3181 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3182 goto normal_default;
3187 if (UTF8_IS_START(*p) && UTF) {
3188 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3196 if (RExC_flags16 & PMf_EXTENDED)
3197 p = regwhite(p, RExC_end);
3199 /* Prime the casefolded buffer. */
3200 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3202 if (ISMULT2(p)) { /* Back off on ?+*. */
3209 /* Emit all the Unicode characters. */
3210 for (foldbuf = tmpbuf;
3212 foldlen -= numlen) {
3213 ender = utf8_to_uvchr(foldbuf, &numlen);
3215 reguni(pRExC_state, ender, s, &unilen);
3218 /* In EBCDIC the numlen
3219 * and unilen can differ. */
3221 if (numlen >= foldlen)
3225 break; /* "Can't happen." */
3229 reguni(pRExC_state, ender, s, &unilen);
3246 /* Emit all the Unicode characters. */
3247 for (foldbuf = tmpbuf;
3249 foldlen -= numlen) {
3250 ender = utf8_to_uvchr(foldbuf, &numlen);
3252 reguni(pRExC_state, ender, s, &unilen);
3255 /* In EBCDIC the numlen
3256 * and unilen can differ. */
3258 if (numlen >= foldlen)
3266 reguni(pRExC_state, ender, s, &unilen);
3279 Set_Node_Cur_Length(ret); /* MJD */
3280 nextchar(pRExC_state);
3282 /* len is STRLEN which is unsigned, need to copy to signed */
3285 vFAIL("Internal disaster");
3294 RExC_size += STR_SZ(len);
3296 RExC_emit += STR_SZ(len);
3301 /* If the encoding pragma is in effect recode the text of
3302 * any EXACT-kind nodes. */
3303 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3304 STRLEN oldlen = STR_LEN(ret);
3305 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3309 if (sv_utf8_downgrade(sv, TRUE)) {
3310 char *s = sv_recode_to_utf8(sv, PL_encoding);
3311 STRLEN newlen = SvCUR(sv);
3314 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3315 (int)oldlen, STRING(ret),
3317 Copy(s, STRING(ret), newlen, char);
3318 STR_LEN(ret) += newlen - oldlen;
3319 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3321 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3329 S_regwhite(pTHX_ char *p, char *e)
3334 else if (*p == '#') {
3337 } while (p < e && *p != '\n');
3345 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3346 Character classes ([:foo:]) can also be negated ([:^foo:]).
3347 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3348 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3349 but trigger failures because they are currently unimplemented. */
3351 #define POSIXCC_DONE(c) ((c) == ':')
3352 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3353 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3356 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3359 I32 namedclass = OOB_NAMEDCLASS;
3361 if (value == '[' && RExC_parse + 1 < RExC_end &&
3362 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3363 POSIXCC(UCHARAT(RExC_parse))) {
3364 char c = UCHARAT(RExC_parse);
3365 char* s = RExC_parse++;
3367 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3369 if (RExC_parse == RExC_end)
3370 /* Grandfather lone [:, [=, [. */
3373 char* t = RExC_parse++; /* skip over the c */
3375 if (UCHARAT(RExC_parse) == ']') {
3376 RExC_parse++; /* skip over the ending ] */
3379 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3380 I32 skip = 5; /* the most common skip */
3384 if (strnEQ(posixcc, "alnum", 5))
3386 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3387 else if (strnEQ(posixcc, "alpha", 5))
3389 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3390 else if (strnEQ(posixcc, "ascii", 5))
3392 complement ? ANYOF_NASCII : ANYOF_ASCII;
3395 if (strnEQ(posixcc, "blank", 5))
3397 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3400 if (strnEQ(posixcc, "cntrl", 5))
3402 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3405 if (strnEQ(posixcc, "digit", 5))
3407 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3410 if (strnEQ(posixcc, "graph", 5))
3412 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3415 if (strnEQ(posixcc, "lower", 5))
3417 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3420 if (strnEQ(posixcc, "print", 5))
3422 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3423 else if (strnEQ(posixcc, "punct", 5))
3425 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3428 if (strnEQ(posixcc, "space", 5))
3430 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3433 if (strnEQ(posixcc, "upper", 5))
3435 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3437 case 'w': /* this is not POSIX, this is the Perl \w */
3438 if (strnEQ(posixcc, "word", 4)) {
3440 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3445 if (strnEQ(posixcc, "xdigit", 6)) {
3447 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3452 if (namedclass == OOB_NAMEDCLASS ||
3453 posixcc[skip] != ':' ||
3454 posixcc[skip+1] != ']')
3456 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3459 } else if (!SIZE_ONLY) {
3460 /* [[=foo=]] and [[.foo.]] are still future. */
3462 /* adjust RExC_parse so the warning shows after
3464 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3466 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3469 /* Maternal grandfather:
3470 * "[:" ending in ":" but not in ":]" */
3480 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3482 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3483 char *s = RExC_parse;
3486 while(*s && isALNUM(*s))
3488 if (*s && c == *s && s[1] == ']') {
3489 if (ckWARN(WARN_REGEXP))
3491 "POSIX syntax [%c %c] belongs inside character classes",
3494 /* [[=foo=]] and [[.foo.]] are still future. */
3495 if (POSIXCC_NOTYET(c)) {
3496 /* adjust RExC_parse so the error shows after
3498 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3500 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3507 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3510 register UV nextvalue;
3511 register IV prevvalue = OOB_UNICODE;
3512 register IV range = 0;
3513 register regnode *ret;
3516 char *rangebegin = 0;
3517 bool need_class = 0;
3518 SV *listsv = Nullsv;
3521 bool optimize_invert = TRUE;
3522 AV* unicode_alternate = 0;
3524 ret = reganode(pRExC_state, ANYOF, 0);
3527 ANYOF_FLAGS(ret) = 0;
3529 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3533 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3537 RExC_size += ANYOF_SKIP;
3539 RExC_emit += ANYOF_SKIP;
3541 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3543 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3544 ANYOF_BITMAP_ZERO(ret);
3545 listsv = newSVpvn("# comment\n", 10);
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3550 if (!SIZE_ONLY && POSIXCC(nextvalue))
3551 checkposixcc(pRExC_state);
3553 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554 if (UCHARAT(RExC_parse) == ']')
3557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3561 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3564 rangebegin = RExC_parse;
3566 value = utf8n_to_uvchr((U8*)RExC_parse,
3567 RExC_end - RExC_parse,
3569 RExC_parse += numlen;
3572 value = UCHARAT(RExC_parse++);
3573 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574 if (value == '[' && POSIXCC(nextvalue))
3575 namedclass = regpposixcc(pRExC_state, value);
3576 else if (value == '\\') {
3578 value = utf8n_to_uvchr((U8*)RExC_parse,
3579 RExC_end - RExC_parse,
3581 RExC_parse += numlen;
3584 value = UCHARAT(RExC_parse++);
3585 /* Some compilers cannot handle switching on 64-bit integer
3586 * values, therefore value cannot be an UV. Yes, this will
3587 * be a problem later if we want switch on Unicode.
3588 * A similar issue a little bit later when switching on
3589 * namedclass. --jhi */
3590 switch ((I32)value) {
3591 case 'w': namedclass = ANYOF_ALNUM; break;
3592 case 'W': namedclass = ANYOF_NALNUM; break;
3593 case 's': namedclass = ANYOF_SPACE; break;
3594 case 'S': namedclass = ANYOF_NSPACE; break;
3595 case 'd': namedclass = ANYOF_DIGIT; break;
3596 case 'D': namedclass = ANYOF_NDIGIT; break;
3599 if (RExC_parse >= RExC_end)
3600 vFAIL2("Empty \\%c{}", value);
3601 if (*RExC_parse == '{') {
3603 e = strchr(RExC_parse++, '}');
3605 vFAIL2("Missing right brace on \\%c{}", c);
3606 while (isSPACE(UCHARAT(RExC_parse)))
3608 if (e == RExC_parse)
3609 vFAIL2("Empty \\%c{}", c);
3611 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3619 if (UCHARAT(RExC_parse) == '^') {
3622 value = value == 'p' ? 'P' : 'p'; /* toggle */
3623 while (isSPACE(UCHARAT(RExC_parse))) {
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "+utf8::%.*s\n", (int)n, RExC_parse);
3632 Perl_sv_catpvf(aTHX_ listsv,
3633 "!utf8::%.*s\n", (int)n, RExC_parse);
3636 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3638 case 'n': value = '\n'; break;
3639 case 'r': value = '\r'; break;
3640 case 't': value = '\t'; break;
3641 case 'f': value = '\f'; break;
3642 case 'b': value = '\b'; break;
3643 case 'e': value = ASCII_TO_NATIVE('\033');break;
3644 case 'a': value = ASCII_TO_NATIVE('\007');break;
3646 if (*RExC_parse == '{') {
3647 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648 | PERL_SCAN_DISALLOW_PREFIX;
3649 e = strchr(RExC_parse++, '}');
3651 vFAIL("Missing right brace on \\x{}");
3653 numlen = e - RExC_parse;
3654 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3658 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3661 RExC_parse += numlen;
3665 value = UCHARAT(RExC_parse++);
3666 value = toCTRL(value);
3668 case '0': case '1': case '2': case '3': case '4':
3669 case '5': case '6': case '7': case '8': case '9':
3673 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3674 RExC_parse += numlen;
3678 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3680 "Unrecognized escape \\%c in character class passed through",
3684 } /* end of \blah */
3686 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3688 if (!SIZE_ONLY && !need_class)
3689 ANYOF_CLASS_ZERO(ret);
3693 /* a bad range like a-\d, a-[:digit:] ? */
3696 if (ckWARN(WARN_REGEXP))
3698 "False [] range \"%*.*s\"",
3699 RExC_parse - rangebegin,
3700 RExC_parse - rangebegin,
3702 if (prevvalue < 256) {
3703 ANYOF_BITMAP_SET(ret, prevvalue);
3704 ANYOF_BITMAP_SET(ret, '-');
3707 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3708 Perl_sv_catpvf(aTHX_ listsv,
3709 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3713 range = 0; /* this was not a true range */
3717 if (namedclass > OOB_NAMEDCLASS)
3718 optimize_invert = FALSE;
3719 /* Possible truncation here but in some 64-bit environments
3720 * the compiler gets heartburn about switch on 64-bit values.
3721 * A similar issue a little earlier when switching on value.
3723 switch ((I32)namedclass) {
3726 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3728 for (value = 0; value < 256; value++)
3730 ANYOF_BITMAP_SET(ret, value);
3732 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3736 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3738 for (value = 0; value < 256; value++)
3739 if (!isALNUM(value))
3740 ANYOF_BITMAP_SET(ret, value);
3742 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3746 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3748 for (value = 0; value < 256; value++)
3749 if (isALNUMC(value))
3750 ANYOF_BITMAP_SET(ret, value);
3752 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3756 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3758 for (value = 0; value < 256; value++)
3759 if (!isALNUMC(value))
3760 ANYOF_BITMAP_SET(ret, value);
3762 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3766 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3768 for (value = 0; value < 256; value++)
3770 ANYOF_BITMAP_SET(ret, value);
3772 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3776 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3778 for (value = 0; value < 256; value++)
3779 if (!isALPHA(value))
3780 ANYOF_BITMAP_SET(ret, value);
3782 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3786 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3789 for (value = 0; value < 128; value++)
3790 ANYOF_BITMAP_SET(ret, value);
3792 for (value = 0; value < 256; value++) {
3794 ANYOF_BITMAP_SET(ret, value);
3798 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3802 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3805 for (value = 128; value < 256; value++)
3806 ANYOF_BITMAP_SET(ret, value);
3808 for (value = 0; value < 256; value++) {
3809 if (!isASCII(value))
3810 ANYOF_BITMAP_SET(ret, value);
3814 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3818 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3820 for (value = 0; value < 256; value++)
3822 ANYOF_BITMAP_SET(ret, value);
3824 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3828 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3830 for (value = 0; value < 256; value++)
3831 if (!isBLANK(value))
3832 ANYOF_BITMAP_SET(ret, value);
3834 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3838 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3840 for (value = 0; value < 256; value++)
3842 ANYOF_BITMAP_SET(ret, value);
3844 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3848 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3850 for (value = 0; value < 256; value++)
3851 if (!isCNTRL(value))
3852 ANYOF_BITMAP_SET(ret, value);
3854 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3858 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3860 /* consecutive digits assumed */
3861 for (value = '0'; value <= '9'; value++)
3862 ANYOF_BITMAP_SET(ret, value);
3864 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3868 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3870 /* consecutive digits assumed */
3871 for (value = 0; value < '0'; value++)
3872 ANYOF_BITMAP_SET(ret, value);
3873 for (value = '9' + 1; value < 256; value++)
3874 ANYOF_BITMAP_SET(ret, value);
3876 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3880 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3882 for (value = 0; value < 256; value++)
3884 ANYOF_BITMAP_SET(ret, value);
3886 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3890 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3892 for (value = 0; value < 256; value++)
3893 if (!isGRAPH(value))
3894 ANYOF_BITMAP_SET(ret, value);
3896 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3900 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3902 for (value = 0; value < 256; value++)
3904 ANYOF_BITMAP_SET(ret, value);
3906 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3910 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3912 for (value = 0; value < 256; value++)
3913 if (!isLOWER(value))
3914 ANYOF_BITMAP_SET(ret, value);
3916 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3920 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3922 for (value = 0; value < 256; value++)
3924 ANYOF_BITMAP_SET(ret, value);
3926 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3930 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3932 for (value = 0; value < 256; value++)
3933 if (!isPRINT(value))
3934 ANYOF_BITMAP_SET(ret, value);
3936 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3940 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3942 for (value = 0; value < 256; value++)
3943 if (isPSXSPC(value))
3944 ANYOF_BITMAP_SET(ret, value);
3946 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3950 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3952 for (value = 0; value < 256; value++)
3953 if (!isPSXSPC(value))
3954 ANYOF_BITMAP_SET(ret, value);
3956 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3960 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3962 for (value = 0; value < 256; value++)
3964 ANYOF_BITMAP_SET(ret, value);
3966 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3970 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3972 for (value = 0; value < 256; value++)
3973 if (!isPUNCT(value))
3974 ANYOF_BITMAP_SET(ret, value);
3976 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3980 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3982 for (value = 0; value < 256; value++)
3984 ANYOF_BITMAP_SET(ret, value);
3986 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3990 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3992 for (value = 0; value < 256; value++)
3993 if (!isSPACE(value))
3994 ANYOF_BITMAP_SET(ret, value);
3996 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4000 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4002 for (value = 0; value < 256; value++)
4004 ANYOF_BITMAP_SET(ret, value);
4006 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4010 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4012 for (value = 0; value < 256; value++)
4013 if (!isUPPER(value))
4014 ANYOF_BITMAP_SET(ret, value);
4016 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4020 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4022 for (value = 0; value < 256; value++)
4023 if (isXDIGIT(value))
4024 ANYOF_BITMAP_SET(ret, value);
4026 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4030 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4032 for (value = 0; value < 256; value++)
4033 if (!isXDIGIT(value))
4034 ANYOF_BITMAP_SET(ret, value);
4036 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4039 vFAIL("Invalid [::] class");
4043 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4046 } /* end of namedclass \blah */
4049 if (prevvalue > value) /* b-a */ {
4050 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4051 RExC_parse - rangebegin,
4052 RExC_parse - rangebegin,
4054 range = 0; /* not a valid range */
4058 prevvalue = value; /* save the beginning of the range */
4059 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4060 RExC_parse[1] != ']') {
4063 /* a bad range like \w-, [:word:]- ? */
4064 if (namedclass > OOB_NAMEDCLASS) {
4065 if (ckWARN(WARN_REGEXP))
4067 "False [] range \"%*.*s\"",
4068 RExC_parse - rangebegin,
4069 RExC_parse - rangebegin,
4072 ANYOF_BITMAP_SET(ret, '-');
4074 range = 1; /* yeah, it's a range! */
4075 continue; /* but do it the next time */
4079 /* now is the next time */
4083 if (prevvalue < 256) {
4084 IV ceilvalue = value < 256 ? value : 255;
4087 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4088 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4090 if (isLOWER(prevvalue)) {
4091 for (i = prevvalue; i <= ceilvalue; i++)
4093 ANYOF_BITMAP_SET(ret, i);
4095 for (i = prevvalue; i <= ceilvalue; i++)
4097 ANYOF_BITMAP_SET(ret, i);
4102 for (i = prevvalue; i <= ceilvalue; i++)
4103 ANYOF_BITMAP_SET(ret, i);
4105 if (value > 255 || UTF) {
4106 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4107 UV natvalue = NATIVE_TO_UNI(value);
4109 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4110 if (prevnatvalue < natvalue) { /* what about > ? */
4111 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4112 prevnatvalue, natvalue);
4114 else if (prevnatvalue == natvalue) {
4115 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4117 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4119 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4121 /* If folding and foldable and a single
4122 * character, insert also the folded version
4123 * to the charclass. */
4125 if (foldlen == UNISKIP(f))
4126 Perl_sv_catpvf(aTHX_ listsv,
4129 /* Any multicharacter foldings
4130 * require the following transform:
4131 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4132 * where E folds into "pq" and F folds
4133 * into "rst", all other characters
4134 * fold to single characters. We save
4135 * away these multicharacter foldings,
4136 * to be later saved as part of the
4137 * additional "s" data. */
4140 if (!unicode_alternate)
4141 unicode_alternate = newAV();
4142 sv = newSVpvn((char*)foldbuf, foldlen);
4144 av_push(unicode_alternate, sv);
4148 /* If folding and the value is one of the Greek
4149 * sigmas insert a few more sigmas to make the
4150 * folding rules of the sigmas to work right.
4151 * Note that not all the possible combinations
4152 * are handled here: some of them are handled
4153 * by the standard folding rules, and some of
4154 * them (literal or EXACTF cases) are handled
4155 * during runtime in regexec.c:S_find_byclass(). */
4156 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4157 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4158 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4159 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4160 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4162 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4163 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4164 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4170 range = 0; /* this range (if it was one) is done now */
4174 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4176 RExC_size += ANYOF_CLASS_ADD_SKIP;
4178 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4181 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4183 /* If the only flag is folding (plus possibly inversion). */
4184 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4186 for (value = 0; value < 256; ++value) {
4187 if (ANYOF_BITMAP_TEST(ret, value)) {
4188 IV fold = PL_fold[value];
4191 ANYOF_BITMAP_SET(ret, fold);
4194 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4197 /* optimize inverted simple patterns (e.g. [^a-z]) */
4198 if (!SIZE_ONLY && optimize_invert &&
4199 /* If the only flag is inversion. */
4200 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4201 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4202 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4203 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4210 /* The 0th element stores the character class description
4211 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4212 * to initialize the appropriate swash (which gets stored in
4213 * the 1st element), and also useful for dumping the regnode.
4214 * The 2nd element stores the multicharacter foldings,
4215 * used later (regexec.c:s_reginclasslen()). */
4216 av_store(av, 0, listsv);
4217 av_store(av, 1, NULL);
4218 av_store(av, 2, (SV*)unicode_alternate);
4219 rv = newRV_noinc((SV*)av);
4220 n = add_data(pRExC_state, 1, "s");
4221 RExC_rx->data->data[n] = (void*)rv;
4229 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4231 char* retval = RExC_parse++;
4234 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4235 RExC_parse[2] == '#') {
4236 while (*RExC_parse && *RExC_parse != ')')
4241 if (RExC_flags16 & PMf_EXTENDED) {
4242 if (isSPACE(*RExC_parse)) {
4246 else if (*RExC_parse == '#') {
4247 while (*RExC_parse && *RExC_parse != '\n')
4258 - reg_node - emit a node
4260 STATIC regnode * /* Location. */
4261 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4263 register regnode *ret;
4264 register regnode *ptr;
4268 SIZE_ALIGN(RExC_size);
4273 NODE_ALIGN_FILL(ret);
4275 FILL_ADVANCE_NODE(ptr, op);
4276 if (RExC_offsets) { /* MJD */
4277 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4278 "reg_node", __LINE__,
4280 RExC_emit - RExC_emit_start > RExC_offsets[0]
4281 ? "Overwriting end of array!\n" : "OK",
4282 RExC_emit - RExC_emit_start,
4283 RExC_parse - RExC_start,
4285 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4294 - reganode - emit a node with an argument
4296 STATIC regnode * /* Location. */
4297 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4299 register regnode *ret;
4300 register regnode *ptr;
4304 SIZE_ALIGN(RExC_size);
4309 NODE_ALIGN_FILL(ret);
4311 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4312 if (RExC_offsets) { /* MJD */
4313 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4315 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4316 "Overwriting end of array!\n" : "OK",
4317 RExC_emit - RExC_emit_start,
4318 RExC_parse - RExC_start,
4320 Set_Cur_Node_Offset;
4329 - reguni - emit (if appropriate) a Unicode character
4332 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4334 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4338 - reginsert - insert an operator in front of already-emitted operand
4340 * Means relocating the operand.
4343 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4345 register regnode *src;
4346 register regnode *dst;
4347 register regnode *place;
4348 register int offset = regarglen[(U8)op];
4350 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4353 RExC_size += NODE_STEP_REGNODE + offset;
4358 RExC_emit += NODE_STEP_REGNODE + offset;
4360 while (src > opnd) {
4361 StructCopy(--src, --dst, regnode);
4362 if (RExC_offsets) { /* MJD 20010112 */
4363 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4365 dst - RExC_emit_start > RExC_offsets[0]
4366 ? "Overwriting end of array!\n" : "OK",
4367 src - RExC_emit_start,
4368 dst - RExC_emit_start,
4370 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4371 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4376 place = opnd; /* Op node, where operand used to be. */
4377 if (RExC_offsets) { /* MJD */
4378 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4380 place - RExC_emit_start > RExC_offsets[0]
4381 ? "Overwriting end of array!\n" : "OK",
4382 place - RExC_emit_start,
4383 RExC_parse - RExC_start,
4385 Set_Node_Offset(place, RExC_parse);
4387 src = NEXTOPER(place);
4388 FILL_ADVANCE_NODE(place, op);
4389 Zero(src, offset, regnode);
4393 - regtail - set the next-pointer at the end of a node chain of p to val.
4396 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4398 register regnode *scan;
4399 register regnode *temp;
4404 /* Find last node. */
4407 temp = regnext(scan);
4413 if (reg_off_by_arg[OP(scan)]) {
4414 ARG_SET(scan, val - scan);
4417 NEXT_OFF(scan) = val - scan;
4422 - regoptail - regtail on operand of first argument; nop if operandless
4425 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4427 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4428 if (p == NULL || SIZE_ONLY)
4430 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4431 regtail(pRExC_state, NEXTOPER(p), val);
4433 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4434 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4441 - regcurly - a little FSA that accepts {\d+,?\d*}
4444 S_regcurly(pTHX_ register char *s)
4465 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4467 register U8 op = EXACT; /* Arbitrary non-END op. */
4468 register regnode *next;
4470 while (op != END && (!last || node < last)) {
4471 /* While that wasn't END last time... */
4477 next = regnext(node);
4479 if (OP(node) == OPTIMIZED)
4482 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4483 (int)(2*l + 1), "", SvPVX(sv));
4484 if (next == NULL) /* Next ptr. */
4485 PerlIO_printf(Perl_debug_log, "(0)");
4487 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4488 (void)PerlIO_putc(Perl_debug_log, '\n');
4490 if (PL_regkind[(U8)op] == BRANCHJ) {
4491 register regnode *nnode = (OP(next) == LONGJMP
4494 if (last && nnode > last)
4496 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4498 else if (PL_regkind[(U8)op] == BRANCH) {
4499 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4501 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4502 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4503 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4505 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4506 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4509 else if ( op == PLUS || op == STAR) {
4510 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4512 else if (op == ANYOF) {
4513 /* arglen 1 + class block */
4514 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4515 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4516 node = NEXTOPER(node);
4518 else if (PL_regkind[(U8)op] == EXACT) {
4519 /* Literal string, where present. */
4520 node += NODE_SZ_STR(node) - 1;
4521 node = NEXTOPER(node);
4524 node = NEXTOPER(node);
4525 node += regarglen[(U8)op];
4527 if (op == CURLYX || op == OPEN)
4529 else if (op == WHILEM)
4535 #endif /* DEBUGGING */
4538 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4541 Perl_regdump(pTHX_ regexp *r)
4544 SV *sv = sv_newmortal();
4546 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4548 /* Header fields of interest. */
4549 if (r->anchored_substr)
4550 PerlIO_printf(Perl_debug_log,
4551 "anchored `%s%.*s%s'%s at %"IVdf" ",
4553 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4554 SvPVX(r->anchored_substr),
4556 SvTAIL(r->anchored_substr) ? "$" : "",
4557 (IV)r->anchored_offset);
4558 else if (r->anchored_utf8)
4559 PerlIO_printf(Perl_debug_log,
4560 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4562 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4563 SvPVX(r->anchored_utf8),
4565 SvTAIL(r->anchored_utf8) ? "$" : "",
4566 (IV)r->anchored_offset);
4567 if (r->float_substr)
4568 PerlIO_printf(Perl_debug_log,
4569 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4571 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4572 SvPVX(r->float_substr),
4574 SvTAIL(r->float_substr) ? "$" : "",
4575 (IV)r->float_min_offset, (UV)r->float_max_offset);
4576 else if (r->float_utf8)
4577 PerlIO_printf(Perl_debug_log,
4578 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4580 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4581 SvPVX(r->float_utf8),
4583 SvTAIL(r->float_utf8) ? "$" : "",
4584 (IV)r->float_min_offset, (UV)r->float_max_offset);
4585 if (r->check_substr || r->check_utf8)
4586 PerlIO_printf(Perl_debug_log,
4587 r->check_substr == r->float_substr
4588 && r->check_utf8 == r->float_utf8
4589 ? "(checking floating" : "(checking anchored");
4590 if (r->reganch & ROPT_NOSCAN)
4591 PerlIO_printf(Perl_debug_log, " noscan");
4592 if (r->reganch & ROPT_CHECK_ALL)
4593 PerlIO_printf(Perl_debug_log, " isall");
4594 if (r->check_substr || r->check_utf8)
4595 PerlIO_printf(Perl_debug_log, ") ");
4597 if (r->regstclass) {
4598 regprop(sv, r->regstclass);
4599 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4601 if (r->reganch & ROPT_ANCH) {
4602 PerlIO_printf(Perl_debug_log, "anchored");
4603 if (r->reganch & ROPT_ANCH_BOL)
4604 PerlIO_printf(Perl_debug_log, "(BOL)");
4605 if (r->reganch & ROPT_ANCH_MBOL)
4606 PerlIO_printf(Perl_debug_log, "(MBOL)");
4607 if (r->reganch & ROPT_ANCH_SBOL)
4608 PerlIO_printf(Perl_debug_log, "(SBOL)");
4609 if (r->reganch & ROPT_ANCH_GPOS)
4610 PerlIO_printf(Perl_debug_log, "(GPOS)");
4611 PerlIO_putc(Perl_debug_log, ' ');
4613 if (r->reganch & ROPT_GPOS_SEEN)
4614 PerlIO_printf(Perl_debug_log, "GPOS ");
4615 if (r->reganch & ROPT_SKIP)
4616 PerlIO_printf(Perl_debug_log, "plus ");
4617 if (r->reganch & ROPT_IMPLICIT)
4618 PerlIO_printf(Perl_debug_log, "implicit ");
4619 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4620 if (r->reganch & ROPT_EVAL_SEEN)
4621 PerlIO_printf(Perl_debug_log, "with eval ");
4622 PerlIO_printf(Perl_debug_log, "\n");
4625 U32 len = r->offsets[0];
4626 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4627 for (i = 1; i <= len; i++)
4628 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4629 (UV)r->offsets[i*2-1],
4630 (UV)r->offsets[i*2]);
4631 PerlIO_printf(Perl_debug_log, "\n");
4633 #endif /* DEBUGGING */
4639 S_put_byte(pTHX_ SV *sv, int c)
4641 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4642 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4643 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4644 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4646 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4649 #endif /* DEBUGGING */
4652 - regprop - printable representation of opcode
4655 Perl_regprop(pTHX_ SV *sv, regnode *o)
4660 sv_setpvn(sv, "", 0);
4661 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4662 /* It would be nice to FAIL() here, but this may be called from
4663 regexec.c, and it would be hard to supply pRExC_state. */
4664 Perl_croak(aTHX_ "Corrupted regexp opcode");
4665 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4667 k = PL_regkind[(U8)OP(o)];
4670 SV *dsv = sv_2mortal(newSVpvn("", 0));
4671 /* Using is_utf8_string() is a crude hack but it may
4672 * be the best for now since we have no flag "this EXACTish
4673 * node was UTF-8" --jhi */
4674 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4676 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4677 UNI_DISPLAY_REGEX) :
4682 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4687 else if (k == CURLY) {
4688 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4689 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4690 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4692 else if (k == WHILEM && o->flags) /* Ordinal/of */
4693 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4694 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4695 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4696 else if (k == LOGICAL)
4697 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4698 else if (k == ANYOF) {
4699 int i, rangestart = -1;
4700 U8 flags = ANYOF_FLAGS(o);
4701 const char * const anyofs[] = { /* Should be syncronized with
4702 * ANYOF_ #xdefines in regcomp.h */
4735 if (flags & ANYOF_LOCALE)
4736 sv_catpv(sv, "{loc}");
4737 if (flags & ANYOF_FOLD)
4738 sv_catpv(sv, "{i}");
4739 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4740 if (flags & ANYOF_INVERT)
4742 for (i = 0; i <= 256; i++) {
4743 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4744 if (rangestart == -1)
4746 } else if (rangestart != -1) {
4747 if (i <= rangestart + 3)
4748 for (; rangestart < i; rangestart++)
4749 put_byte(sv, rangestart);
4751 put_byte(sv, rangestart);
4753 put_byte(sv, i - 1);
4759 if (o->flags & ANYOF_CLASS)
4760 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4761 if (ANYOF_CLASS_TEST(o,i))
4762 sv_catpv(sv, anyofs[i]);
4764 if (flags & ANYOF_UNICODE)
4765 sv_catpv(sv, "{unicode}");
4766 else if (flags & ANYOF_UNICODE_ALL)
4767 sv_catpv(sv, "{unicode_all}");
4771 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4776 U8 s[UTF8_MAXLEN+1];
4778 for (i = 0; i <= 256; i++) { /* just the first 256 */
4779 U8 *e = uvchr_to_utf8(s, i);
4781 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4782 if (rangestart == -1)
4784 } else if (rangestart != -1) {
4787 if (i <= rangestart + 3)
4788 for (; rangestart < i; rangestart++) {
4789 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4793 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4796 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4803 sv_catpv(sv, "..."); /* et cetera */
4807 char *s = savepv(SvPVX(lv));
4810 while(*s && *s != '\n') s++;
4831 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4833 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4834 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4835 #endif /* DEBUGGING */
4839 Perl_re_intuit_string(pTHX_ regexp *prog)
4840 { /* Assume that RE_INTUIT is set */
4843 char *s = SvPV(prog->check_substr
4844 ? prog->check_substr : prog->check_utf8, n_a);
4846 if (!PL_colorset) reginitcolors();
4847 PerlIO_printf(Perl_debug_log,
4848 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4850 prog->check_substr ? "" : "utf8 ",
4851 PL_colors[5],PL_colors[0],
4854 (strlen(s) > 60 ? "..." : ""));
4857 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4861 Perl_pregfree(pTHX_ struct regexp *r)
4864 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4867 if (!r || (--r->refcnt > 0))
4870 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4872 int len = SvCUR(dsv);
4875 PerlIO_printf(Perl_debug_log,
4876 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4877 PL_colors[4],PL_colors[5],PL_colors[0],
4880 len > 60 ? "..." : "");
4884 Safefree(r->precomp);
4885 if (r->offsets) /* 20010421 MJD */
4886 Safefree(r->offsets);
4887 if (RX_MATCH_COPIED(r))
4888 Safefree(r->subbeg);
4890 if (r->anchored_substr)
4891 SvREFCNT_dec(r->anchored_substr);
4892 if (r->anchored_utf8)
4893 SvREFCNT_dec(r->anchored_utf8);
4894 if (r->float_substr)
4895 SvREFCNT_dec(r->float_substr);
4897 SvREFCNT_dec(r->float_utf8);
4898 Safefree(r->substrs);
4901 int n = r->data->count;
4902 AV* new_comppad = NULL;
4907 /* If you add a ->what type here, update the comment in regcomp.h */
4908 switch (r->data->what[n]) {
4910 SvREFCNT_dec((SV*)r->data->data[n]);
4913 Safefree(r->data->data[n]);
4916 new_comppad = (AV*)r->data->data[n];
4919 if (new_comppad == NULL)
4920 Perl_croak(aTHX_ "panic: pregfree comppad");
4921 old_comppad = PL_comppad;
4922 old_curpad = PL_curpad;
4923 /* Watch out for global destruction's random ordering. */
4924 if (SvTYPE(new_comppad) == SVt_PVAV) {
4925 PL_comppad = new_comppad;
4926 PL_curpad = AvARRAY(new_comppad);
4931 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4932 op_free((OP_4tree*)r->data->data[n]);
4935 PL_comppad = old_comppad;
4936 PL_curpad = old_curpad;
4937 SvREFCNT_dec((SV*)new_comppad);
4943 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4946 Safefree(r->data->what);
4949 Safefree(r->startp);
4955 - regnext - dig the "next" pointer out of a node
4957 * [Note, when REGALIGN is defined there are two places in regmatch()
4958 * that bypass this code for speed.]
4961 Perl_regnext(pTHX_ register regnode *p)
4963 register I32 offset;
4965 if (p == &PL_regdummy)
4968 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4976 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4979 STRLEN l1 = strlen(pat1);
4980 STRLEN l2 = strlen(pat2);
4989 Copy(pat1, buf, l1 , char);
4990 Copy(pat2, buf + l1, l2 , char);
4991 buf[l1 + l2] = '\n';
4992 buf[l1 + l2 + 1] = '\0';
4994 /* ANSI variant takes additional second argument */
4995 va_start(args, pat2);
4999 msv = vmess(buf, &args);
5001 message = SvPV(msv,l1);
5004 Copy(message, buf, l1 , char);
5005 buf[l1] = '\0'; /* Overwrite \n */
5006 Perl_croak(aTHX_ "%s", buf);
5009 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5012 Perl_save_re_context(pTHX)
5015 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5016 SAVEI32(RExC_npar); /* () count. */
5017 SAVEI32(RExC_size); /* Code size. */
5018 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5019 SAVEVPTR(RExC_rx); /* from regcomp.c */
5020 SAVEI32(RExC_seen); /* from regcomp.c */
5021 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5022 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5023 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
5024 SAVEPPTR(RExC_end); /* End of input for compile */
5025 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5028 SAVEI32(PL_reg_flags); /* from regexec.c */
5030 SAVEPPTR(PL_reginput); /* String-input pointer. */
5031 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5032 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5033 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5034 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5035 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5036 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5037 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5038 PL_reg_start_tmp = 0;
5039 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5040 PL_reg_start_tmpl = 0;
5041 SAVEVPTR(PL_regdata);
5042 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5043 SAVEI32(PL_regnarrate); /* from regexec.c */
5044 SAVEVPTR(PL_regprogram); /* from regexec.c */
5045 SAVEINT(PL_regindent); /* from regexec.c */
5046 SAVEVPTR(PL_regcc); /* from regexec.c */
5047 SAVEVPTR(PL_curcop);
5048 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5049 SAVEVPTR(PL_reg_re); /* from regexec.c */
5050 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5051 SAVESPTR(PL_reg_sv); /* from regexec.c */
5052 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5053 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5054 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5055 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5056 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5057 SAVEI32(PL_regnpar); /* () count. */
5058 SAVEI32(PL_regsize); /* from regexec.c */
5060 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5065 clear_re(pTHX_ void *r)
5067 ReREFCNT_dec((regexp *)r);