5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags16 (pRExC_state->flags16)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
273 msg, (int)len, RExC_precomp, ellipses); \
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
281 #define FAIL2(pat,msg) \
283 char *ellipses = ""; \
284 IV len = RExC_end - RExC_precomp; \
287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
295 msg, (int)len, RExC_precomp, ellipses); \
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
302 #define Simple_vFAIL(m) \
304 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 * Like Simple_vFAIL(), but accepts two arguments.
323 #define Simple_vFAIL2(m,a1) \
325 IV offset = RExC_parse - RExC_precomp; \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
328 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 #define vFAIL2(m,a1) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) \
347 IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
356 #define vFAIL3(m,a1,a2) \
359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
360 Simple_vFAIL3(m, a1, a2); \
364 * Like Simple_vFAIL(), but accepts four arguments.
366 #define Simple_vFAIL4(m, a1, a2, a3) \
368 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 * Like Simple_vFAIL(), but accepts five arguments.
377 #define Simple_vFAIL5(m, a1, a2, a3, a4) \
379 IV offset = RExC_parse - RExC_precomp; \
380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381 (int)offset, RExC_precomp, RExC_precomp + offset); \
385 #define vWARN(loc,m) \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
392 #define vWARNdep(loc,m) \
394 IV offset = loc - RExC_precomp; \
395 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN2(loc, m, a1) \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
408 #define vWARN3(loc, m, a1, a2) \
410 IV offset = loc - RExC_precomp; \
411 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
413 (int)offset, RExC_precomp, RExC_precomp + offset); \
416 #define vWARN4(loc, m, a1, a2, a3) \
418 IV offset = loc - RExC_precomp; \
419 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
421 (int)offset, RExC_precomp, RExC_precomp + offset); \
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4) \
427 IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
430 (int)offset, RExC_precomp, RExC_precomp + offset); \
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437 /* Macros for recording node offsets. 20001227 mjd@plover.com
438 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
439 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
440 * Element 0 holds the number n.
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
447 # define Set_Node_Offset_To_R(node,byte) \
451 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 RExC_offsets[2*(node)-1] = (byte); \
458 # define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 # define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461 # define Set_Node_Length_To_R(node,len) \
464 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 RExC_offsets[2*(node)] = (len); \
473 # define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 # define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
475 # define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481 static void clear_re(pTHX_ void *r);
483 /* Mark that we cannot extend a found fixed substring at this point.
484 Updata the longest found anchored substring and the longest found
485 floating substrings if needed. */
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 STRLEN l = CHR_SVLEN(data->last_found);
491 STRLEN old_l = CHR_SVLEN(*data->longest);
493 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494 sv_setsv(*data->longest, data->last_found);
495 if (*data->longest == data->longest_fixed) {
496 data->offset_fixed = l ? data->last_start_min : data->pos_min;
497 if (data->flags & SF_BEFORE_EOL)
499 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 data->flags &= ~SF_FIX_BEFORE_EOL;
504 data->offset_float_min = l ? data->last_start_min : data->pos_min;
505 data->offset_float_max = (l
506 ? data->last_start_max
507 : data->pos_min + data->pos_delta);
508 if (data->flags & SF_BEFORE_EOL)
510 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 data->flags &= ~SF_FL_BEFORE_EOL;
515 SvCUR_set(data->last_found, 0);
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
590 cl->flags &= ~ANYOF_UNICODE_ALL;
591 cl->flags |= ANYOF_UNICODE;
592 ARG_SET(cl, ARG(and_with));
594 if (!(and_with->flags & ANYOF_UNICODE_ALL))
595 cl->flags &= ~ANYOF_UNICODE_ALL;
596 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
597 cl->flags &= ~ANYOF_UNICODE;
600 /* 'OR' a given class with another one. Can create false positives */
601 /* We assume that cl is not inverted */
603 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 if (or_with->flags & ANYOF_INVERT) {
607 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
608 * <= (B1 | !B2) | (CL1 | !CL2)
609 * which is wasteful if CL2 is small, but we ignore CL2:
610 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
611 * XXXX Can we handle case-fold? Unclear:
612 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
613 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
616 && !(or_with->flags & ANYOF_FOLD)
617 && !(cl->flags & ANYOF_FOLD) ) {
620 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
621 cl->bitmap[i] |= ~or_with->bitmap[i];
622 } /* XXXX: logic is complicated otherwise */
624 cl_anything(pRExC_state, cl);
627 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
628 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
629 && (!(or_with->flags & ANYOF_FOLD)
630 || (cl->flags & ANYOF_FOLD)) ) {
633 /* OR char bitmap and class bitmap separately */
634 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
635 cl->bitmap[i] |= or_with->bitmap[i];
636 if (or_with->flags & ANYOF_CLASS) {
637 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
638 cl->classflags[i] |= or_with->classflags[i];
639 cl->flags |= ANYOF_CLASS;
642 else { /* XXXX: logic is complicated, leave it along for a moment. */
643 cl_anything(pRExC_state, cl);
646 if (or_with->flags & ANYOF_EOS)
647 cl->flags |= ANYOF_EOS;
649 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
650 ARG(cl) != ARG(or_with)) {
651 cl->flags |= ANYOF_UNICODE_ALL;
652 cl->flags &= ~ANYOF_UNICODE;
654 if (or_with->flags & ANYOF_UNICODE_ALL) {
655 cl->flags |= ANYOF_UNICODE_ALL;
656 cl->flags &= ~ANYOF_UNICODE;
661 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
662 * These need to be revisited when a newer toolchain becomes available.
664 #if defined(__sparc64__) && defined(__GNUC__)
665 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
666 # undef SPARC64_GCC_WORKAROUND
667 # define SPARC64_GCC_WORKAROUND 1
671 /* REx optimizer. Converts nodes into quickier variants "in place".
672 Finds fixed substrings. */
674 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
675 to the position after last scanned or to NULL. */
678 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
679 /* scanp: Start here (read-write). */
680 /* deltap: Write maxlen-minlen here. */
681 /* last: Stop before this one. */
683 I32 min = 0, pars = 0, code;
684 regnode *scan = *scanp, *next;
686 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
687 int is_inf_internal = 0; /* The studied chunk is infinite */
688 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
689 scan_data_t data_fake;
690 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692 while (scan && OP(scan) != END && scan < last) {
693 /* Peephole optimizer: */
695 if (PL_regkind[(U8)OP(scan)] == EXACT) {
696 /* Merge several consecutive EXACTish nodes into one. */
697 regnode *n = regnext(scan);
700 regnode *stop = scan;
703 next = scan + NODE_SZ_STR(scan);
704 /* Skip NOTHING, merge EXACT*. */
706 ( PL_regkind[(U8)OP(n)] == NOTHING ||
707 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
710 if (OP(n) == TAIL || n > next)
712 if (PL_regkind[(U8)OP(n)] == NOTHING) {
713 NEXT_OFF(scan) += NEXT_OFF(n);
714 next = n + NODE_STEP_REGNODE;
722 int oldl = STR_LEN(scan);
723 regnode *nnext = regnext(n);
725 if (oldl + STR_LEN(n) > U8_MAX)
727 NEXT_OFF(scan) += NEXT_OFF(n);
728 STR_LEN(scan) += STR_LEN(n);
729 next = n + NODE_SZ_STR(n);
730 /* Now we can overwrite *n : */
731 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
739 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
741 Two problematic code points in Unicode casefolding of EXACT nodes:
743 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
744 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
750 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
751 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753 This means that in case-insensitive matching (or "loose matching",
754 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
755 length of the above casefolded versions) can match a target string
756 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
757 This would rather mess up the minimum length computation.
759 What we'll do is to look for the tail four bytes, and then peek
760 at the preceding two bytes to see whether we need to decrease
761 the minimum length by four (six minus two).
763 Thanks to the design of UTF-8, there cannot be false matches:
764 A sequence of valid UTF-8 bytes cannot be a subsequence of
765 another valid sequence of UTF-8 bytes.
768 char *s0 = STRING(scan), *s, *t;
769 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
770 char *t0 = "\xcc\x88\xcc\x81";
774 s < s2 && (t = ninstr(s, s1, t0, t1));
776 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
777 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
784 n = scan + NODE_SZ_STR(scan);
786 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
794 /* Follow the next-chain of the current node and optimize
795 away all the NOTHINGs from it. */
796 if (OP(scan) != CURLYX) {
797 int max = (reg_off_by_arg[OP(scan)]
799 /* I32 may be smaller than U16 on CRAYs! */
800 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
801 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805 /* Skip NOTHING and LONGJMP. */
806 while ((n = regnext(n))
807 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
808 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811 if (reg_off_by_arg[OP(scan)])
814 NEXT_OFF(scan) = off;
816 /* The principal pseudo-switch. Cannot be a switch, since we
817 look into several different things. */
818 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
819 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
820 next = regnext(scan);
823 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
824 I32 max1 = 0, min1 = I32_MAX, num = 0;
825 struct regnode_charclass_class accum;
827 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
828 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
829 if (flags & SCF_DO_STCLASS)
830 cl_init_zero(pRExC_state, &accum);
831 while (OP(scan) == code) {
832 I32 deltanext, minnext, f = 0, fake;
833 struct regnode_charclass_class this_class;
838 data_fake.whilem_c = data->whilem_c;
839 data_fake.last_closep = data->last_closep;
842 data_fake.last_closep = &fake;
843 next = regnext(scan);
844 scan = NEXTOPER(scan);
846 scan = NEXTOPER(scan);
847 if (flags & SCF_DO_STCLASS) {
848 cl_init(pRExC_state, &this_class);
849 data_fake.start_class = &this_class;
850 f = SCF_DO_STCLASS_AND;
852 if (flags & SCF_WHILEM_VISITED_POS)
853 f |= SCF_WHILEM_VISITED_POS;
854 /* we suppose the run is continuous, last=next...*/
855 minnext = study_chunk(pRExC_state, &scan, &deltanext,
856 next, &data_fake, f);
859 if (max1 < minnext + deltanext)
860 max1 = minnext + deltanext;
861 if (deltanext == I32_MAX)
862 is_inf = is_inf_internal = 1;
864 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 if (data && (data_fake.flags & SF_HAS_EVAL))
867 data->flags |= SF_HAS_EVAL;
869 data->whilem_c = data_fake.whilem_c;
870 if (flags & SCF_DO_STCLASS)
871 cl_or(pRExC_state, &accum, &this_class);
875 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 if (flags & SCF_DO_SUBSTR) {
878 data->pos_min += min1;
879 data->pos_delta += max1 - min1;
880 if (max1 != min1 || is_inf)
881 data->longest = &(data->longest_float);
884 delta += max1 - min1;
885 if (flags & SCF_DO_STCLASS_OR) {
886 cl_or(pRExC_state, data->start_class, &accum);
888 cl_and(data->start_class, &and_with);
889 flags &= ~SCF_DO_STCLASS;
892 else if (flags & SCF_DO_STCLASS_AND) {
894 cl_and(data->start_class, &accum);
895 flags &= ~SCF_DO_STCLASS;
898 /* Switch to OR mode: cache the old value of
899 * data->start_class */
900 StructCopy(data->start_class, &and_with,
901 struct regnode_charclass_class);
902 flags &= ~SCF_DO_STCLASS_AND;
903 StructCopy(&accum, data->start_class,
904 struct regnode_charclass_class);
905 flags |= SCF_DO_STCLASS_OR;
906 data->start_class->flags |= ANYOF_EOS;
910 else if (code == BRANCHJ) /* single branch is optimized. */
911 scan = NEXTOPER(NEXTOPER(scan));
912 else /* single branch is optimized. */
913 scan = NEXTOPER(scan);
916 else if (OP(scan) == EXACT) {
917 I32 l = STR_LEN(scan);
918 UV uc = *((U8*)STRING(scan));
920 U8 *s = (U8*)STRING(scan);
921 l = utf8_length(s, s + l);
922 uc = utf8_to_uvchr(s, NULL);
925 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
926 /* The code below prefers earlier match for fixed
927 offset, later match for variable offset. */
928 if (data->last_end == -1) { /* Update the start info. */
929 data->last_start_min = data->pos_min;
930 data->last_start_max = is_inf
931 ? I32_MAX : data->pos_min + data->pos_delta;
933 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935 SvUTF8_on(data->last_found);
936 data->last_end = data->pos_min + l;
937 data->pos_min += l; /* As in the first entry. */
938 data->flags &= ~SF_BEFORE_EOL;
940 if (flags & SCF_DO_STCLASS_AND) {
941 /* Check whether it is compatible with what we know already! */
945 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
946 && !ANYOF_BITMAP_TEST(data->start_class, uc)
947 && (!(data->start_class->flags & ANYOF_FOLD)
948 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951 ANYOF_CLASS_ZERO(data->start_class);
952 ANYOF_BITMAP_ZERO(data->start_class);
954 ANYOF_BITMAP_SET(data->start_class, uc);
955 data->start_class->flags &= ~ANYOF_EOS;
957 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959 else if (flags & SCF_DO_STCLASS_OR) {
960 /* false positive possible if the class is case-folded */
962 ANYOF_BITMAP_SET(data->start_class, uc);
964 data->start_class->flags |= ANYOF_UNICODE_ALL;
965 data->start_class->flags &= ~ANYOF_EOS;
966 cl_and(data->start_class, &and_with);
968 flags &= ~SCF_DO_STCLASS;
970 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
971 I32 l = STR_LEN(scan);
972 UV uc = *((U8*)STRING(scan));
974 /* Search for fixed substrings supports EXACT only. */
975 if (flags & SCF_DO_SUBSTR)
976 scan_commit(pRExC_state, data);
978 U8 *s = (U8 *)STRING(scan);
979 l = utf8_length(s, s + l);
980 uc = utf8_to_uvchr(s, NULL);
983 if (data && (flags & SCF_DO_SUBSTR))
985 if (flags & SCF_DO_STCLASS_AND) {
986 /* Check whether it is compatible with what we know already! */
990 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
991 && !ANYOF_BITMAP_TEST(data->start_class, uc)
992 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994 ANYOF_CLASS_ZERO(data->start_class);
995 ANYOF_BITMAP_ZERO(data->start_class);
997 ANYOF_BITMAP_SET(data->start_class, uc);
998 data->start_class->flags &= ~ANYOF_EOS;
999 data->start_class->flags |= ANYOF_FOLD;
1000 if (OP(scan) == EXACTFL)
1001 data->start_class->flags |= ANYOF_LOCALE;
1004 else if (flags & SCF_DO_STCLASS_OR) {
1005 if (data->start_class->flags & ANYOF_FOLD) {
1006 /* false positive possible if the class is case-folded.
1007 Assume that the locale settings are the same... */
1009 ANYOF_BITMAP_SET(data->start_class, uc);
1010 data->start_class->flags &= ~ANYOF_EOS;
1012 cl_and(data->start_class, &and_with);
1014 flags &= ~SCF_DO_STCLASS;
1016 else if (strchr((char*)PL_varies,OP(scan))) {
1017 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1018 I32 f = flags, pos_before = 0;
1019 regnode *oscan = scan;
1020 struct regnode_charclass_class this_class;
1021 struct regnode_charclass_class *oclass = NULL;
1022 I32 next_is_eval = 0;
1024 switch (PL_regkind[(U8)OP(scan)]) {
1025 case WHILEM: /* End of (?:...)* . */
1026 scan = NEXTOPER(scan);
1029 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1030 next = NEXTOPER(scan);
1031 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033 maxcount = REG_INFTY;
1034 next = regnext(scan);
1035 scan = NEXTOPER(scan);
1039 if (flags & SCF_DO_SUBSTR)
1044 if (flags & SCF_DO_STCLASS) {
1046 maxcount = REG_INFTY;
1047 next = regnext(scan);
1048 scan = NEXTOPER(scan);
1051 is_inf = is_inf_internal = 1;
1052 scan = regnext(scan);
1053 if (flags & SCF_DO_SUBSTR) {
1054 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1055 data->longest = &(data->longest_float);
1057 goto optimize_curly_tail;
1059 mincount = ARG1(scan);
1060 maxcount = ARG2(scan);
1061 next = regnext(scan);
1062 if (OP(scan) == CURLYX) {
1063 I32 lp = (data ? *(data->last_closep) : 0);
1065 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1068 next_is_eval = (OP(scan) == EVAL);
1070 if (flags & SCF_DO_SUBSTR) {
1071 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1072 pos_before = data->pos_min;
1076 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 data->flags |= SF_IS_INF;
1080 if (flags & SCF_DO_STCLASS) {
1081 cl_init(pRExC_state, &this_class);
1082 oclass = data->start_class;
1083 data->start_class = &this_class;
1084 f |= SCF_DO_STCLASS_AND;
1085 f &= ~SCF_DO_STCLASS_OR;
1087 /* These are the cases when once a subexpression
1088 fails at a particular position, it cannot succeed
1089 even after backtracking at the enclosing scope.
1091 XXXX what if minimal match and we are at the
1092 initial run of {n,m}? */
1093 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1094 f &= ~SCF_WHILEM_VISITED_POS;
1096 /* This will finish on WHILEM, setting scan, or on NULL: */
1097 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 ? (f & ~SCF_DO_SUBSTR) : f);
1101 if (flags & SCF_DO_STCLASS)
1102 data->start_class = oclass;
1103 if (mincount == 0 || minnext == 0) {
1104 if (flags & SCF_DO_STCLASS_OR) {
1105 cl_or(pRExC_state, data->start_class, &this_class);
1107 else if (flags & SCF_DO_STCLASS_AND) {
1108 /* Switch to OR mode: cache the old value of
1109 * data->start_class */
1110 StructCopy(data->start_class, &and_with,
1111 struct regnode_charclass_class);
1112 flags &= ~SCF_DO_STCLASS_AND;
1113 StructCopy(&this_class, data->start_class,
1114 struct regnode_charclass_class);
1115 flags |= SCF_DO_STCLASS_OR;
1116 data->start_class->flags |= ANYOF_EOS;
1118 } else { /* Non-zero len */
1119 if (flags & SCF_DO_STCLASS_OR) {
1120 cl_or(pRExC_state, data->start_class, &this_class);
1121 cl_and(data->start_class, &and_with);
1123 else if (flags & SCF_DO_STCLASS_AND)
1124 cl_and(data->start_class, &this_class);
1125 flags &= ~SCF_DO_STCLASS;
1127 if (!scan) /* It was not CURLYX, but CURLY. */
1129 if (ckWARN(WARN_REGEXP)
1130 /* ? quantifier ok, except for (?{ ... }) */
1131 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1132 && (minnext == 0) && (deltanext == 0)
1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137 "Quantifier unexpected on zero-length expression");
1140 min += minnext * mincount;
1141 is_inf_internal |= ((maxcount == REG_INFTY
1142 && (minnext + deltanext) > 0)
1143 || deltanext == I32_MAX);
1144 is_inf |= is_inf_internal;
1145 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147 /* Try powerful optimization CURLYX => CURLYN. */
1148 if ( OP(oscan) == CURLYX && data
1149 && data->flags & SF_IN_PAR
1150 && !(data->flags & SF_HAS_EVAL)
1151 && !deltanext && minnext == 1 ) {
1152 /* Try to optimize to CURLYN. */
1153 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1154 regnode *nxt1 = nxt;
1161 if (!strchr((char*)PL_simple,OP(nxt))
1162 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1163 && STR_LEN(nxt) == 1))
1169 if (OP(nxt) != CLOSE)
1171 /* Now we know that nxt2 is the only contents: */
1172 oscan->flags = ARG(nxt);
1174 OP(nxt1) = NOTHING; /* was OPEN. */
1176 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1177 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1178 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1179 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1180 OP(nxt + 1) = OPTIMIZED; /* was count. */
1181 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1186 /* Try optimization CURLYX => CURLYM. */
1187 if ( OP(oscan) == CURLYX && data
1188 && !(data->flags & SF_HAS_PAR)
1189 && !(data->flags & SF_HAS_EVAL)
1191 /* XXXX How to optimize if data == 0? */
1192 /* Optimize to a simpler form. */
1193 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1198 && (OP(nxt2) != WHILEM))
1200 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1201 /* Need to optimize away parenths. */
1202 if (data->flags & SF_IN_PAR) {
1203 /* Set the parenth number. */
1204 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206 if (OP(nxt) != CLOSE)
1207 FAIL("Panic opt close");
1208 oscan->flags = ARG(nxt);
1209 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1210 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1212 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1213 OP(nxt + 1) = OPTIMIZED; /* was count. */
1214 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1215 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1219 regnode *nnxt = regnext(nxt1);
1222 if (reg_off_by_arg[OP(nxt1)])
1223 ARG_SET(nxt1, nxt2 - nxt1);
1224 else if (nxt2 - nxt1 < U16_MAX)
1225 NEXT_OFF(nxt1) = nxt2 - nxt1;
1227 OP(nxt) = NOTHING; /* Cannot beautify */
1232 /* Optimize again: */
1233 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 else if ((OP(oscan) == CURLYX)
1240 && (flags & SCF_WHILEM_VISITED_POS)
1241 /* See the comment on a similar expression above.
1242 However, this time it not a subexpression
1243 we care about, but the expression itself. */
1244 && (maxcount == REG_INFTY)
1245 && data && ++data->whilem_c < 16) {
1246 /* This stays as CURLYX, we can put the count/of pair. */
1247 /* Find WHILEM (as in regexec.c) */
1248 regnode *nxt = oscan + NEXT_OFF(oscan);
1250 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252 PREVOPER(nxt)->flags = data->whilem_c
1253 | (RExC_whilem_seen << 4); /* On WHILEM */
1255 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1257 if (flags & SCF_DO_SUBSTR) {
1258 SV *last_str = Nullsv;
1259 int counted = mincount != 0;
1261 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1262 #if defined(SPARC64_GCC_WORKAROUND)
1268 if (pos_before >= data->last_start_min)
1271 b = data->last_start_min;
1274 s = SvPV(data->last_found, l);
1275 old = b - data->last_start_min;
1278 I32 b = pos_before >= data->last_start_min
1279 ? pos_before : data->last_start_min;
1281 char *s = SvPV(data->last_found, l);
1282 I32 old = b - data->last_start_min;
1286 old = utf8_hop((U8*)s, old) - (U8*)s;
1289 /* Get the added string: */
1290 last_str = newSVpvn(s + old, l);
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1294 SvGROW(last_str, (mincount * l) + 1);
1295 repeatcpy(SvPVX(last_str) + l,
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
1299 SvCUR_set(data->last_found,
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1302 data->last_end += l * (mincount - 1);
1305 /* start offset must point into the last copy */
1306 data->last_start_min += minnext * (mincount - 1);
1307 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1308 * (minnext + data->pos_delta);
1311 /* It is counted once already... */
1312 data->pos_min += minnext * (mincount - counted);
1313 data->pos_delta += - counted * deltanext +
1314 (minnext + deltanext) * maxcount - minnext * mincount;
1315 if (mincount != maxcount) {
1316 /* Cannot extend fixed substrings found inside
1318 scan_commit(pRExC_state,data);
1319 if (mincount && last_str) {
1320 sv_setsv(data->last_found, last_str);
1321 data->last_end = data->pos_min;
1322 data->last_start_min =
1323 data->pos_min - CHR_SVLEN(last_str);
1324 data->last_start_max = is_inf
1326 : data->pos_min + data->pos_delta
1327 - CHR_SVLEN(last_str);
1329 data->longest = &(data->longest_float);
1331 SvREFCNT_dec(last_str);
1333 if (data && (fl & SF_HAS_EVAL))
1334 data->flags |= SF_HAS_EVAL;
1335 optimize_curly_tail:
1336 if (OP(oscan) != CURLYX) {
1337 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1339 NEXT_OFF(oscan) += NEXT_OFF(next);
1342 default: /* REF and CLUMP only? */
1343 if (flags & SCF_DO_SUBSTR) {
1344 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1345 data->longest = &(data->longest_float);
1347 is_inf = is_inf_internal = 1;
1348 if (flags & SCF_DO_STCLASS_OR)
1349 cl_anything(pRExC_state, data->start_class);
1350 flags &= ~SCF_DO_STCLASS;
1354 else if (strchr((char*)PL_simple,OP(scan))) {
1357 if (flags & SCF_DO_SUBSTR) {
1358 scan_commit(pRExC_state,data);
1362 if (flags & SCF_DO_STCLASS) {
1363 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1365 /* Some of the logic below assumes that switching
1366 locale on will only add false positives. */
1367 switch (PL_regkind[(U8)OP(scan)]) {
1371 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1372 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1373 cl_anything(pRExC_state, data->start_class);
1376 if (OP(scan) == SANY)
1378 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1379 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1380 || (data->start_class->flags & ANYOF_CLASS));
1381 cl_anything(pRExC_state, data->start_class);
1383 if (flags & SCF_DO_STCLASS_AND || !value)
1384 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387 if (flags & SCF_DO_STCLASS_AND)
1388 cl_and(data->start_class,
1389 (struct regnode_charclass_class*)scan);
1391 cl_or(pRExC_state, data->start_class,
1392 (struct regnode_charclass_class*)scan);
1395 if (flags & SCF_DO_STCLASS_AND) {
1396 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1397 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1398 for (value = 0; value < 256; value++)
1399 if (!isALNUM(value))
1400 ANYOF_BITMAP_CLEAR(data->start_class, value);
1404 if (data->start_class->flags & ANYOF_LOCALE)
1405 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1407 for (value = 0; value < 256; value++)
1409 ANYOF_BITMAP_SET(data->start_class, value);
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 data->start_class->flags |= ANYOF_LOCALE;
1424 if (flags & SCF_DO_STCLASS_AND) {
1425 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1426 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1427 for (value = 0; value < 256; value++)
1429 ANYOF_BITMAP_CLEAR(data->start_class, value);
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1436 for (value = 0; value < 256; value++)
1437 if (!isALNUM(value))
1438 ANYOF_BITMAP_SET(data->start_class, value);
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448 data->start_class->flags |= ANYOF_LOCALE;
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453 if (flags & SCF_DO_STCLASS_AND) {
1454 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1455 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1456 for (value = 0; value < 256; value++)
1457 if (!isSPACE(value))
1458 ANYOF_BITMAP_CLEAR(data->start_class, value);
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1465 for (value = 0; value < 256; value++)
1467 ANYOF_BITMAP_SET(data->start_class, value);
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477 data->start_class->flags |= ANYOF_LOCALE;
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482 if (flags & SCF_DO_STCLASS_AND) {
1483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1485 for (value = 0; value < 256; value++)
1487 ANYOF_BITMAP_CLEAR(data->start_class, value);
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1494 for (value = 0; value < 256; value++)
1495 if (!isSPACE(value))
1496 ANYOF_BITMAP_SET(data->start_class, value);
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (data->start_class->flags & ANYOF_LOCALE) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1505 if (!isSPACE(value))
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1510 data->start_class->flags |= ANYOF_LOCALE;
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1515 if (flags & SCF_DO_STCLASS_AND) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1517 for (value = 0; value < 256; value++)
1518 if (!isDIGIT(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 if (data->start_class->flags & ANYOF_LOCALE)
1523 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1525 for (value = 0; value < 256; value++)
1527 ANYOF_BITMAP_SET(data->start_class, value);
1532 if (flags & SCF_DO_STCLASS_AND) {
1533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1534 for (value = 0; value < 256; value++)
1536 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539 if (data->start_class->flags & ANYOF_LOCALE)
1540 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1542 for (value = 0; value < 256; value++)
1543 if (!isDIGIT(value))
1544 ANYOF_BITMAP_SET(data->start_class, value);
1549 if (flags & SCF_DO_STCLASS_OR)
1550 cl_and(data->start_class, &and_with);
1551 flags &= ~SCF_DO_STCLASS;
1554 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1555 data->flags |= (OP(scan) == MEOL
1559 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1560 /* Lookbehind, or need to calculate parens/evals/stclass: */
1561 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1562 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1563 /* Lookahead/lookbehind */
1564 I32 deltanext, minnext, fake = 0;
1566 struct regnode_charclass_class intrnl;
1569 data_fake.flags = 0;
1571 data_fake.whilem_c = data->whilem_c;
1572 data_fake.last_closep = data->last_closep;
1575 data_fake.last_closep = &fake;
1576 if ( flags & SCF_DO_STCLASS && !scan->flags
1577 && OP(scan) == IFMATCH ) { /* Lookahead */
1578 cl_init(pRExC_state, &intrnl);
1579 data_fake.start_class = &intrnl;
1580 f |= SCF_DO_STCLASS_AND;
1582 if (flags & SCF_WHILEM_VISITED_POS)
1583 f |= SCF_WHILEM_VISITED_POS;
1584 next = regnext(scan);
1585 nscan = NEXTOPER(NEXTOPER(scan));
1586 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589 vFAIL("Variable length lookbehind not implemented");
1591 else if (minnext > U8_MAX) {
1592 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1594 scan->flags = minnext;
1596 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1598 if (data && (data_fake.flags & SF_HAS_EVAL))
1599 data->flags |= SF_HAS_EVAL;
1601 data->whilem_c = data_fake.whilem_c;
1602 if (f & SCF_DO_STCLASS_AND) {
1603 int was = (data->start_class->flags & ANYOF_EOS);
1605 cl_and(data->start_class, &intrnl);
1607 data->start_class->flags |= ANYOF_EOS;
1610 else if (OP(scan) == OPEN) {
1613 else if (OP(scan) == CLOSE) {
1614 if (ARG(scan) == is_par) {
1615 next = regnext(scan);
1617 if ( next && (OP(next) != WHILEM) && next < last)
1618 is_par = 0; /* Disable optimization */
1621 *(data->last_closep) = ARG(scan);
1623 else if (OP(scan) == EVAL) {
1625 data->flags |= SF_HAS_EVAL;
1627 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1628 if (flags & SCF_DO_SUBSTR) {
1629 scan_commit(pRExC_state,data);
1630 data->longest = &(data->longest_float);
1632 is_inf = is_inf_internal = 1;
1633 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1634 cl_anything(pRExC_state, data->start_class);
1635 flags &= ~SCF_DO_STCLASS;
1637 /* Else: zero-length, ignore. */
1638 scan = regnext(scan);
1643 *deltap = is_inf_internal ? I32_MAX : delta;
1644 if (flags & SCF_DO_SUBSTR && is_inf)
1645 data->pos_delta = I32_MAX - data->pos_min;
1646 if (is_par > U8_MAX)
1648 if (is_par && pars==1 && data) {
1649 data->flags |= SF_IN_PAR;
1650 data->flags &= ~SF_HAS_PAR;
1652 else if (pars && data) {
1653 data->flags |= SF_HAS_PAR;
1654 data->flags &= ~SF_IN_PAR;
1656 if (flags & SCF_DO_STCLASS_OR)
1657 cl_and(data->start_class, &and_with);
1662 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1664 if (RExC_rx->data) {
1665 Renewc(RExC_rx->data,
1666 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1667 char, struct reg_data);
1668 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1669 RExC_rx->data->count += n;
1672 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1673 char, struct reg_data);
1674 New(1208, RExC_rx->data->what, n, U8);
1675 RExC_rx->data->count = n;
1677 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1678 return RExC_rx->data->count - n;
1682 Perl_reginitcolors(pTHX)
1685 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688 PL_colors[0] = s = savepv(s);
1690 s = strchr(s, '\t');
1696 PL_colors[i] = s = "";
1700 PL_colors[i++] = "";
1707 - pregcomp - compile a regular expression into internal code
1709 * We can't allocate space until we know how big the compiled form will be,
1710 * but we can't compile it (and thus know how big it is) until we've got a
1711 * place to put the code. So we cheat: we compile it twice, once with code
1712 * generation turned off and size counting turned on, and once "for real".
1713 * This also means that we don't allocate space until we are sure that the
1714 * thing really will compile successfully, and we never have to move the
1715 * code and thus invalidate pointers into it. (Note that it has to be in
1716 * one piece because free() must be able to free it all.) [NB: not true in perl]
1718 * Beware that the optimization-preparation code in here knows about some
1719 * of the structure of the compiled regexp. [I'll say.]
1722 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1732 RExC_state_t RExC_state;
1733 RExC_state_t *pRExC_state = &RExC_state;
1736 FAIL("NULL regexp argument");
1738 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1742 if (!PL_colorset) reginitcolors();
1743 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1744 PL_colors[4],PL_colors[5],PL_colors[0],
1745 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1747 RExC_flags16 = pm->op_pmflags;
1751 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1752 RExC_seen_evals = 0;
1755 /* First pass: determine size, legality. */
1762 RExC_emit = &PL_regdummy;
1763 RExC_whilem_seen = 0;
1764 #if 0 /* REGC() is (currently) a NOP at the first pass.
1765 * Clever compilers notice this and complain. --jhi */
1766 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1768 if (reg(pRExC_state, 0, &flags) == NULL) {
1769 RExC_precomp = Nullch;
1772 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1774 /* Small enough for pointer-storage convention?
1775 If extralen==0, this means that we will not need long jumps. */
1776 if (RExC_size >= 0x10000L && RExC_extralen)
1777 RExC_size += RExC_extralen;
1780 if (RExC_whilem_seen > 15)
1781 RExC_whilem_seen = 15;
1783 /* Allocate space and initialize. */
1784 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787 FAIL("Regexp out of space");
1790 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1791 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 r->prelen = xend - exp;
1795 r->precomp = savepvn(RExC_precomp, r->prelen);
1797 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1798 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1800 r->substrs = 0; /* Useful during FAIL. */
1801 r->startp = 0; /* Useful during FAIL. */
1802 r->endp = 0; /* Useful during FAIL. */
1804 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1806 r->offsets[0] = RExC_size;
1808 DEBUG_r(PerlIO_printf(Perl_debug_log,
1809 "%s %"UVuf" bytes for offset annotations.\n",
1810 r->offsets ? "Got" : "Couldn't get",
1811 (UV)((2*RExC_size+1) * sizeof(U32))));
1815 /* Second pass: emit code. */
1816 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
1821 RExC_emit_start = r->program;
1822 RExC_emit = r->program;
1823 /* Store the count of eval-groups for security checks: */
1824 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1825 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1827 if (reg(pRExC_state, 0, &flags) == NULL)
1830 /* Dig out information for optimizations. */
1831 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1832 pm->op_pmflags = RExC_flags16;
1834 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1835 r->regstclass = NULL;
1836 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1837 r->reganch |= ROPT_NAUGHTY;
1838 scan = r->program + 1; /* First BRANCH. */
1840 /* XXXX To minimize changes to RE engine we always allocate
1841 3-units-long substrs field. */
1842 Newz(1004, r->substrs, 1, struct reg_substr_data);
1844 StructCopy(&zero_scan_data, &data, scan_data_t);
1845 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1848 STRLEN longest_float_length, longest_fixed_length;
1849 struct regnode_charclass_class ch_class;
1854 /* Skip introductions and multiplicators >= 1. */
1855 while ((OP(first) == OPEN && (sawopen = 1)) ||
1856 /* An OR of *one* alternative - should not happen now. */
1857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1858 (OP(first) == PLUS) ||
1859 (OP(first) == MINMOD) ||
1860 /* An {n,m} with n>0 */
1861 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1862 if (OP(first) == PLUS)
1865 first += regarglen[(U8)OP(first)];
1866 first = NEXTOPER(first);
1869 /* Starting-point info. */
1871 if (PL_regkind[(U8)OP(first)] == EXACT) {
1872 if (OP(first) == EXACT)
1873 ; /* Empty, get anchored substr later. */
1874 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1875 r->regstclass = first;
1877 else if (strchr((char*)PL_simple,OP(first)))
1878 r->regstclass = first;
1879 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1880 PL_regkind[(U8)OP(first)] == NBOUND)
1881 r->regstclass = first;
1882 else if (PL_regkind[(U8)OP(first)] == BOL) {
1883 r->reganch |= (OP(first) == MBOL
1885 : (OP(first) == SBOL
1888 first = NEXTOPER(first);
1891 else if (OP(first) == GPOS) {
1892 r->reganch |= ROPT_ANCH_GPOS;
1893 first = NEXTOPER(first);
1896 else if (!sawopen && (OP(first) == STAR &&
1897 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1898 !(r->reganch & ROPT_ANCH) )
1900 /* turn .* into ^.* with an implied $*=1 */
1901 int type = OP(NEXTOPER(first));
1903 if (type == REG_ANY)
1904 type = ROPT_ANCH_MBOL;
1906 type = ROPT_ANCH_SBOL;
1908 r->reganch |= type | ROPT_IMPLICIT;
1909 first = NEXTOPER(first);
1912 if (sawplus && (!sawopen || !RExC_sawback)
1913 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1914 /* x+ must match at the 1st pos of run of x's */
1915 r->reganch |= ROPT_SKIP;
1917 /* Scan is after the zeroth branch, first is atomic matcher. */
1918 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1919 (IV)(first - scan + 1)));
1921 * If there's something expensive in the r.e., find the
1922 * longest literal string that must appear and make it the
1923 * regmust. Resolve ties in favor of later strings, since
1924 * the regstart check works with the beginning of the r.e.
1925 * and avoiding duplication strengthens checking. Not a
1926 * strong reason, but sufficient in the absence of others.
1927 * [Now we resolve ties in favor of the earlier string if
1928 * it happens that c_offset_min has been invalidated, since the
1929 * earlier string may buy us something the later one won't.]
1933 data.longest_fixed = newSVpvn("",0);
1934 data.longest_float = newSVpvn("",0);
1935 data.last_found = newSVpvn("",0);
1936 data.longest = &(data.longest_fixed);
1938 if (!r->regstclass) {
1939 cl_init(pRExC_state, &ch_class);
1940 data.start_class = &ch_class;
1941 stclass_flag = SCF_DO_STCLASS_AND;
1942 } else /* XXXX Check for BOUND? */
1944 data.last_closep = &last_close;
1946 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1947 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1948 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1949 && data.last_start_min == 0 && data.last_end > 0
1950 && !RExC_seen_zerolen
1951 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1952 r->reganch |= ROPT_CHECK_ALL;
1953 scan_commit(pRExC_state, &data);
1954 SvREFCNT_dec(data.last_found);
1956 longest_float_length = CHR_SVLEN(data.longest_float);
1957 if (longest_float_length
1958 || (data.flags & SF_FL_BEFORE_EOL
1959 && (!(data.flags & SF_FL_BEFORE_MEOL)
1960 || (RExC_flags16 & PMf_MULTILINE)))) {
1963 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1964 && data.offset_fixed == data.offset_float_min
1965 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1966 goto remove_float; /* As in (a)+. */
1968 if (SvUTF8(data.longest_float)) {
1969 r->float_utf8 = data.longest_float;
1970 r->float_substr = Nullsv;
1972 r->float_substr = data.longest_float;
1973 r->float_utf8 = Nullsv;
1975 r->float_min_offset = data.offset_float_min;
1976 r->float_max_offset = data.offset_float_max;
1977 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1978 && (!(data.flags & SF_FL_BEFORE_MEOL)
1979 || (RExC_flags16 & PMf_MULTILINE)));
1980 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1984 r->float_substr = r->float_utf8 = Nullsv;
1985 SvREFCNT_dec(data.longest_float);
1986 longest_float_length = 0;
1989 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1990 if (longest_fixed_length
1991 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags16 & PMf_MULTILINE)))) {
1996 if (SvUTF8(data.longest_fixed)) {
1997 r->anchored_utf8 = data.longest_fixed;
1998 r->anchored_substr = Nullsv;
2000 r->anchored_substr = data.longest_fixed;
2001 r->anchored_utf8 = Nullsv;
2003 r->anchored_offset = data.offset_fixed;
2004 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2005 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2006 || (RExC_flags16 & PMf_MULTILINE)));
2007 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010 r->anchored_substr = r->anchored_utf8 = Nullsv;
2011 SvREFCNT_dec(data.longest_fixed);
2012 longest_fixed_length = 0;
2015 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2016 r->regstclass = NULL;
2017 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2019 && !(data.start_class->flags & ANYOF_EOS)
2020 && !cl_is_anything(data.start_class)) {
2021 I32 n = add_data(pRExC_state, 1, "f");
2023 New(1006, RExC_rx->data->data[n], 1,
2024 struct regnode_charclass_class);
2025 StructCopy(data.start_class,
2026 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2027 struct regnode_charclass_class);
2028 r->regstclass = (regnode*)RExC_rx->data->data[n];
2029 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2030 PL_regdata = r->data; /* for regprop() */
2031 DEBUG_r({ SV *sv = sv_newmortal();
2032 regprop(sv, (regnode*)data.start_class);
2033 PerlIO_printf(Perl_debug_log,
2034 "synthetic stclass `%s'.\n",
2038 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2039 if (longest_fixed_length > longest_float_length) {
2040 r->check_substr = r->anchored_substr;
2041 r->check_utf8 = r->anchored_utf8;
2042 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2043 if (r->reganch & ROPT_ANCH_SINGLE)
2044 r->reganch |= ROPT_NOSCAN;
2047 r->check_substr = r->float_substr;
2048 r->check_utf8 = r->float_utf8;
2049 r->check_offset_min = data.offset_float_min;
2050 r->check_offset_max = data.offset_float_max;
2052 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2053 This should be changed ASAP! */
2054 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2055 r->reganch |= RE_USE_INTUIT;
2056 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2057 r->reganch |= RE_INTUIT_TAIL;
2061 /* Several toplevels. Best we can is to set minlen. */
2063 struct regnode_charclass_class ch_class;
2066 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2067 scan = r->program + 1;
2068 cl_init(pRExC_state, &ch_class);
2069 data.start_class = &ch_class;
2070 data.last_closep = &last_close;
2071 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2072 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2073 = r->float_substr = r->float_utf8 = Nullsv;
2074 if (!(data.start_class->flags & ANYOF_EOS)
2075 && !cl_is_anything(data.start_class)) {
2076 I32 n = add_data(pRExC_state, 1, "f");
2078 New(1006, RExC_rx->data->data[n], 1,
2079 struct regnode_charclass_class);
2080 StructCopy(data.start_class,
2081 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2082 struct regnode_charclass_class);
2083 r->regstclass = (regnode*)RExC_rx->data->data[n];
2084 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2085 DEBUG_r({ SV* sv = sv_newmortal();
2086 regprop(sv, (regnode*)data.start_class);
2087 PerlIO_printf(Perl_debug_log,
2088 "synthetic stclass `%s'.\n",
2094 if (RExC_seen & REG_SEEN_GPOS)
2095 r->reganch |= ROPT_GPOS_SEEN;
2096 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2097 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2098 if (RExC_seen & REG_SEEN_EVAL)
2099 r->reganch |= ROPT_EVAL_SEEN;
2100 if (RExC_seen & REG_SEEN_CANY)
2101 r->reganch |= ROPT_CANY_SEEN;
2102 Newz(1002, r->startp, RExC_npar, I32);
2103 Newz(1002, r->endp, RExC_npar, I32);
2104 PL_regdata = r->data; /* for regprop() */
2105 DEBUG_r(regdump(r));
2110 - reg - regular expression, i.e. main body or parenthesized thing
2112 * Caller must absorb opening parenthesis.
2114 * Combining parenthesis handling with the base level of regular expression
2115 * is a trifle forced, but the need to tie the tails of the branches to what
2116 * follows makes it hard to avoid.
2119 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2120 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2122 register regnode *ret; /* Will be the head of the group. */
2123 register regnode *br;
2124 register regnode *lastbr;
2125 register regnode *ender = 0;
2126 register I32 parno = 0;
2127 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2129 /* for (?g), (?gc), and (?o) warnings; warning
2130 about (?c) will warn about (?g) -- japhy */
2132 I32 wastedflags = 0x00,
2135 wasted_gc = 0x02 | 0x04,
2138 char * parse_start = RExC_parse; /* MJD */
2139 char *oregcomp_parse = RExC_parse;
2142 *flagp = 0; /* Tentatively. */
2145 /* Make an OPEN node, if parenthesized. */
2147 if (*RExC_parse == '?') { /* (?...) */
2148 U16 posflags = 0, negflags = 0;
2149 U16 *flagsp = &posflags;
2151 char *seqstart = RExC_parse;
2154 paren = *RExC_parse++;
2155 ret = NULL; /* For look-ahead/behind. */
2157 case '<': /* (?<...) */
2158 RExC_seen |= REG_SEEN_LOOKBEHIND;
2159 if (*RExC_parse == '!')
2161 if (*RExC_parse != '=' && *RExC_parse != '!')
2164 case '=': /* (?=...) */
2165 case '!': /* (?!...) */
2166 RExC_seen_zerolen++;
2167 case ':': /* (?:...) */
2168 case '>': /* (?>...) */
2170 case '$': /* (?$...) */
2171 case '@': /* (?@...) */
2172 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2174 case '#': /* (?#...) */
2175 while (*RExC_parse && *RExC_parse != ')')
2177 if (*RExC_parse != ')')
2178 FAIL("Sequence (?#... not terminated");
2179 nextchar(pRExC_state);
2182 case 'p': /* (?p...) */
2183 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2184 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2186 case '?': /* (??...) */
2188 if (*RExC_parse != '{')
2190 paren = *RExC_parse++;
2192 case '{': /* (?{...}) */
2194 I32 count = 1, n = 0;
2196 char *s = RExC_parse;
2198 OP_4tree *sop, *rop;
2200 RExC_seen_zerolen++;
2201 RExC_seen |= REG_SEEN_EVAL;
2202 while (count && (c = *RExC_parse)) {
2203 if (c == '\\' && RExC_parse[1])
2211 if (*RExC_parse != ')')
2214 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219 if (RExC_parse - 1 - s)
2220 sv = newSVpvn(s, RExC_parse - 1 - s);
2222 sv = newSVpvn("", 0);
2225 Perl_save_re_context(aTHX);
2226 rop = sv_compile_2op(sv, &sop, "re", &av);
2227 sop->op_private |= OPpREFCOUNTED;
2228 /* re_dup will OpREFCNT_inc */
2229 OpREFCNT_set(sop, 1);
2232 n = add_data(pRExC_state, 3, "nop");
2233 RExC_rx->data->data[n] = (void*)rop;
2234 RExC_rx->data->data[n+1] = (void*)sop;
2235 RExC_rx->data->data[n+2] = (void*)av;
2238 else { /* First pass */
2239 if (PL_reginterp_cnt < ++RExC_seen_evals
2240 && PL_curcop != &PL_compiling)
2241 /* No compiled RE interpolated, has runtime
2242 components ===> unsafe. */
2243 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2244 if (PL_tainting && PL_tainted)
2245 FAIL("Eval-group in insecure regular expression");
2248 nextchar(pRExC_state);
2250 ret = reg_node(pRExC_state, LOGICAL);
2253 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2254 /* deal with the length of this later - MJD */
2257 return reganode(pRExC_state, EVAL, n);
2259 case '(': /* (?(?{...})...) and (?(?=...)...) */
2261 if (RExC_parse[0] == '?') { /* (?(?...)) */
2262 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2263 || RExC_parse[1] == '<'
2264 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2267 ret = reg_node(pRExC_state, LOGICAL);
2270 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2274 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2276 parno = atoi(RExC_parse++);
2278 while (isDIGIT(*RExC_parse))
2280 ret = reganode(pRExC_state, GROUPP, parno);
2282 if ((c = *nextchar(pRExC_state)) != ')')
2283 vFAIL("Switch condition not recognized");
2285 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2286 br = regbranch(pRExC_state, &flags, 1);
2288 br = reganode(pRExC_state, LONGJMP, 0);
2290 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2291 c = *nextchar(pRExC_state);
2295 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2296 regbranch(pRExC_state, &flags, 1);
2297 regtail(pRExC_state, ret, lastbr);
2300 c = *nextchar(pRExC_state);
2305 vFAIL("Switch (?(condition)... contains too many branches");
2306 ender = reg_node(pRExC_state, TAIL);
2307 regtail(pRExC_state, br, ender);
2309 regtail(pRExC_state, lastbr, ender);
2310 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2313 regtail(pRExC_state, ret, ender);
2317 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2321 RExC_parse--; /* for vFAIL to print correctly */
2322 vFAIL("Sequence (? incomplete");
2326 parse_flags: /* (?i) */
2327 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2328 /* (?g), (?gc) and (?o) are useless here
2329 and must be globally applied -- japhy */
2331 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2333 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2334 if (! (wastedflags & wflagbit) ) {
2335 wastedflags |= wflagbit;
2338 "Useless (%s%c) - %suse /%c modifier",
2339 flagsp == &negflags ? "?-" : "?",
2341 flagsp == &negflags ? "don't " : "",
2347 else if (*RExC_parse == 'c') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 if (! (wastedflags & wasted_c) ) {
2350 wastedflags |= wasted_gc;
2353 "Useless (%sc) - %suse /gc modifier",
2354 flagsp == &negflags ? "?-" : "?",
2355 flagsp == &negflags ? "don't " : ""
2360 else { pmflag(flagsp, *RExC_parse); }
2364 if (*RExC_parse == '-') {
2366 wastedflags = 0; /* reset so (?g-c) warns twice */
2370 RExC_flags16 |= posflags;
2371 RExC_flags16 &= ~negflags;
2372 if (*RExC_parse == ':') {
2378 if (*RExC_parse != ')') {
2380 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2382 nextchar(pRExC_state);
2390 ret = reganode(pRExC_state, OPEN, parno);
2391 Set_Node_Length(ret, 1); /* MJD */
2392 Set_Node_Offset(ret, RExC_parse); /* MJD */
2399 /* Pick up the branches, linking them together. */
2400 parse_start = RExC_parse; /* MJD */
2401 br = regbranch(pRExC_state, &flags, 1);
2402 /* branch_len = (paren != 0); */
2406 if (*RExC_parse == '|') {
2407 if (!SIZE_ONLY && RExC_extralen) {
2408 reginsert(pRExC_state, BRANCHJ, br);
2411 reginsert(pRExC_state, BRANCH, br);
2412 Set_Node_Length(br, paren != 0);
2413 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2417 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2419 else if (paren == ':') {
2420 *flagp |= flags&SIMPLE;
2422 if (open) { /* Starts with OPEN. */
2423 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2425 else if (paren != '?') /* Not Conditional */
2427 *flagp |= flags & (SPSTART | HASWIDTH);
2429 while (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 ender = reganode(pRExC_state, LONGJMP,0);
2432 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2435 RExC_extralen += 2; /* Account for LONGJMP. */
2436 nextchar(pRExC_state);
2437 br = regbranch(pRExC_state, &flags, 0);
2441 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2445 *flagp |= flags&SPSTART;
2448 if (have_branch || paren != ':') {
2449 /* Make a closing node, and hook it on the end. */
2452 ender = reg_node(pRExC_state, TAIL);
2455 ender = reganode(pRExC_state, CLOSE, parno);
2456 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2457 Set_Node_Length(ender,1); /* MJD */
2463 *flagp &= ~HASWIDTH;
2466 ender = reg_node(pRExC_state, SUCCEED);
2469 ender = reg_node(pRExC_state, END);
2472 regtail(pRExC_state, lastbr, ender);
2475 /* Hook the tails of the branches to the closing node. */
2476 for (br = ret; br != NULL; br = regnext(br)) {
2477 regoptail(pRExC_state, br, ender);
2484 static char parens[] = "=!<,>";
2486 if (paren && (p = strchr(parens, paren))) {
2487 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2488 int flag = (p - parens) > 1;
2491 node = SUSPEND, flag = 0;
2492 reginsert(pRExC_state, node,ret);
2494 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2498 /* Check for proper termination. */
2500 RExC_flags16 = oregflags;
2501 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2502 RExC_parse = oregcomp_parse;
2503 vFAIL("Unmatched (");
2506 else if (!paren && RExC_parse < RExC_end) {
2507 if (*RExC_parse == ')') {
2509 vFAIL("Unmatched )");
2512 FAIL("Junk on end of regexp"); /* "Can't happen". */
2520 - regbranch - one alternative of an | operator
2522 * Implements the concatenation operator.
2525 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2527 register regnode *ret;
2528 register regnode *chain = NULL;
2529 register regnode *latest;
2530 I32 flags = 0, c = 0;
2535 if (!SIZE_ONLY && RExC_extralen)
2536 ret = reganode(pRExC_state, BRANCHJ,0);
2538 ret = reg_node(pRExC_state, BRANCH);
2539 Set_Node_Length(ret, 1);
2543 if (!first && SIZE_ONLY)
2544 RExC_extralen += 1; /* BRANCHJ */
2546 *flagp = WORST; /* Tentatively. */
2549 nextchar(pRExC_state);
2550 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2552 latest = regpiece(pRExC_state, &flags);
2553 if (latest == NULL) {
2554 if (flags & TRYAGAIN)
2558 else if (ret == NULL)
2560 *flagp |= flags&HASWIDTH;
2561 if (chain == NULL) /* First piece. */
2562 *flagp |= flags&SPSTART;
2565 regtail(pRExC_state, chain, latest);
2570 if (chain == NULL) { /* Loop ran zero times. */
2571 chain = reg_node(pRExC_state, NOTHING);
2576 *flagp |= flags&SIMPLE;
2583 - regpiece - something followed by possible [*+?]
2585 * Note that the branching code sequences used for ? and the general cases
2586 * of * and + are somewhat optimized: they use the same NOTHING node as
2587 * both the endmarker for their branch list and the body of the last branch.
2588 * It might seem that this node could be dispensed with entirely, but the
2589 * endmarker role is not redundant.
2592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2594 register regnode *ret;
2596 register char *next;
2598 char *origparse = RExC_parse;
2601 I32 max = REG_INFTY;
2604 ret = regatom(pRExC_state, &flags);
2606 if (flags & TRYAGAIN)
2613 if (op == '{' && regcurly(RExC_parse)) {
2614 parse_start = RExC_parse; /* MJD */
2615 next = RExC_parse + 1;
2617 while (isDIGIT(*next) || *next == ',') {
2626 if (*next == '}') { /* got one */
2630 min = atoi(RExC_parse);
2634 maxpos = RExC_parse;
2636 if (!max && *maxpos != '0')
2637 max = REG_INFTY; /* meaning "infinity" */
2638 else if (max >= REG_INFTY)
2639 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2641 nextchar(pRExC_state);
2644 if ((flags&SIMPLE)) {
2645 RExC_naughty += 2 + RExC_naughty / 2;
2646 reginsert(pRExC_state, CURLY, ret);
2647 Set_Node_Offset(ret, parse_start+1); /* MJD */
2648 Set_Node_Cur_Length(ret);
2651 regnode *w = reg_node(pRExC_state, WHILEM);
2654 regtail(pRExC_state, ret, w);
2655 if (!SIZE_ONLY && RExC_extralen) {
2656 reginsert(pRExC_state, LONGJMP,ret);
2657 reginsert(pRExC_state, NOTHING,ret);
2658 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2660 reginsert(pRExC_state, CURLYX,ret);
2662 Set_Node_Offset(ret, parse_start+1);
2663 Set_Node_Length(ret,
2664 op == '{' ? (RExC_parse - parse_start) : 1);
2666 if (!SIZE_ONLY && RExC_extralen)
2667 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2668 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2670 RExC_whilem_seen++, RExC_extralen += 3;
2671 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2679 if (max && max < min)
2680 vFAIL("Can't do {n,m} with n > m");
2695 #if 0 /* Now runtime fix should be reliable. */
2697 /* if this is reinstated, don't forget to put this back into perldiag:
2699 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2701 (F) The part of the regexp subject to either the * or + quantifier
2702 could match an empty string. The {#} shows in the regular
2703 expression about where the problem was discovered.
2707 if (!(flags&HASWIDTH) && op != '?')
2708 vFAIL("Regexp *+ operand could be empty");
2711 parse_start = RExC_parse;
2712 nextchar(pRExC_state);
2714 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2716 if (op == '*' && (flags&SIMPLE)) {
2717 reginsert(pRExC_state, STAR, ret);
2721 else if (op == '*') {
2725 else if (op == '+' && (flags&SIMPLE)) {
2726 reginsert(pRExC_state, PLUS, ret);
2730 else if (op == '+') {
2734 else if (op == '?') {
2739 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2741 "%.*s matches null string many times",
2742 RExC_parse - origparse,
2746 if (*RExC_parse == '?') {
2747 nextchar(pRExC_state);
2748 reginsert(pRExC_state, MINMOD, ret);
2749 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2751 if (ISMULT2(RExC_parse)) {
2753 vFAIL("Nested quantifiers");
2760 - regatom - the lowest level
2762 * Optimization: gobbles an entire sequence of ordinary characters so that
2763 * it can turn them into a single node, which is smaller to store and
2764 * faster to run. Backslashed characters are exceptions, each becoming a
2765 * separate node; the code is simpler that way and it's not worth fixing.
2767 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2769 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2771 register regnode *ret = 0;
2773 char *parse_start = 0;
2775 *flagp = WORST; /* Tentatively. */
2778 switch (*RExC_parse) {
2780 RExC_seen_zerolen++;
2781 nextchar(pRExC_state);
2782 if (RExC_flags16 & PMf_MULTILINE)
2783 ret = reg_node(pRExC_state, MBOL);
2784 else if (RExC_flags16 & PMf_SINGLELINE)
2785 ret = reg_node(pRExC_state, SBOL);
2787 ret = reg_node(pRExC_state, BOL);
2788 Set_Node_Length(ret, 1); /* MJD */
2791 nextchar(pRExC_state);
2793 RExC_seen_zerolen++;
2794 if (RExC_flags16 & PMf_MULTILINE)
2795 ret = reg_node(pRExC_state, MEOL);
2796 else if (RExC_flags16 & PMf_SINGLELINE)
2797 ret = reg_node(pRExC_state, SEOL);
2799 ret = reg_node(pRExC_state, EOL);
2800 Set_Node_Length(ret, 1); /* MJD */
2803 nextchar(pRExC_state);
2804 if (RExC_flags16 & PMf_SINGLELINE)
2805 ret = reg_node(pRExC_state, SANY);
2807 ret = reg_node(pRExC_state, REG_ANY);
2808 *flagp |= HASWIDTH|SIMPLE;
2810 Set_Node_Length(ret, 1); /* MJD */
2814 char *oregcomp_parse = ++RExC_parse;
2815 ret = regclass(pRExC_state);
2816 if (*RExC_parse != ']') {
2817 RExC_parse = oregcomp_parse;
2818 vFAIL("Unmatched [");
2820 nextchar(pRExC_state);
2821 *flagp |= HASWIDTH|SIMPLE;
2822 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2826 nextchar(pRExC_state);
2827 ret = reg(pRExC_state, 1, &flags);
2829 if (flags & TRYAGAIN) {
2830 if (RExC_parse == RExC_end) {
2831 /* Make parent create an empty node if needed. */
2839 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2843 if (flags & TRYAGAIN) {
2847 vFAIL("Internal urp");
2848 /* Supposed to be caught earlier. */
2851 if (!regcurly(RExC_parse)) {
2860 vFAIL("Quantifier follows nothing");
2863 switch (*++RExC_parse) {
2865 RExC_seen_zerolen++;
2866 ret = reg_node(pRExC_state, SBOL);
2868 nextchar(pRExC_state);
2869 Set_Node_Length(ret, 2); /* MJD */
2872 ret = reg_node(pRExC_state, GPOS);
2873 RExC_seen |= REG_SEEN_GPOS;
2875 nextchar(pRExC_state);
2876 Set_Node_Length(ret, 2); /* MJD */
2879 ret = reg_node(pRExC_state, SEOL);
2881 RExC_seen_zerolen++; /* Do not optimize RE away */
2882 nextchar(pRExC_state);
2885 ret = reg_node(pRExC_state, EOS);
2887 RExC_seen_zerolen++; /* Do not optimize RE away */
2888 nextchar(pRExC_state);
2889 Set_Node_Length(ret, 2); /* MJD */
2892 ret = reg_node(pRExC_state, CANY);
2893 RExC_seen |= REG_SEEN_CANY;
2894 *flagp |= HASWIDTH|SIMPLE;
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 ret = reg_node(pRExC_state, CLUMP);
2901 nextchar(pRExC_state);
2902 Set_Node_Length(ret, 2); /* MJD */
2905 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
2906 *flagp |= HASWIDTH|SIMPLE;
2907 nextchar(pRExC_state);
2908 Set_Node_Length(ret, 2); /* MJD */
2911 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 RExC_seen_zerolen++;
2918 RExC_seen |= REG_SEEN_LOOKBEHIND;
2919 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2925 RExC_seen_zerolen++;
2926 RExC_seen |= REG_SEEN_LOOKBEHIND;
2927 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
2940 *flagp |= HASWIDTH|SIMPLE;
2941 nextchar(pRExC_state);
2942 Set_Node_Length(ret, 2); /* MJD */
2945 ret = reg_node(pRExC_state, DIGIT);
2946 *flagp |= HASWIDTH|SIMPLE;
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2951 ret = reg_node(pRExC_state, NDIGIT);
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2959 char* oldregxend = RExC_end;
2960 char* parse_start = RExC_parse;
2962 if (RExC_parse[1] == '{') {
2963 /* a lovely hack--pretend we saw [\pX] instead */
2964 RExC_end = strchr(RExC_parse, '}');
2966 U8 c = (U8)*RExC_parse;
2968 RExC_end = oldregxend;
2969 vFAIL2("Missing right brace on \\%c{}", c);
2974 RExC_end = RExC_parse + 2;
2977 ret = regclass(pRExC_state);
2979 RExC_end = oldregxend;
2981 Set_Node_Cur_Length(ret); /* MJD */
2982 nextchar(pRExC_state);
2983 *flagp |= HASWIDTH|SIMPLE;
2996 case '1': case '2': case '3': case '4':
2997 case '5': case '6': case '7': case '8': case '9':
2999 I32 num = atoi(RExC_parse);
3001 if (num > 9 && num >= RExC_npar)
3004 char * parse_start = RExC_parse - 1; /* MJD */
3005 while (isDIGIT(*RExC_parse))
3008 if (!SIZE_ONLY && num > RExC_rx->nparens)
3009 vFAIL("Reference to nonexistent group");
3011 ret = reganode(pRExC_state, FOLD
3012 ? (LOC ? REFFL : REFF)
3016 /* override incorrect value set in reganode MJD */
3017 Set_Node_Offset(ret, parse_start+1);
3018 Set_Node_Cur_Length(ret); /* MJD */
3020 nextchar(pRExC_state);
3025 if (RExC_parse >= RExC_end)
3026 FAIL("Trailing \\");
3029 /* Do not generate `unrecognized' warnings here, we fall
3030 back into the quick-grab loop below */
3036 if (RExC_flags16 & PMf_EXTENDED) {
3037 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3038 if (RExC_parse < RExC_end)
3044 register STRLEN len;
3050 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3052 parse_start = RExC_parse - 1;
3058 ret = reg_node(pRExC_state, FOLD
3059 ? (LOC ? EXACTFL : EXACTF)
3062 for (len = 0, p = RExC_parse - 1;
3063 len < 127 && p < RExC_end;
3068 if (RExC_flags16 & PMf_EXTENDED)
3069 p = regwhite(p, RExC_end);
3116 ender = ASCII_TO_NATIVE('\033');
3120 ender = ASCII_TO_NATIVE('\007');
3125 char* e = strchr(p, '}');
3129 vFAIL("Missing right brace on \\x{}");
3132 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3133 | PERL_SCAN_DISALLOW_PREFIX;
3135 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3138 /* numlen is generous */
3139 if (numlen + len >= 127) {
3147 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3149 ender = grok_hex(p, &numlen, &flags, NULL);
3155 ender = UCHARAT(p++);
3156 ender = toCTRL(ender);
3158 case '0': case '1': case '2': case '3':case '4':
3159 case '5': case '6': case '7': case '8':case '9':
3161 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3164 ender = grok_oct(p, &numlen, &flags, NULL);
3174 FAIL("Trailing \\");
3177 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3178 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3179 goto normal_default;
3184 if (UTF8_IS_START(*p) && UTF) {
3185 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3193 if (RExC_flags16 & PMf_EXTENDED)
3194 p = regwhite(p, RExC_end);
3196 /* Prime the casefolded buffer. */
3197 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3199 if (ISMULT2(p)) { /* Back off on ?+*. */
3206 /* Emit all the Unicode characters. */
3207 for (foldbuf = tmpbuf;
3209 foldlen -= numlen) {
3210 ender = utf8_to_uvchr(foldbuf, &numlen);
3212 reguni(pRExC_state, ender, s, &unilen);
3215 /* In EBCDIC the numlen
3216 * and unilen can differ. */
3218 if (numlen >= foldlen)
3222 break; /* "Can't happen." */
3226 reguni(pRExC_state, ender, s, &unilen);
3243 /* Emit all the Unicode characters. */
3244 for (foldbuf = tmpbuf;
3246 foldlen -= numlen) {
3247 ender = utf8_to_uvchr(foldbuf, &numlen);
3249 reguni(pRExC_state, ender, s, &unilen);
3252 /* In EBCDIC the numlen
3253 * and unilen can differ. */
3255 if (numlen >= foldlen)
3263 reguni(pRExC_state, ender, s, &unilen);
3276 Set_Node_Cur_Length(ret); /* MJD */
3277 nextchar(pRExC_state);
3279 /* len is STRLEN which is unsigned, need to copy to signed */
3282 vFAIL("Internal disaster");
3291 RExC_size += STR_SZ(len);
3293 RExC_emit += STR_SZ(len);
3298 /* If the encoding pragma is in effect recode the text of
3299 * any EXACT-kind nodes. */
3300 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3301 STRLEN oldlen = STR_LEN(ret);
3302 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3306 if (sv_utf8_downgrade(sv, TRUE)) {
3307 char *s = sv_recode_to_utf8(sv, PL_encoding);
3308 STRLEN newlen = SvCUR(sv);
3311 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312 (int)oldlen, STRING(ret),
3314 Copy(s, STRING(ret), newlen, char);
3315 STR_LEN(ret) += newlen - oldlen;
3316 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3318 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3326 S_regwhite(pTHX_ char *p, char *e)
3331 else if (*p == '#') {
3334 } while (p < e && *p != '\n');
3342 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343 Character classes ([:foo:]) can also be negated ([:^foo:]).
3344 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3346 but trigger failures because they are currently unimplemented. */
3348 #define POSIXCC_DONE(c) ((c) == ':')
3349 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3353 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3356 I32 namedclass = OOB_NAMEDCLASS;
3358 if (value == '[' && RExC_parse + 1 < RExC_end &&
3359 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3360 POSIXCC(UCHARAT(RExC_parse))) {
3361 char c = UCHARAT(RExC_parse);
3362 char* s = RExC_parse++;
3364 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3366 if (RExC_parse == RExC_end)
3367 /* Grandfather lone [:, [=, [. */
3370 char* t = RExC_parse++; /* skip over the c */
3372 if (UCHARAT(RExC_parse) == ']') {
3373 RExC_parse++; /* skip over the ending ] */
3376 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377 I32 skip = 5; /* the most common skip */
3381 if (strnEQ(posixcc, "alnum", 5))
3383 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384 else if (strnEQ(posixcc, "alpha", 5))
3386 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387 else if (strnEQ(posixcc, "ascii", 5))
3389 complement ? ANYOF_NASCII : ANYOF_ASCII;
3392 if (strnEQ(posixcc, "blank", 5))
3394 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3397 if (strnEQ(posixcc, "cntrl", 5))
3399 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3402 if (strnEQ(posixcc, "digit", 5))
3404 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3407 if (strnEQ(posixcc, "graph", 5))
3409 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3412 if (strnEQ(posixcc, "lower", 5))
3414 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3417 if (strnEQ(posixcc, "print", 5))
3419 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420 else if (strnEQ(posixcc, "punct", 5))
3422 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3425 if (strnEQ(posixcc, "space", 5))
3427 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3430 if (strnEQ(posixcc, "upper", 5))
3432 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3434 case 'w': /* this is not POSIX, this is the Perl \w */
3435 if (strnEQ(posixcc, "word", 4)) {
3437 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3442 if (strnEQ(posixcc, "xdigit", 6)) {
3444 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3449 if (namedclass == OOB_NAMEDCLASS ||
3450 posixcc[skip] != ':' ||
3451 posixcc[skip+1] != ']')
3453 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3456 } else if (!SIZE_ONLY) {
3457 /* [[=foo=]] and [[.foo.]] are still future. */
3459 /* adjust RExC_parse so the warning shows after
3461 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3463 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3466 /* Maternal grandfather:
3467 * "[:" ending in ":" but not in ":]" */
3477 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3479 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3480 char *s = RExC_parse;
3483 while(*s && isALNUM(*s))
3485 if (*s && c == *s && s[1] == ']') {
3486 if (ckWARN(WARN_REGEXP))
3488 "POSIX syntax [%c %c] belongs inside character classes",
3491 /* [[=foo=]] and [[.foo.]] are still future. */
3492 if (POSIXCC_NOTYET(c)) {
3493 /* adjust RExC_parse so the error shows after
3495 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3497 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3504 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3507 register UV nextvalue;
3508 register IV prevvalue = OOB_UNICODE;
3509 register IV range = 0;
3510 register regnode *ret;
3513 char *rangebegin = 0;
3514 bool need_class = 0;
3515 SV *listsv = Nullsv;
3518 bool optimize_invert = TRUE;
3519 AV* unicode_alternate = 0;
3521 ret = reganode(pRExC_state, ANYOF, 0);
3524 ANYOF_FLAGS(ret) = 0;
3526 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3530 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534 RExC_size += ANYOF_SKIP;
3536 RExC_emit += ANYOF_SKIP;
3538 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3540 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3541 ANYOF_BITMAP_ZERO(ret);
3542 listsv = newSVpvn("# comment\n", 10);
3545 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3547 if (!SIZE_ONLY && POSIXCC(nextvalue))
3548 checkposixcc(pRExC_state);
3550 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3551 if (UCHARAT(RExC_parse) == ']')
3554 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3558 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3561 rangebegin = RExC_parse;
3563 value = utf8n_to_uvchr((U8*)RExC_parse,
3564 RExC_end - RExC_parse,
3566 RExC_parse += numlen;
3569 value = UCHARAT(RExC_parse++);
3570 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3571 if (value == '[' && POSIXCC(nextvalue))
3572 namedclass = regpposixcc(pRExC_state, value);
3573 else if (value == '\\') {
3575 value = utf8n_to_uvchr((U8*)RExC_parse,
3576 RExC_end - RExC_parse,
3578 RExC_parse += numlen;
3581 value = UCHARAT(RExC_parse++);
3582 /* Some compilers cannot handle switching on 64-bit integer
3583 * values, therefore value cannot be an UV. Yes, this will
3584 * be a problem later if we want switch on Unicode.
3585 * A similar issue a little bit later when switching on
3586 * namedclass. --jhi */
3587 switch ((I32)value) {
3588 case 'w': namedclass = ANYOF_ALNUM; break;
3589 case 'W': namedclass = ANYOF_NALNUM; break;
3590 case 's': namedclass = ANYOF_SPACE; break;
3591 case 'S': namedclass = ANYOF_NSPACE; break;
3592 case 'd': namedclass = ANYOF_DIGIT; break;
3593 case 'D': namedclass = ANYOF_NDIGIT; break;
3596 if (*RExC_parse == '{') {
3598 e = strchr(RExC_parse++, '}');
3600 vFAIL2("Missing right brace on \\%c{}", c);
3601 while (isSPACE(UCHARAT(RExC_parse)))
3603 if (e == RExC_parse)
3604 vFAIL2("Empty \\%c{}", c);
3606 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3614 if (UCHARAT(RExC_parse) == '^') {
3617 value = value == 'p' ? 'P' : 'p'; /* toggle */
3618 while (isSPACE(UCHARAT(RExC_parse))) {
3624 Perl_sv_catpvf(aTHX_ listsv,
3625 "+utf8::%.*s\n", (int)n, RExC_parse);
3627 Perl_sv_catpvf(aTHX_ listsv,
3628 "!utf8::%.*s\n", (int)n, RExC_parse);
3631 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3633 case 'n': value = '\n'; break;
3634 case 'r': value = '\r'; break;
3635 case 't': value = '\t'; break;
3636 case 'f': value = '\f'; break;
3637 case 'b': value = '\b'; break;
3638 case 'e': value = ASCII_TO_NATIVE('\033');break;
3639 case 'a': value = ASCII_TO_NATIVE('\007');break;
3641 if (*RExC_parse == '{') {
3642 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3643 | PERL_SCAN_DISALLOW_PREFIX;
3644 e = strchr(RExC_parse++, '}');
3646 vFAIL("Missing right brace on \\x{}");
3648 numlen = e - RExC_parse;
3649 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3653 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3655 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3656 RExC_parse += numlen;
3660 value = UCHARAT(RExC_parse++);
3661 value = toCTRL(value);
3663 case '0': case '1': case '2': case '3': case '4':
3664 case '5': case '6': case '7': case '8': case '9':
3668 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3669 RExC_parse += numlen;
3673 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3675 "Unrecognized escape \\%c in character class passed through",
3679 } /* end of \blah */
3681 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3683 if (!SIZE_ONLY && !need_class)
3684 ANYOF_CLASS_ZERO(ret);
3688 /* a bad range like a-\d, a-[:digit:] ? */
3691 if (ckWARN(WARN_REGEXP))
3693 "False [] range \"%*.*s\"",
3694 RExC_parse - rangebegin,
3695 RExC_parse - rangebegin,
3697 if (prevvalue < 256) {
3698 ANYOF_BITMAP_SET(ret, prevvalue);
3699 ANYOF_BITMAP_SET(ret, '-');
3702 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3703 Perl_sv_catpvf(aTHX_ listsv,
3704 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3708 range = 0; /* this was not a true range */
3712 if (namedclass > OOB_NAMEDCLASS)
3713 optimize_invert = FALSE;
3714 /* Possible truncation here but in some 64-bit environments
3715 * the compiler gets heartburn about switch on 64-bit values.
3716 * A similar issue a little earlier when switching on value.
3718 switch ((I32)namedclass) {
3721 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3723 for (value = 0; value < 256; value++)
3725 ANYOF_BITMAP_SET(ret, value);
3727 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3731 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3733 for (value = 0; value < 256; value++)
3734 if (!isALNUM(value))
3735 ANYOF_BITMAP_SET(ret, value);
3737 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3741 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3743 for (value = 0; value < 256; value++)
3744 if (isALNUMC(value))
3745 ANYOF_BITMAP_SET(ret, value);
3747 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3751 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3753 for (value = 0; value < 256; value++)
3754 if (!isALNUMC(value))
3755 ANYOF_BITMAP_SET(ret, value);
3757 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3761 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3763 for (value = 0; value < 256; value++)
3765 ANYOF_BITMAP_SET(ret, value);
3767 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3771 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3773 for (value = 0; value < 256; value++)
3774 if (!isALPHA(value))
3775 ANYOF_BITMAP_SET(ret, value);
3777 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3781 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3784 for (value = 0; value < 128; value++)
3785 ANYOF_BITMAP_SET(ret, value);
3787 for (value = 0; value < 256; value++) {
3789 ANYOF_BITMAP_SET(ret, value);
3793 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3797 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3800 for (value = 128; value < 256; value++)
3801 ANYOF_BITMAP_SET(ret, value);
3803 for (value = 0; value < 256; value++) {
3804 if (!isASCII(value))
3805 ANYOF_BITMAP_SET(ret, value);
3809 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3813 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3815 for (value = 0; value < 256; value++)
3817 ANYOF_BITMAP_SET(ret, value);
3819 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3823 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3825 for (value = 0; value < 256; value++)
3826 if (!isBLANK(value))
3827 ANYOF_BITMAP_SET(ret, value);
3829 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3833 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3835 for (value = 0; value < 256; value++)
3837 ANYOF_BITMAP_SET(ret, value);
3839 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3843 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3845 for (value = 0; value < 256; value++)
3846 if (!isCNTRL(value))
3847 ANYOF_BITMAP_SET(ret, value);
3849 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3853 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3855 /* consecutive digits assumed */
3856 for (value = '0'; value <= '9'; value++)
3857 ANYOF_BITMAP_SET(ret, value);
3859 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3863 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3865 /* consecutive digits assumed */
3866 for (value = 0; value < '0'; value++)
3867 ANYOF_BITMAP_SET(ret, value);
3868 for (value = '9' + 1; value < 256; value++)
3869 ANYOF_BITMAP_SET(ret, value);
3871 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3875 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3877 for (value = 0; value < 256; value++)
3879 ANYOF_BITMAP_SET(ret, value);
3881 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3885 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3887 for (value = 0; value < 256; value++)
3888 if (!isGRAPH(value))
3889 ANYOF_BITMAP_SET(ret, value);
3891 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3895 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3897 for (value = 0; value < 256; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3901 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3905 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3907 for (value = 0; value < 256; value++)
3908 if (!isLOWER(value))
3909 ANYOF_BITMAP_SET(ret, value);
3911 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3915 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3917 for (value = 0; value < 256; value++)
3919 ANYOF_BITMAP_SET(ret, value);
3921 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3925 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3927 for (value = 0; value < 256; value++)
3928 if (!isPRINT(value))
3929 ANYOF_BITMAP_SET(ret, value);
3931 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3935 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3937 for (value = 0; value < 256; value++)
3938 if (isPSXSPC(value))
3939 ANYOF_BITMAP_SET(ret, value);
3941 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3945 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3947 for (value = 0; value < 256; value++)
3948 if (!isPSXSPC(value))
3949 ANYOF_BITMAP_SET(ret, value);
3951 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3955 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3957 for (value = 0; value < 256; value++)
3959 ANYOF_BITMAP_SET(ret, value);
3961 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3965 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3967 for (value = 0; value < 256; value++)
3968 if (!isPUNCT(value))
3969 ANYOF_BITMAP_SET(ret, value);
3971 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3975 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3977 for (value = 0; value < 256; value++)
3979 ANYOF_BITMAP_SET(ret, value);
3981 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3985 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3987 for (value = 0; value < 256; value++)
3988 if (!isSPACE(value))
3989 ANYOF_BITMAP_SET(ret, value);
3991 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3995 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3997 for (value = 0; value < 256; value++)
3999 ANYOF_BITMAP_SET(ret, value);
4001 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4005 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4007 for (value = 0; value < 256; value++)
4008 if (!isUPPER(value))
4009 ANYOF_BITMAP_SET(ret, value);
4011 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4015 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4017 for (value = 0; value < 256; value++)
4018 if (isXDIGIT(value))
4019 ANYOF_BITMAP_SET(ret, value);
4021 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4025 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4027 for (value = 0; value < 256; value++)
4028 if (!isXDIGIT(value))
4029 ANYOF_BITMAP_SET(ret, value);
4031 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4034 vFAIL("Invalid [::] class");
4038 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4041 } /* end of namedclass \blah */
4044 if (prevvalue > value) /* b-a */ {
4045 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4046 RExC_parse - rangebegin,
4047 RExC_parse - rangebegin,
4049 range = 0; /* not a valid range */
4053 prevvalue = value; /* save the beginning of the range */
4054 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4055 RExC_parse[1] != ']') {
4058 /* a bad range like \w-, [:word:]- ? */
4059 if (namedclass > OOB_NAMEDCLASS) {
4060 if (ckWARN(WARN_REGEXP))
4062 "False [] range \"%*.*s\"",
4063 RExC_parse - rangebegin,
4064 RExC_parse - rangebegin,
4067 ANYOF_BITMAP_SET(ret, '-');
4069 range = 1; /* yeah, it's a range! */
4070 continue; /* but do it the next time */
4074 /* now is the next time */
4078 if (prevvalue < 256) {
4079 IV ceilvalue = value < 256 ? value : 255;
4082 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4083 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4085 if (isLOWER(prevvalue)) {
4086 for (i = prevvalue; i <= ceilvalue; i++)
4088 ANYOF_BITMAP_SET(ret, i);
4090 for (i = prevvalue; i <= ceilvalue; i++)
4092 ANYOF_BITMAP_SET(ret, i);
4097 for (i = prevvalue; i <= ceilvalue; i++)
4098 ANYOF_BITMAP_SET(ret, i);
4100 if (value > 255 || UTF) {
4101 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4102 UV natvalue = NATIVE_TO_UNI(value);
4104 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4105 if (prevnatvalue < natvalue) { /* what about > ? */
4106 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4107 prevnatvalue, natvalue);
4109 else if (prevnatvalue == natvalue) {
4110 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4112 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4114 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4116 /* If folding and foldable and a single
4117 * character, insert also the folded version
4118 * to the charclass. */
4120 if (foldlen == UNISKIP(f))
4121 Perl_sv_catpvf(aTHX_ listsv,
4124 /* Any multicharacter foldings
4125 * require the following transform:
4126 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4127 * where E folds into "pq" and F folds
4128 * into "rst", all other characters
4129 * fold to single characters. We save
4130 * away these multicharacter foldings,
4131 * to be later saved as part of the
4132 * additional "s" data. */
4135 if (!unicode_alternate)
4136 unicode_alternate = newAV();
4137 sv = newSVpvn((char*)foldbuf, foldlen);
4139 av_push(unicode_alternate, sv);
4143 /* If folding and the value is one of the Greek
4144 * sigmas insert a few more sigmas to make the
4145 * folding rules of the sigmas to work right.
4146 * Note that not all the possible combinations
4147 * are handled here: some of them are handled
4148 * by the standard folding rules, and some of
4149 * them (literal or EXACTF cases) are handled
4150 * during runtime in regexec.c:S_find_byclass(). */
4151 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4152 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4153 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4155 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4157 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4158 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4159 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4165 range = 0; /* this range (if it was one) is done now */
4169 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4171 RExC_size += ANYOF_CLASS_ADD_SKIP;
4173 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4176 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4178 /* If the only flag is folding (plus possibly inversion). */
4179 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4181 for (value = 0; value < 256; ++value) {
4182 if (ANYOF_BITMAP_TEST(ret, value)) {
4183 IV fold = PL_fold[value];
4186 ANYOF_BITMAP_SET(ret, fold);
4189 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4192 /* optimize inverted simple patterns (e.g. [^a-z]) */
4193 if (!SIZE_ONLY && optimize_invert &&
4194 /* If the only flag is inversion. */
4195 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4196 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4197 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4198 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4205 /* The 0th element stores the character class description
4206 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4207 * to initialize the appropriate swash (which gets stored in
4208 * the 1st element), and also useful for dumping the regnode.
4209 * The 2nd element stores the multicharacter foldings,
4210 * used later (regexec.c:s_reginclasslen()). */
4211 av_store(av, 0, listsv);
4212 av_store(av, 1, NULL);
4213 av_store(av, 2, (SV*)unicode_alternate);
4214 rv = newRV_noinc((SV*)av);
4215 n = add_data(pRExC_state, 1, "s");
4216 RExC_rx->data->data[n] = (void*)rv;
4224 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4226 char* retval = RExC_parse++;
4229 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4230 RExC_parse[2] == '#') {
4231 while (*RExC_parse && *RExC_parse != ')')
4236 if (RExC_flags16 & PMf_EXTENDED) {
4237 if (isSPACE(*RExC_parse)) {
4241 else if (*RExC_parse == '#') {
4242 while (*RExC_parse && *RExC_parse != '\n')
4253 - reg_node - emit a node
4255 STATIC regnode * /* Location. */
4256 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4258 register regnode *ret;
4259 register regnode *ptr;
4263 SIZE_ALIGN(RExC_size);
4268 NODE_ALIGN_FILL(ret);
4270 FILL_ADVANCE_NODE(ptr, op);
4271 if (RExC_offsets) { /* MJD */
4272 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4273 "reg_node", __LINE__,
4275 RExC_emit - RExC_emit_start > RExC_offsets[0]
4276 ? "Overwriting end of array!\n" : "OK",
4277 RExC_emit - RExC_emit_start,
4278 RExC_parse - RExC_start,
4280 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4289 - reganode - emit a node with an argument
4291 STATIC regnode * /* Location. */
4292 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4294 register regnode *ret;
4295 register regnode *ptr;
4299 SIZE_ALIGN(RExC_size);
4304 NODE_ALIGN_FILL(ret);
4306 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4307 if (RExC_offsets) { /* MJD */
4308 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4310 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4311 "Overwriting end of array!\n" : "OK",
4312 RExC_emit - RExC_emit_start,
4313 RExC_parse - RExC_start,
4315 Set_Cur_Node_Offset;
4324 - reguni - emit (if appropriate) a Unicode character
4327 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4329 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4333 - reginsert - insert an operator in front of already-emitted operand
4335 * Means relocating the operand.
4338 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4340 register regnode *src;
4341 register regnode *dst;
4342 register regnode *place;
4343 register int offset = regarglen[(U8)op];
4345 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4348 RExC_size += NODE_STEP_REGNODE + offset;
4353 RExC_emit += NODE_STEP_REGNODE + offset;
4355 while (src > opnd) {
4356 StructCopy(--src, --dst, regnode);
4357 if (RExC_offsets) { /* MJD 20010112 */
4358 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4360 dst - RExC_emit_start > RExC_offsets[0]
4361 ? "Overwriting end of array!\n" : "OK",
4362 src - RExC_emit_start,
4363 dst - RExC_emit_start,
4365 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4366 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4371 place = opnd; /* Op node, where operand used to be. */
4372 if (RExC_offsets) { /* MJD */
4373 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4375 place - RExC_emit_start > RExC_offsets[0]
4376 ? "Overwriting end of array!\n" : "OK",
4377 place - RExC_emit_start,
4378 RExC_parse - RExC_start,
4380 Set_Node_Offset(place, RExC_parse);
4382 src = NEXTOPER(place);
4383 FILL_ADVANCE_NODE(place, op);
4384 Zero(src, offset, regnode);
4388 - regtail - set the next-pointer at the end of a node chain of p to val.
4391 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4393 register regnode *scan;
4394 register regnode *temp;
4399 /* Find last node. */
4402 temp = regnext(scan);
4408 if (reg_off_by_arg[OP(scan)]) {
4409 ARG_SET(scan, val - scan);
4412 NEXT_OFF(scan) = val - scan;
4417 - regoptail - regtail on operand of first argument; nop if operandless
4420 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4422 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4423 if (p == NULL || SIZE_ONLY)
4425 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4426 regtail(pRExC_state, NEXTOPER(p), val);
4428 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4429 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4436 - regcurly - a little FSA that accepts {\d+,?\d*}
4439 S_regcurly(pTHX_ register char *s)
4460 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4462 register U8 op = EXACT; /* Arbitrary non-END op. */
4463 register regnode *next;
4465 while (op != END && (!last || node < last)) {
4466 /* While that wasn't END last time... */
4472 next = regnext(node);
4474 if (OP(node) == OPTIMIZED)
4477 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4478 (int)(2*l + 1), "", SvPVX(sv));
4479 if (next == NULL) /* Next ptr. */
4480 PerlIO_printf(Perl_debug_log, "(0)");
4482 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4483 (void)PerlIO_putc(Perl_debug_log, '\n');
4485 if (PL_regkind[(U8)op] == BRANCHJ) {
4486 register regnode *nnode = (OP(next) == LONGJMP
4489 if (last && nnode > last)
4491 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4493 else if (PL_regkind[(U8)op] == BRANCH) {
4494 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4496 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4497 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4498 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4500 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4501 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4504 else if ( op == PLUS || op == STAR) {
4505 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4507 else if (op == ANYOF) {
4508 /* arglen 1 + class block */
4509 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4510 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4511 node = NEXTOPER(node);
4513 else if (PL_regkind[(U8)op] == EXACT) {
4514 /* Literal string, where present. */
4515 node += NODE_SZ_STR(node) - 1;
4516 node = NEXTOPER(node);
4519 node = NEXTOPER(node);
4520 node += regarglen[(U8)op];
4522 if (op == CURLYX || op == OPEN)
4524 else if (op == WHILEM)
4530 #endif /* DEBUGGING */
4533 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4536 Perl_regdump(pTHX_ regexp *r)
4539 SV *sv = sv_newmortal();
4541 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4543 /* Header fields of interest. */
4544 if (r->anchored_substr)
4545 PerlIO_printf(Perl_debug_log,
4546 "anchored `%s%.*s%s'%s at %"IVdf" ",
4548 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4549 SvPVX(r->anchored_substr),
4551 SvTAIL(r->anchored_substr) ? "$" : "",
4552 (IV)r->anchored_offset);
4553 else if (r->anchored_utf8)
4554 PerlIO_printf(Perl_debug_log,
4555 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4557 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4558 SvPVX(r->anchored_utf8),
4560 SvTAIL(r->anchored_utf8) ? "$" : "",
4561 (IV)r->anchored_offset);
4562 if (r->float_substr)
4563 PerlIO_printf(Perl_debug_log,
4564 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4566 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4567 SvPVX(r->float_substr),
4569 SvTAIL(r->float_substr) ? "$" : "",
4570 (IV)r->float_min_offset, (UV)r->float_max_offset);
4571 else if (r->float_utf8)
4572 PerlIO_printf(Perl_debug_log,
4573 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4575 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4576 SvPVX(r->float_utf8),
4578 SvTAIL(r->float_utf8) ? "$" : "",
4579 (IV)r->float_min_offset, (UV)r->float_max_offset);
4580 if (r->check_substr || r->check_utf8)
4581 PerlIO_printf(Perl_debug_log,
4582 r->check_substr == r->float_substr
4583 && r->check_utf8 == r->float_utf8
4584 ? "(checking floating" : "(checking anchored");
4585 if (r->reganch & ROPT_NOSCAN)
4586 PerlIO_printf(Perl_debug_log, " noscan");
4587 if (r->reganch & ROPT_CHECK_ALL)
4588 PerlIO_printf(Perl_debug_log, " isall");
4589 if (r->check_substr || r->check_utf8)
4590 PerlIO_printf(Perl_debug_log, ") ");
4592 if (r->regstclass) {
4593 regprop(sv, r->regstclass);
4594 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4596 if (r->reganch & ROPT_ANCH) {
4597 PerlIO_printf(Perl_debug_log, "anchored");
4598 if (r->reganch & ROPT_ANCH_BOL)
4599 PerlIO_printf(Perl_debug_log, "(BOL)");
4600 if (r->reganch & ROPT_ANCH_MBOL)
4601 PerlIO_printf(Perl_debug_log, "(MBOL)");
4602 if (r->reganch & ROPT_ANCH_SBOL)
4603 PerlIO_printf(Perl_debug_log, "(SBOL)");
4604 if (r->reganch & ROPT_ANCH_GPOS)
4605 PerlIO_printf(Perl_debug_log, "(GPOS)");
4606 PerlIO_putc(Perl_debug_log, ' ');
4608 if (r->reganch & ROPT_GPOS_SEEN)
4609 PerlIO_printf(Perl_debug_log, "GPOS ");
4610 if (r->reganch & ROPT_SKIP)
4611 PerlIO_printf(Perl_debug_log, "plus ");
4612 if (r->reganch & ROPT_IMPLICIT)
4613 PerlIO_printf(Perl_debug_log, "implicit ");
4614 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4615 if (r->reganch & ROPT_EVAL_SEEN)
4616 PerlIO_printf(Perl_debug_log, "with eval ");
4617 PerlIO_printf(Perl_debug_log, "\n");
4620 U32 len = r->offsets[0];
4621 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4622 for (i = 1; i <= len; i++)
4623 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4624 (UV)r->offsets[i*2-1],
4625 (UV)r->offsets[i*2]);
4626 PerlIO_printf(Perl_debug_log, "\n");
4628 #endif /* DEBUGGING */
4634 S_put_byte(pTHX_ SV *sv, int c)
4636 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4637 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4638 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4639 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4641 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4644 #endif /* DEBUGGING */
4647 - regprop - printable representation of opcode
4650 Perl_regprop(pTHX_ SV *sv, regnode *o)
4655 sv_setpvn(sv, "", 0);
4656 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4657 /* It would be nice to FAIL() here, but this may be called from
4658 regexec.c, and it would be hard to supply pRExC_state. */
4659 Perl_croak(aTHX_ "Corrupted regexp opcode");
4660 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4662 k = PL_regkind[(U8)OP(o)];
4665 SV *dsv = sv_2mortal(newSVpvn("", 0));
4666 /* Using is_utf8_string() is a crude hack but it may
4667 * be the best for now since we have no flag "this EXACTish
4668 * node was UTF-8" --jhi */
4669 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4671 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4672 UNI_DISPLAY_REGEX) :
4677 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4682 else if (k == CURLY) {
4683 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4684 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4685 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4687 else if (k == WHILEM && o->flags) /* Ordinal/of */
4688 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4689 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4690 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4691 else if (k == LOGICAL)
4692 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4693 else if (k == ANYOF) {
4694 int i, rangestart = -1;
4695 U8 flags = ANYOF_FLAGS(o);
4696 const char * const anyofs[] = { /* Should be syncronized with
4697 * ANYOF_ #xdefines in regcomp.h */
4730 if (flags & ANYOF_LOCALE)
4731 sv_catpv(sv, "{loc}");
4732 if (flags & ANYOF_FOLD)
4733 sv_catpv(sv, "{i}");
4734 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4735 if (flags & ANYOF_INVERT)
4737 for (i = 0; i <= 256; i++) {
4738 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4739 if (rangestart == -1)
4741 } else if (rangestart != -1) {
4742 if (i <= rangestart + 3)
4743 for (; rangestart < i; rangestart++)
4744 put_byte(sv, rangestart);
4746 put_byte(sv, rangestart);
4748 put_byte(sv, i - 1);
4754 if (o->flags & ANYOF_CLASS)
4755 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4756 if (ANYOF_CLASS_TEST(o,i))
4757 sv_catpv(sv, anyofs[i]);
4759 if (flags & ANYOF_UNICODE)
4760 sv_catpv(sv, "{unicode}");
4761 else if (flags & ANYOF_UNICODE_ALL)
4762 sv_catpv(sv, "{unicode_all}");
4766 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4771 U8 s[UTF8_MAXLEN+1];
4773 for (i = 0; i <= 256; i++) { /* just the first 256 */
4774 U8 *e = uvchr_to_utf8(s, i);
4776 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4777 if (rangestart == -1)
4779 } else if (rangestart != -1) {
4782 if (i <= rangestart + 3)
4783 for (; rangestart < i; rangestart++) {
4784 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4788 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4791 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4798 sv_catpv(sv, "..."); /* et cetera */
4802 char *s = savepv(SvPVX(lv));
4805 while(*s && *s != '\n') s++;
4826 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4828 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4829 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4830 #endif /* DEBUGGING */
4834 Perl_re_intuit_string(pTHX_ regexp *prog)
4835 { /* Assume that RE_INTUIT is set */
4838 char *s = SvPV(prog->check_substr
4839 ? prog->check_substr : prog->check_utf8, n_a);
4841 if (!PL_colorset) reginitcolors();
4842 PerlIO_printf(Perl_debug_log,
4843 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4845 prog->check_substr ? "" : "utf8 ",
4846 PL_colors[5],PL_colors[0],
4849 (strlen(s) > 60 ? "..." : ""));
4852 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4856 Perl_pregfree(pTHX_ struct regexp *r)
4859 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4862 if (!r || (--r->refcnt > 0))
4865 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4867 int len = SvCUR(dsv);
4870 PerlIO_printf(Perl_debug_log,
4871 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4872 PL_colors[4],PL_colors[5],PL_colors[0],
4875 len > 60 ? "..." : "");
4879 Safefree(r->precomp);
4880 if (r->offsets) /* 20010421 MJD */
4881 Safefree(r->offsets);
4882 if (RX_MATCH_COPIED(r))
4883 Safefree(r->subbeg);
4885 if (r->anchored_substr)
4886 SvREFCNT_dec(r->anchored_substr);
4887 if (r->anchored_utf8)
4888 SvREFCNT_dec(r->anchored_utf8);
4889 if (r->float_substr)
4890 SvREFCNT_dec(r->float_substr);
4892 SvREFCNT_dec(r->float_utf8);
4893 Safefree(r->substrs);
4896 int n = r->data->count;
4897 AV* new_comppad = NULL;
4902 /* If you add a ->what type here, update the comment in regcomp.h */
4903 switch (r->data->what[n]) {
4905 SvREFCNT_dec((SV*)r->data->data[n]);
4908 Safefree(r->data->data[n]);
4911 new_comppad = (AV*)r->data->data[n];
4914 if (new_comppad == NULL)
4915 Perl_croak(aTHX_ "panic: pregfree comppad");
4916 old_comppad = PL_comppad;
4917 old_curpad = PL_curpad;
4918 /* Watch out for global destruction's random ordering. */
4919 if (SvTYPE(new_comppad) == SVt_PVAV) {
4920 PL_comppad = new_comppad;
4921 PL_curpad = AvARRAY(new_comppad);
4926 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4927 op_free((OP_4tree*)r->data->data[n]);
4930 PL_comppad = old_comppad;
4931 PL_curpad = old_curpad;
4932 SvREFCNT_dec((SV*)new_comppad);
4938 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4941 Safefree(r->data->what);
4944 Safefree(r->startp);
4950 - regnext - dig the "next" pointer out of a node
4952 * [Note, when REGALIGN is defined there are two places in regmatch()
4953 * that bypass this code for speed.]
4956 Perl_regnext(pTHX_ register regnode *p)
4958 register I32 offset;
4960 if (p == &PL_regdummy)
4963 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4971 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4974 STRLEN l1 = strlen(pat1);
4975 STRLEN l2 = strlen(pat2);
4984 Copy(pat1, buf, l1 , char);
4985 Copy(pat2, buf + l1, l2 , char);
4986 buf[l1 + l2] = '\n';
4987 buf[l1 + l2 + 1] = '\0';
4989 /* ANSI variant takes additional second argument */
4990 va_start(args, pat2);
4994 msv = vmess(buf, &args);
4996 message = SvPV(msv,l1);
4999 Copy(message, buf, l1 , char);
5000 buf[l1] = '\0'; /* Overwrite \n */
5001 Perl_croak(aTHX_ "%s", buf);
5004 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5007 Perl_save_re_context(pTHX)
5010 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5011 SAVEI32(RExC_npar); /* () count. */
5012 SAVEI32(RExC_size); /* Code size. */
5013 SAVEI16(RExC_flags16); /* are we folding, multilining? */
5014 SAVEVPTR(RExC_rx); /* from regcomp.c */
5015 SAVEI32(RExC_seen); /* from regcomp.c */
5016 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5017 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5018 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
5019 SAVEPPTR(RExC_end); /* End of input for compile */
5020 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5023 SAVEI32(PL_reg_flags); /* from regexec.c */
5025 SAVEPPTR(PL_reginput); /* String-input pointer. */
5026 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5027 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5028 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5029 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5030 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5031 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5032 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5033 PL_reg_start_tmp = 0;
5034 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5035 PL_reg_start_tmpl = 0;
5036 SAVEVPTR(PL_regdata);
5037 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5038 SAVEI32(PL_regnarrate); /* from regexec.c */
5039 SAVEVPTR(PL_regprogram); /* from regexec.c */
5040 SAVEINT(PL_regindent); /* from regexec.c */
5041 SAVEVPTR(PL_regcc); /* from regexec.c */
5042 SAVEVPTR(PL_curcop);
5043 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5044 SAVEVPTR(PL_reg_re); /* from regexec.c */
5045 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5046 SAVESPTR(PL_reg_sv); /* from regexec.c */
5047 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5048 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5049 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5050 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5051 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5052 SAVEI32(PL_regnpar); /* () count. */
5053 SAVEI32(PL_regsize); /* from regexec.c */
5055 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5060 clear_re(pTHX_ void *r)
5062 ReREFCNT_dec((regexp *)r);