5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 #define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 #define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 #define vFAIL(m) STMT_START { \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 * Like Simple_vFAIL(), but accepts two arguments.
316 #define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 #define vFAIL2(m,a1) STMT_START { \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
333 * Like Simple_vFAIL(), but accepts three arguments.
335 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 #define vFAIL3(m,a1,a2) STMT_START { \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
351 * Like Simple_vFAIL(), but accepts four arguments.
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
360 * Like Simple_vFAIL(), but accepts five arguments.
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 data->flags &= ~SF_BEFORE_EOL;
502 /* Can match anything (initialization) */
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
506 ANYOF_CLASS_ZERO(cl);
507 ANYOF_BITMAP_SETALL(cl);
508 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
510 cl->flags |= ANYOF_LOCALE;
513 /* Can match anything (initialization) */
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
519 for (value = 0; value <= ANYOF_MAX; value += 2)
520 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
522 if (!(cl->flags & ANYOF_UNICODE_ALL))
524 if (!ANYOF_BITMAP_TESTALLSET(cl))
529 /* Can match anything (initialization) */
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
533 Zero(cl, 1, struct regnode_charclass_class);
535 cl_anything(pRExC_state, cl);
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
545 cl->flags |= ANYOF_LOCALE;
548 /* 'And' a given class with another one. Can create false positives */
549 /* We assume that cl is not inverted */
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552 struct regnode_charclass_class *and_with)
554 if (!(and_with->flags & ANYOF_CLASS)
555 && !(cl->flags & ANYOF_CLASS)
556 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557 && !(and_with->flags & ANYOF_FOLD)
558 && !(cl->flags & ANYOF_FOLD)) {
561 if (and_with->flags & ANYOF_INVERT)
562 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563 cl->bitmap[i] &= ~and_with->bitmap[i];
565 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566 cl->bitmap[i] &= and_with->bitmap[i];
567 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568 if (!(and_with->flags & ANYOF_EOS))
569 cl->flags &= ~ANYOF_EOS;
571 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 cl->flags |= ANYOF_UNICODE;
574 ARG_SET(cl, ARG(and_with));
576 if (!(and_with->flags & ANYOF_UNICODE_ALL))
577 cl->flags &= ~ANYOF_UNICODE_ALL;
578 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579 cl->flags &= ~ANYOF_UNICODE;
582 /* 'OR' a given class with another one. Can create false positives */
583 /* We assume that cl is not inverted */
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
587 if (or_with->flags & ANYOF_INVERT) {
589 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590 * <= (B1 | !B2) | (CL1 | !CL2)
591 * which is wasteful if CL2 is small, but we ignore CL2:
592 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593 * XXXX Can we handle case-fold? Unclear:
594 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598 && !(or_with->flags & ANYOF_FOLD)
599 && !(cl->flags & ANYOF_FOLD) ) {
602 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603 cl->bitmap[i] |= ~or_with->bitmap[i];
604 } /* XXXX: logic is complicated otherwise */
606 cl_anything(pRExC_state, cl);
609 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611 && (!(or_with->flags & ANYOF_FOLD)
612 || (cl->flags & ANYOF_FOLD)) ) {
615 /* OR char bitmap and class bitmap separately */
616 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617 cl->bitmap[i] |= or_with->bitmap[i];
618 if (or_with->flags & ANYOF_CLASS) {
619 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620 cl->classflags[i] |= or_with->classflags[i];
621 cl->flags |= ANYOF_CLASS;
624 else { /* XXXX: logic is complicated, leave it along for a moment. */
625 cl_anything(pRExC_state, cl);
628 if (or_with->flags & ANYOF_EOS)
629 cl->flags |= ANYOF_EOS;
631 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632 ARG(cl) != ARG(or_with)) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
636 if (or_with->flags & ANYOF_UNICODE_ALL) {
637 cl->flags |= ANYOF_UNICODE_ALL;
638 cl->flags &= ~ANYOF_UNICODE;
643 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644 * These need to be revisited when a newer toolchain becomes available.
646 #if defined(__sparc64__) && defined(__GNUC__)
647 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 # undef SPARC64_GCC_WORKAROUND
649 # define SPARC64_GCC_WORKAROUND 1
653 /* REx optimizer. Converts nodes into quickier variants "in place".
654 Finds fixed substrings. */
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657 to the position after last scanned or to NULL. */
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661 /* scanp: Start here (read-write). */
662 /* deltap: Write maxlen-minlen here. */
663 /* last: Stop before this one. */
665 I32 min = 0, pars = 0, code;
666 regnode *scan = *scanp, *next;
668 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669 int is_inf_internal = 0; /* The studied chunk is infinite */
670 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671 scan_data_t data_fake;
672 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
674 while (scan && OP(scan) != END && scan < last) {
675 /* Peephole optimizer: */
677 if (PL_regkind[(U8)OP(scan)] == EXACT) {
678 /* Merge several consecutive EXACTish nodes into one. */
679 regnode *n = regnext(scan);
682 regnode *stop = scan;
685 next = scan + NODE_SZ_STR(scan);
686 /* Skip NOTHING, merge EXACT*. */
688 ( PL_regkind[(U8)OP(n)] == NOTHING ||
689 (stringok && (OP(n) == OP(scan))))
691 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692 if (OP(n) == TAIL || n > next)
694 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695 NEXT_OFF(scan) += NEXT_OFF(n);
696 next = n + NODE_STEP_REGNODE;
704 int oldl = STR_LEN(scan);
705 regnode *nnext = regnext(n);
707 if (oldl + STR_LEN(n) > U8_MAX)
709 NEXT_OFF(scan) += NEXT_OFF(n);
710 STR_LEN(scan) += STR_LEN(n);
711 next = n + NODE_SZ_STR(n);
712 /* Now we can overwrite *n : */
713 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
721 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
723 Two problematic code points in Unicode casefolding of EXACT nodes:
725 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
732 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
733 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
735 This means that in case-insensitive matching (or "loose matching",
736 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737 length of the above casefolded versions) can match a target string
738 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739 This would rather mess up the minimum length computation.
741 What we'll do is to look for the tail four bytes, and then peek
742 at the preceding two bytes to see whether we need to decrease
743 the minimum length by four (six minus two).
745 Thanks to the design of UTF-8, there cannot be false matches:
746 A sequence of valid UTF-8 bytes cannot be a subsequence of
747 another valid sequence of UTF-8 bytes.
750 char *s0 = STRING(scan), *s, *t;
751 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752 char *t0 = "\xcc\x88\xcc\x81";
756 s < s2 && (t = ninstr(s, s1, t0, t1));
758 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
766 n = scan + NODE_SZ_STR(scan);
768 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
776 /* Follow the next-chain of the current node and optimize
777 away all the NOTHINGs from it. */
778 if (OP(scan) != CURLYX) {
779 int max = (reg_off_by_arg[OP(scan)]
781 /* I32 may be smaller than U16 on CRAYs! */
782 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
787 /* Skip NOTHING and LONGJMP. */
788 while ((n = regnext(n))
789 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
793 if (reg_off_by_arg[OP(scan)])
796 NEXT_OFF(scan) = off;
798 /* The principal pseudo-switch. Cannot be a switch, since we
799 look into several different things. */
800 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802 next = regnext(scan);
805 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806 I32 max1 = 0, min1 = I32_MAX, num = 0;
807 struct regnode_charclass_class accum;
809 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811 if (flags & SCF_DO_STCLASS)
812 cl_init_zero(pRExC_state, &accum);
813 while (OP(scan) == code) {
814 I32 deltanext, minnext, f = 0, fake;
815 struct regnode_charclass_class this_class;
820 data_fake.whilem_c = data->whilem_c;
821 data_fake.last_closep = data->last_closep;
824 data_fake.last_closep = &fake;
825 next = regnext(scan);
826 scan = NEXTOPER(scan);
828 scan = NEXTOPER(scan);
829 if (flags & SCF_DO_STCLASS) {
830 cl_init(pRExC_state, &this_class);
831 data_fake.start_class = &this_class;
832 f = SCF_DO_STCLASS_AND;
834 if (flags & SCF_WHILEM_VISITED_POS)
835 f |= SCF_WHILEM_VISITED_POS;
836 /* we suppose the run is continuous, last=next...*/
837 minnext = study_chunk(pRExC_state, &scan, &deltanext,
838 next, &data_fake, f);
841 if (max1 < minnext + deltanext)
842 max1 = minnext + deltanext;
843 if (deltanext == I32_MAX)
844 is_inf = is_inf_internal = 1;
846 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
848 if (data && (data_fake.flags & SF_HAS_EVAL))
849 data->flags |= SF_HAS_EVAL;
851 data->whilem_c = data_fake.whilem_c;
852 if (flags & SCF_DO_STCLASS)
853 cl_or(pRExC_state, &accum, &this_class);
857 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
859 if (flags & SCF_DO_SUBSTR) {
860 data->pos_min += min1;
861 data->pos_delta += max1 - min1;
862 if (max1 != min1 || is_inf)
863 data->longest = &(data->longest_float);
866 delta += max1 - min1;
867 if (flags & SCF_DO_STCLASS_OR) {
868 cl_or(pRExC_state, data->start_class, &accum);
870 cl_and(data->start_class, &and_with);
871 flags &= ~SCF_DO_STCLASS;
874 else if (flags & SCF_DO_STCLASS_AND) {
876 cl_and(data->start_class, &accum);
877 flags &= ~SCF_DO_STCLASS;
880 /* Switch to OR mode: cache the old value of
881 * data->start_class */
882 StructCopy(data->start_class, &and_with,
883 struct regnode_charclass_class);
884 flags &= ~SCF_DO_STCLASS_AND;
885 StructCopy(&accum, data->start_class,
886 struct regnode_charclass_class);
887 flags |= SCF_DO_STCLASS_OR;
888 data->start_class->flags |= ANYOF_EOS;
892 else if (code == BRANCHJ) /* single branch is optimized. */
893 scan = NEXTOPER(NEXTOPER(scan));
894 else /* single branch is optimized. */
895 scan = NEXTOPER(scan);
898 else if (OP(scan) == EXACT) {
899 I32 l = STR_LEN(scan);
900 UV uc = *((U8*)STRING(scan));
902 U8 *s = (U8*)STRING(scan);
903 l = utf8_length(s, s + l);
904 uc = utf8_to_uvchr(s, NULL);
907 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908 /* The code below prefers earlier match for fixed
909 offset, later match for variable offset. */
910 if (data->last_end == -1) { /* Update the start info. */
911 data->last_start_min = data->pos_min;
912 data->last_start_max = is_inf
913 ? I32_MAX : data->pos_min + data->pos_delta;
915 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
917 SvUTF8_on(data->last_found);
918 data->last_end = data->pos_min + l;
919 data->pos_min += l; /* As in the first entry. */
920 data->flags &= ~SF_BEFORE_EOL;
922 if (flags & SCF_DO_STCLASS_AND) {
923 /* Check whether it is compatible with what we know already! */
927 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928 && !ANYOF_BITMAP_TEST(data->start_class, uc)
929 && (!(data->start_class->flags & ANYOF_FOLD)
930 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
933 ANYOF_CLASS_ZERO(data->start_class);
934 ANYOF_BITMAP_ZERO(data->start_class);
936 ANYOF_BITMAP_SET(data->start_class, uc);
937 data->start_class->flags &= ~ANYOF_EOS;
939 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
941 else if (flags & SCF_DO_STCLASS_OR) {
942 /* false positive possible if the class is case-folded */
944 ANYOF_BITMAP_SET(data->start_class, uc);
946 data->start_class->flags |= ANYOF_UNICODE_ALL;
947 data->start_class->flags &= ~ANYOF_EOS;
948 cl_and(data->start_class, &and_with);
950 flags &= ~SCF_DO_STCLASS;
952 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953 I32 l = STR_LEN(scan);
954 UV uc = *((U8*)STRING(scan));
956 /* Search for fixed substrings supports EXACT only. */
957 if (flags & SCF_DO_SUBSTR)
958 scan_commit(pRExC_state, data);
960 U8 *s = (U8 *)STRING(scan);
961 l = utf8_length(s, s + l);
962 uc = utf8_to_uvchr(s, NULL);
965 if (data && (flags & SCF_DO_SUBSTR))
967 if (flags & SCF_DO_STCLASS_AND) {
968 /* Check whether it is compatible with what we know already! */
972 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973 && !ANYOF_BITMAP_TEST(data->start_class, uc)
974 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
976 ANYOF_CLASS_ZERO(data->start_class);
977 ANYOF_BITMAP_ZERO(data->start_class);
979 ANYOF_BITMAP_SET(data->start_class, uc);
980 data->start_class->flags &= ~ANYOF_EOS;
981 data->start_class->flags |= ANYOF_FOLD;
982 if (OP(scan) == EXACTFL)
983 data->start_class->flags |= ANYOF_LOCALE;
986 else if (flags & SCF_DO_STCLASS_OR) {
987 if (data->start_class->flags & ANYOF_FOLD) {
988 /* false positive possible if the class is case-folded.
989 Assume that the locale settings are the same... */
991 ANYOF_BITMAP_SET(data->start_class, uc);
992 data->start_class->flags &= ~ANYOF_EOS;
994 cl_and(data->start_class, &and_with);
996 flags &= ~SCF_DO_STCLASS;
998 else if (strchr((char*)PL_varies,OP(scan))) {
999 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000 I32 f = flags, pos_before = 0;
1001 regnode *oscan = scan;
1002 struct regnode_charclass_class this_class;
1003 struct regnode_charclass_class *oclass = NULL;
1004 I32 next_is_eval = 0;
1006 switch (PL_regkind[(U8)OP(scan)]) {
1007 case WHILEM: /* End of (?:...)* . */
1008 scan = NEXTOPER(scan);
1011 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012 next = NEXTOPER(scan);
1013 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1015 maxcount = REG_INFTY;
1016 next = regnext(scan);
1017 scan = NEXTOPER(scan);
1021 if (flags & SCF_DO_SUBSTR)
1026 if (flags & SCF_DO_STCLASS) {
1028 maxcount = REG_INFTY;
1029 next = regnext(scan);
1030 scan = NEXTOPER(scan);
1033 is_inf = is_inf_internal = 1;
1034 scan = regnext(scan);
1035 if (flags & SCF_DO_SUBSTR) {
1036 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037 data->longest = &(data->longest_float);
1039 goto optimize_curly_tail;
1041 mincount = ARG1(scan);
1042 maxcount = ARG2(scan);
1043 next = regnext(scan);
1044 if (OP(scan) == CURLYX) {
1045 I32 lp = (data ? *(data->last_closep) : 0);
1047 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1049 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050 next_is_eval = (OP(scan) == EVAL);
1052 if (flags & SCF_DO_SUBSTR) {
1053 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054 pos_before = data->pos_min;
1058 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1060 data->flags |= SF_IS_INF;
1062 if (flags & SCF_DO_STCLASS) {
1063 cl_init(pRExC_state, &this_class);
1064 oclass = data->start_class;
1065 data->start_class = &this_class;
1066 f |= SCF_DO_STCLASS_AND;
1067 f &= ~SCF_DO_STCLASS_OR;
1069 /* These are the cases when once a subexpression
1070 fails at a particular position, it cannot succeed
1071 even after backtracking at the enclosing scope.
1073 XXXX what if minimal match and we are at the
1074 initial run of {n,m}? */
1075 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076 f &= ~SCF_WHILEM_VISITED_POS;
1078 /* This will finish on WHILEM, setting scan, or on NULL: */
1079 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1081 ? (f & ~SCF_DO_SUBSTR) : f);
1083 if (flags & SCF_DO_STCLASS)
1084 data->start_class = oclass;
1085 if (mincount == 0 || minnext == 0) {
1086 if (flags & SCF_DO_STCLASS_OR) {
1087 cl_or(pRExC_state, data->start_class, &this_class);
1089 else if (flags & SCF_DO_STCLASS_AND) {
1090 /* Switch to OR mode: cache the old value of
1091 * data->start_class */
1092 StructCopy(data->start_class, &and_with,
1093 struct regnode_charclass_class);
1094 flags &= ~SCF_DO_STCLASS_AND;
1095 StructCopy(&this_class, data->start_class,
1096 struct regnode_charclass_class);
1097 flags |= SCF_DO_STCLASS_OR;
1098 data->start_class->flags |= ANYOF_EOS;
1100 } else { /* Non-zero len */
1101 if (flags & SCF_DO_STCLASS_OR) {
1102 cl_or(pRExC_state, data->start_class, &this_class);
1103 cl_and(data->start_class, &and_with);
1105 else if (flags & SCF_DO_STCLASS_AND)
1106 cl_and(data->start_class, &this_class);
1107 flags &= ~SCF_DO_STCLASS;
1109 if (!scan) /* It was not CURLYX, but CURLY. */
1111 if (ckWARN(WARN_REGEXP)
1112 /* ? quantifier ok, except for (?{ ... }) */
1113 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114 && (minnext == 0) && (deltanext == 0)
1115 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1119 "Quantifier unexpected on zero-length expression");
1122 min += minnext * mincount;
1123 is_inf_internal |= ((maxcount == REG_INFTY
1124 && (minnext + deltanext) > 0)
1125 || deltanext == I32_MAX);
1126 is_inf |= is_inf_internal;
1127 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1129 /* Try powerful optimization CURLYX => CURLYN. */
1130 if ( OP(oscan) == CURLYX && data
1131 && data->flags & SF_IN_PAR
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext && minnext == 1 ) {
1134 /* Try to optimize to CURLYN. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136 regnode *nxt1 = nxt;
1143 if (!strchr((char*)PL_simple,OP(nxt))
1144 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145 && STR_LEN(nxt) == 1))
1151 if (OP(nxt) != CLOSE)
1153 /* Now we know that nxt2 is the only contents: */
1154 oscan->flags = (U8)ARG(nxt);
1156 OP(nxt1) = NOTHING; /* was OPEN. */
1158 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1162 OP(nxt + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1168 /* Try optimization CURLYX => CURLYM. */
1169 if ( OP(oscan) == CURLYX && data
1170 && !(data->flags & SF_HAS_PAR)
1171 && !(data->flags & SF_HAS_EVAL)
1173 /* XXXX How to optimize if data == 0? */
1174 /* Optimize to a simpler form. */
1175 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1179 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180 && (OP(nxt2) != WHILEM))
1182 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1183 /* Need to optimize away parenths. */
1184 if (data->flags & SF_IN_PAR) {
1185 /* Set the parenth number. */
1186 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1188 if (OP(nxt) != CLOSE)
1189 FAIL("Panic opt close");
1190 oscan->flags = (U8)ARG(nxt);
1191 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1192 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1194 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195 OP(nxt + 1) = OPTIMIZED; /* was count. */
1196 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1200 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201 regnode *nnxt = regnext(nxt1);
1204 if (reg_off_by_arg[OP(nxt1)])
1205 ARG_SET(nxt1, nxt2 - nxt1);
1206 else if (nxt2 - nxt1 < U16_MAX)
1207 NEXT_OFF(nxt1) = nxt2 - nxt1;
1209 OP(nxt) = NOTHING; /* Cannot beautify */
1214 /* Optimize again: */
1215 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1221 else if ((OP(oscan) == CURLYX)
1222 && (flags & SCF_WHILEM_VISITED_POS)
1223 /* See the comment on a similar expression above.
1224 However, this time it not a subexpression
1225 we care about, but the expression itself. */
1226 && (maxcount == REG_INFTY)
1227 && data && ++data->whilem_c < 16) {
1228 /* This stays as CURLYX, we can put the count/of pair. */
1229 /* Find WHILEM (as in regexec.c) */
1230 regnode *nxt = oscan + NEXT_OFF(oscan);
1232 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1234 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235 | (RExC_whilem_seen << 4)); /* On WHILEM */
1237 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1239 if (flags & SCF_DO_SUBSTR) {
1240 SV *last_str = Nullsv;
1241 int counted = mincount != 0;
1243 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1250 if (pos_before >= data->last_start_min)
1253 b = data->last_start_min;
1256 s = SvPV(data->last_found, l);
1257 old = b - data->last_start_min;
1260 I32 b = pos_before >= data->last_start_min
1261 ? pos_before : data->last_start_min;
1263 char *s = SvPV(data->last_found, l);
1264 I32 old = b - data->last_start_min;
1268 old = utf8_hop((U8*)s, old) - (U8*)s;
1271 /* Get the added string: */
1272 last_str = newSVpvn(s + old, l);
1273 if (deltanext == 0 && pos_before == b) {
1274 /* What was added is a constant string */
1276 SvGROW(last_str, (mincount * l) + 1);
1277 repeatcpy(SvPVX(last_str) + l,
1278 SvPVX(last_str), l, mincount - 1);
1279 SvCUR(last_str) *= mincount;
1280 /* Add additional parts. */
1281 SvCUR_set(data->last_found,
1282 SvCUR(data->last_found) - l);
1283 sv_catsv(data->last_found, last_str);
1284 data->last_end += l * (mincount - 1);
1287 /* start offset must point into the last copy */
1288 data->last_start_min += minnext * (mincount - 1);
1289 data->last_start_max += is_inf ? I32_MAX
1290 : (maxcount - 1) * (minnext + data->pos_delta);
1293 /* It is counted once already... */
1294 data->pos_min += minnext * (mincount - counted);
1295 data->pos_delta += - counted * deltanext +
1296 (minnext + deltanext) * maxcount - minnext * mincount;
1297 if (mincount != maxcount) {
1298 /* Cannot extend fixed substrings found inside
1300 scan_commit(pRExC_state,data);
1301 if (mincount && last_str) {
1302 sv_setsv(data->last_found, last_str);
1303 data->last_end = data->pos_min;
1304 data->last_start_min =
1305 data->pos_min - CHR_SVLEN(last_str);
1306 data->last_start_max = is_inf
1308 : data->pos_min + data->pos_delta
1309 - CHR_SVLEN(last_str);
1311 data->longest = &(data->longest_float);
1313 SvREFCNT_dec(last_str);
1315 if (data && (fl & SF_HAS_EVAL))
1316 data->flags |= SF_HAS_EVAL;
1317 optimize_curly_tail:
1318 if (OP(oscan) != CURLYX) {
1319 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1321 NEXT_OFF(oscan) += NEXT_OFF(next);
1324 default: /* REF and CLUMP only? */
1325 if (flags & SCF_DO_SUBSTR) {
1326 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1327 data->longest = &(data->longest_float);
1329 is_inf = is_inf_internal = 1;
1330 if (flags & SCF_DO_STCLASS_OR)
1331 cl_anything(pRExC_state, data->start_class);
1332 flags &= ~SCF_DO_STCLASS;
1336 else if (strchr((char*)PL_simple,OP(scan))) {
1339 if (flags & SCF_DO_SUBSTR) {
1340 scan_commit(pRExC_state,data);
1344 if (flags & SCF_DO_STCLASS) {
1345 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1347 /* Some of the logic below assumes that switching
1348 locale on will only add false positives. */
1349 switch (PL_regkind[(U8)OP(scan)]) {
1353 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1354 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1355 cl_anything(pRExC_state, data->start_class);
1358 if (OP(scan) == SANY)
1360 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1361 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1362 || (data->start_class->flags & ANYOF_CLASS));
1363 cl_anything(pRExC_state, data->start_class);
1365 if (flags & SCF_DO_STCLASS_AND || !value)
1366 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1369 if (flags & SCF_DO_STCLASS_AND)
1370 cl_and(data->start_class,
1371 (struct regnode_charclass_class*)scan);
1373 cl_or(pRExC_state, data->start_class,
1374 (struct regnode_charclass_class*)scan);
1377 if (flags & SCF_DO_STCLASS_AND) {
1378 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1380 for (value = 0; value < 256; value++)
1381 if (!isALNUM(value))
1382 ANYOF_BITMAP_CLEAR(data->start_class, value);
1386 if (data->start_class->flags & ANYOF_LOCALE)
1387 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1389 for (value = 0; value < 256; value++)
1391 ANYOF_BITMAP_SET(data->start_class, value);
1396 if (flags & SCF_DO_STCLASS_AND) {
1397 if (data->start_class->flags & ANYOF_LOCALE)
1398 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1401 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1402 data->start_class->flags |= ANYOF_LOCALE;
1406 if (flags & SCF_DO_STCLASS_AND) {
1407 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1409 for (value = 0; value < 256; value++)
1411 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1418 for (value = 0; value < 256; value++)
1419 if (!isALNUM(value))
1420 ANYOF_BITMAP_SET(data->start_class, value);
1425 if (flags & SCF_DO_STCLASS_AND) {
1426 if (data->start_class->flags & ANYOF_LOCALE)
1427 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1430 data->start_class->flags |= ANYOF_LOCALE;
1431 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1435 if (flags & SCF_DO_STCLASS_AND) {
1436 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1438 for (value = 0; value < 256; value++)
1439 if (!isSPACE(value))
1440 ANYOF_BITMAP_CLEAR(data->start_class, value);
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1447 for (value = 0; value < 256; value++)
1449 ANYOF_BITMAP_SET(data->start_class, value);
1454 if (flags & SCF_DO_STCLASS_AND) {
1455 if (data->start_class->flags & ANYOF_LOCALE)
1456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1459 data->start_class->flags |= ANYOF_LOCALE;
1460 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1464 if (flags & SCF_DO_STCLASS_AND) {
1465 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1467 for (value = 0; value < 256; value++)
1469 ANYOF_BITMAP_CLEAR(data->start_class, value);
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1476 for (value = 0; value < 256; value++)
1477 if (!isSPACE(value))
1478 ANYOF_BITMAP_SET(data->start_class, value);
1483 if (flags & SCF_DO_STCLASS_AND) {
1484 if (data->start_class->flags & ANYOF_LOCALE) {
1485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1486 for (value = 0; value < 256; value++)
1487 if (!isSPACE(value))
1488 ANYOF_BITMAP_CLEAR(data->start_class, value);
1492 data->start_class->flags |= ANYOF_LOCALE;
1493 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1497 if (flags & SCF_DO_STCLASS_AND) {
1498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1499 for (value = 0; value < 256; value++)
1500 if (!isDIGIT(value))
1501 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504 if (data->start_class->flags & ANYOF_LOCALE)
1505 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1507 for (value = 0; value < 256; value++)
1509 ANYOF_BITMAP_SET(data->start_class, value);
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1516 for (value = 0; value < 256; value++)
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1524 for (value = 0; value < 256; value++)
1525 if (!isDIGIT(value))
1526 ANYOF_BITMAP_SET(data->start_class, value);
1531 if (flags & SCF_DO_STCLASS_OR)
1532 cl_and(data->start_class, &and_with);
1533 flags &= ~SCF_DO_STCLASS;
1536 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1537 data->flags |= (OP(scan) == MEOL
1541 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1542 /* Lookbehind, or need to calculate parens/evals/stclass: */
1543 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1544 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1545 /* Lookahead/lookbehind */
1546 I32 deltanext, minnext, fake = 0;
1548 struct regnode_charclass_class intrnl;
1551 data_fake.flags = 0;
1553 data_fake.whilem_c = data->whilem_c;
1554 data_fake.last_closep = data->last_closep;
1557 data_fake.last_closep = &fake;
1558 if ( flags & SCF_DO_STCLASS && !scan->flags
1559 && OP(scan) == IFMATCH ) { /* Lookahead */
1560 cl_init(pRExC_state, &intrnl);
1561 data_fake.start_class = &intrnl;
1562 f |= SCF_DO_STCLASS_AND;
1564 if (flags & SCF_WHILEM_VISITED_POS)
1565 f |= SCF_WHILEM_VISITED_POS;
1566 next = regnext(scan);
1567 nscan = NEXTOPER(NEXTOPER(scan));
1568 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1571 vFAIL("Variable length lookbehind not implemented");
1573 else if (minnext > U8_MAX) {
1574 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1576 scan->flags = (U8)minnext;
1578 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1580 if (data && (data_fake.flags & SF_HAS_EVAL))
1581 data->flags |= SF_HAS_EVAL;
1583 data->whilem_c = data_fake.whilem_c;
1584 if (f & SCF_DO_STCLASS_AND) {
1585 int was = (data->start_class->flags & ANYOF_EOS);
1587 cl_and(data->start_class, &intrnl);
1589 data->start_class->flags |= ANYOF_EOS;
1592 else if (OP(scan) == OPEN) {
1595 else if (OP(scan) == CLOSE) {
1596 if ((I32)ARG(scan) == is_par) {
1597 next = regnext(scan);
1599 if ( next && (OP(next) != WHILEM) && next < last)
1600 is_par = 0; /* Disable optimization */
1603 *(data->last_closep) = ARG(scan);
1605 else if (OP(scan) == EVAL) {
1607 data->flags |= SF_HAS_EVAL;
1609 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1610 if (flags & SCF_DO_SUBSTR) {
1611 scan_commit(pRExC_state,data);
1612 data->longest = &(data->longest_float);
1614 is_inf = is_inf_internal = 1;
1615 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1616 cl_anything(pRExC_state, data->start_class);
1617 flags &= ~SCF_DO_STCLASS;
1619 /* Else: zero-length, ignore. */
1620 scan = regnext(scan);
1625 *deltap = is_inf_internal ? I32_MAX : delta;
1626 if (flags & SCF_DO_SUBSTR && is_inf)
1627 data->pos_delta = I32_MAX - data->pos_min;
1628 if (is_par > U8_MAX)
1630 if (is_par && pars==1 && data) {
1631 data->flags |= SF_IN_PAR;
1632 data->flags &= ~SF_HAS_PAR;
1634 else if (pars && data) {
1635 data->flags |= SF_HAS_PAR;
1636 data->flags &= ~SF_IN_PAR;
1638 if (flags & SCF_DO_STCLASS_OR)
1639 cl_and(data->start_class, &and_with);
1644 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1646 if (RExC_rx->data) {
1647 Renewc(RExC_rx->data,
1648 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1649 char, struct reg_data);
1650 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1651 RExC_rx->data->count += n;
1654 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1655 char, struct reg_data);
1656 New(1208, RExC_rx->data->what, n, U8);
1657 RExC_rx->data->count = n;
1659 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1660 return RExC_rx->data->count - n;
1664 Perl_reginitcolors(pTHX)
1667 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1670 PL_colors[0] = s = savepv(s);
1672 s = strchr(s, '\t');
1678 PL_colors[i] = s = "";
1682 PL_colors[i++] = "";
1689 - pregcomp - compile a regular expression into internal code
1691 * We can't allocate space until we know how big the compiled form will be,
1692 * but we can't compile it (and thus know how big it is) until we've got a
1693 * place to put the code. So we cheat: we compile it twice, once with code
1694 * generation turned off and size counting turned on, and once "for real".
1695 * This also means that we don't allocate space until we are sure that the
1696 * thing really will compile successfully, and we never have to move the
1697 * code and thus invalidate pointers into it. (Note that it has to be in
1698 * one piece because free() must be able to free it all.) [NB: not true in perl]
1700 * Beware that the optimization-preparation code in here knows about some
1701 * of the structure of the compiled regexp. [I'll say.]
1704 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1714 RExC_state_t RExC_state;
1715 RExC_state_t *pRExC_state = &RExC_state;
1718 FAIL("NULL regexp argument");
1720 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1724 if (!PL_colorset) reginitcolors();
1725 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1726 PL_colors[4],PL_colors[5],PL_colors[0],
1727 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1729 RExC_flags = pm->op_pmflags;
1733 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1734 RExC_seen_evals = 0;
1737 /* First pass: determine size, legality. */
1744 RExC_emit = &PL_regdummy;
1745 RExC_whilem_seen = 0;
1746 #if 0 /* REGC() is (currently) a NOP at the first pass.
1747 * Clever compilers notice this and complain. --jhi */
1748 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1750 if (reg(pRExC_state, 0, &flags) == NULL) {
1751 RExC_precomp = Nullch;
1754 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1756 /* Small enough for pointer-storage convention?
1757 If extralen==0, this means that we will not need long jumps. */
1758 if (RExC_size >= 0x10000L && RExC_extralen)
1759 RExC_size += RExC_extralen;
1762 if (RExC_whilem_seen > 15)
1763 RExC_whilem_seen = 15;
1765 /* Allocate space and initialize. */
1766 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1769 FAIL("Regexp out of space");
1772 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1773 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1776 r->prelen = xend - exp;
1777 r->precomp = savepvn(RExC_precomp, r->prelen);
1779 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1780 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1782 r->substrs = 0; /* Useful during FAIL. */
1783 r->startp = 0; /* Useful during FAIL. */
1784 r->endp = 0; /* Useful during FAIL. */
1786 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1788 r->offsets[0] = RExC_size;
1790 DEBUG_r(PerlIO_printf(Perl_debug_log,
1791 "%s %"UVuf" bytes for offset annotations.\n",
1792 r->offsets ? "Got" : "Couldn't get",
1793 (UV)((2*RExC_size+1) * sizeof(U32))));
1797 /* Second pass: emit code. */
1798 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1803 RExC_emit_start = r->program;
1804 RExC_emit = r->program;
1805 /* Store the count of eval-groups for security checks: */
1806 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1807 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1809 if (reg(pRExC_state, 0, &flags) == NULL)
1812 /* Dig out information for optimizations. */
1813 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1814 pm->op_pmflags = RExC_flags;
1816 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1817 r->regstclass = NULL;
1818 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1819 r->reganch |= ROPT_NAUGHTY;
1820 scan = r->program + 1; /* First BRANCH. */
1822 /* XXXX To minimize changes to RE engine we always allocate
1823 3-units-long substrs field. */
1824 Newz(1004, r->substrs, 1, struct reg_substr_data);
1826 StructCopy(&zero_scan_data, &data, scan_data_t);
1827 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1828 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1830 STRLEN longest_float_length, longest_fixed_length;
1831 struct regnode_charclass_class ch_class;
1836 /* Skip introductions and multiplicators >= 1. */
1837 while ((OP(first) == OPEN && (sawopen = 1)) ||
1838 /* An OR of *one* alternative - should not happen now. */
1839 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1840 (OP(first) == PLUS) ||
1841 (OP(first) == MINMOD) ||
1842 /* An {n,m} with n>0 */
1843 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1844 if (OP(first) == PLUS)
1847 first += regarglen[(U8)OP(first)];
1848 first = NEXTOPER(first);
1851 /* Starting-point info. */
1853 if (PL_regkind[(U8)OP(first)] == EXACT) {
1854 if (OP(first) == EXACT)
1855 ; /* Empty, get anchored substr later. */
1856 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1857 r->regstclass = first;
1859 else if (strchr((char*)PL_simple,OP(first)))
1860 r->regstclass = first;
1861 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1862 PL_regkind[(U8)OP(first)] == NBOUND)
1863 r->regstclass = first;
1864 else if (PL_regkind[(U8)OP(first)] == BOL) {
1865 r->reganch |= (OP(first) == MBOL
1867 : (OP(first) == SBOL
1870 first = NEXTOPER(first);
1873 else if (OP(first) == GPOS) {
1874 r->reganch |= ROPT_ANCH_GPOS;
1875 first = NEXTOPER(first);
1878 else if (!sawopen && (OP(first) == STAR &&
1879 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1880 !(r->reganch & ROPT_ANCH) )
1882 /* turn .* into ^.* with an implied $*=1 */
1883 int type = OP(NEXTOPER(first));
1885 if (type == REG_ANY)
1886 type = ROPT_ANCH_MBOL;
1888 type = ROPT_ANCH_SBOL;
1890 r->reganch |= type | ROPT_IMPLICIT;
1891 first = NEXTOPER(first);
1894 if (sawplus && (!sawopen || !RExC_sawback)
1895 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1896 /* x+ must match at the 1st pos of run of x's */
1897 r->reganch |= ROPT_SKIP;
1899 /* Scan is after the zeroth branch, first is atomic matcher. */
1900 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1901 (IV)(first - scan + 1)));
1903 * If there's something expensive in the r.e., find the
1904 * longest literal string that must appear and make it the
1905 * regmust. Resolve ties in favor of later strings, since
1906 * the regstart check works with the beginning of the r.e.
1907 * and avoiding duplication strengthens checking. Not a
1908 * strong reason, but sufficient in the absence of others.
1909 * [Now we resolve ties in favor of the earlier string if
1910 * it happens that c_offset_min has been invalidated, since the
1911 * earlier string may buy us something the later one won't.]
1915 data.longest_fixed = newSVpvn("",0);
1916 data.longest_float = newSVpvn("",0);
1917 data.last_found = newSVpvn("",0);
1918 data.longest = &(data.longest_fixed);
1920 if (!r->regstclass) {
1921 cl_init(pRExC_state, &ch_class);
1922 data.start_class = &ch_class;
1923 stclass_flag = SCF_DO_STCLASS_AND;
1924 } else /* XXXX Check for BOUND? */
1926 data.last_closep = &last_close;
1928 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1929 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1930 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1931 && data.last_start_min == 0 && data.last_end > 0
1932 && !RExC_seen_zerolen
1933 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1934 r->reganch |= ROPT_CHECK_ALL;
1935 scan_commit(pRExC_state, &data);
1936 SvREFCNT_dec(data.last_found);
1938 longest_float_length = CHR_SVLEN(data.longest_float);
1939 if (longest_float_length
1940 || (data.flags & SF_FL_BEFORE_EOL
1941 && (!(data.flags & SF_FL_BEFORE_MEOL)
1942 || (RExC_flags & PMf_MULTILINE)))) {
1945 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1946 && data.offset_fixed == data.offset_float_min
1947 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1948 goto remove_float; /* As in (a)+. */
1950 if (SvUTF8(data.longest_float)) {
1951 r->float_utf8 = data.longest_float;
1952 r->float_substr = Nullsv;
1954 r->float_substr = data.longest_float;
1955 r->float_utf8 = Nullsv;
1957 r->float_min_offset = data.offset_float_min;
1958 r->float_max_offset = data.offset_float_max;
1959 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1960 && (!(data.flags & SF_FL_BEFORE_MEOL)
1961 || (RExC_flags & PMf_MULTILINE)));
1962 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1966 r->float_substr = r->float_utf8 = Nullsv;
1967 SvREFCNT_dec(data.longest_float);
1968 longest_float_length = 0;
1971 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1972 if (longest_fixed_length
1973 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1974 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1975 || (RExC_flags & PMf_MULTILINE)))) {
1978 if (SvUTF8(data.longest_fixed)) {
1979 r->anchored_utf8 = data.longest_fixed;
1980 r->anchored_substr = Nullsv;
1982 r->anchored_substr = data.longest_fixed;
1983 r->anchored_utf8 = Nullsv;
1985 r->anchored_offset = data.offset_fixed;
1986 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1987 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1988 || (RExC_flags & PMf_MULTILINE)));
1989 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1992 r->anchored_substr = r->anchored_utf8 = Nullsv;
1993 SvREFCNT_dec(data.longest_fixed);
1994 longest_fixed_length = 0;
1997 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
1998 r->regstclass = NULL;
1999 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2001 && !(data.start_class->flags & ANYOF_EOS)
2002 && !cl_is_anything(data.start_class))
2004 I32 n = add_data(pRExC_state, 1, "f");
2006 New(1006, RExC_rx->data->data[n], 1,
2007 struct regnode_charclass_class);
2008 StructCopy(data.start_class,
2009 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2010 struct regnode_charclass_class);
2011 r->regstclass = (regnode*)RExC_rx->data->data[n];
2012 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2013 PL_regdata = r->data; /* for regprop() */
2014 DEBUG_r({ SV *sv = sv_newmortal();
2015 regprop(sv, (regnode*)data.start_class);
2016 PerlIO_printf(Perl_debug_log,
2017 "synthetic stclass `%s'.\n",
2021 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2022 if (longest_fixed_length > longest_float_length) {
2023 r->check_substr = r->anchored_substr;
2024 r->check_utf8 = r->anchored_utf8;
2025 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2026 if (r->reganch & ROPT_ANCH_SINGLE)
2027 r->reganch |= ROPT_NOSCAN;
2030 r->check_substr = r->float_substr;
2031 r->check_utf8 = r->float_utf8;
2032 r->check_offset_min = data.offset_float_min;
2033 r->check_offset_max = data.offset_float_max;
2035 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2036 This should be changed ASAP! */
2037 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2038 r->reganch |= RE_USE_INTUIT;
2039 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2040 r->reganch |= RE_INTUIT_TAIL;
2044 /* Several toplevels. Best we can is to set minlen. */
2046 struct regnode_charclass_class ch_class;
2049 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2050 scan = r->program + 1;
2051 cl_init(pRExC_state, &ch_class);
2052 data.start_class = &ch_class;
2053 data.last_closep = &last_close;
2054 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2055 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2056 = r->float_substr = r->float_utf8 = Nullsv;
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class))
2060 I32 n = add_data(pRExC_state, 1, "f");
2062 New(1006, RExC_rx->data->data[n], 1,
2063 struct regnode_charclass_class);
2064 StructCopy(data.start_class,
2065 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2066 struct regnode_charclass_class);
2067 r->regstclass = (regnode*)RExC_rx->data->data[n];
2068 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2069 DEBUG_r({ SV* sv = sv_newmortal();
2070 regprop(sv, (regnode*)data.start_class);
2071 PerlIO_printf(Perl_debug_log,
2072 "synthetic stclass `%s'.\n",
2078 if (RExC_seen & REG_SEEN_GPOS)
2079 r->reganch |= ROPT_GPOS_SEEN;
2080 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2081 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2082 if (RExC_seen & REG_SEEN_EVAL)
2083 r->reganch |= ROPT_EVAL_SEEN;
2084 if (RExC_seen & REG_SEEN_CANY)
2085 r->reganch |= ROPT_CANY_SEEN;
2086 Newz(1002, r->startp, RExC_npar, I32);
2087 Newz(1002, r->endp, RExC_npar, I32);
2088 PL_regdata = r->data; /* for regprop() */
2089 DEBUG_r(regdump(r));
2094 - reg - regular expression, i.e. main body or parenthesized thing
2096 * Caller must absorb opening parenthesis.
2098 * Combining parenthesis handling with the base level of regular expression
2099 * is a trifle forced, but the need to tie the tails of the branches to what
2100 * follows makes it hard to avoid.
2103 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2104 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2106 register regnode *ret; /* Will be the head of the group. */
2107 register regnode *br;
2108 register regnode *lastbr;
2109 register regnode *ender = 0;
2110 register I32 parno = 0;
2111 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2113 /* for (?g), (?gc), and (?o) warnings; warning
2114 about (?c) will warn about (?g) -- japhy */
2116 I32 wastedflags = 0x00,
2119 wasted_gc = 0x02 | 0x04,
2122 char * parse_start = RExC_parse; /* MJD */
2123 char *oregcomp_parse = RExC_parse;
2126 *flagp = 0; /* Tentatively. */
2129 /* Make an OPEN node, if parenthesized. */
2131 if (*RExC_parse == '?') { /* (?...) */
2132 U32 posflags = 0, negflags = 0;
2133 U32 *flagsp = &posflags;
2135 char *seqstart = RExC_parse;
2138 paren = *RExC_parse++;
2139 ret = NULL; /* For look-ahead/behind. */
2141 case '<': /* (?<...) */
2142 RExC_seen |= REG_SEEN_LOOKBEHIND;
2143 if (*RExC_parse == '!')
2145 if (*RExC_parse != '=' && *RExC_parse != '!')
2148 case '=': /* (?=...) */
2149 case '!': /* (?!...) */
2150 RExC_seen_zerolen++;
2151 case ':': /* (?:...) */
2152 case '>': /* (?>...) */
2154 case '$': /* (?$...) */
2155 case '@': /* (?@...) */
2156 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2158 case '#': /* (?#...) */
2159 while (*RExC_parse && *RExC_parse != ')')
2161 if (*RExC_parse != ')')
2162 FAIL("Sequence (?#... not terminated");
2163 nextchar(pRExC_state);
2166 case 'p': /* (?p...) */
2167 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2168 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2170 case '?': /* (??...) */
2172 if (*RExC_parse != '{')
2174 paren = *RExC_parse++;
2176 case '{': /* (?{...}) */
2178 I32 count = 1, n = 0;
2180 char *s = RExC_parse;
2182 OP_4tree *sop, *rop;
2184 RExC_seen_zerolen++;
2185 RExC_seen |= REG_SEEN_EVAL;
2186 while (count && (c = *RExC_parse)) {
2187 if (c == '\\' && RExC_parse[1])
2195 if (*RExC_parse != ')')
2198 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2203 if (RExC_parse - 1 - s)
2204 sv = newSVpvn(s, RExC_parse - 1 - s);
2206 sv = newSVpvn("", 0);
2209 Perl_save_re_context(aTHX);
2210 rop = sv_compile_2op(sv, &sop, "re", &pad);
2211 sop->op_private |= OPpREFCOUNTED;
2212 /* re_dup will OpREFCNT_inc */
2213 OpREFCNT_set(sop, 1);
2216 n = add_data(pRExC_state, 3, "nop");
2217 RExC_rx->data->data[n] = (void*)rop;
2218 RExC_rx->data->data[n+1] = (void*)sop;
2219 RExC_rx->data->data[n+2] = (void*)pad;
2222 else { /* First pass */
2223 if (PL_reginterp_cnt < ++RExC_seen_evals
2224 && PL_curcop != &PL_compiling)
2225 /* No compiled RE interpolated, has runtime
2226 components ===> unsafe. */
2227 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2228 if (PL_tainting && PL_tainted)
2229 FAIL("Eval-group in insecure regular expression");
2232 nextchar(pRExC_state);
2234 ret = reg_node(pRExC_state, LOGICAL);
2237 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2238 /* deal with the length of this later - MJD */
2241 ret = reganode(pRExC_state, EVAL, n);
2242 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2243 Set_Node_Offset(ret, parse_start);
2246 case '(': /* (?(?{...})...) and (?(?=...)...) */
2248 if (RExC_parse[0] == '?') { /* (?(?...)) */
2249 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2250 || RExC_parse[1] == '<'
2251 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2254 ret = reg_node(pRExC_state, LOGICAL);
2257 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2261 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2263 parno = atoi(RExC_parse++);
2265 while (isDIGIT(*RExC_parse))
2267 ret = reganode(pRExC_state, GROUPP, parno);
2269 if ((c = *nextchar(pRExC_state)) != ')')
2270 vFAIL("Switch condition not recognized");
2272 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2273 br = regbranch(pRExC_state, &flags, 1);
2275 br = reganode(pRExC_state, LONGJMP, 0);
2277 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2278 c = *nextchar(pRExC_state);
2282 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2283 regbranch(pRExC_state, &flags, 1);
2284 regtail(pRExC_state, ret, lastbr);
2287 c = *nextchar(pRExC_state);
2292 vFAIL("Switch (?(condition)... contains too many branches");
2293 ender = reg_node(pRExC_state, TAIL);
2294 regtail(pRExC_state, br, ender);
2296 regtail(pRExC_state, lastbr, ender);
2297 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2300 regtail(pRExC_state, ret, ender);
2304 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2308 RExC_parse--; /* for vFAIL to print correctly */
2309 vFAIL("Sequence (? incomplete");
2313 parse_flags: /* (?i) */
2314 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2315 /* (?g), (?gc) and (?o) are useless here
2316 and must be globally applied -- japhy */
2318 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2319 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2320 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2321 if (! (wastedflags & wflagbit) ) {
2322 wastedflags |= wflagbit;
2325 "Useless (%s%c) - %suse /%c modifier",
2326 flagsp == &negflags ? "?-" : "?",
2328 flagsp == &negflags ? "don't " : "",
2334 else if (*RExC_parse == 'c') {
2335 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2336 if (! (wastedflags & wasted_c) ) {
2337 wastedflags |= wasted_gc;
2340 "Useless (%sc) - %suse /gc modifier",
2341 flagsp == &negflags ? "?-" : "?",
2342 flagsp == &negflags ? "don't " : ""
2347 else { pmflag(flagsp, *RExC_parse); }
2351 if (*RExC_parse == '-') {
2353 wastedflags = 0; /* reset so (?g-c) warns twice */
2357 RExC_flags |= posflags;
2358 RExC_flags &= ~negflags;
2359 if (*RExC_parse == ':') {
2365 if (*RExC_parse != ')') {
2367 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2369 nextchar(pRExC_state);
2377 ret = reganode(pRExC_state, OPEN, parno);
2378 Set_Node_Length(ret, 1); /* MJD */
2379 Set_Node_Offset(ret, RExC_parse); /* MJD */
2386 /* Pick up the branches, linking them together. */
2387 parse_start = RExC_parse; /* MJD */
2388 br = regbranch(pRExC_state, &flags, 1);
2389 /* branch_len = (paren != 0); */
2393 if (*RExC_parse == '|') {
2394 if (!SIZE_ONLY && RExC_extralen) {
2395 reginsert(pRExC_state, BRANCHJ, br);
2398 reginsert(pRExC_state, BRANCH, br);
2399 Set_Node_Length(br, paren != 0);
2400 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2404 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2406 else if (paren == ':') {
2407 *flagp |= flags&SIMPLE;
2409 if (open) { /* Starts with OPEN. */
2410 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2412 else if (paren != '?') /* Not Conditional */
2414 *flagp |= flags & (SPSTART | HASWIDTH);
2416 while (*RExC_parse == '|') {
2417 if (!SIZE_ONLY && RExC_extralen) {
2418 ender = reganode(pRExC_state, LONGJMP,0);
2419 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2422 RExC_extralen += 2; /* Account for LONGJMP. */
2423 nextchar(pRExC_state);
2424 br = regbranch(pRExC_state, &flags, 0);
2428 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2432 *flagp |= flags&SPSTART;
2435 if (have_branch || paren != ':') {
2436 /* Make a closing node, and hook it on the end. */
2439 ender = reg_node(pRExC_state, TAIL);
2442 ender = reganode(pRExC_state, CLOSE, parno);
2443 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2444 Set_Node_Length(ender,1); /* MJD */
2450 *flagp &= ~HASWIDTH;
2453 ender = reg_node(pRExC_state, SUCCEED);
2456 ender = reg_node(pRExC_state, END);
2459 regtail(pRExC_state, lastbr, ender);
2462 /* Hook the tails of the branches to the closing node. */
2463 for (br = ret; br != NULL; br = regnext(br)) {
2464 regoptail(pRExC_state, br, ender);
2471 static char parens[] = "=!<,>";
2473 if (paren && (p = strchr(parens, paren))) {
2474 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2475 int flag = (p - parens) > 1;
2478 node = SUSPEND, flag = 0;
2479 reginsert(pRExC_state, node,ret);
2480 Set_Node_Offset(ret, oregcomp_parse);
2481 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2483 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2487 /* Check for proper termination. */
2489 RExC_flags = oregflags;
2490 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2491 RExC_parse = oregcomp_parse;
2492 vFAIL("Unmatched (");
2495 else if (!paren && RExC_parse < RExC_end) {
2496 if (*RExC_parse == ')') {
2498 vFAIL("Unmatched )");
2501 FAIL("Junk on end of regexp"); /* "Can't happen". */
2509 - regbranch - one alternative of an | operator
2511 * Implements the concatenation operator.
2514 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2516 register regnode *ret;
2517 register regnode *chain = NULL;
2518 register regnode *latest;
2519 I32 flags = 0, c = 0;
2524 if (!SIZE_ONLY && RExC_extralen)
2525 ret = reganode(pRExC_state, BRANCHJ,0);
2527 ret = reg_node(pRExC_state, BRANCH);
2528 Set_Node_Length(ret, 1);
2532 if (!first && SIZE_ONLY)
2533 RExC_extralen += 1; /* BRANCHJ */
2535 *flagp = WORST; /* Tentatively. */
2538 nextchar(pRExC_state);
2539 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2541 latest = regpiece(pRExC_state, &flags);
2542 if (latest == NULL) {
2543 if (flags & TRYAGAIN)
2547 else if (ret == NULL)
2549 *flagp |= flags&HASWIDTH;
2550 if (chain == NULL) /* First piece. */
2551 *flagp |= flags&SPSTART;
2554 regtail(pRExC_state, chain, latest);
2559 if (chain == NULL) { /* Loop ran zero times. */
2560 chain = reg_node(pRExC_state, NOTHING);
2565 *flagp |= flags&SIMPLE;
2572 - regpiece - something followed by possible [*+?]
2574 * Note that the branching code sequences used for ? and the general cases
2575 * of * and + are somewhat optimized: they use the same NOTHING node as
2576 * both the endmarker for their branch list and the body of the last branch.
2577 * It might seem that this node could be dispensed with entirely, but the
2578 * endmarker role is not redundant.
2581 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2583 register regnode *ret;
2585 register char *next;
2587 char *origparse = RExC_parse;
2590 I32 max = REG_INFTY;
2593 ret = regatom(pRExC_state, &flags);
2595 if (flags & TRYAGAIN)
2602 if (op == '{' && regcurly(RExC_parse)) {
2603 parse_start = RExC_parse; /* MJD */
2604 next = RExC_parse + 1;
2606 while (isDIGIT(*next) || *next == ',') {
2615 if (*next == '}') { /* got one */
2619 min = atoi(RExC_parse);
2623 maxpos = RExC_parse;
2625 if (!max && *maxpos != '0')
2626 max = REG_INFTY; /* meaning "infinity" */
2627 else if (max >= REG_INFTY)
2628 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2630 nextchar(pRExC_state);
2633 if ((flags&SIMPLE)) {
2634 RExC_naughty += 2 + RExC_naughty / 2;
2635 reginsert(pRExC_state, CURLY, ret);
2636 Set_Node_Offset(ret, parse_start+1); /* MJD */
2637 Set_Node_Cur_Length(ret);
2640 regnode *w = reg_node(pRExC_state, WHILEM);
2643 regtail(pRExC_state, ret, w);
2644 if (!SIZE_ONLY && RExC_extralen) {
2645 reginsert(pRExC_state, LONGJMP,ret);
2646 reginsert(pRExC_state, NOTHING,ret);
2647 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2649 reginsert(pRExC_state, CURLYX,ret);
2651 Set_Node_Offset(ret, parse_start+1);
2652 Set_Node_Length(ret,
2653 op == '{' ? (RExC_parse - parse_start) : 1);
2655 if (!SIZE_ONLY && RExC_extralen)
2656 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2657 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2659 RExC_whilem_seen++, RExC_extralen += 3;
2660 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2668 if (max && max < min)
2669 vFAIL("Can't do {n,m} with n > m");
2671 ARG1_SET(ret, (U16)min);
2672 ARG2_SET(ret, (U16)max);
2684 #if 0 /* Now runtime fix should be reliable. */
2686 /* if this is reinstated, don't forget to put this back into perldiag:
2688 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2690 (F) The part of the regexp subject to either the * or + quantifier
2691 could match an empty string. The {#} shows in the regular
2692 expression about where the problem was discovered.
2696 if (!(flags&HASWIDTH) && op != '?')
2697 vFAIL("Regexp *+ operand could be empty");
2700 parse_start = RExC_parse;
2701 nextchar(pRExC_state);
2703 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2705 if (op == '*' && (flags&SIMPLE)) {
2706 reginsert(pRExC_state, STAR, ret);
2710 else if (op == '*') {
2714 else if (op == '+' && (flags&SIMPLE)) {
2715 reginsert(pRExC_state, PLUS, ret);
2719 else if (op == '+') {
2723 else if (op == '?') {
2728 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2730 "%.*s matches null string many times",
2731 RExC_parse - origparse,
2735 if (*RExC_parse == '?') {
2736 nextchar(pRExC_state);
2737 reginsert(pRExC_state, MINMOD, ret);
2738 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2740 if (ISMULT2(RExC_parse)) {
2742 vFAIL("Nested quantifiers");
2749 - regatom - the lowest level
2751 * Optimization: gobbles an entire sequence of ordinary characters so that
2752 * it can turn them into a single node, which is smaller to store and
2753 * faster to run. Backslashed characters are exceptions, each becoming a
2754 * separate node; the code is simpler that way and it's not worth fixing.
2756 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2758 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2760 register regnode *ret = 0;
2762 char *parse_start = 0;
2764 *flagp = WORST; /* Tentatively. */
2767 switch (*RExC_parse) {
2769 RExC_seen_zerolen++;
2770 nextchar(pRExC_state);
2771 if (RExC_flags & PMf_MULTILINE)
2772 ret = reg_node(pRExC_state, MBOL);
2773 else if (RExC_flags & PMf_SINGLELINE)
2774 ret = reg_node(pRExC_state, SBOL);
2776 ret = reg_node(pRExC_state, BOL);
2777 Set_Node_Length(ret, 1); /* MJD */
2780 nextchar(pRExC_state);
2782 RExC_seen_zerolen++;
2783 if (RExC_flags & PMf_MULTILINE)
2784 ret = reg_node(pRExC_state, MEOL);
2785 else if (RExC_flags & PMf_SINGLELINE)
2786 ret = reg_node(pRExC_state, SEOL);
2788 ret = reg_node(pRExC_state, EOL);
2789 Set_Node_Length(ret, 1); /* MJD */
2792 nextchar(pRExC_state);
2793 if (RExC_flags & PMf_SINGLELINE)
2794 ret = reg_node(pRExC_state, SANY);
2796 ret = reg_node(pRExC_state, REG_ANY);
2797 *flagp |= HASWIDTH|SIMPLE;
2799 Set_Node_Length(ret, 1); /* MJD */
2803 char *oregcomp_parse = ++RExC_parse;
2804 ret = regclass(pRExC_state);
2805 if (*RExC_parse != ']') {
2806 RExC_parse = oregcomp_parse;
2807 vFAIL("Unmatched [");
2809 nextchar(pRExC_state);
2810 *flagp |= HASWIDTH|SIMPLE;
2811 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2815 nextchar(pRExC_state);
2816 ret = reg(pRExC_state, 1, &flags);
2818 if (flags & TRYAGAIN) {
2819 if (RExC_parse == RExC_end) {
2820 /* Make parent create an empty node if needed. */
2828 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2832 if (flags & TRYAGAIN) {
2836 vFAIL("Internal urp");
2837 /* Supposed to be caught earlier. */
2840 if (!regcurly(RExC_parse)) {
2849 vFAIL("Quantifier follows nothing");
2852 switch (*++RExC_parse) {
2854 RExC_seen_zerolen++;
2855 ret = reg_node(pRExC_state, SBOL);
2857 nextchar(pRExC_state);
2858 Set_Node_Length(ret, 2); /* MJD */
2861 ret = reg_node(pRExC_state, GPOS);
2862 RExC_seen |= REG_SEEN_GPOS;
2864 nextchar(pRExC_state);
2865 Set_Node_Length(ret, 2); /* MJD */
2868 ret = reg_node(pRExC_state, SEOL);
2870 RExC_seen_zerolen++; /* Do not optimize RE away */
2871 nextchar(pRExC_state);
2874 ret = reg_node(pRExC_state, EOS);
2876 RExC_seen_zerolen++; /* Do not optimize RE away */
2877 nextchar(pRExC_state);
2878 Set_Node_Length(ret, 2); /* MJD */
2881 ret = reg_node(pRExC_state, CANY);
2882 RExC_seen |= REG_SEEN_CANY;
2883 *flagp |= HASWIDTH|SIMPLE;
2884 nextchar(pRExC_state);
2885 Set_Node_Length(ret, 2); /* MJD */
2888 ret = reg_node(pRExC_state, CLUMP);
2890 nextchar(pRExC_state);
2891 Set_Node_Length(ret, 2); /* MJD */
2894 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2895 *flagp |= HASWIDTH|SIMPLE;
2896 nextchar(pRExC_state);
2897 Set_Node_Length(ret, 2); /* MJD */
2900 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2901 *flagp |= HASWIDTH|SIMPLE;
2902 nextchar(pRExC_state);
2903 Set_Node_Length(ret, 2); /* MJD */
2906 RExC_seen_zerolen++;
2907 RExC_seen |= REG_SEEN_LOOKBEHIND;
2908 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2910 nextchar(pRExC_state);
2911 Set_Node_Length(ret, 2); /* MJD */
2914 RExC_seen_zerolen++;
2915 RExC_seen |= REG_SEEN_LOOKBEHIND;
2916 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2922 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2923 *flagp |= HASWIDTH|SIMPLE;
2924 nextchar(pRExC_state);
2925 Set_Node_Length(ret, 2); /* MJD */
2928 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2929 *flagp |= HASWIDTH|SIMPLE;
2930 nextchar(pRExC_state);
2931 Set_Node_Length(ret, 2); /* MJD */
2934 ret = reg_node(pRExC_state, DIGIT);
2935 *flagp |= HASWIDTH|SIMPLE;
2936 nextchar(pRExC_state);
2937 Set_Node_Length(ret, 2); /* MJD */
2940 ret = reg_node(pRExC_state, NDIGIT);
2941 *flagp |= HASWIDTH|SIMPLE;
2942 nextchar(pRExC_state);
2943 Set_Node_Length(ret, 2); /* MJD */
2948 char* oldregxend = RExC_end;
2949 char* parse_start = RExC_parse - 2;
2951 if (RExC_parse[1] == '{') {
2952 /* a lovely hack--pretend we saw [\pX] instead */
2953 RExC_end = strchr(RExC_parse, '}');
2955 U8 c = (U8)*RExC_parse;
2957 RExC_end = oldregxend;
2958 vFAIL2("Missing right brace on \\%c{}", c);
2963 RExC_end = RExC_parse + 2;
2964 if (RExC_end > oldregxend)
2965 RExC_end = oldregxend;
2969 ret = regclass(pRExC_state);
2971 RExC_end = oldregxend;
2974 Set_Node_Offset(ret, parse_start + 2);
2975 Set_Node_Cur_Length(ret);
2976 nextchar(pRExC_state);
2977 *flagp |= HASWIDTH|SIMPLE;
2990 case '1': case '2': case '3': case '4':
2991 case '5': case '6': case '7': case '8': case '9':
2993 I32 num = atoi(RExC_parse);
2995 if (num > 9 && num >= RExC_npar)
2998 char * parse_start = RExC_parse - 1; /* MJD */
2999 while (isDIGIT(*RExC_parse))
3002 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3003 vFAIL("Reference to nonexistent group");
3005 ret = reganode(pRExC_state,
3006 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3010 /* override incorrect value set in reganode MJD */
3011 Set_Node_Offset(ret, parse_start+1);
3012 Set_Node_Cur_Length(ret); /* MJD */
3014 nextchar(pRExC_state);
3019 if (RExC_parse >= RExC_end)
3020 FAIL("Trailing \\");
3023 /* Do not generate `unrecognized' warnings here, we fall
3024 back into the quick-grab loop below */
3030 if (RExC_flags & PMf_EXTENDED) {
3031 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3032 if (RExC_parse < RExC_end)
3038 register STRLEN len;
3044 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3046 parse_start = RExC_parse - 1;
3052 ret = reg_node(pRExC_state,
3053 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3055 for (len = 0, p = RExC_parse - 1;
3056 len < 127 && p < RExC_end;
3061 if (RExC_flags & PMf_EXTENDED)
3062 p = regwhite(p, RExC_end);
3109 ender = ASCII_TO_NATIVE('\033');
3113 ender = ASCII_TO_NATIVE('\007');
3118 char* e = strchr(p, '}');
3122 vFAIL("Missing right brace on \\x{}");
3125 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3126 | PERL_SCAN_DISALLOW_PREFIX;
3128 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3131 /* numlen is generous */
3132 if (numlen + len >= 127) {
3140 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3142 ender = grok_hex(p, &numlen, &flags, NULL);
3148 ender = UCHARAT(p++);
3149 ender = toCTRL(ender);
3151 case '0': case '1': case '2': case '3':case '4':
3152 case '5': case '6': case '7': case '8':case '9':
3154 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3157 ender = grok_oct(p, &numlen, &flags, NULL);
3167 FAIL("Trailing \\");
3170 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3171 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3172 goto normal_default;
3177 if (UTF8_IS_START(*p) && UTF) {
3178 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3186 if (RExC_flags & PMf_EXTENDED)
3187 p = regwhite(p, RExC_end);
3189 /* Prime the casefolded buffer. */
3190 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3192 if (ISMULT2(p)) { /* Back off on ?+*. */
3199 /* Emit all the Unicode characters. */
3200 for (foldbuf = tmpbuf;
3202 foldlen -= numlen) {
3203 ender = utf8_to_uvchr(foldbuf, &numlen);
3205 reguni(pRExC_state, ender, s, &unilen);
3208 /* In EBCDIC the numlen
3209 * and unilen can differ. */
3211 if (numlen >= foldlen)
3215 break; /* "Can't happen." */
3219 reguni(pRExC_state, ender, s, &unilen);
3228 REGC((char)ender, s++);
3236 /* Emit all the Unicode characters. */
3237 for (foldbuf = tmpbuf;
3239 foldlen -= numlen) {
3240 ender = utf8_to_uvchr(foldbuf, &numlen);
3242 reguni(pRExC_state, ender, s, &unilen);
3245 /* In EBCDIC the numlen
3246 * and unilen can differ. */
3248 if (numlen >= foldlen)
3256 reguni(pRExC_state, ender, s, &unilen);
3265 REGC((char)ender, s++);
3269 Set_Node_Cur_Length(ret); /* MJD */
3270 nextchar(pRExC_state);
3272 /* len is STRLEN which is unsigned, need to copy to signed */
3275 vFAIL("Internal disaster");
3284 RExC_size += STR_SZ(len);
3286 RExC_emit += STR_SZ(len);
3291 /* If the encoding pragma is in effect recode the text of
3292 * any EXACT-kind nodes. */
3293 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3294 STRLEN oldlen = STR_LEN(ret);
3295 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3299 if (sv_utf8_downgrade(sv, TRUE)) {
3300 char *s = sv_recode_to_utf8(sv, PL_encoding);
3301 STRLEN newlen = SvCUR(sv);
3306 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3307 (int)oldlen, STRING(ret),
3309 Copy(s, STRING(ret), newlen, char);
3310 STR_LEN(ret) += newlen - oldlen;
3311 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3313 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3321 S_regwhite(pTHX_ char *p, char *e)
3326 else if (*p == '#') {
3329 } while (p < e && *p != '\n');
3337 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3338 Character classes ([:foo:]) can also be negated ([:^foo:]).
3339 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3340 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3341 but trigger failures because they are currently unimplemented. */
3343 #define POSIXCC_DONE(c) ((c) == ':')
3344 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3345 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3348 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3351 I32 namedclass = OOB_NAMEDCLASS;
3353 if (value == '[' && RExC_parse + 1 < RExC_end &&
3354 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3355 POSIXCC(UCHARAT(RExC_parse))) {
3356 char c = UCHARAT(RExC_parse);
3357 char* s = RExC_parse++;
3359 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3361 if (RExC_parse == RExC_end)
3362 /* Grandfather lone [:, [=, [. */
3365 char* t = RExC_parse++; /* skip over the c */
3367 if (UCHARAT(RExC_parse) == ']') {
3368 RExC_parse++; /* skip over the ending ] */
3371 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3372 I32 skip = 5; /* the most common skip */
3376 if (strnEQ(posixcc, "alnum", 5))
3378 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3379 else if (strnEQ(posixcc, "alpha", 5))
3381 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3382 else if (strnEQ(posixcc, "ascii", 5))
3384 complement ? ANYOF_NASCII : ANYOF_ASCII;
3387 if (strnEQ(posixcc, "blank", 5))
3389 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3392 if (strnEQ(posixcc, "cntrl", 5))
3394 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3397 if (strnEQ(posixcc, "digit", 5))
3399 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3402 if (strnEQ(posixcc, "graph", 5))
3404 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3407 if (strnEQ(posixcc, "lower", 5))
3409 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3412 if (strnEQ(posixcc, "print", 5))
3414 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3415 else if (strnEQ(posixcc, "punct", 5))
3417 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3420 if (strnEQ(posixcc, "space", 5))
3422 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3425 if (strnEQ(posixcc, "upper", 5))
3427 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3429 case 'w': /* this is not POSIX, this is the Perl \w */
3430 if (strnEQ(posixcc, "word", 4)) {
3432 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3437 if (strnEQ(posixcc, "xdigit", 6)) {
3439 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3444 if (namedclass == OOB_NAMEDCLASS ||
3445 posixcc[skip] != ':' ||
3446 posixcc[skip+1] != ']')
3448 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3451 } else if (!SIZE_ONLY) {
3452 /* [[=foo=]] and [[.foo.]] are still future. */
3454 /* adjust RExC_parse so the warning shows after
3456 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3458 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3461 /* Maternal grandfather:
3462 * "[:" ending in ":" but not in ":]" */
3472 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3474 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3475 char *s = RExC_parse;
3478 while(*s && isALNUM(*s))
3480 if (*s && c == *s && s[1] == ']') {
3481 if (ckWARN(WARN_REGEXP))
3483 "POSIX syntax [%c %c] belongs inside character classes",
3486 /* [[=foo=]] and [[.foo.]] are still future. */
3487 if (POSIXCC_NOTYET(c)) {
3488 /* adjust RExC_parse so the error shows after
3490 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3492 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3499 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3502 register UV nextvalue;
3503 register IV prevvalue = OOB_UNICODE;
3504 register IV range = 0;
3505 register regnode *ret;
3508 char *rangebegin = 0;
3509 bool need_class = 0;
3510 SV *listsv = Nullsv;
3513 bool optimize_invert = TRUE;
3514 AV* unicode_alternate = 0;
3516 UV literal_endpoint = 0;
3519 ret = reganode(pRExC_state, ANYOF, 0);
3522 ANYOF_FLAGS(ret) = 0;
3524 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3528 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3532 RExC_size += ANYOF_SKIP;
3534 RExC_emit += ANYOF_SKIP;
3536 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3538 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3539 ANYOF_BITMAP_ZERO(ret);
3540 listsv = newSVpvn("# comment\n", 10);
3543 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3545 if (!SIZE_ONLY && POSIXCC(nextvalue))
3546 checkposixcc(pRExC_state);
3548 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3549 if (UCHARAT(RExC_parse) == ']')
3552 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3556 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3559 rangebegin = RExC_parse;
3561 value = utf8n_to_uvchr((U8*)RExC_parse,
3562 RExC_end - RExC_parse,
3564 RExC_parse += numlen;
3567 value = UCHARAT(RExC_parse++);
3568 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3569 if (value == '[' && POSIXCC(nextvalue))
3570 namedclass = regpposixcc(pRExC_state, value);
3571 else if (value == '\\') {
3573 value = utf8n_to_uvchr((U8*)RExC_parse,
3574 RExC_end - RExC_parse,
3576 RExC_parse += numlen;
3579 value = UCHARAT(RExC_parse++);
3580 /* Some compilers cannot handle switching on 64-bit integer
3581 * values, therefore value cannot be an UV. Yes, this will
3582 * be a problem later if we want switch on Unicode.
3583 * A similar issue a little bit later when switching on
3584 * namedclass. --jhi */
3585 switch ((I32)value) {
3586 case 'w': namedclass = ANYOF_ALNUM; break;
3587 case 'W': namedclass = ANYOF_NALNUM; break;
3588 case 's': namedclass = ANYOF_SPACE; break;
3589 case 'S': namedclass = ANYOF_NSPACE; break;
3590 case 'd': namedclass = ANYOF_DIGIT; break;
3591 case 'D': namedclass = ANYOF_NDIGIT; break;
3594 if (RExC_parse >= RExC_end)
3595 vFAIL2("Empty \\%c{}", (U8)value);
3596 if (*RExC_parse == '{') {
3598 e = strchr(RExC_parse++, '}');
3600 vFAIL2("Missing right brace on \\%c{}", c);
3601 while (isSPACE(UCHARAT(RExC_parse)))
3603 if (e == RExC_parse)
3604 vFAIL2("Empty \\%c{}", c);
3606 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3614 if (UCHARAT(RExC_parse) == '^') {
3617 value = value == 'p' ? 'P' : 'p'; /* toggle */
3618 while (isSPACE(UCHARAT(RExC_parse))) {
3624 Perl_sv_catpvf(aTHX_ listsv,
3625 "+utf8::%.*s\n", (int)n, RExC_parse);
3627 Perl_sv_catpvf(aTHX_ listsv,
3628 "!utf8::%.*s\n", (int)n, RExC_parse);
3631 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3633 case 'n': value = '\n'; break;
3634 case 'r': value = '\r'; break;
3635 case 't': value = '\t'; break;
3636 case 'f': value = '\f'; break;
3637 case 'b': value = '\b'; break;
3638 case 'e': value = ASCII_TO_NATIVE('\033');break;
3639 case 'a': value = ASCII_TO_NATIVE('\007');break;
3641 if (*RExC_parse == '{') {
3642 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3643 | PERL_SCAN_DISALLOW_PREFIX;
3644 e = strchr(RExC_parse++, '}');
3646 vFAIL("Missing right brace on \\x{}");
3648 numlen = e - RExC_parse;
3649 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3653 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3655 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3656 RExC_parse += numlen;
3660 value = UCHARAT(RExC_parse++);
3661 value = toCTRL(value);
3663 case '0': case '1': case '2': case '3': case '4':
3664 case '5': case '6': case '7': case '8': case '9':
3668 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3669 RExC_parse += numlen;
3673 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3675 "Unrecognized escape \\%c in character class passed through",
3679 } /* end of \blah */
3685 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3687 if (!SIZE_ONLY && !need_class)
3688 ANYOF_CLASS_ZERO(ret);
3692 /* a bad range like a-\d, a-[:digit:] ? */
3695 if (ckWARN(WARN_REGEXP))
3697 "False [] range \"%*.*s\"",
3698 RExC_parse - rangebegin,
3699 RExC_parse - rangebegin,
3701 if (prevvalue < 256) {
3702 ANYOF_BITMAP_SET(ret, prevvalue);
3703 ANYOF_BITMAP_SET(ret, '-');
3706 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3707 Perl_sv_catpvf(aTHX_ listsv,
3708 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3712 range = 0; /* this was not a true range */
3716 if (namedclass > OOB_NAMEDCLASS)
3717 optimize_invert = FALSE;
3718 /* Possible truncation here but in some 64-bit environments
3719 * the compiler gets heartburn about switch on 64-bit values.
3720 * A similar issue a little earlier when switching on value.
3722 switch ((I32)namedclass) {
3725 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3727 for (value = 0; value < 256; value++)
3729 ANYOF_BITMAP_SET(ret, value);
3731 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3735 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3737 for (value = 0; value < 256; value++)
3738 if (!isALNUM(value))
3739 ANYOF_BITMAP_SET(ret, value);
3741 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3745 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3747 for (value = 0; value < 256; value++)
3748 if (isALNUMC(value))
3749 ANYOF_BITMAP_SET(ret, value);
3751 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3755 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3757 for (value = 0; value < 256; value++)
3758 if (!isALNUMC(value))
3759 ANYOF_BITMAP_SET(ret, value);
3761 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3765 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3767 for (value = 0; value < 256; value++)
3769 ANYOF_BITMAP_SET(ret, value);
3771 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3775 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3777 for (value = 0; value < 256; value++)
3778 if (!isALPHA(value))
3779 ANYOF_BITMAP_SET(ret, value);
3781 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3785 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3788 for (value = 0; value < 128; value++)
3789 ANYOF_BITMAP_SET(ret, value);
3791 for (value = 0; value < 256; value++) {
3793 ANYOF_BITMAP_SET(ret, value);
3797 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3801 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3804 for (value = 128; value < 256; value++)
3805 ANYOF_BITMAP_SET(ret, value);
3807 for (value = 0; value < 256; value++) {
3808 if (!isASCII(value))
3809 ANYOF_BITMAP_SET(ret, value);
3813 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3817 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3819 for (value = 0; value < 256; value++)
3821 ANYOF_BITMAP_SET(ret, value);
3823 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3827 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3829 for (value = 0; value < 256; value++)
3830 if (!isBLANK(value))
3831 ANYOF_BITMAP_SET(ret, value);
3833 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3837 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3839 for (value = 0; value < 256; value++)
3841 ANYOF_BITMAP_SET(ret, value);
3843 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3847 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3849 for (value = 0; value < 256; value++)
3850 if (!isCNTRL(value))
3851 ANYOF_BITMAP_SET(ret, value);
3853 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3857 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3859 /* consecutive digits assumed */
3860 for (value = '0'; value <= '9'; value++)
3861 ANYOF_BITMAP_SET(ret, value);
3863 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3867 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3869 /* consecutive digits assumed */
3870 for (value = 0; value < '0'; value++)
3871 ANYOF_BITMAP_SET(ret, value);
3872 for (value = '9' + 1; value < 256; value++)
3873 ANYOF_BITMAP_SET(ret, value);
3875 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3879 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3881 for (value = 0; value < 256; value++)
3883 ANYOF_BITMAP_SET(ret, value);
3885 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3889 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3891 for (value = 0; value < 256; value++)
3892 if (!isGRAPH(value))
3893 ANYOF_BITMAP_SET(ret, value);
3895 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3899 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3901 for (value = 0; value < 256; value++)
3903 ANYOF_BITMAP_SET(ret, value);
3905 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3909 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3911 for (value = 0; value < 256; value++)
3912 if (!isLOWER(value))
3913 ANYOF_BITMAP_SET(ret, value);
3915 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3919 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3921 for (value = 0; value < 256; value++)
3923 ANYOF_BITMAP_SET(ret, value);
3925 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3929 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3931 for (value = 0; value < 256; value++)
3932 if (!isPRINT(value))
3933 ANYOF_BITMAP_SET(ret, value);
3935 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3939 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3941 for (value = 0; value < 256; value++)
3942 if (isPSXSPC(value))
3943 ANYOF_BITMAP_SET(ret, value);
3945 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3949 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3951 for (value = 0; value < 256; value++)
3952 if (!isPSXSPC(value))
3953 ANYOF_BITMAP_SET(ret, value);
3955 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3959 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3961 for (value = 0; value < 256; value++)
3963 ANYOF_BITMAP_SET(ret, value);
3965 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3969 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3971 for (value = 0; value < 256; value++)
3972 if (!isPUNCT(value))
3973 ANYOF_BITMAP_SET(ret, value);
3975 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3979 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3981 for (value = 0; value < 256; value++)
3983 ANYOF_BITMAP_SET(ret, value);
3985 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3989 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3991 for (value = 0; value < 256; value++)
3992 if (!isSPACE(value))
3993 ANYOF_BITMAP_SET(ret, value);
3995 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3999 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4001 for (value = 0; value < 256; value++)
4003 ANYOF_BITMAP_SET(ret, value);
4005 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4009 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4011 for (value = 0; value < 256; value++)
4012 if (!isUPPER(value))
4013 ANYOF_BITMAP_SET(ret, value);
4015 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4019 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4021 for (value = 0; value < 256; value++)
4022 if (isXDIGIT(value))
4023 ANYOF_BITMAP_SET(ret, value);
4025 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4029 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4031 for (value = 0; value < 256; value++)
4032 if (!isXDIGIT(value))
4033 ANYOF_BITMAP_SET(ret, value);
4035 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4038 vFAIL("Invalid [::] class");
4042 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4045 } /* end of namedclass \blah */
4048 if (prevvalue > (IV)value) /* b-a */ {
4049 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4050 RExC_parse - rangebegin,
4051 RExC_parse - rangebegin,
4053 range = 0; /* not a valid range */
4057 prevvalue = value; /* save the beginning of the range */
4058 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4059 RExC_parse[1] != ']') {
4062 /* a bad range like \w-, [:word:]- ? */
4063 if (namedclass > OOB_NAMEDCLASS) {
4064 if (ckWARN(WARN_REGEXP))
4066 "False [] range \"%*.*s\"",
4067 RExC_parse - rangebegin,
4068 RExC_parse - rangebegin,
4071 ANYOF_BITMAP_SET(ret, '-');
4073 range = 1; /* yeah, it's a range! */
4074 continue; /* but do it the next time */
4078 /* now is the next time */
4082 if (prevvalue < 256) {
4083 IV ceilvalue = value < 256 ? value : 255;
4086 /* In EBCDIC [\x89-\x91] should include
4087 * the \x8e but [i-j] should not. */
4088 if (literal_endpoint == 2 &&
4089 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4090 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4092 if (isLOWER(prevvalue)) {
4093 for (i = prevvalue; i <= ceilvalue; i++)
4095 ANYOF_BITMAP_SET(ret, i);
4097 for (i = prevvalue; i <= ceilvalue; i++)
4099 ANYOF_BITMAP_SET(ret, i);
4104 for (i = prevvalue; i <= ceilvalue; i++)
4105 ANYOF_BITMAP_SET(ret, i);
4107 if (value > 255 || UTF) {
4108 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4109 UV natvalue = NATIVE_TO_UNI(value);
4111 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4112 if (prevnatvalue < natvalue) { /* what about > ? */
4113 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4114 prevnatvalue, natvalue);
4116 else if (prevnatvalue == natvalue) {
4117 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4119 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4121 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4123 /* If folding and foldable and a single
4124 * character, insert also the folded version
4125 * to the charclass. */
4127 if (foldlen == (STRLEN)UNISKIP(f))
4128 Perl_sv_catpvf(aTHX_ listsv,
4131 /* Any multicharacter foldings
4132 * require the following transform:
4133 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4134 * where E folds into "pq" and F folds
4135 * into "rst", all other characters
4136 * fold to single characters. We save
4137 * away these multicharacter foldings,
4138 * to be later saved as part of the
4139 * additional "s" data. */
4142 if (!unicode_alternate)
4143 unicode_alternate = newAV();
4144 sv = newSVpvn((char*)foldbuf, foldlen);
4146 av_push(unicode_alternate, sv);
4150 /* If folding and the value is one of the Greek
4151 * sigmas insert a few more sigmas to make the
4152 * folding rules of the sigmas to work right.
4153 * Note that not all the possible combinations
4154 * are handled here: some of them are handled
4155 * by the standard folding rules, and some of
4156 * them (literal or EXACTF cases) are handled
4157 * during runtime in regexec.c:S_find_byclass(). */
4158 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4159 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4160 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4161 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4162 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4164 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4165 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4166 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4171 literal_endpoint = 0;
4175 range = 0; /* this range (if it was one) is done now */
4179 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4181 RExC_size += ANYOF_CLASS_ADD_SKIP;
4183 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4186 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4188 /* If the only flag is folding (plus possibly inversion). */
4189 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4191 for (value = 0; value < 256; ++value) {
4192 if (ANYOF_BITMAP_TEST(ret, value)) {
4193 UV fold = PL_fold[value];
4196 ANYOF_BITMAP_SET(ret, fold);
4199 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4202 /* optimize inverted simple patterns (e.g. [^a-z]) */
4203 if (!SIZE_ONLY && optimize_invert &&
4204 /* If the only flag is inversion. */
4205 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4206 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4207 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4208 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4215 /* The 0th element stores the character class description
4216 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4217 * to initialize the appropriate swash (which gets stored in
4218 * the 1st element), and also useful for dumping the regnode.
4219 * The 2nd element stores the multicharacter foldings,
4220 * used later (regexec.c:S_reginclass()). */
4221 av_store(av, 0, listsv);
4222 av_store(av, 1, NULL);
4223 av_store(av, 2, (SV*)unicode_alternate);
4224 rv = newRV_noinc((SV*)av);
4225 n = add_data(pRExC_state, 1, "s");
4226 RExC_rx->data->data[n] = (void*)rv;
4234 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4236 char* retval = RExC_parse++;
4239 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4240 RExC_parse[2] == '#') {
4241 while (*RExC_parse && *RExC_parse != ')')
4246 if (RExC_flags & PMf_EXTENDED) {
4247 if (isSPACE(*RExC_parse)) {
4251 else if (*RExC_parse == '#') {
4252 while (*RExC_parse && *RExC_parse != '\n')
4263 - reg_node - emit a node
4265 STATIC regnode * /* Location. */
4266 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4268 register regnode *ret;
4269 register regnode *ptr;
4273 SIZE_ALIGN(RExC_size);
4278 NODE_ALIGN_FILL(ret);
4280 FILL_ADVANCE_NODE(ptr, op);
4281 if (RExC_offsets) { /* MJD */
4282 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4283 "reg_node", __LINE__,
4285 RExC_emit - RExC_emit_start > RExC_offsets[0]
4286 ? "Overwriting end of array!\n" : "OK",
4287 RExC_emit - RExC_emit_start,
4288 RExC_parse - RExC_start,
4290 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4299 - reganode - emit a node with an argument
4301 STATIC regnode * /* Location. */
4302 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4304 register regnode *ret;
4305 register regnode *ptr;
4309 SIZE_ALIGN(RExC_size);
4314 NODE_ALIGN_FILL(ret);
4316 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4317 if (RExC_offsets) { /* MJD */
4318 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4322 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4323 "Overwriting end of array!\n" : "OK",
4324 RExC_emit - RExC_emit_start,
4325 RExC_parse - RExC_start,
4327 Set_Cur_Node_Offset;
4336 - reguni - emit (if appropriate) a Unicode character
4339 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4341 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4345 - reginsert - insert an operator in front of already-emitted operand
4347 * Means relocating the operand.
4350 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4352 register regnode *src;
4353 register regnode *dst;
4354 register regnode *place;
4355 register int offset = regarglen[(U8)op];
4357 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4360 RExC_size += NODE_STEP_REGNODE + offset;
4365 RExC_emit += NODE_STEP_REGNODE + offset;
4367 while (src > opnd) {
4368 StructCopy(--src, --dst, regnode);
4369 if (RExC_offsets) { /* MJD 20010112 */
4370 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4374 dst - RExC_emit_start > RExC_offsets[0]
4375 ? "Overwriting end of array!\n" : "OK",
4376 src - RExC_emit_start,
4377 dst - RExC_emit_start,
4379 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4380 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4385 place = opnd; /* Op node, where operand used to be. */
4386 if (RExC_offsets) { /* MJD */
4387 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4391 place - RExC_emit_start > RExC_offsets[0]
4392 ? "Overwriting end of array!\n" : "OK",
4393 place - RExC_emit_start,
4394 RExC_parse - RExC_start,
4396 Set_Node_Offset(place, RExC_parse);
4398 src = NEXTOPER(place);
4399 FILL_ADVANCE_NODE(place, op);
4400 Zero(src, offset, regnode);
4404 - regtail - set the next-pointer at the end of a node chain of p to val.
4407 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4409 register regnode *scan;
4410 register regnode *temp;
4415 /* Find last node. */
4418 temp = regnext(scan);
4424 if (reg_off_by_arg[OP(scan)]) {
4425 ARG_SET(scan, val - scan);
4428 NEXT_OFF(scan) = val - scan;
4433 - regoptail - regtail on operand of first argument; nop if operandless
4436 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4438 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4439 if (p == NULL || SIZE_ONLY)
4441 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4442 regtail(pRExC_state, NEXTOPER(p), val);
4444 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4445 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4452 - regcurly - a little FSA that accepts {\d+,?\d*}
4455 S_regcurly(pTHX_ register char *s)
4476 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4478 register U8 op = EXACT; /* Arbitrary non-END op. */
4479 register regnode *next;
4481 while (op != END && (!last || node < last)) {
4482 /* While that wasn't END last time... */
4488 next = regnext(node);
4490 if (OP(node) == OPTIMIZED)
4493 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4494 (int)(2*l + 1), "", SvPVX(sv));
4495 if (next == NULL) /* Next ptr. */
4496 PerlIO_printf(Perl_debug_log, "(0)");
4498 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4499 (void)PerlIO_putc(Perl_debug_log, '\n');
4501 if (PL_regkind[(U8)op] == BRANCHJ) {
4502 register regnode *nnode = (OP(next) == LONGJMP
4505 if (last && nnode > last)
4507 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4509 else if (PL_regkind[(U8)op] == BRANCH) {
4510 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4512 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4513 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4514 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4516 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4517 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4520 else if ( op == PLUS || op == STAR) {
4521 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4523 else if (op == ANYOF) {
4524 /* arglen 1 + class block */
4525 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4526 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4527 node = NEXTOPER(node);
4529 else if (PL_regkind[(U8)op] == EXACT) {
4530 /* Literal string, where present. */
4531 node += NODE_SZ_STR(node) - 1;
4532 node = NEXTOPER(node);
4535 node = NEXTOPER(node);
4536 node += regarglen[(U8)op];
4538 if (op == CURLYX || op == OPEN)
4540 else if (op == WHILEM)
4546 #endif /* DEBUGGING */
4549 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4552 Perl_regdump(pTHX_ regexp *r)
4555 SV *sv = sv_newmortal();
4557 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4559 /* Header fields of interest. */
4560 if (r->anchored_substr)
4561 PerlIO_printf(Perl_debug_log,
4562 "anchored `%s%.*s%s'%s at %"IVdf" ",
4564 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4565 SvPVX(r->anchored_substr),
4567 SvTAIL(r->anchored_substr) ? "$" : "",
4568 (IV)r->anchored_offset);
4569 else if (r->anchored_utf8)
4570 PerlIO_printf(Perl_debug_log,
4571 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4573 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4574 SvPVX(r->anchored_utf8),
4576 SvTAIL(r->anchored_utf8) ? "$" : "",
4577 (IV)r->anchored_offset);
4578 if (r->float_substr)
4579 PerlIO_printf(Perl_debug_log,
4580 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4582 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4583 SvPVX(r->float_substr),
4585 SvTAIL(r->float_substr) ? "$" : "",
4586 (IV)r->float_min_offset, (UV)r->float_max_offset);
4587 else if (r->float_utf8)
4588 PerlIO_printf(Perl_debug_log,
4589 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4591 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4592 SvPVX(r->float_utf8),
4594 SvTAIL(r->float_utf8) ? "$" : "",
4595 (IV)r->float_min_offset, (UV)r->float_max_offset);
4596 if (r->check_substr || r->check_utf8)
4597 PerlIO_printf(Perl_debug_log,
4598 r->check_substr == r->float_substr
4599 && r->check_utf8 == r->float_utf8
4600 ? "(checking floating" : "(checking anchored");
4601 if (r->reganch & ROPT_NOSCAN)
4602 PerlIO_printf(Perl_debug_log, " noscan");
4603 if (r->reganch & ROPT_CHECK_ALL)
4604 PerlIO_printf(Perl_debug_log, " isall");
4605 if (r->check_substr || r->check_utf8)
4606 PerlIO_printf(Perl_debug_log, ") ");
4608 if (r->regstclass) {
4609 regprop(sv, r->regstclass);
4610 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4612 if (r->reganch & ROPT_ANCH) {
4613 PerlIO_printf(Perl_debug_log, "anchored");
4614 if (r->reganch & ROPT_ANCH_BOL)
4615 PerlIO_printf(Perl_debug_log, "(BOL)");
4616 if (r->reganch & ROPT_ANCH_MBOL)
4617 PerlIO_printf(Perl_debug_log, "(MBOL)");
4618 if (r->reganch & ROPT_ANCH_SBOL)
4619 PerlIO_printf(Perl_debug_log, "(SBOL)");
4620 if (r->reganch & ROPT_ANCH_GPOS)
4621 PerlIO_printf(Perl_debug_log, "(GPOS)");
4622 PerlIO_putc(Perl_debug_log, ' ');
4624 if (r->reganch & ROPT_GPOS_SEEN)
4625 PerlIO_printf(Perl_debug_log, "GPOS ");
4626 if (r->reganch & ROPT_SKIP)
4627 PerlIO_printf(Perl_debug_log, "plus ");
4628 if (r->reganch & ROPT_IMPLICIT)
4629 PerlIO_printf(Perl_debug_log, "implicit ");
4630 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4631 if (r->reganch & ROPT_EVAL_SEEN)
4632 PerlIO_printf(Perl_debug_log, "with eval ");
4633 PerlIO_printf(Perl_debug_log, "\n");
4636 U32 len = r->offsets[0];
4637 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4638 for (i = 1; i <= len; i++)
4639 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4640 (UV)r->offsets[i*2-1],
4641 (UV)r->offsets[i*2]);
4642 PerlIO_printf(Perl_debug_log, "\n");
4644 #endif /* DEBUGGING */
4650 S_put_byte(pTHX_ SV *sv, int c)
4652 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4653 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4654 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4655 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4657 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4660 #endif /* DEBUGGING */
4663 - regprop - printable representation of opcode
4666 Perl_regprop(pTHX_ SV *sv, regnode *o)
4671 sv_setpvn(sv, "", 0);
4672 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4673 /* It would be nice to FAIL() here, but this may be called from
4674 regexec.c, and it would be hard to supply pRExC_state. */
4675 Perl_croak(aTHX_ "Corrupted regexp opcode");
4676 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4678 k = PL_regkind[(U8)OP(o)];
4681 SV *dsv = sv_2mortal(newSVpvn("", 0));
4682 /* Using is_utf8_string() is a crude hack but it may
4683 * be the best for now since we have no flag "this EXACTish
4684 * node was UTF-8" --jhi */
4685 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4687 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4688 UNI_DISPLAY_REGEX) :
4693 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4698 else if (k == CURLY) {
4699 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4700 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4701 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4703 else if (k == WHILEM && o->flags) /* Ordinal/of */
4704 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4705 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4706 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4707 else if (k == LOGICAL)
4708 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4709 else if (k == ANYOF) {
4710 int i, rangestart = -1;
4711 U8 flags = ANYOF_FLAGS(o);
4712 const char * const anyofs[] = { /* Should be syncronized with
4713 * ANYOF_ #xdefines in regcomp.h */
4746 if (flags & ANYOF_LOCALE)
4747 sv_catpv(sv, "{loc}");
4748 if (flags & ANYOF_FOLD)
4749 sv_catpv(sv, "{i}");
4750 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4751 if (flags & ANYOF_INVERT)
4753 for (i = 0; i <= 256; i++) {
4754 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4755 if (rangestart == -1)
4757 } else if (rangestart != -1) {
4758 if (i <= rangestart + 3)
4759 for (; rangestart < i; rangestart++)
4760 put_byte(sv, rangestart);
4762 put_byte(sv, rangestart);
4764 put_byte(sv, i - 1);
4770 if (o->flags & ANYOF_CLASS)
4771 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4772 if (ANYOF_CLASS_TEST(o,i))
4773 sv_catpv(sv, anyofs[i]);
4775 if (flags & ANYOF_UNICODE)
4776 sv_catpv(sv, "{unicode}");
4777 else if (flags & ANYOF_UNICODE_ALL)
4778 sv_catpv(sv, "{unicode_all}");
4782 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4786 U8 s[UTF8_MAXLEN+1];
4788 for (i = 0; i <= 256; i++) { /* just the first 256 */
4789 U8 *e = uvchr_to_utf8(s, i);
4791 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4792 if (rangestart == -1)
4794 } else if (rangestart != -1) {
4797 if (i <= rangestart + 3)
4798 for (; rangestart < i; rangestart++) {
4799 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4803 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4806 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4813 sv_catpv(sv, "..."); /* et cetera */
4817 char *s = savepv(SvPVX(lv));
4820 while(*s && *s != '\n') s++;
4841 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4843 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4844 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4845 #endif /* DEBUGGING */
4849 Perl_re_intuit_string(pTHX_ regexp *prog)
4850 { /* Assume that RE_INTUIT is set */
4853 char *s = SvPV(prog->check_substr
4854 ? prog->check_substr : prog->check_utf8, n_a);
4856 if (!PL_colorset) reginitcolors();
4857 PerlIO_printf(Perl_debug_log,
4858 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4860 prog->check_substr ? "" : "utf8 ",
4861 PL_colors[5],PL_colors[0],
4864 (strlen(s) > 60 ? "..." : ""));
4867 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4871 Perl_pregfree(pTHX_ struct regexp *r)
4874 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4877 if (!r || (--r->refcnt > 0))
4883 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4884 r->prelen, 60, UNI_DISPLAY_REGEX)
4885 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4889 PerlIO_printf(Perl_debug_log,
4890 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4891 PL_colors[4],PL_colors[5],PL_colors[0],
4894 len > 60 ? "..." : "");
4898 Safefree(r->precomp);
4899 if (r->offsets) /* 20010421 MJD */
4900 Safefree(r->offsets);
4901 if (RX_MATCH_COPIED(r))
4902 Safefree(r->subbeg);
4904 if (r->anchored_substr)
4905 SvREFCNT_dec(r->anchored_substr);
4906 if (r->anchored_utf8)
4907 SvREFCNT_dec(r->anchored_utf8);
4908 if (r->float_substr)
4909 SvREFCNT_dec(r->float_substr);
4911 SvREFCNT_dec(r->float_utf8);
4912 Safefree(r->substrs);
4915 int n = r->data->count;
4916 PAD* new_comppad = NULL;
4920 /* If you add a ->what type here, update the comment in regcomp.h */
4921 switch (r->data->what[n]) {
4923 SvREFCNT_dec((SV*)r->data->data[n]);
4926 Safefree(r->data->data[n]);
4929 new_comppad = (AV*)r->data->data[n];
4932 if (new_comppad == NULL)
4933 Perl_croak(aTHX_ "panic: pregfree comppad");
4934 PAD_SAVE_LOCAL(old_comppad,
4935 /* Watch out for global destruction's random ordering. */
4936 (SvTYPE(new_comppad) == SVt_PVAV) ?
4937 new_comppad : Null(PAD *)
4939 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4940 op_free((OP_4tree*)r->data->data[n]);
4943 PAD_RESTORE_LOCAL(old_comppad);
4944 SvREFCNT_dec((SV*)new_comppad);
4950 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4953 Safefree(r->data->what);
4956 Safefree(r->startp);
4962 - regnext - dig the "next" pointer out of a node
4964 * [Note, when REGALIGN is defined there are two places in regmatch()
4965 * that bypass this code for speed.]
4968 Perl_regnext(pTHX_ register regnode *p)
4970 register I32 offset;
4972 if (p == &PL_regdummy)
4975 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4983 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4986 STRLEN l1 = strlen(pat1);
4987 STRLEN l2 = strlen(pat2);
4996 Copy(pat1, buf, l1 , char);
4997 Copy(pat2, buf + l1, l2 , char);
4998 buf[l1 + l2] = '\n';
4999 buf[l1 + l2 + 1] = '\0';
5001 /* ANSI variant takes additional second argument */
5002 va_start(args, pat2);
5006 msv = vmess(buf, &args);
5008 message = SvPV(msv,l1);
5011 Copy(message, buf, l1 , char);
5012 buf[l1] = '\0'; /* Overwrite \n */
5013 Perl_croak(aTHX_ "%s", buf);
5016 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5019 Perl_save_re_context(pTHX)
5022 SAVEPPTR(RExC_precomp); /* uncompiled string. */
5023 SAVEI32(RExC_npar); /* () count. */
5024 SAVEI32(RExC_size); /* Code size. */
5025 SAVEI32(RExC_flags); /* are we folding, multilining? */
5026 SAVEVPTR(RExC_rx); /* from regcomp.c */
5027 SAVEI32(RExC_seen); /* from regcomp.c */
5028 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
5029 SAVEI32(RExC_naughty); /* How bad is this pattern? */
5030 SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */
5031 SAVEPPTR(RExC_end); /* End of input for compile */
5032 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
5035 SAVEI32(PL_reg_flags); /* from regexec.c */
5037 SAVEPPTR(PL_reginput); /* String-input pointer. */
5038 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5039 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5040 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5041 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5042 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5043 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5044 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5045 PL_reg_start_tmp = 0;
5046 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5047 PL_reg_start_tmpl = 0;
5048 SAVEVPTR(PL_regdata);
5049 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5050 SAVEI32(PL_regnarrate); /* from regexec.c */
5051 SAVEVPTR(PL_regprogram); /* from regexec.c */
5052 SAVEINT(PL_regindent); /* from regexec.c */
5053 SAVEVPTR(PL_regcc); /* from regexec.c */
5054 SAVEVPTR(PL_curcop);
5055 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5056 SAVEVPTR(PL_reg_re); /* from regexec.c */
5057 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5058 SAVESPTR(PL_reg_sv); /* from regexec.c */
5059 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
5060 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5061 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5062 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5063 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5064 SAVEI32(PL_regnpar); /* () count. */
5065 SAVEI32(PL_regsize); /* from regexec.c */
5067 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5072 clear_re(pTHX_ void *r)
5074 ReREFCNT_dec((regexp *)r);