5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 #define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 #define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 #define vFAIL(m) STMT_START { \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 * Like Simple_vFAIL(), but accepts two arguments.
316 #define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 #define vFAIL2(m,a1) STMT_START { \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
333 * Like Simple_vFAIL(), but accepts three arguments.
335 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 #define vFAIL3(m,a1,a2) STMT_START { \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
351 * Like Simple_vFAIL(), but accepts four arguments.
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
360 * Like Simple_vFAIL(), but accepts five arguments.
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 data->flags &= ~SF_BEFORE_EOL;
502 /* Can match anything (initialization) */
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
506 ANYOF_CLASS_ZERO(cl);
507 ANYOF_BITMAP_SETALL(cl);
508 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
510 cl->flags |= ANYOF_LOCALE;
513 /* Can match anything (initialization) */
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
519 for (value = 0; value <= ANYOF_MAX; value += 2)
520 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
522 if (!(cl->flags & ANYOF_UNICODE_ALL))
524 if (!ANYOF_BITMAP_TESTALLSET(cl))
529 /* Can match anything (initialization) */
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
533 Zero(cl, 1, struct regnode_charclass_class);
535 cl_anything(pRExC_state, cl);
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
545 cl->flags |= ANYOF_LOCALE;
548 /* 'And' a given class with another one. Can create false positives */
549 /* We assume that cl is not inverted */
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552 struct regnode_charclass_class *and_with)
554 if (!(and_with->flags & ANYOF_CLASS)
555 && !(cl->flags & ANYOF_CLASS)
556 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557 && !(and_with->flags & ANYOF_FOLD)
558 && !(cl->flags & ANYOF_FOLD)) {
561 if (and_with->flags & ANYOF_INVERT)
562 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563 cl->bitmap[i] &= ~and_with->bitmap[i];
565 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566 cl->bitmap[i] &= and_with->bitmap[i];
567 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568 if (!(and_with->flags & ANYOF_EOS))
569 cl->flags &= ~ANYOF_EOS;
571 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 cl->flags |= ANYOF_UNICODE;
574 ARG_SET(cl, ARG(and_with));
576 if (!(and_with->flags & ANYOF_UNICODE_ALL))
577 cl->flags &= ~ANYOF_UNICODE_ALL;
578 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579 cl->flags &= ~ANYOF_UNICODE;
582 /* 'OR' a given class with another one. Can create false positives */
583 /* We assume that cl is not inverted */
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
587 if (or_with->flags & ANYOF_INVERT) {
589 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590 * <= (B1 | !B2) | (CL1 | !CL2)
591 * which is wasteful if CL2 is small, but we ignore CL2:
592 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593 * XXXX Can we handle case-fold? Unclear:
594 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598 && !(or_with->flags & ANYOF_FOLD)
599 && !(cl->flags & ANYOF_FOLD) ) {
602 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603 cl->bitmap[i] |= ~or_with->bitmap[i];
604 } /* XXXX: logic is complicated otherwise */
606 cl_anything(pRExC_state, cl);
609 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611 && (!(or_with->flags & ANYOF_FOLD)
612 || (cl->flags & ANYOF_FOLD)) ) {
615 /* OR char bitmap and class bitmap separately */
616 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617 cl->bitmap[i] |= or_with->bitmap[i];
618 if (or_with->flags & ANYOF_CLASS) {
619 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620 cl->classflags[i] |= or_with->classflags[i];
621 cl->flags |= ANYOF_CLASS;
624 else { /* XXXX: logic is complicated, leave it along for a moment. */
625 cl_anything(pRExC_state, cl);
628 if (or_with->flags & ANYOF_EOS)
629 cl->flags |= ANYOF_EOS;
631 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632 ARG(cl) != ARG(or_with)) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
636 if (or_with->flags & ANYOF_UNICODE_ALL) {
637 cl->flags |= ANYOF_UNICODE_ALL;
638 cl->flags &= ~ANYOF_UNICODE;
643 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644 * These need to be revisited when a newer toolchain becomes available.
646 #if defined(__sparc64__) && defined(__GNUC__)
647 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 # undef SPARC64_GCC_WORKAROUND
649 # define SPARC64_GCC_WORKAROUND 1
653 /* REx optimizer. Converts nodes into quickier variants "in place".
654 Finds fixed substrings. */
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657 to the position after last scanned or to NULL. */
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661 /* scanp: Start here (read-write). */
662 /* deltap: Write maxlen-minlen here. */
663 /* last: Stop before this one. */
665 I32 min = 0, pars = 0, code;
666 regnode *scan = *scanp, *next;
668 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669 int is_inf_internal = 0; /* The studied chunk is infinite */
670 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671 scan_data_t data_fake;
672 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
674 while (scan && OP(scan) != END && scan < last) {
675 /* Peephole optimizer: */
677 if (PL_regkind[(U8)OP(scan)] == EXACT) {
678 /* Merge several consecutive EXACTish nodes into one. */
679 regnode *n = regnext(scan);
682 regnode *stop = scan;
685 next = scan + NODE_SZ_STR(scan);
686 /* Skip NOTHING, merge EXACT*. */
688 ( PL_regkind[(U8)OP(n)] == NOTHING ||
689 (stringok && (OP(n) == OP(scan))))
691 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692 if (OP(n) == TAIL || n > next)
694 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695 NEXT_OFF(scan) += NEXT_OFF(n);
696 next = n + NODE_STEP_REGNODE;
704 int oldl = STR_LEN(scan);
705 regnode *nnext = regnext(n);
707 if (oldl + STR_LEN(n) > U8_MAX)
709 NEXT_OFF(scan) += NEXT_OFF(n);
710 STR_LEN(scan) += STR_LEN(n);
711 next = n + NODE_SZ_STR(n);
712 /* Now we can overwrite *n : */
713 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
721 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
723 Two problematic code points in Unicode casefolding of EXACT nodes:
725 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
732 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
733 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
735 This means that in case-insensitive matching (or "loose matching",
736 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737 length of the above casefolded versions) can match a target string
738 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739 This would rather mess up the minimum length computation.
741 What we'll do is to look for the tail four bytes, and then peek
742 at the preceding two bytes to see whether we need to decrease
743 the minimum length by four (six minus two).
745 Thanks to the design of UTF-8, there cannot be false matches:
746 A sequence of valid UTF-8 bytes cannot be a subsequence of
747 another valid sequence of UTF-8 bytes.
750 char *s0 = STRING(scan), *s, *t;
751 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752 char *t0 = "\xcc\x88\xcc\x81";
756 s < s2 && (t = ninstr(s, s1, t0, t1));
758 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
766 n = scan + NODE_SZ_STR(scan);
768 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
776 /* Follow the next-chain of the current node and optimize
777 away all the NOTHINGs from it. */
778 if (OP(scan) != CURLYX) {
779 int max = (reg_off_by_arg[OP(scan)]
781 /* I32 may be smaller than U16 on CRAYs! */
782 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
787 /* Skip NOTHING and LONGJMP. */
788 while ((n = regnext(n))
789 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
793 if (reg_off_by_arg[OP(scan)])
796 NEXT_OFF(scan) = off;
798 /* The principal pseudo-switch. Cannot be a switch, since we
799 look into several different things. */
800 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802 next = regnext(scan);
805 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806 I32 max1 = 0, min1 = I32_MAX, num = 0;
807 struct regnode_charclass_class accum;
809 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811 if (flags & SCF_DO_STCLASS)
812 cl_init_zero(pRExC_state, &accum);
813 while (OP(scan) == code) {
814 I32 deltanext, minnext, f = 0, fake;
815 struct regnode_charclass_class this_class;
820 data_fake.whilem_c = data->whilem_c;
821 data_fake.last_closep = data->last_closep;
824 data_fake.last_closep = &fake;
825 next = regnext(scan);
826 scan = NEXTOPER(scan);
828 scan = NEXTOPER(scan);
829 if (flags & SCF_DO_STCLASS) {
830 cl_init(pRExC_state, &this_class);
831 data_fake.start_class = &this_class;
832 f = SCF_DO_STCLASS_AND;
834 if (flags & SCF_WHILEM_VISITED_POS)
835 f |= SCF_WHILEM_VISITED_POS;
836 /* we suppose the run is continuous, last=next...*/
837 minnext = study_chunk(pRExC_state, &scan, &deltanext,
838 next, &data_fake, f);
841 if (max1 < minnext + deltanext)
842 max1 = minnext + deltanext;
843 if (deltanext == I32_MAX)
844 is_inf = is_inf_internal = 1;
846 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
848 if (data && (data_fake.flags & SF_HAS_EVAL))
849 data->flags |= SF_HAS_EVAL;
851 data->whilem_c = data_fake.whilem_c;
852 if (flags & SCF_DO_STCLASS)
853 cl_or(pRExC_state, &accum, &this_class);
857 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
859 if (flags & SCF_DO_SUBSTR) {
860 data->pos_min += min1;
861 data->pos_delta += max1 - min1;
862 if (max1 != min1 || is_inf)
863 data->longest = &(data->longest_float);
866 delta += max1 - min1;
867 if (flags & SCF_DO_STCLASS_OR) {
868 cl_or(pRExC_state, data->start_class, &accum);
870 cl_and(data->start_class, &and_with);
871 flags &= ~SCF_DO_STCLASS;
874 else if (flags & SCF_DO_STCLASS_AND) {
876 cl_and(data->start_class, &accum);
877 flags &= ~SCF_DO_STCLASS;
880 /* Switch to OR mode: cache the old value of
881 * data->start_class */
882 StructCopy(data->start_class, &and_with,
883 struct regnode_charclass_class);
884 flags &= ~SCF_DO_STCLASS_AND;
885 StructCopy(&accum, data->start_class,
886 struct regnode_charclass_class);
887 flags |= SCF_DO_STCLASS_OR;
888 data->start_class->flags |= ANYOF_EOS;
892 else if (code == BRANCHJ) /* single branch is optimized. */
893 scan = NEXTOPER(NEXTOPER(scan));
894 else /* single branch is optimized. */
895 scan = NEXTOPER(scan);
898 else if (OP(scan) == EXACT) {
899 I32 l = STR_LEN(scan);
900 UV uc = *((U8*)STRING(scan));
902 U8 *s = (U8*)STRING(scan);
903 l = utf8_length(s, s + l);
904 uc = utf8_to_uvchr(s, NULL);
907 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908 /* The code below prefers earlier match for fixed
909 offset, later match for variable offset. */
910 if (data->last_end == -1) { /* Update the start info. */
911 data->last_start_min = data->pos_min;
912 data->last_start_max = is_inf
913 ? I32_MAX : data->pos_min + data->pos_delta;
915 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
917 SvUTF8_on(data->last_found);
918 data->last_end = data->pos_min + l;
919 data->pos_min += l; /* As in the first entry. */
920 data->flags &= ~SF_BEFORE_EOL;
922 if (flags & SCF_DO_STCLASS_AND) {
923 /* Check whether it is compatible with what we know already! */
927 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928 && !ANYOF_BITMAP_TEST(data->start_class, uc)
929 && (!(data->start_class->flags & ANYOF_FOLD)
930 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
933 ANYOF_CLASS_ZERO(data->start_class);
934 ANYOF_BITMAP_ZERO(data->start_class);
936 ANYOF_BITMAP_SET(data->start_class, uc);
937 data->start_class->flags &= ~ANYOF_EOS;
939 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
941 else if (flags & SCF_DO_STCLASS_OR) {
942 /* false positive possible if the class is case-folded */
944 ANYOF_BITMAP_SET(data->start_class, uc);
946 data->start_class->flags |= ANYOF_UNICODE_ALL;
947 data->start_class->flags &= ~ANYOF_EOS;
948 cl_and(data->start_class, &and_with);
950 flags &= ~SCF_DO_STCLASS;
952 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953 I32 l = STR_LEN(scan);
954 UV uc = *((U8*)STRING(scan));
956 /* Search for fixed substrings supports EXACT only. */
957 if (flags & SCF_DO_SUBSTR)
958 scan_commit(pRExC_state, data);
960 U8 *s = (U8 *)STRING(scan);
961 l = utf8_length(s, s + l);
962 uc = utf8_to_uvchr(s, NULL);
965 if (data && (flags & SCF_DO_SUBSTR))
967 if (flags & SCF_DO_STCLASS_AND) {
968 /* Check whether it is compatible with what we know already! */
972 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973 && !ANYOF_BITMAP_TEST(data->start_class, uc)
974 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
976 ANYOF_CLASS_ZERO(data->start_class);
977 ANYOF_BITMAP_ZERO(data->start_class);
979 ANYOF_BITMAP_SET(data->start_class, uc);
980 data->start_class->flags &= ~ANYOF_EOS;
981 data->start_class->flags |= ANYOF_FOLD;
982 if (OP(scan) == EXACTFL)
983 data->start_class->flags |= ANYOF_LOCALE;
986 else if (flags & SCF_DO_STCLASS_OR) {
987 if (data->start_class->flags & ANYOF_FOLD) {
988 /* false positive possible if the class is case-folded.
989 Assume that the locale settings are the same... */
991 ANYOF_BITMAP_SET(data->start_class, uc);
992 data->start_class->flags &= ~ANYOF_EOS;
994 cl_and(data->start_class, &and_with);
996 flags &= ~SCF_DO_STCLASS;
998 else if (strchr((char*)PL_varies,OP(scan))) {
999 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000 I32 f = flags, pos_before = 0;
1001 regnode *oscan = scan;
1002 struct regnode_charclass_class this_class;
1003 struct regnode_charclass_class *oclass = NULL;
1004 I32 next_is_eval = 0;
1006 switch (PL_regkind[(U8)OP(scan)]) {
1007 case WHILEM: /* End of (?:...)* . */
1008 scan = NEXTOPER(scan);
1011 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012 next = NEXTOPER(scan);
1013 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1015 maxcount = REG_INFTY;
1016 next = regnext(scan);
1017 scan = NEXTOPER(scan);
1021 if (flags & SCF_DO_SUBSTR)
1026 if (flags & SCF_DO_STCLASS) {
1028 maxcount = REG_INFTY;
1029 next = regnext(scan);
1030 scan = NEXTOPER(scan);
1033 is_inf = is_inf_internal = 1;
1034 scan = regnext(scan);
1035 if (flags & SCF_DO_SUBSTR) {
1036 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037 data->longest = &(data->longest_float);
1039 goto optimize_curly_tail;
1041 mincount = ARG1(scan);
1042 maxcount = ARG2(scan);
1043 next = regnext(scan);
1044 if (OP(scan) == CURLYX) {
1045 I32 lp = (data ? *(data->last_closep) : 0);
1047 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1049 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050 next_is_eval = (OP(scan) == EVAL);
1052 if (flags & SCF_DO_SUBSTR) {
1053 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054 pos_before = data->pos_min;
1058 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1060 data->flags |= SF_IS_INF;
1062 if (flags & SCF_DO_STCLASS) {
1063 cl_init(pRExC_state, &this_class);
1064 oclass = data->start_class;
1065 data->start_class = &this_class;
1066 f |= SCF_DO_STCLASS_AND;
1067 f &= ~SCF_DO_STCLASS_OR;
1069 /* These are the cases when once a subexpression
1070 fails at a particular position, it cannot succeed
1071 even after backtracking at the enclosing scope.
1073 XXXX what if minimal match and we are at the
1074 initial run of {n,m}? */
1075 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076 f &= ~SCF_WHILEM_VISITED_POS;
1078 /* This will finish on WHILEM, setting scan, or on NULL: */
1079 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1081 ? (f & ~SCF_DO_SUBSTR) : f);
1083 if (flags & SCF_DO_STCLASS)
1084 data->start_class = oclass;
1085 if (mincount == 0 || minnext == 0) {
1086 if (flags & SCF_DO_STCLASS_OR) {
1087 cl_or(pRExC_state, data->start_class, &this_class);
1089 else if (flags & SCF_DO_STCLASS_AND) {
1090 /* Switch to OR mode: cache the old value of
1091 * data->start_class */
1092 StructCopy(data->start_class, &and_with,
1093 struct regnode_charclass_class);
1094 flags &= ~SCF_DO_STCLASS_AND;
1095 StructCopy(&this_class, data->start_class,
1096 struct regnode_charclass_class);
1097 flags |= SCF_DO_STCLASS_OR;
1098 data->start_class->flags |= ANYOF_EOS;
1100 } else { /* Non-zero len */
1101 if (flags & SCF_DO_STCLASS_OR) {
1102 cl_or(pRExC_state, data->start_class, &this_class);
1103 cl_and(data->start_class, &and_with);
1105 else if (flags & SCF_DO_STCLASS_AND)
1106 cl_and(data->start_class, &this_class);
1107 flags &= ~SCF_DO_STCLASS;
1109 if (!scan) /* It was not CURLYX, but CURLY. */
1111 if (ckWARN(WARN_REGEXP)
1112 /* ? quantifier ok, except for (?{ ... }) */
1113 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114 && (minnext == 0) && (deltanext == 0)
1115 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1119 "Quantifier unexpected on zero-length expression");
1122 min += minnext * mincount;
1123 is_inf_internal |= ((maxcount == REG_INFTY
1124 && (minnext + deltanext) > 0)
1125 || deltanext == I32_MAX);
1126 is_inf |= is_inf_internal;
1127 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1129 /* Try powerful optimization CURLYX => CURLYN. */
1130 if ( OP(oscan) == CURLYX && data
1131 && data->flags & SF_IN_PAR
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext && minnext == 1 ) {
1134 /* Try to optimize to CURLYN. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136 regnode *nxt1 = nxt;
1143 if (!strchr((char*)PL_simple,OP(nxt))
1144 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145 && STR_LEN(nxt) == 1))
1151 if (OP(nxt) != CLOSE)
1153 /* Now we know that nxt2 is the only contents: */
1154 oscan->flags = (U8)ARG(nxt);
1156 OP(nxt1) = NOTHING; /* was OPEN. */
1158 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1162 OP(nxt + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1168 /* Try optimization CURLYX => CURLYM. */
1169 if ( OP(oscan) == CURLYX && data
1170 && !(data->flags & SF_HAS_PAR)
1171 && !(data->flags & SF_HAS_EVAL)
1173 /* XXXX How to optimize if data == 0? */
1174 /* Optimize to a simpler form. */
1175 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1179 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180 && (OP(nxt2) != WHILEM))
1182 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1183 /* Need to optimize away parenths. */
1184 if (data->flags & SF_IN_PAR) {
1185 /* Set the parenth number. */
1186 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1188 if (OP(nxt) != CLOSE)
1189 FAIL("Panic opt close");
1190 oscan->flags = (U8)ARG(nxt);
1191 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1192 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1194 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195 OP(nxt + 1) = OPTIMIZED; /* was count. */
1196 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1200 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201 regnode *nnxt = regnext(nxt1);
1204 if (reg_off_by_arg[OP(nxt1)])
1205 ARG_SET(nxt1, nxt2 - nxt1);
1206 else if (nxt2 - nxt1 < U16_MAX)
1207 NEXT_OFF(nxt1) = nxt2 - nxt1;
1209 OP(nxt) = NOTHING; /* Cannot beautify */
1214 /* Optimize again: */
1215 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1221 else if ((OP(oscan) == CURLYX)
1222 && (flags & SCF_WHILEM_VISITED_POS)
1223 /* See the comment on a similar expression above.
1224 However, this time it not a subexpression
1225 we care about, but the expression itself. */
1226 && (maxcount == REG_INFTY)
1227 && data && ++data->whilem_c < 16) {
1228 /* This stays as CURLYX, we can put the count/of pair. */
1229 /* Find WHILEM (as in regexec.c) */
1230 regnode *nxt = oscan + NEXT_OFF(oscan);
1232 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1234 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235 | (RExC_whilem_seen << 4)); /* On WHILEM */
1237 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1239 if (flags & SCF_DO_SUBSTR) {
1240 SV *last_str = Nullsv;
1241 int counted = mincount != 0;
1243 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1250 if (pos_before >= data->last_start_min)
1253 b = data->last_start_min;
1256 s = SvPV(data->last_found, l);
1257 old = b - data->last_start_min;
1260 I32 b = pos_before >= data->last_start_min
1261 ? pos_before : data->last_start_min;
1263 char *s = SvPV(data->last_found, l);
1264 I32 old = b - data->last_start_min;
1268 old = utf8_hop((U8*)s, old) - (U8*)s;
1271 /* Get the added string: */
1272 last_str = newSVpvn(s + old, l);
1274 SvUTF8_on(last_str);
1275 if (deltanext == 0 && pos_before == b) {
1276 /* What was added is a constant string */
1278 SvGROW(last_str, (mincount * l) + 1);
1279 repeatcpy(SvPVX(last_str) + l,
1280 SvPVX(last_str), l, mincount - 1);
1281 SvCUR(last_str) *= mincount;
1282 /* Add additional parts. */
1283 SvCUR_set(data->last_found,
1284 SvCUR(data->last_found) - l);
1285 sv_catsv(data->last_found, last_str);
1286 data->last_end += l * (mincount - 1);
1289 /* start offset must point into the last copy */
1290 data->last_start_min += minnext * (mincount - 1);
1291 data->last_start_max += is_inf ? I32_MAX
1292 : (maxcount - 1) * (minnext + data->pos_delta);
1295 /* It is counted once already... */
1296 data->pos_min += minnext * (mincount - counted);
1297 data->pos_delta += - counted * deltanext +
1298 (minnext + deltanext) * maxcount - minnext * mincount;
1299 if (mincount != maxcount) {
1300 /* Cannot extend fixed substrings found inside
1302 scan_commit(pRExC_state,data);
1303 if (mincount && last_str) {
1304 sv_setsv(data->last_found, last_str);
1305 data->last_end = data->pos_min;
1306 data->last_start_min =
1307 data->pos_min - CHR_SVLEN(last_str);
1308 data->last_start_max = is_inf
1310 : data->pos_min + data->pos_delta
1311 - CHR_SVLEN(last_str);
1313 data->longest = &(data->longest_float);
1315 SvREFCNT_dec(last_str);
1317 if (data && (fl & SF_HAS_EVAL))
1318 data->flags |= SF_HAS_EVAL;
1319 optimize_curly_tail:
1320 if (OP(oscan) != CURLYX) {
1321 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1323 NEXT_OFF(oscan) += NEXT_OFF(next);
1326 default: /* REF and CLUMP only? */
1327 if (flags & SCF_DO_SUBSTR) {
1328 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1329 data->longest = &(data->longest_float);
1331 is_inf = is_inf_internal = 1;
1332 if (flags & SCF_DO_STCLASS_OR)
1333 cl_anything(pRExC_state, data->start_class);
1334 flags &= ~SCF_DO_STCLASS;
1338 else if (strchr((char*)PL_simple,OP(scan))) {
1341 if (flags & SCF_DO_SUBSTR) {
1342 scan_commit(pRExC_state,data);
1346 if (flags & SCF_DO_STCLASS) {
1347 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1349 /* Some of the logic below assumes that switching
1350 locale on will only add false positives. */
1351 switch (PL_regkind[(U8)OP(scan)]) {
1355 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1357 cl_anything(pRExC_state, data->start_class);
1360 if (OP(scan) == SANY)
1362 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364 || (data->start_class->flags & ANYOF_CLASS));
1365 cl_anything(pRExC_state, data->start_class);
1367 if (flags & SCF_DO_STCLASS_AND || !value)
1368 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1371 if (flags & SCF_DO_STCLASS_AND)
1372 cl_and(data->start_class,
1373 (struct regnode_charclass_class*)scan);
1375 cl_or(pRExC_state, data->start_class,
1376 (struct regnode_charclass_class*)scan);
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (!isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1391 for (value = 0; value < 256; value++)
1393 ANYOF_BITMAP_SET(data->start_class, value);
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404 data->start_class->flags |= ANYOF_LOCALE;
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411 for (value = 0; value < 256; value++)
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1420 for (value = 0; value < 256; value++)
1421 if (!isALNUM(value))
1422 ANYOF_BITMAP_SET(data->start_class, value);
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440 for (value = 0; value < 256; value++)
1441 if (!isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1449 for (value = 0; value < 256; value++)
1451 ANYOF_BITMAP_SET(data->start_class, value);
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469 for (value = 0; value < 256; value++)
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1478 for (value = 0; value < 256; value++)
1479 if (!isSPACE(value))
1480 ANYOF_BITMAP_SET(data->start_class, value);
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE) {
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1489 if (!isSPACE(value))
1490 ANYOF_BITMAP_CLEAR(data->start_class, value);
1494 data->start_class->flags |= ANYOF_LOCALE;
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501 for (value = 0; value < 256; value++)
1502 if (!isDIGIT(value))
1503 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1509 for (value = 0; value < 256; value++)
1511 ANYOF_BITMAP_SET(data->start_class, value);
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518 for (value = 0; value < 256; value++)
1520 ANYOF_BITMAP_CLEAR(data->start_class, value);
1523 if (data->start_class->flags & ANYOF_LOCALE)
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1526 for (value = 0; value < 256; value++)
1527 if (!isDIGIT(value))
1528 ANYOF_BITMAP_SET(data->start_class, value);
1533 if (flags & SCF_DO_STCLASS_OR)
1534 cl_and(data->start_class, &and_with);
1535 flags &= ~SCF_DO_STCLASS;
1538 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1539 data->flags |= (OP(scan) == MEOL
1543 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1544 /* Lookbehind, or need to calculate parens/evals/stclass: */
1545 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1546 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1547 /* Lookahead/lookbehind */
1548 I32 deltanext, minnext, fake = 0;
1550 struct regnode_charclass_class intrnl;
1553 data_fake.flags = 0;
1555 data_fake.whilem_c = data->whilem_c;
1556 data_fake.last_closep = data->last_closep;
1559 data_fake.last_closep = &fake;
1560 if ( flags & SCF_DO_STCLASS && !scan->flags
1561 && OP(scan) == IFMATCH ) { /* Lookahead */
1562 cl_init(pRExC_state, &intrnl);
1563 data_fake.start_class = &intrnl;
1564 f |= SCF_DO_STCLASS_AND;
1566 if (flags & SCF_WHILEM_VISITED_POS)
1567 f |= SCF_WHILEM_VISITED_POS;
1568 next = regnext(scan);
1569 nscan = NEXTOPER(NEXTOPER(scan));
1570 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1573 vFAIL("Variable length lookbehind not implemented");
1575 else if (minnext > U8_MAX) {
1576 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1578 scan->flags = (U8)minnext;
1580 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1582 if (data && (data_fake.flags & SF_HAS_EVAL))
1583 data->flags |= SF_HAS_EVAL;
1585 data->whilem_c = data_fake.whilem_c;
1586 if (f & SCF_DO_STCLASS_AND) {
1587 int was = (data->start_class->flags & ANYOF_EOS);
1589 cl_and(data->start_class, &intrnl);
1591 data->start_class->flags |= ANYOF_EOS;
1594 else if (OP(scan) == OPEN) {
1597 else if (OP(scan) == CLOSE) {
1598 if ((I32)ARG(scan) == is_par) {
1599 next = regnext(scan);
1601 if ( next && (OP(next) != WHILEM) && next < last)
1602 is_par = 0; /* Disable optimization */
1605 *(data->last_closep) = ARG(scan);
1607 else if (OP(scan) == EVAL) {
1609 data->flags |= SF_HAS_EVAL;
1611 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1612 if (flags & SCF_DO_SUBSTR) {
1613 scan_commit(pRExC_state,data);
1614 data->longest = &(data->longest_float);
1616 is_inf = is_inf_internal = 1;
1617 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1618 cl_anything(pRExC_state, data->start_class);
1619 flags &= ~SCF_DO_STCLASS;
1621 /* Else: zero-length, ignore. */
1622 scan = regnext(scan);
1627 *deltap = is_inf_internal ? I32_MAX : delta;
1628 if (flags & SCF_DO_SUBSTR && is_inf)
1629 data->pos_delta = I32_MAX - data->pos_min;
1630 if (is_par > U8_MAX)
1632 if (is_par && pars==1 && data) {
1633 data->flags |= SF_IN_PAR;
1634 data->flags &= ~SF_HAS_PAR;
1636 else if (pars && data) {
1637 data->flags |= SF_HAS_PAR;
1638 data->flags &= ~SF_IN_PAR;
1640 if (flags & SCF_DO_STCLASS_OR)
1641 cl_and(data->start_class, &and_with);
1646 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1648 if (RExC_rx->data) {
1649 Renewc(RExC_rx->data,
1650 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1651 char, struct reg_data);
1652 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653 RExC_rx->data->count += n;
1656 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1657 char, struct reg_data);
1658 New(1208, RExC_rx->data->what, n, U8);
1659 RExC_rx->data->count = n;
1661 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662 return RExC_rx->data->count - n;
1666 Perl_reginitcolors(pTHX)
1669 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1672 PL_colors[0] = s = savepv(s);
1674 s = strchr(s, '\t');
1680 PL_colors[i] = s = "";
1684 PL_colors[i++] = "";
1691 - pregcomp - compile a regular expression into internal code
1693 * We can't allocate space until we know how big the compiled form will be,
1694 * but we can't compile it (and thus know how big it is) until we've got a
1695 * place to put the code. So we cheat: we compile it twice, once with code
1696 * generation turned off and size counting turned on, and once "for real".
1697 * This also means that we don't allocate space until we are sure that the
1698 * thing really will compile successfully, and we never have to move the
1699 * code and thus invalidate pointers into it. (Note that it has to be in
1700 * one piece because free() must be able to free it all.) [NB: not true in perl]
1702 * Beware that the optimization-preparation code in here knows about some
1703 * of the structure of the compiled regexp. [I'll say.]
1706 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1716 RExC_state_t RExC_state;
1717 RExC_state_t *pRExC_state = &RExC_state;
1720 FAIL("NULL regexp argument");
1722 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1726 if (!PL_colorset) reginitcolors();
1727 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728 PL_colors[4],PL_colors[5],PL_colors[0],
1729 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1731 RExC_flags = pm->op_pmflags;
1735 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736 RExC_seen_evals = 0;
1739 /* First pass: determine size, legality. */
1746 RExC_emit = &PL_regdummy;
1747 RExC_whilem_seen = 0;
1748 #if 0 /* REGC() is (currently) a NOP at the first pass.
1749 * Clever compilers notice this and complain. --jhi */
1750 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1752 if (reg(pRExC_state, 0, &flags) == NULL) {
1753 RExC_precomp = Nullch;
1756 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1758 /* Small enough for pointer-storage convention?
1759 If extralen==0, this means that we will not need long jumps. */
1760 if (RExC_size >= 0x10000L && RExC_extralen)
1761 RExC_size += RExC_extralen;
1764 if (RExC_whilem_seen > 15)
1765 RExC_whilem_seen = 15;
1767 /* Allocate space and initialize. */
1768 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1771 FAIL("Regexp out of space");
1774 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1775 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1778 r->prelen = xend - exp;
1779 r->precomp = savepvn(RExC_precomp, r->prelen);
1781 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1782 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1784 r->substrs = 0; /* Useful during FAIL. */
1785 r->startp = 0; /* Useful during FAIL. */
1786 r->endp = 0; /* Useful during FAIL. */
1788 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1790 r->offsets[0] = RExC_size;
1792 DEBUG_r(PerlIO_printf(Perl_debug_log,
1793 "%s %"UVuf" bytes for offset annotations.\n",
1794 r->offsets ? "Got" : "Couldn't get",
1795 (UV)((2*RExC_size+1) * sizeof(U32))));
1799 /* Second pass: emit code. */
1800 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1805 RExC_emit_start = r->program;
1806 RExC_emit = r->program;
1807 /* Store the count of eval-groups for security checks: */
1808 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1809 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1811 if (reg(pRExC_state, 0, &flags) == NULL)
1814 /* Dig out information for optimizations. */
1815 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1816 pm->op_pmflags = RExC_flags;
1818 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1819 r->regstclass = NULL;
1820 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1821 r->reganch |= ROPT_NAUGHTY;
1822 scan = r->program + 1; /* First BRANCH. */
1824 /* XXXX To minimize changes to RE engine we always allocate
1825 3-units-long substrs field. */
1826 Newz(1004, r->substrs, 1, struct reg_substr_data);
1828 StructCopy(&zero_scan_data, &data, scan_data_t);
1829 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1830 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1832 STRLEN longest_float_length, longest_fixed_length;
1833 struct regnode_charclass_class ch_class;
1838 /* Skip introductions and multiplicators >= 1. */
1839 while ((OP(first) == OPEN && (sawopen = 1)) ||
1840 /* An OR of *one* alternative - should not happen now. */
1841 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1842 (OP(first) == PLUS) ||
1843 (OP(first) == MINMOD) ||
1844 /* An {n,m} with n>0 */
1845 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1846 if (OP(first) == PLUS)
1849 first += regarglen[(U8)OP(first)];
1850 first = NEXTOPER(first);
1853 /* Starting-point info. */
1855 if (PL_regkind[(U8)OP(first)] == EXACT) {
1856 if (OP(first) == EXACT)
1857 ; /* Empty, get anchored substr later. */
1858 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1859 r->regstclass = first;
1861 else if (strchr((char*)PL_simple,OP(first)))
1862 r->regstclass = first;
1863 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1864 PL_regkind[(U8)OP(first)] == NBOUND)
1865 r->regstclass = first;
1866 else if (PL_regkind[(U8)OP(first)] == BOL) {
1867 r->reganch |= (OP(first) == MBOL
1869 : (OP(first) == SBOL
1872 first = NEXTOPER(first);
1875 else if (OP(first) == GPOS) {
1876 r->reganch |= ROPT_ANCH_GPOS;
1877 first = NEXTOPER(first);
1880 else if (!sawopen && (OP(first) == STAR &&
1881 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1882 !(r->reganch & ROPT_ANCH) )
1884 /* turn .* into ^.* with an implied $*=1 */
1885 int type = OP(NEXTOPER(first));
1887 if (type == REG_ANY)
1888 type = ROPT_ANCH_MBOL;
1890 type = ROPT_ANCH_SBOL;
1892 r->reganch |= type | ROPT_IMPLICIT;
1893 first = NEXTOPER(first);
1896 if (sawplus && (!sawopen || !RExC_sawback)
1897 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1898 /* x+ must match at the 1st pos of run of x's */
1899 r->reganch |= ROPT_SKIP;
1901 /* Scan is after the zeroth branch, first is atomic matcher. */
1902 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1903 (IV)(first - scan + 1)));
1905 * If there's something expensive in the r.e., find the
1906 * longest literal string that must appear and make it the
1907 * regmust. Resolve ties in favor of later strings, since
1908 * the regstart check works with the beginning of the r.e.
1909 * and avoiding duplication strengthens checking. Not a
1910 * strong reason, but sufficient in the absence of others.
1911 * [Now we resolve ties in favor of the earlier string if
1912 * it happens that c_offset_min has been invalidated, since the
1913 * earlier string may buy us something the later one won't.]
1917 data.longest_fixed = newSVpvn("",0);
1918 data.longest_float = newSVpvn("",0);
1919 data.last_found = newSVpvn("",0);
1920 data.longest = &(data.longest_fixed);
1922 if (!r->regstclass) {
1923 cl_init(pRExC_state, &ch_class);
1924 data.start_class = &ch_class;
1925 stclass_flag = SCF_DO_STCLASS_AND;
1926 } else /* XXXX Check for BOUND? */
1928 data.last_closep = &last_close;
1930 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1931 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1932 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1933 && data.last_start_min == 0 && data.last_end > 0
1934 && !RExC_seen_zerolen
1935 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1936 r->reganch |= ROPT_CHECK_ALL;
1937 scan_commit(pRExC_state, &data);
1938 SvREFCNT_dec(data.last_found);
1940 longest_float_length = CHR_SVLEN(data.longest_float);
1941 if (longest_float_length
1942 || (data.flags & SF_FL_BEFORE_EOL
1943 && (!(data.flags & SF_FL_BEFORE_MEOL)
1944 || (RExC_flags & PMf_MULTILINE)))) {
1947 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1948 && data.offset_fixed == data.offset_float_min
1949 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1950 goto remove_float; /* As in (a)+. */
1952 if (SvUTF8(data.longest_float)) {
1953 r->float_utf8 = data.longest_float;
1954 r->float_substr = Nullsv;
1956 r->float_substr = data.longest_float;
1957 r->float_utf8 = Nullsv;
1959 r->float_min_offset = data.offset_float_min;
1960 r->float_max_offset = data.offset_float_max;
1961 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1962 && (!(data.flags & SF_FL_BEFORE_MEOL)
1963 || (RExC_flags & PMf_MULTILINE)));
1964 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1968 r->float_substr = r->float_utf8 = Nullsv;
1969 SvREFCNT_dec(data.longest_float);
1970 longest_float_length = 0;
1973 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1974 if (longest_fixed_length
1975 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1976 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1977 || (RExC_flags & PMf_MULTILINE)))) {
1980 if (SvUTF8(data.longest_fixed)) {
1981 r->anchored_utf8 = data.longest_fixed;
1982 r->anchored_substr = Nullsv;
1984 r->anchored_substr = data.longest_fixed;
1985 r->anchored_utf8 = Nullsv;
1987 r->anchored_offset = data.offset_fixed;
1988 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1990 || (RExC_flags & PMf_MULTILINE)));
1991 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1994 r->anchored_substr = r->anchored_utf8 = Nullsv;
1995 SvREFCNT_dec(data.longest_fixed);
1996 longest_fixed_length = 0;
1999 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2000 r->regstclass = NULL;
2001 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2003 && !(data.start_class->flags & ANYOF_EOS)
2004 && !cl_is_anything(data.start_class))
2006 I32 n = add_data(pRExC_state, 1, "f");
2008 New(1006, RExC_rx->data->data[n], 1,
2009 struct regnode_charclass_class);
2010 StructCopy(data.start_class,
2011 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2012 struct regnode_charclass_class);
2013 r->regstclass = (regnode*)RExC_rx->data->data[n];
2014 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2015 PL_regdata = r->data; /* for regprop() */
2016 DEBUG_r({ SV *sv = sv_newmortal();
2017 regprop(sv, (regnode*)data.start_class);
2018 PerlIO_printf(Perl_debug_log,
2019 "synthetic stclass `%s'.\n",
2023 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2024 if (longest_fixed_length > longest_float_length) {
2025 r->check_substr = r->anchored_substr;
2026 r->check_utf8 = r->anchored_utf8;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
2032 r->check_substr = r->float_substr;
2033 r->check_utf8 = r->float_utf8;
2034 r->check_offset_min = data.offset_float_min;
2035 r->check_offset_max = data.offset_float_max;
2037 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2038 This should be changed ASAP! */
2039 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2040 r->reganch |= RE_USE_INTUIT;
2041 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2042 r->reganch |= RE_INTUIT_TAIL;
2046 /* Several toplevels. Best we can is to set minlen. */
2048 struct regnode_charclass_class ch_class;
2051 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2052 scan = r->program + 1;
2053 cl_init(pRExC_state, &ch_class);
2054 data.start_class = &ch_class;
2055 data.last_closep = &last_close;
2056 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2058 = r->float_substr = r->float_utf8 = Nullsv;
2059 if (!(data.start_class->flags & ANYOF_EOS)
2060 && !cl_is_anything(data.start_class))
2062 I32 n = add_data(pRExC_state, 1, "f");
2064 New(1006, RExC_rx->data->data[n], 1,
2065 struct regnode_charclass_class);
2066 StructCopy(data.start_class,
2067 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2068 struct regnode_charclass_class);
2069 r->regstclass = (regnode*)RExC_rx->data->data[n];
2070 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2071 DEBUG_r({ SV* sv = sv_newmortal();
2072 regprop(sv, (regnode*)data.start_class);
2073 PerlIO_printf(Perl_debug_log,
2074 "synthetic stclass `%s'.\n",
2080 if (RExC_seen & REG_SEEN_GPOS)
2081 r->reganch |= ROPT_GPOS_SEEN;
2082 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2083 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2084 if (RExC_seen & REG_SEEN_EVAL)
2085 r->reganch |= ROPT_EVAL_SEEN;
2086 if (RExC_seen & REG_SEEN_CANY)
2087 r->reganch |= ROPT_CANY_SEEN;
2088 Newz(1002, r->startp, RExC_npar, I32);
2089 Newz(1002, r->endp, RExC_npar, I32);
2090 PL_regdata = r->data; /* for regprop() */
2091 DEBUG_r(regdump(r));
2096 - reg - regular expression, i.e. main body or parenthesized thing
2098 * Caller must absorb opening parenthesis.
2100 * Combining parenthesis handling with the base level of regular expression
2101 * is a trifle forced, but the need to tie the tails of the branches to what
2102 * follows makes it hard to avoid.
2105 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2106 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2108 register regnode *ret; /* Will be the head of the group. */
2109 register regnode *br;
2110 register regnode *lastbr;
2111 register regnode *ender = 0;
2112 register I32 parno = 0;
2113 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2115 /* for (?g), (?gc), and (?o) warnings; warning
2116 about (?c) will warn about (?g) -- japhy */
2118 I32 wastedflags = 0x00,
2121 wasted_gc = 0x02 | 0x04,
2124 char * parse_start = RExC_parse; /* MJD */
2125 char *oregcomp_parse = RExC_parse;
2128 *flagp = 0; /* Tentatively. */
2131 /* Make an OPEN node, if parenthesized. */
2133 if (*RExC_parse == '?') { /* (?...) */
2134 U32 posflags = 0, negflags = 0;
2135 U32 *flagsp = &posflags;
2137 char *seqstart = RExC_parse;
2140 paren = *RExC_parse++;
2141 ret = NULL; /* For look-ahead/behind. */
2143 case '<': /* (?<...) */
2144 RExC_seen |= REG_SEEN_LOOKBEHIND;
2145 if (*RExC_parse == '!')
2147 if (*RExC_parse != '=' && *RExC_parse != '!')
2150 case '=': /* (?=...) */
2151 case '!': /* (?!...) */
2152 RExC_seen_zerolen++;
2153 case ':': /* (?:...) */
2154 case '>': /* (?>...) */
2156 case '$': /* (?$...) */
2157 case '@': /* (?@...) */
2158 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2160 case '#': /* (?#...) */
2161 while (*RExC_parse && *RExC_parse != ')')
2163 if (*RExC_parse != ')')
2164 FAIL("Sequence (?#... not terminated");
2165 nextchar(pRExC_state);
2168 case 'p': /* (?p...) */
2169 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2170 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2172 case '?': /* (??...) */
2174 if (*RExC_parse != '{')
2176 paren = *RExC_parse++;
2178 case '{': /* (?{...}) */
2180 I32 count = 1, n = 0;
2182 char *s = RExC_parse;
2184 OP_4tree *sop, *rop;
2186 RExC_seen_zerolen++;
2187 RExC_seen |= REG_SEEN_EVAL;
2188 while (count && (c = *RExC_parse)) {
2189 if (c == '\\' && RExC_parse[1])
2197 if (*RExC_parse != ')')
2200 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2205 if (RExC_parse - 1 - s)
2206 sv = newSVpvn(s, RExC_parse - 1 - s);
2208 sv = newSVpvn("", 0);
2211 Perl_save_re_context(aTHX);
2212 rop = sv_compile_2op(sv, &sop, "re", &pad);
2213 sop->op_private |= OPpREFCOUNTED;
2214 /* re_dup will OpREFCNT_inc */
2215 OpREFCNT_set(sop, 1);
2218 n = add_data(pRExC_state, 3, "nop");
2219 RExC_rx->data->data[n] = (void*)rop;
2220 RExC_rx->data->data[n+1] = (void*)sop;
2221 RExC_rx->data->data[n+2] = (void*)pad;
2224 else { /* First pass */
2225 if (PL_reginterp_cnt < ++RExC_seen_evals
2226 && PL_curcop != &PL_compiling)
2227 /* No compiled RE interpolated, has runtime
2228 components ===> unsafe. */
2229 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2230 if (PL_tainting && PL_tainted)
2231 FAIL("Eval-group in insecure regular expression");
2234 nextchar(pRExC_state);
2236 ret = reg_node(pRExC_state, LOGICAL);
2239 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2240 /* deal with the length of this later - MJD */
2243 ret = reganode(pRExC_state, EVAL, n);
2244 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2245 Set_Node_Offset(ret, parse_start);
2248 case '(': /* (?(?{...})...) and (?(?=...)...) */
2250 if (RExC_parse[0] == '?') { /* (?(?...)) */
2251 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2252 || RExC_parse[1] == '<'
2253 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2256 ret = reg_node(pRExC_state, LOGICAL);
2259 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2263 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2265 parno = atoi(RExC_parse++);
2267 while (isDIGIT(*RExC_parse))
2269 ret = reganode(pRExC_state, GROUPP, parno);
2271 if ((c = *nextchar(pRExC_state)) != ')')
2272 vFAIL("Switch condition not recognized");
2274 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2275 br = regbranch(pRExC_state, &flags, 1);
2277 br = reganode(pRExC_state, LONGJMP, 0);
2279 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2280 c = *nextchar(pRExC_state);
2284 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2285 regbranch(pRExC_state, &flags, 1);
2286 regtail(pRExC_state, ret, lastbr);
2289 c = *nextchar(pRExC_state);
2294 vFAIL("Switch (?(condition)... contains too many branches");
2295 ender = reg_node(pRExC_state, TAIL);
2296 regtail(pRExC_state, br, ender);
2298 regtail(pRExC_state, lastbr, ender);
2299 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2302 regtail(pRExC_state, ret, ender);
2306 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2310 RExC_parse--; /* for vFAIL to print correctly */
2311 vFAIL("Sequence (? incomplete");
2315 parse_flags: /* (?i) */
2316 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2317 /* (?g), (?gc) and (?o) are useless here
2318 and must be globally applied -- japhy */
2320 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2321 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2322 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2323 if (! (wastedflags & wflagbit) ) {
2324 wastedflags |= wflagbit;
2327 "Useless (%s%c) - %suse /%c modifier",
2328 flagsp == &negflags ? "?-" : "?",
2330 flagsp == &negflags ? "don't " : "",
2336 else if (*RExC_parse == 'c') {
2337 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2338 if (! (wastedflags & wasted_c) ) {
2339 wastedflags |= wasted_gc;
2342 "Useless (%sc) - %suse /gc modifier",
2343 flagsp == &negflags ? "?-" : "?",
2344 flagsp == &negflags ? "don't " : ""
2349 else { pmflag(flagsp, *RExC_parse); }
2353 if (*RExC_parse == '-') {
2355 wastedflags = 0; /* reset so (?g-c) warns twice */
2359 RExC_flags |= posflags;
2360 RExC_flags &= ~negflags;
2361 if (*RExC_parse == ':') {
2367 if (*RExC_parse != ')') {
2369 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2371 nextchar(pRExC_state);
2379 ret = reganode(pRExC_state, OPEN, parno);
2380 Set_Node_Length(ret, 1); /* MJD */
2381 Set_Node_Offset(ret, RExC_parse); /* MJD */
2388 /* Pick up the branches, linking them together. */
2389 parse_start = RExC_parse; /* MJD */
2390 br = regbranch(pRExC_state, &flags, 1);
2391 /* branch_len = (paren != 0); */
2395 if (*RExC_parse == '|') {
2396 if (!SIZE_ONLY && RExC_extralen) {
2397 reginsert(pRExC_state, BRANCHJ, br);
2400 reginsert(pRExC_state, BRANCH, br);
2401 Set_Node_Length(br, paren != 0);
2402 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2406 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2408 else if (paren == ':') {
2409 *flagp |= flags&SIMPLE;
2411 if (open) { /* Starts with OPEN. */
2412 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2414 else if (paren != '?') /* Not Conditional */
2416 *flagp |= flags & (SPSTART | HASWIDTH);
2418 while (*RExC_parse == '|') {
2419 if (!SIZE_ONLY && RExC_extralen) {
2420 ender = reganode(pRExC_state, LONGJMP,0);
2421 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2424 RExC_extralen += 2; /* Account for LONGJMP. */
2425 nextchar(pRExC_state);
2426 br = regbranch(pRExC_state, &flags, 0);
2430 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2434 *flagp |= flags&SPSTART;
2437 if (have_branch || paren != ':') {
2438 /* Make a closing node, and hook it on the end. */
2441 ender = reg_node(pRExC_state, TAIL);
2444 ender = reganode(pRExC_state, CLOSE, parno);
2445 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2446 Set_Node_Length(ender,1); /* MJD */
2452 *flagp &= ~HASWIDTH;
2455 ender = reg_node(pRExC_state, SUCCEED);
2458 ender = reg_node(pRExC_state, END);
2461 regtail(pRExC_state, lastbr, ender);
2464 /* Hook the tails of the branches to the closing node. */
2465 for (br = ret; br != NULL; br = regnext(br)) {
2466 regoptail(pRExC_state, br, ender);
2473 static char parens[] = "=!<,>";
2475 if (paren && (p = strchr(parens, paren))) {
2476 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2477 int flag = (p - parens) > 1;
2480 node = SUSPEND, flag = 0;
2481 reginsert(pRExC_state, node,ret);
2482 Set_Node_Offset(ret, oregcomp_parse);
2483 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2485 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2489 /* Check for proper termination. */
2491 RExC_flags = oregflags;
2492 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2493 RExC_parse = oregcomp_parse;
2494 vFAIL("Unmatched (");
2497 else if (!paren && RExC_parse < RExC_end) {
2498 if (*RExC_parse == ')') {
2500 vFAIL("Unmatched )");
2503 FAIL("Junk on end of regexp"); /* "Can't happen". */
2511 - regbranch - one alternative of an | operator
2513 * Implements the concatenation operator.
2516 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2518 register regnode *ret;
2519 register regnode *chain = NULL;
2520 register regnode *latest;
2521 I32 flags = 0, c = 0;
2526 if (!SIZE_ONLY && RExC_extralen)
2527 ret = reganode(pRExC_state, BRANCHJ,0);
2529 ret = reg_node(pRExC_state, BRANCH);
2530 Set_Node_Length(ret, 1);
2534 if (!first && SIZE_ONLY)
2535 RExC_extralen += 1; /* BRANCHJ */
2537 *flagp = WORST; /* Tentatively. */
2540 nextchar(pRExC_state);
2541 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2543 latest = regpiece(pRExC_state, &flags);
2544 if (latest == NULL) {
2545 if (flags & TRYAGAIN)
2549 else if (ret == NULL)
2551 *flagp |= flags&HASWIDTH;
2552 if (chain == NULL) /* First piece. */
2553 *flagp |= flags&SPSTART;
2556 regtail(pRExC_state, chain, latest);
2561 if (chain == NULL) { /* Loop ran zero times. */
2562 chain = reg_node(pRExC_state, NOTHING);
2567 *flagp |= flags&SIMPLE;
2574 - regpiece - something followed by possible [*+?]
2576 * Note that the branching code sequences used for ? and the general cases
2577 * of * and + are somewhat optimized: they use the same NOTHING node as
2578 * both the endmarker for their branch list and the body of the last branch.
2579 * It might seem that this node could be dispensed with entirely, but the
2580 * endmarker role is not redundant.
2583 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2585 register regnode *ret;
2587 register char *next;
2589 char *origparse = RExC_parse;
2592 I32 max = REG_INFTY;
2595 ret = regatom(pRExC_state, &flags);
2597 if (flags & TRYAGAIN)
2604 if (op == '{' && regcurly(RExC_parse)) {
2605 parse_start = RExC_parse; /* MJD */
2606 next = RExC_parse + 1;
2608 while (isDIGIT(*next) || *next == ',') {
2617 if (*next == '}') { /* got one */
2621 min = atoi(RExC_parse);
2625 maxpos = RExC_parse;
2627 if (!max && *maxpos != '0')
2628 max = REG_INFTY; /* meaning "infinity" */
2629 else if (max >= REG_INFTY)
2630 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2632 nextchar(pRExC_state);
2635 if ((flags&SIMPLE)) {
2636 RExC_naughty += 2 + RExC_naughty / 2;
2637 reginsert(pRExC_state, CURLY, ret);
2638 Set_Node_Offset(ret, parse_start+1); /* MJD */
2639 Set_Node_Cur_Length(ret);
2642 regnode *w = reg_node(pRExC_state, WHILEM);
2645 regtail(pRExC_state, ret, w);
2646 if (!SIZE_ONLY && RExC_extralen) {
2647 reginsert(pRExC_state, LONGJMP,ret);
2648 reginsert(pRExC_state, NOTHING,ret);
2649 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2651 reginsert(pRExC_state, CURLYX,ret);
2653 Set_Node_Offset(ret, parse_start+1);
2654 Set_Node_Length(ret,
2655 op == '{' ? (RExC_parse - parse_start) : 1);
2657 if (!SIZE_ONLY && RExC_extralen)
2658 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2659 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2661 RExC_whilem_seen++, RExC_extralen += 3;
2662 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2670 if (max && max < min)
2671 vFAIL("Can't do {n,m} with n > m");
2673 ARG1_SET(ret, (U16)min);
2674 ARG2_SET(ret, (U16)max);
2686 #if 0 /* Now runtime fix should be reliable. */
2688 /* if this is reinstated, don't forget to put this back into perldiag:
2690 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2692 (F) The part of the regexp subject to either the * or + quantifier
2693 could match an empty string. The {#} shows in the regular
2694 expression about where the problem was discovered.
2698 if (!(flags&HASWIDTH) && op != '?')
2699 vFAIL("Regexp *+ operand could be empty");
2702 parse_start = RExC_parse;
2703 nextchar(pRExC_state);
2705 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2707 if (op == '*' && (flags&SIMPLE)) {
2708 reginsert(pRExC_state, STAR, ret);
2712 else if (op == '*') {
2716 else if (op == '+' && (flags&SIMPLE)) {
2717 reginsert(pRExC_state, PLUS, ret);
2721 else if (op == '+') {
2725 else if (op == '?') {
2730 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2732 "%.*s matches null string many times",
2733 RExC_parse - origparse,
2737 if (*RExC_parse == '?') {
2738 nextchar(pRExC_state);
2739 reginsert(pRExC_state, MINMOD, ret);
2740 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2742 if (ISMULT2(RExC_parse)) {
2744 vFAIL("Nested quantifiers");
2751 - regatom - the lowest level
2753 * Optimization: gobbles an entire sequence of ordinary characters so that
2754 * it can turn them into a single node, which is smaller to store and
2755 * faster to run. Backslashed characters are exceptions, each becoming a
2756 * separate node; the code is simpler that way and it's not worth fixing.
2758 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2760 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2762 register regnode *ret = 0;
2764 char *parse_start = 0;
2766 *flagp = WORST; /* Tentatively. */
2769 switch (*RExC_parse) {
2771 RExC_seen_zerolen++;
2772 nextchar(pRExC_state);
2773 if (RExC_flags & PMf_MULTILINE)
2774 ret = reg_node(pRExC_state, MBOL);
2775 else if (RExC_flags & PMf_SINGLELINE)
2776 ret = reg_node(pRExC_state, SBOL);
2778 ret = reg_node(pRExC_state, BOL);
2779 Set_Node_Length(ret, 1); /* MJD */
2782 nextchar(pRExC_state);
2784 RExC_seen_zerolen++;
2785 if (RExC_flags & PMf_MULTILINE)
2786 ret = reg_node(pRExC_state, MEOL);
2787 else if (RExC_flags & PMf_SINGLELINE)
2788 ret = reg_node(pRExC_state, SEOL);
2790 ret = reg_node(pRExC_state, EOL);
2791 Set_Node_Length(ret, 1); /* MJD */
2794 nextchar(pRExC_state);
2795 if (RExC_flags & PMf_SINGLELINE)
2796 ret = reg_node(pRExC_state, SANY);
2798 ret = reg_node(pRExC_state, REG_ANY);
2799 *flagp |= HASWIDTH|SIMPLE;
2801 Set_Node_Length(ret, 1); /* MJD */
2805 char *oregcomp_parse = ++RExC_parse;
2806 ret = regclass(pRExC_state);
2807 if (*RExC_parse != ']') {
2808 RExC_parse = oregcomp_parse;
2809 vFAIL("Unmatched [");
2811 nextchar(pRExC_state);
2812 *flagp |= HASWIDTH|SIMPLE;
2813 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2817 nextchar(pRExC_state);
2818 ret = reg(pRExC_state, 1, &flags);
2820 if (flags & TRYAGAIN) {
2821 if (RExC_parse == RExC_end) {
2822 /* Make parent create an empty node if needed. */
2830 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2834 if (flags & TRYAGAIN) {
2838 vFAIL("Internal urp");
2839 /* Supposed to be caught earlier. */
2842 if (!regcurly(RExC_parse)) {
2851 vFAIL("Quantifier follows nothing");
2854 switch (*++RExC_parse) {
2856 RExC_seen_zerolen++;
2857 ret = reg_node(pRExC_state, SBOL);
2859 nextchar(pRExC_state);
2860 Set_Node_Length(ret, 2); /* MJD */
2863 ret = reg_node(pRExC_state, GPOS);
2864 RExC_seen |= REG_SEEN_GPOS;
2866 nextchar(pRExC_state);
2867 Set_Node_Length(ret, 2); /* MJD */
2870 ret = reg_node(pRExC_state, SEOL);
2872 RExC_seen_zerolen++; /* Do not optimize RE away */
2873 nextchar(pRExC_state);
2876 ret = reg_node(pRExC_state, EOS);
2878 RExC_seen_zerolen++; /* Do not optimize RE away */
2879 nextchar(pRExC_state);
2880 Set_Node_Length(ret, 2); /* MJD */
2883 ret = reg_node(pRExC_state, CANY);
2884 RExC_seen |= REG_SEEN_CANY;
2885 *flagp |= HASWIDTH|SIMPLE;
2886 nextchar(pRExC_state);
2887 Set_Node_Length(ret, 2); /* MJD */
2890 ret = reg_node(pRExC_state, CLUMP);
2892 nextchar(pRExC_state);
2893 Set_Node_Length(ret, 2); /* MJD */
2896 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2897 *flagp |= HASWIDTH|SIMPLE;
2898 nextchar(pRExC_state);
2899 Set_Node_Length(ret, 2); /* MJD */
2902 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2903 *flagp |= HASWIDTH|SIMPLE;
2904 nextchar(pRExC_state);
2905 Set_Node_Length(ret, 2); /* MJD */
2908 RExC_seen_zerolen++;
2909 RExC_seen |= REG_SEEN_LOOKBEHIND;
2910 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2912 nextchar(pRExC_state);
2913 Set_Node_Length(ret, 2); /* MJD */
2916 RExC_seen_zerolen++;
2917 RExC_seen |= REG_SEEN_LOOKBEHIND;
2918 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2920 nextchar(pRExC_state);
2921 Set_Node_Length(ret, 2); /* MJD */
2924 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2925 *flagp |= HASWIDTH|SIMPLE;
2926 nextchar(pRExC_state);
2927 Set_Node_Length(ret, 2); /* MJD */
2930 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2931 *flagp |= HASWIDTH|SIMPLE;
2932 nextchar(pRExC_state);
2933 Set_Node_Length(ret, 2); /* MJD */
2936 ret = reg_node(pRExC_state, DIGIT);
2937 *flagp |= HASWIDTH|SIMPLE;
2938 nextchar(pRExC_state);
2939 Set_Node_Length(ret, 2); /* MJD */
2942 ret = reg_node(pRExC_state, NDIGIT);
2943 *flagp |= HASWIDTH|SIMPLE;
2944 nextchar(pRExC_state);
2945 Set_Node_Length(ret, 2); /* MJD */
2950 char* oldregxend = RExC_end;
2951 char* parse_start = RExC_parse - 2;
2953 if (RExC_parse[1] == '{') {
2954 /* a lovely hack--pretend we saw [\pX] instead */
2955 RExC_end = strchr(RExC_parse, '}');
2957 U8 c = (U8)*RExC_parse;
2959 RExC_end = oldregxend;
2960 vFAIL2("Missing right brace on \\%c{}", c);
2965 RExC_end = RExC_parse + 2;
2966 if (RExC_end > oldregxend)
2967 RExC_end = oldregxend;
2971 ret = regclass(pRExC_state);
2973 RExC_end = oldregxend;
2976 Set_Node_Offset(ret, parse_start + 2);
2977 Set_Node_Cur_Length(ret);
2978 nextchar(pRExC_state);
2979 *flagp |= HASWIDTH|SIMPLE;
2992 case '1': case '2': case '3': case '4':
2993 case '5': case '6': case '7': case '8': case '9':
2995 I32 num = atoi(RExC_parse);
2997 if (num > 9 && num >= RExC_npar)
3000 char * parse_start = RExC_parse - 1; /* MJD */
3001 while (isDIGIT(*RExC_parse))
3004 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3005 vFAIL("Reference to nonexistent group");
3007 ret = reganode(pRExC_state,
3008 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3012 /* override incorrect value set in reganode MJD */
3013 Set_Node_Offset(ret, parse_start+1);
3014 Set_Node_Cur_Length(ret); /* MJD */
3016 nextchar(pRExC_state);
3021 if (RExC_parse >= RExC_end)
3022 FAIL("Trailing \\");
3025 /* Do not generate `unrecognized' warnings here, we fall
3026 back into the quick-grab loop below */
3032 if (RExC_flags & PMf_EXTENDED) {
3033 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3034 if (RExC_parse < RExC_end)
3040 register STRLEN len;
3046 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3048 parse_start = RExC_parse - 1;
3054 ret = reg_node(pRExC_state,
3055 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3057 for (len = 0, p = RExC_parse - 1;
3058 len < 127 && p < RExC_end;
3063 if (RExC_flags & PMf_EXTENDED)
3064 p = regwhite(p, RExC_end);
3111 ender = ASCII_TO_NATIVE('\033');
3115 ender = ASCII_TO_NATIVE('\007');
3120 char* e = strchr(p, '}');
3124 vFAIL("Missing right brace on \\x{}");
3127 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3128 | PERL_SCAN_DISALLOW_PREFIX;
3130 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3133 /* numlen is generous */
3134 if (numlen + len >= 127) {
3142 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3144 ender = grok_hex(p, &numlen, &flags, NULL);
3150 ender = UCHARAT(p++);
3151 ender = toCTRL(ender);
3153 case '0': case '1': case '2': case '3':case '4':
3154 case '5': case '6': case '7': case '8':case '9':
3156 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3159 ender = grok_oct(p, &numlen, &flags, NULL);
3169 FAIL("Trailing \\");
3172 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3173 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3174 goto normal_default;
3179 if (UTF8_IS_START(*p) && UTF) {
3180 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3188 if (RExC_flags & PMf_EXTENDED)
3189 p = regwhite(p, RExC_end);
3191 /* Prime the casefolded buffer. */
3192 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3194 if (ISMULT2(p)) { /* Back off on ?+*. */
3201 /* Emit all the Unicode characters. */
3202 for (foldbuf = tmpbuf;
3204 foldlen -= numlen) {
3205 ender = utf8_to_uvchr(foldbuf, &numlen);
3207 reguni(pRExC_state, ender, s, &unilen);
3210 /* In EBCDIC the numlen
3211 * and unilen can differ. */
3213 if (numlen >= foldlen)
3217 break; /* "Can't happen." */
3221 reguni(pRExC_state, ender, s, &unilen);
3230 REGC((char)ender, s++);
3238 /* Emit all the Unicode characters. */
3239 for (foldbuf = tmpbuf;
3241 foldlen -= numlen) {
3242 ender = utf8_to_uvchr(foldbuf, &numlen);
3244 reguni(pRExC_state, ender, s, &unilen);
3247 /* In EBCDIC the numlen
3248 * and unilen can differ. */
3250 if (numlen >= foldlen)
3258 reguni(pRExC_state, ender, s, &unilen);
3267 REGC((char)ender, s++);
3271 Set_Node_Cur_Length(ret); /* MJD */
3272 nextchar(pRExC_state);
3274 /* len is STRLEN which is unsigned, need to copy to signed */
3277 vFAIL("Internal disaster");
3286 RExC_size += STR_SZ(len);
3288 RExC_emit += STR_SZ(len);
3293 /* If the encoding pragma is in effect recode the text of
3294 * any EXACT-kind nodes. */
3295 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3296 STRLEN oldlen = STR_LEN(ret);
3297 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3301 if (sv_utf8_downgrade(sv, TRUE)) {
3302 char *s = sv_recode_to_utf8(sv, PL_encoding);
3303 STRLEN newlen = SvCUR(sv);
3308 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3309 (int)oldlen, STRING(ret),
3311 Copy(s, STRING(ret), newlen, char);
3312 STR_LEN(ret) += newlen - oldlen;
3313 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3315 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3323 S_regwhite(pTHX_ char *p, char *e)
3328 else if (*p == '#') {
3331 } while (p < e && *p != '\n');
3339 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3340 Character classes ([:foo:]) can also be negated ([:^foo:]).
3341 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3342 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3343 but trigger failures because they are currently unimplemented. */
3345 #define POSIXCC_DONE(c) ((c) == ':')
3346 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3347 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3350 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3353 I32 namedclass = OOB_NAMEDCLASS;
3355 if (value == '[' && RExC_parse + 1 < RExC_end &&
3356 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3357 POSIXCC(UCHARAT(RExC_parse))) {
3358 char c = UCHARAT(RExC_parse);
3359 char* s = RExC_parse++;
3361 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3363 if (RExC_parse == RExC_end)
3364 /* Grandfather lone [:, [=, [. */
3367 char* t = RExC_parse++; /* skip over the c */
3369 if (UCHARAT(RExC_parse) == ']') {
3370 RExC_parse++; /* skip over the ending ] */
3373 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3374 I32 skip = 5; /* the most common skip */
3378 if (strnEQ(posixcc, "alnum", 5))
3380 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3381 else if (strnEQ(posixcc, "alpha", 5))
3383 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3384 else if (strnEQ(posixcc, "ascii", 5))
3386 complement ? ANYOF_NASCII : ANYOF_ASCII;
3389 if (strnEQ(posixcc, "blank", 5))
3391 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3394 if (strnEQ(posixcc, "cntrl", 5))
3396 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3399 if (strnEQ(posixcc, "digit", 5))
3401 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3404 if (strnEQ(posixcc, "graph", 5))
3406 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3409 if (strnEQ(posixcc, "lower", 5))
3411 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3414 if (strnEQ(posixcc, "print", 5))
3416 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3417 else if (strnEQ(posixcc, "punct", 5))
3419 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3422 if (strnEQ(posixcc, "space", 5))
3424 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3427 if (strnEQ(posixcc, "upper", 5))
3429 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3431 case 'w': /* this is not POSIX, this is the Perl \w */
3432 if (strnEQ(posixcc, "word", 4)) {
3434 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3439 if (strnEQ(posixcc, "xdigit", 6)) {
3441 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3446 if (namedclass == OOB_NAMEDCLASS ||
3447 posixcc[skip] != ':' ||
3448 posixcc[skip+1] != ']')
3450 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3453 } else if (!SIZE_ONLY) {
3454 /* [[=foo=]] and [[.foo.]] are still future. */
3456 /* adjust RExC_parse so the warning shows after
3458 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3460 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3463 /* Maternal grandfather:
3464 * "[:" ending in ":" but not in ":]" */
3474 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3476 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3477 char *s = RExC_parse;
3480 while(*s && isALNUM(*s))
3482 if (*s && c == *s && s[1] == ']') {
3483 if (ckWARN(WARN_REGEXP))
3485 "POSIX syntax [%c %c] belongs inside character classes",
3488 /* [[=foo=]] and [[.foo.]] are still future. */
3489 if (POSIXCC_NOTYET(c)) {
3490 /* adjust RExC_parse so the error shows after
3492 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3494 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3501 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3504 register UV nextvalue;
3505 register IV prevvalue = OOB_UNICODE;
3506 register IV range = 0;
3507 register regnode *ret;
3510 char *rangebegin = 0;
3511 bool need_class = 0;
3512 SV *listsv = Nullsv;
3515 bool optimize_invert = TRUE;
3516 AV* unicode_alternate = 0;
3518 UV literal_endpoint = 0;
3521 ret = reganode(pRExC_state, ANYOF, 0);
3524 ANYOF_FLAGS(ret) = 0;
3526 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3530 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3534 RExC_size += ANYOF_SKIP;
3536 RExC_emit += ANYOF_SKIP;
3538 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3540 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3541 ANYOF_BITMAP_ZERO(ret);
3542 listsv = newSVpvn("# comment\n", 10);
3545 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3547 if (!SIZE_ONLY && POSIXCC(nextvalue))
3548 checkposixcc(pRExC_state);
3550 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3551 if (UCHARAT(RExC_parse) == ']')
3554 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3558 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3561 rangebegin = RExC_parse;
3563 value = utf8n_to_uvchr((U8*)RExC_parse,
3564 RExC_end - RExC_parse,
3566 RExC_parse += numlen;
3569 value = UCHARAT(RExC_parse++);
3570 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3571 if (value == '[' && POSIXCC(nextvalue))
3572 namedclass = regpposixcc(pRExC_state, value);
3573 else if (value == '\\') {
3575 value = utf8n_to_uvchr((U8*)RExC_parse,
3576 RExC_end - RExC_parse,
3578 RExC_parse += numlen;
3581 value = UCHARAT(RExC_parse++);
3582 /* Some compilers cannot handle switching on 64-bit integer
3583 * values, therefore value cannot be an UV. Yes, this will
3584 * be a problem later if we want switch on Unicode.
3585 * A similar issue a little bit later when switching on
3586 * namedclass. --jhi */
3587 switch ((I32)value) {
3588 case 'w': namedclass = ANYOF_ALNUM; break;
3589 case 'W': namedclass = ANYOF_NALNUM; break;
3590 case 's': namedclass = ANYOF_SPACE; break;
3591 case 'S': namedclass = ANYOF_NSPACE; break;
3592 case 'd': namedclass = ANYOF_DIGIT; break;
3593 case 'D': namedclass = ANYOF_NDIGIT; break;
3596 if (RExC_parse >= RExC_end)
3597 vFAIL2("Empty \\%c{}", (U8)value);
3598 if (*RExC_parse == '{') {
3600 e = strchr(RExC_parse++, '}');
3602 vFAIL2("Missing right brace on \\%c{}", c);
3603 while (isSPACE(UCHARAT(RExC_parse)))
3605 if (e == RExC_parse)
3606 vFAIL2("Empty \\%c{}", c);
3608 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3616 if (UCHARAT(RExC_parse) == '^') {
3619 value = value == 'p' ? 'P' : 'p'; /* toggle */
3620 while (isSPACE(UCHARAT(RExC_parse))) {
3626 Perl_sv_catpvf(aTHX_ listsv,
3627 "+utf8::%.*s\n", (int)n, RExC_parse);
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "!utf8::%.*s\n", (int)n, RExC_parse);
3633 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3635 case 'n': value = '\n'; break;
3636 case 'r': value = '\r'; break;
3637 case 't': value = '\t'; break;
3638 case 'f': value = '\f'; break;
3639 case 'b': value = '\b'; break;
3640 case 'e': value = ASCII_TO_NATIVE('\033');break;
3641 case 'a': value = ASCII_TO_NATIVE('\007');break;
3643 if (*RExC_parse == '{') {
3644 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3645 | PERL_SCAN_DISALLOW_PREFIX;
3646 e = strchr(RExC_parse++, '}');
3648 vFAIL("Missing right brace on \\x{}");
3650 numlen = e - RExC_parse;
3651 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3655 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3657 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3658 RExC_parse += numlen;
3662 value = UCHARAT(RExC_parse++);
3663 value = toCTRL(value);
3665 case '0': case '1': case '2': case '3': case '4':
3666 case '5': case '6': case '7': case '8': case '9':
3670 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3671 RExC_parse += numlen;
3675 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3677 "Unrecognized escape \\%c in character class passed through",
3681 } /* end of \blah */
3687 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3689 if (!SIZE_ONLY && !need_class)
3690 ANYOF_CLASS_ZERO(ret);
3694 /* a bad range like a-\d, a-[:digit:] ? */
3697 if (ckWARN(WARN_REGEXP))
3699 "False [] range \"%*.*s\"",
3700 RExC_parse - rangebegin,
3701 RExC_parse - rangebegin,
3703 if (prevvalue < 256) {
3704 ANYOF_BITMAP_SET(ret, prevvalue);
3705 ANYOF_BITMAP_SET(ret, '-');
3708 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3709 Perl_sv_catpvf(aTHX_ listsv,
3710 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3714 range = 0; /* this was not a true range */
3718 if (namedclass > OOB_NAMEDCLASS)
3719 optimize_invert = FALSE;
3720 /* Possible truncation here but in some 64-bit environments
3721 * the compiler gets heartburn about switch on 64-bit values.
3722 * A similar issue a little earlier when switching on value.
3724 switch ((I32)namedclass) {
3727 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3729 for (value = 0; value < 256; value++)
3731 ANYOF_BITMAP_SET(ret, value);
3733 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3737 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3739 for (value = 0; value < 256; value++)
3740 if (!isALNUM(value))
3741 ANYOF_BITMAP_SET(ret, value);
3743 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3747 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3749 for (value = 0; value < 256; value++)
3750 if (isALNUMC(value))
3751 ANYOF_BITMAP_SET(ret, value);
3753 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3757 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3759 for (value = 0; value < 256; value++)
3760 if (!isALNUMC(value))
3761 ANYOF_BITMAP_SET(ret, value);
3763 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3767 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3769 for (value = 0; value < 256; value++)
3771 ANYOF_BITMAP_SET(ret, value);
3773 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3777 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3779 for (value = 0; value < 256; value++)
3780 if (!isALPHA(value))
3781 ANYOF_BITMAP_SET(ret, value);
3783 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3787 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3790 for (value = 0; value < 128; value++)
3791 ANYOF_BITMAP_SET(ret, value);
3793 for (value = 0; value < 256; value++) {
3795 ANYOF_BITMAP_SET(ret, value);
3799 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3803 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3806 for (value = 128; value < 256; value++)
3807 ANYOF_BITMAP_SET(ret, value);
3809 for (value = 0; value < 256; value++) {
3810 if (!isASCII(value))
3811 ANYOF_BITMAP_SET(ret, value);
3815 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3819 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3821 for (value = 0; value < 256; value++)
3823 ANYOF_BITMAP_SET(ret, value);
3825 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3829 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3831 for (value = 0; value < 256; value++)
3832 if (!isBLANK(value))
3833 ANYOF_BITMAP_SET(ret, value);
3835 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3839 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3841 for (value = 0; value < 256; value++)
3843 ANYOF_BITMAP_SET(ret, value);
3845 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3849 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3851 for (value = 0; value < 256; value++)
3852 if (!isCNTRL(value))
3853 ANYOF_BITMAP_SET(ret, value);
3855 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3859 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3861 /* consecutive digits assumed */
3862 for (value = '0'; value <= '9'; value++)
3863 ANYOF_BITMAP_SET(ret, value);
3865 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3869 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3871 /* consecutive digits assumed */
3872 for (value = 0; value < '0'; value++)
3873 ANYOF_BITMAP_SET(ret, value);
3874 for (value = '9' + 1; value < 256; value++)
3875 ANYOF_BITMAP_SET(ret, value);
3877 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3881 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3883 for (value = 0; value < 256; value++)
3885 ANYOF_BITMAP_SET(ret, value);
3887 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3891 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3893 for (value = 0; value < 256; value++)
3894 if (!isGRAPH(value))
3895 ANYOF_BITMAP_SET(ret, value);
3897 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3901 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3903 for (value = 0; value < 256; value++)
3905 ANYOF_BITMAP_SET(ret, value);
3907 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3911 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3913 for (value = 0; value < 256; value++)
3914 if (!isLOWER(value))
3915 ANYOF_BITMAP_SET(ret, value);
3917 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3921 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3923 for (value = 0; value < 256; value++)
3925 ANYOF_BITMAP_SET(ret, value);
3927 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3931 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3933 for (value = 0; value < 256; value++)
3934 if (!isPRINT(value))
3935 ANYOF_BITMAP_SET(ret, value);
3937 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3941 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3943 for (value = 0; value < 256; value++)
3944 if (isPSXSPC(value))
3945 ANYOF_BITMAP_SET(ret, value);
3947 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3951 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3953 for (value = 0; value < 256; value++)
3954 if (!isPSXSPC(value))
3955 ANYOF_BITMAP_SET(ret, value);
3957 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3961 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3963 for (value = 0; value < 256; value++)
3965 ANYOF_BITMAP_SET(ret, value);
3967 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3971 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3973 for (value = 0; value < 256; value++)
3974 if (!isPUNCT(value))
3975 ANYOF_BITMAP_SET(ret, value);
3977 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3981 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3983 for (value = 0; value < 256; value++)
3985 ANYOF_BITMAP_SET(ret, value);
3987 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3991 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3993 for (value = 0; value < 256; value++)
3994 if (!isSPACE(value))
3995 ANYOF_BITMAP_SET(ret, value);
3997 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4001 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4003 for (value = 0; value < 256; value++)
4005 ANYOF_BITMAP_SET(ret, value);
4007 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4011 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4013 for (value = 0; value < 256; value++)
4014 if (!isUPPER(value))
4015 ANYOF_BITMAP_SET(ret, value);
4017 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4021 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4023 for (value = 0; value < 256; value++)
4024 if (isXDIGIT(value))
4025 ANYOF_BITMAP_SET(ret, value);
4027 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4031 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4033 for (value = 0; value < 256; value++)
4034 if (!isXDIGIT(value))
4035 ANYOF_BITMAP_SET(ret, value);
4037 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4040 vFAIL("Invalid [::] class");
4044 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4047 } /* end of namedclass \blah */
4050 if (prevvalue > (IV)value) /* b-a */ {
4051 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4052 RExC_parse - rangebegin,
4053 RExC_parse - rangebegin,
4055 range = 0; /* not a valid range */
4059 prevvalue = value; /* save the beginning of the range */
4060 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4061 RExC_parse[1] != ']') {
4064 /* a bad range like \w-, [:word:]- ? */
4065 if (namedclass > OOB_NAMEDCLASS) {
4066 if (ckWARN(WARN_REGEXP))
4068 "False [] range \"%*.*s\"",
4069 RExC_parse - rangebegin,
4070 RExC_parse - rangebegin,
4073 ANYOF_BITMAP_SET(ret, '-');
4075 range = 1; /* yeah, it's a range! */
4076 continue; /* but do it the next time */
4080 /* now is the next time */
4084 if (prevvalue < 256) {
4085 IV ceilvalue = value < 256 ? value : 255;
4088 /* In EBCDIC [\x89-\x91] should include
4089 * the \x8e but [i-j] should not. */
4090 if (literal_endpoint == 2 &&
4091 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4092 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4094 if (isLOWER(prevvalue)) {
4095 for (i = prevvalue; i <= ceilvalue; i++)
4097 ANYOF_BITMAP_SET(ret, i);
4099 for (i = prevvalue; i <= ceilvalue; i++)
4101 ANYOF_BITMAP_SET(ret, i);
4106 for (i = prevvalue; i <= ceilvalue; i++)
4107 ANYOF_BITMAP_SET(ret, i);
4109 if (value > 255 || UTF) {
4110 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4111 UV natvalue = NATIVE_TO_UNI(value);
4113 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4114 if (prevnatvalue < natvalue) { /* what about > ? */
4115 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4116 prevnatvalue, natvalue);
4118 else if (prevnatvalue == natvalue) {
4119 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4121 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4123 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4125 /* If folding and foldable and a single
4126 * character, insert also the folded version
4127 * to the charclass. */
4129 if (foldlen == (STRLEN)UNISKIP(f))
4130 Perl_sv_catpvf(aTHX_ listsv,
4133 /* Any multicharacter foldings
4134 * require the following transform:
4135 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4136 * where E folds into "pq" and F folds
4137 * into "rst", all other characters
4138 * fold to single characters. We save
4139 * away these multicharacter foldings,
4140 * to be later saved as part of the
4141 * additional "s" data. */
4144 if (!unicode_alternate)
4145 unicode_alternate = newAV();
4146 sv = newSVpvn((char*)foldbuf, foldlen);
4148 av_push(unicode_alternate, sv);
4152 /* If folding and the value is one of the Greek
4153 * sigmas insert a few more sigmas to make the
4154 * folding rules of the sigmas to work right.
4155 * Note that not all the possible combinations
4156 * are handled here: some of them are handled
4157 * by the standard folding rules, and some of
4158 * them (literal or EXACTF cases) are handled
4159 * during runtime in regexec.c:S_find_byclass(). */
4160 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4161 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4162 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4163 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4164 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4166 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4167 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4168 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4173 literal_endpoint = 0;
4177 range = 0; /* this range (if it was one) is done now */
4181 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4183 RExC_size += ANYOF_CLASS_ADD_SKIP;
4185 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4188 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4190 /* If the only flag is folding (plus possibly inversion). */
4191 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4193 for (value = 0; value < 256; ++value) {
4194 if (ANYOF_BITMAP_TEST(ret, value)) {
4195 UV fold = PL_fold[value];
4198 ANYOF_BITMAP_SET(ret, fold);
4201 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4204 /* optimize inverted simple patterns (e.g. [^a-z]) */
4205 if (!SIZE_ONLY && optimize_invert &&
4206 /* If the only flag is inversion. */
4207 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4208 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4209 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4210 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4217 /* The 0th element stores the character class description
4218 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4219 * to initialize the appropriate swash (which gets stored in
4220 * the 1st element), and also useful for dumping the regnode.
4221 * The 2nd element stores the multicharacter foldings,
4222 * used later (regexec.c:S_reginclass()). */
4223 av_store(av, 0, listsv);
4224 av_store(av, 1, NULL);
4225 av_store(av, 2, (SV*)unicode_alternate);
4226 rv = newRV_noinc((SV*)av);
4227 n = add_data(pRExC_state, 1, "s");
4228 RExC_rx->data->data[n] = (void*)rv;
4236 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4238 char* retval = RExC_parse++;
4241 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4242 RExC_parse[2] == '#') {
4243 while (*RExC_parse && *RExC_parse != ')')
4248 if (RExC_flags & PMf_EXTENDED) {
4249 if (isSPACE(*RExC_parse)) {
4253 else if (*RExC_parse == '#') {
4254 while (*RExC_parse && *RExC_parse != '\n')
4265 - reg_node - emit a node
4267 STATIC regnode * /* Location. */
4268 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4270 register regnode *ret;
4271 register regnode *ptr;
4275 SIZE_ALIGN(RExC_size);
4280 NODE_ALIGN_FILL(ret);
4282 FILL_ADVANCE_NODE(ptr, op);
4283 if (RExC_offsets) { /* MJD */
4284 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4285 "reg_node", __LINE__,
4287 RExC_emit - RExC_emit_start > RExC_offsets[0]
4288 ? "Overwriting end of array!\n" : "OK",
4289 RExC_emit - RExC_emit_start,
4290 RExC_parse - RExC_start,
4292 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4301 - reganode - emit a node with an argument
4303 STATIC regnode * /* Location. */
4304 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4306 register regnode *ret;
4307 register regnode *ptr;
4311 SIZE_ALIGN(RExC_size);
4316 NODE_ALIGN_FILL(ret);
4318 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4319 if (RExC_offsets) { /* MJD */
4320 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4324 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4325 "Overwriting end of array!\n" : "OK",
4326 RExC_emit - RExC_emit_start,
4327 RExC_parse - RExC_start,
4329 Set_Cur_Node_Offset;
4338 - reguni - emit (if appropriate) a Unicode character
4341 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4343 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4347 - reginsert - insert an operator in front of already-emitted operand
4349 * Means relocating the operand.
4352 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4354 register regnode *src;
4355 register regnode *dst;
4356 register regnode *place;
4357 register int offset = regarglen[(U8)op];
4359 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4362 RExC_size += NODE_STEP_REGNODE + offset;
4367 RExC_emit += NODE_STEP_REGNODE + offset;
4369 while (src > opnd) {
4370 StructCopy(--src, --dst, regnode);
4371 if (RExC_offsets) { /* MJD 20010112 */
4372 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4376 dst - RExC_emit_start > RExC_offsets[0]
4377 ? "Overwriting end of array!\n" : "OK",
4378 src - RExC_emit_start,
4379 dst - RExC_emit_start,
4381 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4382 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4387 place = opnd; /* Op node, where operand used to be. */
4388 if (RExC_offsets) { /* MJD */
4389 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4393 place - RExC_emit_start > RExC_offsets[0]
4394 ? "Overwriting end of array!\n" : "OK",
4395 place - RExC_emit_start,
4396 RExC_parse - RExC_start,
4398 Set_Node_Offset(place, RExC_parse);
4400 src = NEXTOPER(place);
4401 FILL_ADVANCE_NODE(place, op);
4402 Zero(src, offset, regnode);
4406 - regtail - set the next-pointer at the end of a node chain of p to val.
4409 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4411 register regnode *scan;
4412 register regnode *temp;
4417 /* Find last node. */
4420 temp = regnext(scan);
4426 if (reg_off_by_arg[OP(scan)]) {
4427 ARG_SET(scan, val - scan);
4430 NEXT_OFF(scan) = val - scan;
4435 - regoptail - regtail on operand of first argument; nop if operandless
4438 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4440 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4441 if (p == NULL || SIZE_ONLY)
4443 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4444 regtail(pRExC_state, NEXTOPER(p), val);
4446 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4447 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4454 - regcurly - a little FSA that accepts {\d+,?\d*}
4457 S_regcurly(pTHX_ register char *s)
4478 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4480 register U8 op = EXACT; /* Arbitrary non-END op. */
4481 register regnode *next;
4483 while (op != END && (!last || node < last)) {
4484 /* While that wasn't END last time... */
4490 next = regnext(node);
4492 if (OP(node) == OPTIMIZED)
4495 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4496 (int)(2*l + 1), "", SvPVX(sv));
4497 if (next == NULL) /* Next ptr. */
4498 PerlIO_printf(Perl_debug_log, "(0)");
4500 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4501 (void)PerlIO_putc(Perl_debug_log, '\n');
4503 if (PL_regkind[(U8)op] == BRANCHJ) {
4504 register regnode *nnode = (OP(next) == LONGJMP
4507 if (last && nnode > last)
4509 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4511 else if (PL_regkind[(U8)op] == BRANCH) {
4512 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4514 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4515 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4516 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4518 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4519 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4522 else if ( op == PLUS || op == STAR) {
4523 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4525 else if (op == ANYOF) {
4526 /* arglen 1 + class block */
4527 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4528 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4529 node = NEXTOPER(node);
4531 else if (PL_regkind[(U8)op] == EXACT) {
4532 /* Literal string, where present. */
4533 node += NODE_SZ_STR(node) - 1;
4534 node = NEXTOPER(node);
4537 node = NEXTOPER(node);
4538 node += regarglen[(U8)op];
4540 if (op == CURLYX || op == OPEN)
4542 else if (op == WHILEM)
4548 #endif /* DEBUGGING */
4551 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4554 Perl_regdump(pTHX_ regexp *r)
4557 SV *sv = sv_newmortal();
4559 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4561 /* Header fields of interest. */
4562 if (r->anchored_substr)
4563 PerlIO_printf(Perl_debug_log,
4564 "anchored `%s%.*s%s'%s at %"IVdf" ",
4566 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4567 SvPVX(r->anchored_substr),
4569 SvTAIL(r->anchored_substr) ? "$" : "",
4570 (IV)r->anchored_offset);
4571 else if (r->anchored_utf8)
4572 PerlIO_printf(Perl_debug_log,
4573 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4575 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4576 SvPVX(r->anchored_utf8),
4578 SvTAIL(r->anchored_utf8) ? "$" : "",
4579 (IV)r->anchored_offset);
4580 if (r->float_substr)
4581 PerlIO_printf(Perl_debug_log,
4582 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4584 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4585 SvPVX(r->float_substr),
4587 SvTAIL(r->float_substr) ? "$" : "",
4588 (IV)r->float_min_offset, (UV)r->float_max_offset);
4589 else if (r->float_utf8)
4590 PerlIO_printf(Perl_debug_log,
4591 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4593 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4594 SvPVX(r->float_utf8),
4596 SvTAIL(r->float_utf8) ? "$" : "",
4597 (IV)r->float_min_offset, (UV)r->float_max_offset);
4598 if (r->check_substr || r->check_utf8)
4599 PerlIO_printf(Perl_debug_log,
4600 r->check_substr == r->float_substr
4601 && r->check_utf8 == r->float_utf8
4602 ? "(checking floating" : "(checking anchored");
4603 if (r->reganch & ROPT_NOSCAN)
4604 PerlIO_printf(Perl_debug_log, " noscan");
4605 if (r->reganch & ROPT_CHECK_ALL)
4606 PerlIO_printf(Perl_debug_log, " isall");
4607 if (r->check_substr || r->check_utf8)
4608 PerlIO_printf(Perl_debug_log, ") ");
4610 if (r->regstclass) {
4611 regprop(sv, r->regstclass);
4612 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4614 if (r->reganch & ROPT_ANCH) {
4615 PerlIO_printf(Perl_debug_log, "anchored");
4616 if (r->reganch & ROPT_ANCH_BOL)
4617 PerlIO_printf(Perl_debug_log, "(BOL)");
4618 if (r->reganch & ROPT_ANCH_MBOL)
4619 PerlIO_printf(Perl_debug_log, "(MBOL)");
4620 if (r->reganch & ROPT_ANCH_SBOL)
4621 PerlIO_printf(Perl_debug_log, "(SBOL)");
4622 if (r->reganch & ROPT_ANCH_GPOS)
4623 PerlIO_printf(Perl_debug_log, "(GPOS)");
4624 PerlIO_putc(Perl_debug_log, ' ');
4626 if (r->reganch & ROPT_GPOS_SEEN)
4627 PerlIO_printf(Perl_debug_log, "GPOS ");
4628 if (r->reganch & ROPT_SKIP)
4629 PerlIO_printf(Perl_debug_log, "plus ");
4630 if (r->reganch & ROPT_IMPLICIT)
4631 PerlIO_printf(Perl_debug_log, "implicit ");
4632 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4633 if (r->reganch & ROPT_EVAL_SEEN)
4634 PerlIO_printf(Perl_debug_log, "with eval ");
4635 PerlIO_printf(Perl_debug_log, "\n");
4638 U32 len = r->offsets[0];
4639 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4640 for (i = 1; i <= len; i++)
4641 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4642 (UV)r->offsets[i*2-1],
4643 (UV)r->offsets[i*2]);
4644 PerlIO_printf(Perl_debug_log, "\n");
4646 #endif /* DEBUGGING */
4652 S_put_byte(pTHX_ SV *sv, int c)
4654 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4655 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4656 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4657 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4659 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4662 #endif /* DEBUGGING */
4665 - regprop - printable representation of opcode
4668 Perl_regprop(pTHX_ SV *sv, regnode *o)
4673 sv_setpvn(sv, "", 0);
4674 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4675 /* It would be nice to FAIL() here, but this may be called from
4676 regexec.c, and it would be hard to supply pRExC_state. */
4677 Perl_croak(aTHX_ "Corrupted regexp opcode");
4678 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4680 k = PL_regkind[(U8)OP(o)];
4683 SV *dsv = sv_2mortal(newSVpvn("", 0));
4684 /* Using is_utf8_string() is a crude hack but it may
4685 * be the best for now since we have no flag "this EXACTish
4686 * node was UTF-8" --jhi */
4687 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4689 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4690 UNI_DISPLAY_REGEX) :
4695 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4700 else if (k == CURLY) {
4701 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4702 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4703 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4705 else if (k == WHILEM && o->flags) /* Ordinal/of */
4706 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4707 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4708 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4709 else if (k == LOGICAL)
4710 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4711 else if (k == ANYOF) {
4712 int i, rangestart = -1;
4713 U8 flags = ANYOF_FLAGS(o);
4714 const char * const anyofs[] = { /* Should be syncronized with
4715 * ANYOF_ #xdefines in regcomp.h */
4748 if (flags & ANYOF_LOCALE)
4749 sv_catpv(sv, "{loc}");
4750 if (flags & ANYOF_FOLD)
4751 sv_catpv(sv, "{i}");
4752 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4753 if (flags & ANYOF_INVERT)
4755 for (i = 0; i <= 256; i++) {
4756 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4757 if (rangestart == -1)
4759 } else if (rangestart != -1) {
4760 if (i <= rangestart + 3)
4761 for (; rangestart < i; rangestart++)
4762 put_byte(sv, rangestart);
4764 put_byte(sv, rangestart);
4766 put_byte(sv, i - 1);
4772 if (o->flags & ANYOF_CLASS)
4773 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4774 if (ANYOF_CLASS_TEST(o,i))
4775 sv_catpv(sv, anyofs[i]);
4777 if (flags & ANYOF_UNICODE)
4778 sv_catpv(sv, "{unicode}");
4779 else if (flags & ANYOF_UNICODE_ALL)
4780 sv_catpv(sv, "{unicode_all}");
4784 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4788 U8 s[UTF8_MAXLEN+1];
4790 for (i = 0; i <= 256; i++) { /* just the first 256 */
4791 U8 *e = uvchr_to_utf8(s, i);
4793 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4794 if (rangestart == -1)
4796 } else if (rangestart != -1) {
4799 if (i <= rangestart + 3)
4800 for (; rangestart < i; rangestart++) {
4801 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4805 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4808 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4815 sv_catpv(sv, "..."); /* et cetera */
4819 char *s = savepv(SvPVX(lv));
4822 while(*s && *s != '\n') s++;
4843 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4845 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4846 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4847 #endif /* DEBUGGING */
4851 Perl_re_intuit_string(pTHX_ regexp *prog)
4852 { /* Assume that RE_INTUIT is set */
4855 char *s = SvPV(prog->check_substr
4856 ? prog->check_substr : prog->check_utf8, n_a);
4858 if (!PL_colorset) reginitcolors();
4859 PerlIO_printf(Perl_debug_log,
4860 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4862 prog->check_substr ? "" : "utf8 ",
4863 PL_colors[5],PL_colors[0],
4866 (strlen(s) > 60 ? "..." : ""));
4869 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4873 Perl_pregfree(pTHX_ struct regexp *r)
4876 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4879 if (!r || (--r->refcnt > 0))
4885 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4886 r->prelen, 60, UNI_DISPLAY_REGEX)
4887 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4891 PerlIO_printf(Perl_debug_log,
4892 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4893 PL_colors[4],PL_colors[5],PL_colors[0],
4896 len > 60 ? "..." : "");
4900 Safefree(r->precomp);
4901 if (r->offsets) /* 20010421 MJD */
4902 Safefree(r->offsets);
4903 if (RX_MATCH_COPIED(r))
4904 Safefree(r->subbeg);
4906 if (r->anchored_substr)
4907 SvREFCNT_dec(r->anchored_substr);
4908 if (r->anchored_utf8)
4909 SvREFCNT_dec(r->anchored_utf8);
4910 if (r->float_substr)
4911 SvREFCNT_dec(r->float_substr);
4913 SvREFCNT_dec(r->float_utf8);
4914 Safefree(r->substrs);
4917 int n = r->data->count;
4918 PAD* new_comppad = NULL;
4922 /* If you add a ->what type here, update the comment in regcomp.h */
4923 switch (r->data->what[n]) {
4925 SvREFCNT_dec((SV*)r->data->data[n]);
4928 Safefree(r->data->data[n]);
4931 new_comppad = (AV*)r->data->data[n];
4934 if (new_comppad == NULL)
4935 Perl_croak(aTHX_ "panic: pregfree comppad");
4936 PAD_SAVE_LOCAL(old_comppad,
4937 /* Watch out for global destruction's random ordering. */
4938 (SvTYPE(new_comppad) == SVt_PVAV) ?
4939 new_comppad : Null(PAD *)
4941 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4942 op_free((OP_4tree*)r->data->data[n]);
4945 PAD_RESTORE_LOCAL(old_comppad);
4946 SvREFCNT_dec((SV*)new_comppad);
4952 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4955 Safefree(r->data->what);
4958 Safefree(r->startp);
4964 - regnext - dig the "next" pointer out of a node
4966 * [Note, when REGALIGN is defined there are two places in regmatch()
4967 * that bypass this code for speed.]
4970 Perl_regnext(pTHX_ register regnode *p)
4972 register I32 offset;
4974 if (p == &PL_regdummy)
4977 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4985 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4988 STRLEN l1 = strlen(pat1);
4989 STRLEN l2 = strlen(pat2);
4998 Copy(pat1, buf, l1 , char);
4999 Copy(pat2, buf + l1, l2 , char);
5000 buf[l1 + l2] = '\n';
5001 buf[l1 + l2 + 1] = '\0';
5003 /* ANSI variant takes additional second argument */
5004 va_start(args, pat2);
5008 msv = vmess(buf, &args);
5010 message = SvPV(msv,l1);
5013 Copy(message, buf, l1 , char);
5014 buf[l1] = '\0'; /* Overwrite \n */
5015 Perl_croak(aTHX_ "%s", buf);
5018 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5021 Perl_save_re_context(pTHX)
5023 SAVEI32(PL_reg_flags); /* from regexec.c */
5025 SAVEPPTR(PL_reginput); /* String-input pointer. */
5026 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5027 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5028 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5029 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5030 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5031 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5032 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5033 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5034 PL_reg_start_tmp = 0;
5035 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5036 PL_reg_start_tmpl = 0;
5037 SAVEVPTR(PL_regdata);
5038 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5039 SAVEI32(PL_regnarrate); /* from regexec.c */
5040 SAVEVPTR(PL_regprogram); /* from regexec.c */
5041 SAVEINT(PL_regindent); /* from regexec.c */
5042 SAVEVPTR(PL_regcc); /* from regexec.c */
5043 SAVEVPTR(PL_curcop);
5044 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5045 SAVEVPTR(PL_reg_re); /* from regexec.c */
5046 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5047 SAVESPTR(PL_reg_sv); /* from regexec.c */
5048 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5049 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5050 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5051 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5052 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5053 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5054 PL_reg_oldsaved = Nullch;
5055 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5056 PL_reg_oldsavedlen = 0;
5057 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5059 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5060 PL_reg_leftiter = 0;
5061 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5062 PL_reg_poscache = Nullch;
5063 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5064 PL_reg_poscache_size = 0;
5065 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5066 SAVEI32(PL_regnpar); /* () count. */
5067 SAVEI32(PL_regsize); /* from regexec.c */
5070 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5076 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5077 for (i = 1; i <= rx->nparens; i++) {
5078 sprintf(digits, "%lu", (long)i);
5079 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5086 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5091 clear_re(pTHX_ void *r)
5093 ReREFCNT_dec((regexp *)r);