5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2003, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 #define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 #define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 #define vFAIL(m) STMT_START { \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 * Like Simple_vFAIL(), but accepts two arguments.
316 #define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 #define vFAIL2(m,a1) STMT_START { \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
333 * Like Simple_vFAIL(), but accepts three arguments.
335 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 #define vFAIL3(m,a1,a2) STMT_START { \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
351 * Like Simple_vFAIL(), but accepts four arguments.
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
360 * Like Simple_vFAIL(), but accepts five arguments.
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
579 cl->flags &= ~ANYOF_UNICODE_ALL;
580 cl->flags |= ANYOF_UNICODE;
581 ARG_SET(cl, ARG(and_with));
583 if (!(and_with->flags & ANYOF_UNICODE_ALL))
584 cl->flags &= ~ANYOF_UNICODE_ALL;
585 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
586 cl->flags &= ~ANYOF_UNICODE;
589 /* 'OR' a given class with another one. Can create false positives */
590 /* We assume that cl is not inverted */
592 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
594 if (or_with->flags & ANYOF_INVERT) {
596 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
597 * <= (B1 | !B2) | (CL1 | !CL2)
598 * which is wasteful if CL2 is small, but we ignore CL2:
599 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
600 * XXXX Can we handle case-fold? Unclear:
601 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
602 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && !(or_with->flags & ANYOF_FOLD)
606 && !(cl->flags & ANYOF_FOLD) ) {
609 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
610 cl->bitmap[i] |= ~or_with->bitmap[i];
611 } /* XXXX: logic is complicated otherwise */
613 cl_anything(pRExC_state, cl);
616 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618 && (!(or_with->flags & ANYOF_FOLD)
619 || (cl->flags & ANYOF_FOLD)) ) {
622 /* OR char bitmap and class bitmap separately */
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= or_with->bitmap[i];
625 if (or_with->flags & ANYOF_CLASS) {
626 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
627 cl->classflags[i] |= or_with->classflags[i];
628 cl->flags |= ANYOF_CLASS;
631 else { /* XXXX: logic is complicated, leave it along for a moment. */
632 cl_anything(pRExC_state, cl);
635 if (or_with->flags & ANYOF_EOS)
636 cl->flags |= ANYOF_EOS;
638 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
639 ARG(cl) != ARG(or_with)) {
640 cl->flags |= ANYOF_UNICODE_ALL;
641 cl->flags &= ~ANYOF_UNICODE;
643 if (or_with->flags & ANYOF_UNICODE_ALL) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
650 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
651 * These need to be revisited when a newer toolchain becomes available.
653 #if defined(__sparc64__) && defined(__GNUC__)
654 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
655 # undef SPARC64_GCC_WORKAROUND
656 # define SPARC64_GCC_WORKAROUND 1
660 /* REx optimizer. Converts nodes into quickier variants "in place".
661 Finds fixed substrings. */
663 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
664 to the position after last scanned or to NULL. */
667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
668 /* scanp: Start here (read-write). */
669 /* deltap: Write maxlen-minlen here. */
670 /* last: Stop before this one. */
672 I32 min = 0, pars = 0, code;
673 regnode *scan = *scanp, *next;
675 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
676 int is_inf_internal = 0; /* The studied chunk is infinite */
677 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
678 scan_data_t data_fake;
679 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
681 while (scan && OP(scan) != END && scan < last) {
682 /* Peephole optimizer: */
684 if (PL_regkind[(U8)OP(scan)] == EXACT) {
685 /* Merge several consecutive EXACTish nodes into one. */
686 regnode *n = regnext(scan);
689 regnode *stop = scan;
692 next = scan + NODE_SZ_STR(scan);
693 /* Skip NOTHING, merge EXACT*. */
695 ( PL_regkind[(U8)OP(n)] == NOTHING ||
696 (stringok && (OP(n) == OP(scan))))
698 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
699 if (OP(n) == TAIL || n > next)
701 if (PL_regkind[(U8)OP(n)] == NOTHING) {
702 NEXT_OFF(scan) += NEXT_OFF(n);
703 next = n + NODE_STEP_REGNODE;
711 int oldl = STR_LEN(scan);
712 regnode *nnext = regnext(n);
714 if (oldl + STR_LEN(n) > U8_MAX)
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 STR_LEN(scan) += STR_LEN(n);
718 next = n + NODE_SZ_STR(n);
719 /* Now we can overwrite *n : */
720 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
728 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
730 Two problematic code points in Unicode casefolding of EXACT nodes:
732 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
733 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
739 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
740 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
742 This means that in case-insensitive matching (or "loose matching",
743 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
744 length of the above casefolded versions) can match a target string
745 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
746 This would rather mess up the minimum length computation.
748 What we'll do is to look for the tail four bytes, and then peek
749 at the preceding two bytes to see whether we need to decrease
750 the minimum length by four (six minus two).
752 Thanks to the design of UTF-8, there cannot be false matches:
753 A sequence of valid UTF-8 bytes cannot be a subsequence of
754 another valid sequence of UTF-8 bytes.
757 char *s0 = STRING(scan), *s, *t;
758 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
759 char *t0 = "\xcc\x88\xcc\x81";
763 s < s2 && (t = ninstr(s, s1, t0, t1));
765 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
766 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
773 n = scan + NODE_SZ_STR(scan);
775 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
783 /* Follow the next-chain of the current node and optimize
784 away all the NOTHINGs from it. */
785 if (OP(scan) != CURLYX) {
786 int max = (reg_off_by_arg[OP(scan)]
788 /* I32 may be smaller than U16 on CRAYs! */
789 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
790 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
794 /* Skip NOTHING and LONGJMP. */
795 while ((n = regnext(n))
796 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
797 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
800 if (reg_off_by_arg[OP(scan)])
803 NEXT_OFF(scan) = off;
805 /* The principal pseudo-switch. Cannot be a switch, since we
806 look into several different things. */
807 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
808 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
809 next = regnext(scan);
812 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
813 I32 max1 = 0, min1 = I32_MAX, num = 0;
814 struct regnode_charclass_class accum;
816 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
817 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
818 if (flags & SCF_DO_STCLASS)
819 cl_init_zero(pRExC_state, &accum);
820 while (OP(scan) == code) {
821 I32 deltanext, minnext, f = 0, fake;
822 struct regnode_charclass_class this_class;
827 data_fake.whilem_c = data->whilem_c;
828 data_fake.last_closep = data->last_closep;
831 data_fake.last_closep = &fake;
832 next = regnext(scan);
833 scan = NEXTOPER(scan);
835 scan = NEXTOPER(scan);
836 if (flags & SCF_DO_STCLASS) {
837 cl_init(pRExC_state, &this_class);
838 data_fake.start_class = &this_class;
839 f = SCF_DO_STCLASS_AND;
841 if (flags & SCF_WHILEM_VISITED_POS)
842 f |= SCF_WHILEM_VISITED_POS;
843 /* we suppose the run is continuous, last=next...*/
844 minnext = study_chunk(pRExC_state, &scan, &deltanext,
845 next, &data_fake, f);
848 if (max1 < minnext + deltanext)
849 max1 = minnext + deltanext;
850 if (deltanext == I32_MAX)
851 is_inf = is_inf_internal = 1;
853 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
855 if (data && (data_fake.flags & SF_HAS_EVAL))
856 data->flags |= SF_HAS_EVAL;
858 data->whilem_c = data_fake.whilem_c;
859 if (flags & SCF_DO_STCLASS)
860 cl_or(pRExC_state, &accum, &this_class);
864 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
866 if (flags & SCF_DO_SUBSTR) {
867 data->pos_min += min1;
868 data->pos_delta += max1 - min1;
869 if (max1 != min1 || is_inf)
870 data->longest = &(data->longest_float);
873 delta += max1 - min1;
874 if (flags & SCF_DO_STCLASS_OR) {
875 cl_or(pRExC_state, data->start_class, &accum);
877 cl_and(data->start_class, &and_with);
878 flags &= ~SCF_DO_STCLASS;
881 else if (flags & SCF_DO_STCLASS_AND) {
883 cl_and(data->start_class, &accum);
884 flags &= ~SCF_DO_STCLASS;
887 /* Switch to OR mode: cache the old value of
888 * data->start_class */
889 StructCopy(data->start_class, &and_with,
890 struct regnode_charclass_class);
891 flags &= ~SCF_DO_STCLASS_AND;
892 StructCopy(&accum, data->start_class,
893 struct regnode_charclass_class);
894 flags |= SCF_DO_STCLASS_OR;
895 data->start_class->flags |= ANYOF_EOS;
899 else if (code == BRANCHJ) /* single branch is optimized. */
900 scan = NEXTOPER(NEXTOPER(scan));
901 else /* single branch is optimized. */
902 scan = NEXTOPER(scan);
905 else if (OP(scan) == EXACT) {
906 I32 l = STR_LEN(scan);
907 UV uc = *((U8*)STRING(scan));
909 U8 *s = (U8*)STRING(scan);
910 l = utf8_length(s, s + l);
911 uc = utf8_to_uvchr(s, NULL);
914 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
915 /* The code below prefers earlier match for fixed
916 offset, later match for variable offset. */
917 if (data->last_end == -1) { /* Update the start info. */
918 data->last_start_min = data->pos_min;
919 data->last_start_max = is_inf
920 ? I32_MAX : data->pos_min + data->pos_delta;
922 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
924 SV * sv = data->last_found;
925 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
926 mg_find(sv, PERL_MAGIC_utf8) : NULL;
927 if (mg && mg->mg_len >= 0)
928 mg->mg_len += utf8_length((U8*)STRING(scan),
929 (U8*)STRING(scan)+STR_LEN(scan));
932 SvUTF8_on(data->last_found);
933 data->last_end = data->pos_min + l;
934 data->pos_min += l; /* As in the first entry. */
935 data->flags &= ~SF_BEFORE_EOL;
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
944 && (!(data->start_class->flags & ANYOF_FOLD)
945 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
948 ANYOF_CLASS_ZERO(data->start_class);
949 ANYOF_BITMAP_ZERO(data->start_class);
951 ANYOF_BITMAP_SET(data->start_class, uc);
952 data->start_class->flags &= ~ANYOF_EOS;
954 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
956 else if (flags & SCF_DO_STCLASS_OR) {
957 /* false positive possible if the class is case-folded */
959 ANYOF_BITMAP_SET(data->start_class, uc);
961 data->start_class->flags |= ANYOF_UNICODE_ALL;
962 data->start_class->flags &= ~ANYOF_EOS;
963 cl_and(data->start_class, &and_with);
965 flags &= ~SCF_DO_STCLASS;
967 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
968 I32 l = STR_LEN(scan);
969 UV uc = *((U8*)STRING(scan));
971 /* Search for fixed substrings supports EXACT only. */
972 if (flags & SCF_DO_SUBSTR)
973 scan_commit(pRExC_state, data);
975 U8 *s = (U8 *)STRING(scan);
976 l = utf8_length(s, s + l);
977 uc = utf8_to_uvchr(s, NULL);
980 if (data && (flags & SCF_DO_SUBSTR))
982 if (flags & SCF_DO_STCLASS_AND) {
983 /* Check whether it is compatible with what we know already! */
987 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
988 && !ANYOF_BITMAP_TEST(data->start_class, uc)
989 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
991 ANYOF_CLASS_ZERO(data->start_class);
992 ANYOF_BITMAP_ZERO(data->start_class);
994 ANYOF_BITMAP_SET(data->start_class, uc);
995 data->start_class->flags &= ~ANYOF_EOS;
996 data->start_class->flags |= ANYOF_FOLD;
997 if (OP(scan) == EXACTFL)
998 data->start_class->flags |= ANYOF_LOCALE;
1001 else if (flags & SCF_DO_STCLASS_OR) {
1002 if (data->start_class->flags & ANYOF_FOLD) {
1003 /* false positive possible if the class is case-folded.
1004 Assume that the locale settings are the same... */
1006 ANYOF_BITMAP_SET(data->start_class, uc);
1007 data->start_class->flags &= ~ANYOF_EOS;
1009 cl_and(data->start_class, &and_with);
1011 flags &= ~SCF_DO_STCLASS;
1013 else if (strchr((char*)PL_varies,OP(scan))) {
1014 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1015 I32 f = flags, pos_before = 0;
1016 regnode *oscan = scan;
1017 struct regnode_charclass_class this_class;
1018 struct regnode_charclass_class *oclass = NULL;
1019 I32 next_is_eval = 0;
1021 switch (PL_regkind[(U8)OP(scan)]) {
1022 case WHILEM: /* End of (?:...)* . */
1023 scan = NEXTOPER(scan);
1026 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1027 next = NEXTOPER(scan);
1028 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1030 maxcount = REG_INFTY;
1031 next = regnext(scan);
1032 scan = NEXTOPER(scan);
1036 if (flags & SCF_DO_SUBSTR)
1041 if (flags & SCF_DO_STCLASS) {
1043 maxcount = REG_INFTY;
1044 next = regnext(scan);
1045 scan = NEXTOPER(scan);
1048 is_inf = is_inf_internal = 1;
1049 scan = regnext(scan);
1050 if (flags & SCF_DO_SUBSTR) {
1051 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1052 data->longest = &(data->longest_float);
1054 goto optimize_curly_tail;
1056 mincount = ARG1(scan);
1057 maxcount = ARG2(scan);
1058 next = regnext(scan);
1059 if (OP(scan) == CURLYX) {
1060 I32 lp = (data ? *(data->last_closep) : 0);
1062 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1065 next_is_eval = (OP(scan) == EVAL);
1067 if (flags & SCF_DO_SUBSTR) {
1068 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1069 pos_before = data->pos_min;
1073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075 data->flags |= SF_IS_INF;
1077 if (flags & SCF_DO_STCLASS) {
1078 cl_init(pRExC_state, &this_class);
1079 oclass = data->start_class;
1080 data->start_class = &this_class;
1081 f |= SCF_DO_STCLASS_AND;
1082 f &= ~SCF_DO_STCLASS_OR;
1084 /* These are the cases when once a subexpression
1085 fails at a particular position, it cannot succeed
1086 even after backtracking at the enclosing scope.
1088 XXXX what if minimal match and we are at the
1089 initial run of {n,m}? */
1090 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1091 f &= ~SCF_WHILEM_VISITED_POS;
1093 /* This will finish on WHILEM, setting scan, or on NULL: */
1094 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096 ? (f & ~SCF_DO_SUBSTR) : f);
1098 if (flags & SCF_DO_STCLASS)
1099 data->start_class = oclass;
1100 if (mincount == 0 || minnext == 0) {
1101 if (flags & SCF_DO_STCLASS_OR) {
1102 cl_or(pRExC_state, data->start_class, &this_class);
1104 else if (flags & SCF_DO_STCLASS_AND) {
1105 /* Switch to OR mode: cache the old value of
1106 * data->start_class */
1107 StructCopy(data->start_class, &and_with,
1108 struct regnode_charclass_class);
1109 flags &= ~SCF_DO_STCLASS_AND;
1110 StructCopy(&this_class, data->start_class,
1111 struct regnode_charclass_class);
1112 flags |= SCF_DO_STCLASS_OR;
1113 data->start_class->flags |= ANYOF_EOS;
1115 } else { /* Non-zero len */
1116 if (flags & SCF_DO_STCLASS_OR) {
1117 cl_or(pRExC_state, data->start_class, &this_class);
1118 cl_and(data->start_class, &and_with);
1120 else if (flags & SCF_DO_STCLASS_AND)
1121 cl_and(data->start_class, &this_class);
1122 flags &= ~SCF_DO_STCLASS;
1124 if (!scan) /* It was not CURLYX, but CURLY. */
1126 if (ckWARN(WARN_REGEXP)
1127 /* ? quantifier ok, except for (?{ ... }) */
1128 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1129 && (minnext == 0) && (deltanext == 0)
1130 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1131 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1134 "Quantifier unexpected on zero-length expression");
1137 min += minnext * mincount;
1138 is_inf_internal |= ((maxcount == REG_INFTY
1139 && (minnext + deltanext) > 0)
1140 || deltanext == I32_MAX);
1141 is_inf |= is_inf_internal;
1142 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144 /* Try powerful optimization CURLYX => CURLYN. */
1145 if ( OP(oscan) == CURLYX && data
1146 && data->flags & SF_IN_PAR
1147 && !(data->flags & SF_HAS_EVAL)
1148 && !deltanext && minnext == 1 ) {
1149 /* Try to optimize to CURLYN. */
1150 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1151 regnode *nxt1 = nxt;
1158 if (!strchr((char*)PL_simple,OP(nxt))
1159 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1160 && STR_LEN(nxt) == 1))
1166 if (OP(nxt) != CLOSE)
1168 /* Now we know that nxt2 is the only contents: */
1169 oscan->flags = (U8)ARG(nxt);
1171 OP(nxt1) = NOTHING; /* was OPEN. */
1173 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1174 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1175 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1176 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1177 OP(nxt + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 /* Try optimization CURLYX => CURLYM. */
1184 if ( OP(oscan) == CURLYX && data
1185 && !(data->flags & SF_HAS_PAR)
1186 && !(data->flags & SF_HAS_EVAL)
1188 /* XXXX How to optimize if data == 0? */
1189 /* Optimize to a simpler form. */
1190 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1195 && (OP(nxt2) != WHILEM))
1197 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1198 /* Need to optimize away parenths. */
1199 if (data->flags & SF_IN_PAR) {
1200 /* Set the parenth number. */
1201 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203 if (OP(nxt) != CLOSE)
1204 FAIL("Panic opt close");
1205 oscan->flags = (U8)ARG(nxt);
1206 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1207 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1209 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1210 OP(nxt + 1) = OPTIMIZED; /* was count. */
1211 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1212 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1215 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1216 regnode *nnxt = regnext(nxt1);
1219 if (reg_off_by_arg[OP(nxt1)])
1220 ARG_SET(nxt1, nxt2 - nxt1);
1221 else if (nxt2 - nxt1 < U16_MAX)
1222 NEXT_OFF(nxt1) = nxt2 - nxt1;
1224 OP(nxt) = NOTHING; /* Cannot beautify */
1229 /* Optimize again: */
1230 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236 else if ((OP(oscan) == CURLYX)
1237 && (flags & SCF_WHILEM_VISITED_POS)
1238 /* See the comment on a similar expression above.
1239 However, this time it not a subexpression
1240 we care about, but the expression itself. */
1241 && (maxcount == REG_INFTY)
1242 && data && ++data->whilem_c < 16) {
1243 /* This stays as CURLYX, we can put the count/of pair. */
1244 /* Find WHILEM (as in regexec.c) */
1245 regnode *nxt = oscan + NEXT_OFF(oscan);
1247 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1250 | (RExC_whilem_seen << 4)); /* On WHILEM */
1252 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1254 if (flags & SCF_DO_SUBSTR) {
1255 SV *last_str = Nullsv;
1256 int counted = mincount != 0;
1258 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1259 #if defined(SPARC64_GCC_WORKAROUND)
1265 if (pos_before >= data->last_start_min)
1268 b = data->last_start_min;
1271 s = SvPV(data->last_found, l);
1272 old = b - data->last_start_min;
1275 I32 b = pos_before >= data->last_start_min
1276 ? pos_before : data->last_start_min;
1278 char *s = SvPV(data->last_found, l);
1279 I32 old = b - data->last_start_min;
1283 old = utf8_hop((U8*)s, old) - (U8*)s;
1286 /* Get the added string: */
1287 last_str = newSVpvn(s + old, l);
1289 SvUTF8_on(last_str);
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1293 SvGROW(last_str, (mincount * l) + 1);
1294 repeatcpy(SvPVX(last_str) + l,
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
1298 SvCUR_set(data->last_found,
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1302 SV * sv = data->last_found;
1304 SvUTF8(sv) && SvMAGICAL(sv) ?
1305 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1306 if (mg && mg->mg_len >= 0)
1307 mg->mg_len += CHR_SVLEN(last_str);
1309 data->last_end += l * (mincount - 1);
1312 /* start offset must point into the last copy */
1313 data->last_start_min += minnext * (mincount - 1);
1314 data->last_start_max += is_inf ? I32_MAX
1315 : (maxcount - 1) * (minnext + data->pos_delta);
1318 /* It is counted once already... */
1319 data->pos_min += minnext * (mincount - counted);
1320 data->pos_delta += - counted * deltanext +
1321 (minnext + deltanext) * maxcount - minnext * mincount;
1322 if (mincount != maxcount) {
1323 /* Cannot extend fixed substrings found inside
1325 scan_commit(pRExC_state,data);
1326 if (mincount && last_str) {
1327 sv_setsv(data->last_found, last_str);
1328 data->last_end = data->pos_min;
1329 data->last_start_min =
1330 data->pos_min - CHR_SVLEN(last_str);
1331 data->last_start_max = is_inf
1333 : data->pos_min + data->pos_delta
1334 - CHR_SVLEN(last_str);
1336 data->longest = &(data->longest_float);
1338 SvREFCNT_dec(last_str);
1340 if (data && (fl & SF_HAS_EVAL))
1341 data->flags |= SF_HAS_EVAL;
1342 optimize_curly_tail:
1343 if (OP(oscan) != CURLYX) {
1344 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1346 NEXT_OFF(oscan) += NEXT_OFF(next);
1349 default: /* REF and CLUMP only? */
1350 if (flags & SCF_DO_SUBSTR) {
1351 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1352 data->longest = &(data->longest_float);
1354 is_inf = is_inf_internal = 1;
1355 if (flags & SCF_DO_STCLASS_OR)
1356 cl_anything(pRExC_state, data->start_class);
1357 flags &= ~SCF_DO_STCLASS;
1361 else if (strchr((char*)PL_simple,OP(scan))) {
1364 if (flags & SCF_DO_SUBSTR) {
1365 scan_commit(pRExC_state,data);
1369 if (flags & SCF_DO_STCLASS) {
1370 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1372 /* Some of the logic below assumes that switching
1373 locale on will only add false positives. */
1374 switch (PL_regkind[(U8)OP(scan)]) {
1378 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1379 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1380 cl_anything(pRExC_state, data->start_class);
1383 if (OP(scan) == SANY)
1385 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1386 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1387 || (data->start_class->flags & ANYOF_CLASS));
1388 cl_anything(pRExC_state, data->start_class);
1390 if (flags & SCF_DO_STCLASS_AND || !value)
1391 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1394 if (flags & SCF_DO_STCLASS_AND)
1395 cl_and(data->start_class,
1396 (struct regnode_charclass_class*)scan);
1398 cl_or(pRExC_state, data->start_class,
1399 (struct regnode_charclass_class*)scan);
1402 if (flags & SCF_DO_STCLASS_AND) {
1403 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1404 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1405 for (value = 0; value < 256; value++)
1406 if (!isALNUM(value))
1407 ANYOF_BITMAP_CLEAR(data->start_class, value);
1411 if (data->start_class->flags & ANYOF_LOCALE)
1412 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1414 for (value = 0; value < 256; value++)
1416 ANYOF_BITMAP_SET(data->start_class, value);
1421 if (flags & SCF_DO_STCLASS_AND) {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1426 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1427 data->start_class->flags |= ANYOF_LOCALE;
1431 if (flags & SCF_DO_STCLASS_AND) {
1432 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1433 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1434 for (value = 0; value < 256; value++)
1436 ANYOF_BITMAP_CLEAR(data->start_class, value);
1440 if (data->start_class->flags & ANYOF_LOCALE)
1441 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1443 for (value = 0; value < 256; value++)
1444 if (!isALNUM(value))
1445 ANYOF_BITMAP_SET(data->start_class, value);
1450 if (flags & SCF_DO_STCLASS_AND) {
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1455 data->start_class->flags |= ANYOF_LOCALE;
1456 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1460 if (flags & SCF_DO_STCLASS_AND) {
1461 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1462 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1463 for (value = 0; value < 256; value++)
1464 if (!isSPACE(value))
1465 ANYOF_BITMAP_CLEAR(data->start_class, value);
1469 if (data->start_class->flags & ANYOF_LOCALE)
1470 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1472 for (value = 0; value < 256; value++)
1474 ANYOF_BITMAP_SET(data->start_class, value);
1479 if (flags & SCF_DO_STCLASS_AND) {
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1484 data->start_class->flags |= ANYOF_LOCALE;
1485 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1489 if (flags & SCF_DO_STCLASS_AND) {
1490 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1491 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1492 for (value = 0; value < 256; value++)
1494 ANYOF_BITMAP_CLEAR(data->start_class, value);
1498 if (data->start_class->flags & ANYOF_LOCALE)
1499 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1501 for (value = 0; value < 256; value++)
1502 if (!isSPACE(value))
1503 ANYOF_BITMAP_SET(data->start_class, value);
1508 if (flags & SCF_DO_STCLASS_AND) {
1509 if (data->start_class->flags & ANYOF_LOCALE) {
1510 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1511 for (value = 0; value < 256; value++)
1512 if (!isSPACE(value))
1513 ANYOF_BITMAP_CLEAR(data->start_class, value);
1517 data->start_class->flags |= ANYOF_LOCALE;
1518 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1522 if (flags & SCF_DO_STCLASS_AND) {
1523 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1524 for (value = 0; value < 256; value++)
1525 if (!isDIGIT(value))
1526 ANYOF_BITMAP_CLEAR(data->start_class, value);
1529 if (data->start_class->flags & ANYOF_LOCALE)
1530 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1532 for (value = 0; value < 256; value++)
1534 ANYOF_BITMAP_SET(data->start_class, value);
1539 if (flags & SCF_DO_STCLASS_AND) {
1540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1541 for (value = 0; value < 256; value++)
1543 ANYOF_BITMAP_CLEAR(data->start_class, value);
1546 if (data->start_class->flags & ANYOF_LOCALE)
1547 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1549 for (value = 0; value < 256; value++)
1550 if (!isDIGIT(value))
1551 ANYOF_BITMAP_SET(data->start_class, value);
1556 if (flags & SCF_DO_STCLASS_OR)
1557 cl_and(data->start_class, &and_with);
1558 flags &= ~SCF_DO_STCLASS;
1561 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1562 data->flags |= (OP(scan) == MEOL
1566 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1567 /* Lookbehind, or need to calculate parens/evals/stclass: */
1568 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1569 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1570 /* Lookahead/lookbehind */
1571 I32 deltanext, minnext, fake = 0;
1573 struct regnode_charclass_class intrnl;
1576 data_fake.flags = 0;
1578 data_fake.whilem_c = data->whilem_c;
1579 data_fake.last_closep = data->last_closep;
1582 data_fake.last_closep = &fake;
1583 if ( flags & SCF_DO_STCLASS && !scan->flags
1584 && OP(scan) == IFMATCH ) { /* Lookahead */
1585 cl_init(pRExC_state, &intrnl);
1586 data_fake.start_class = &intrnl;
1587 f |= SCF_DO_STCLASS_AND;
1589 if (flags & SCF_WHILEM_VISITED_POS)
1590 f |= SCF_WHILEM_VISITED_POS;
1591 next = regnext(scan);
1592 nscan = NEXTOPER(NEXTOPER(scan));
1593 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1596 vFAIL("Variable length lookbehind not implemented");
1598 else if (minnext > U8_MAX) {
1599 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1601 scan->flags = (U8)minnext;
1603 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1605 if (data && (data_fake.flags & SF_HAS_EVAL))
1606 data->flags |= SF_HAS_EVAL;
1608 data->whilem_c = data_fake.whilem_c;
1609 if (f & SCF_DO_STCLASS_AND) {
1610 int was = (data->start_class->flags & ANYOF_EOS);
1612 cl_and(data->start_class, &intrnl);
1614 data->start_class->flags |= ANYOF_EOS;
1617 else if (OP(scan) == OPEN) {
1620 else if (OP(scan) == CLOSE) {
1621 if ((I32)ARG(scan) == is_par) {
1622 next = regnext(scan);
1624 if ( next && (OP(next) != WHILEM) && next < last)
1625 is_par = 0; /* Disable optimization */
1628 *(data->last_closep) = ARG(scan);
1630 else if (OP(scan) == EVAL) {
1632 data->flags |= SF_HAS_EVAL;
1634 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1635 if (flags & SCF_DO_SUBSTR) {
1636 scan_commit(pRExC_state,data);
1637 data->longest = &(data->longest_float);
1639 is_inf = is_inf_internal = 1;
1640 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1641 cl_anything(pRExC_state, data->start_class);
1642 flags &= ~SCF_DO_STCLASS;
1644 /* Else: zero-length, ignore. */
1645 scan = regnext(scan);
1650 *deltap = is_inf_internal ? I32_MAX : delta;
1651 if (flags & SCF_DO_SUBSTR && is_inf)
1652 data->pos_delta = I32_MAX - data->pos_min;
1653 if (is_par > U8_MAX)
1655 if (is_par && pars==1 && data) {
1656 data->flags |= SF_IN_PAR;
1657 data->flags &= ~SF_HAS_PAR;
1659 else if (pars && data) {
1660 data->flags |= SF_HAS_PAR;
1661 data->flags &= ~SF_IN_PAR;
1663 if (flags & SCF_DO_STCLASS_OR)
1664 cl_and(data->start_class, &and_with);
1669 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1671 if (RExC_rx->data) {
1672 Renewc(RExC_rx->data,
1673 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1674 char, struct reg_data);
1675 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1676 RExC_rx->data->count += n;
1679 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1680 char, struct reg_data);
1681 New(1208, RExC_rx->data->what, n, U8);
1682 RExC_rx->data->count = n;
1684 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1685 return RExC_rx->data->count - n;
1689 Perl_reginitcolors(pTHX)
1692 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1695 PL_colors[0] = s = savepv(s);
1697 s = strchr(s, '\t');
1703 PL_colors[i] = s = "";
1707 PL_colors[i++] = "";
1714 - pregcomp - compile a regular expression into internal code
1716 * We can't allocate space until we know how big the compiled form will be,
1717 * but we can't compile it (and thus know how big it is) until we've got a
1718 * place to put the code. So we cheat: we compile it twice, once with code
1719 * generation turned off and size counting turned on, and once "for real".
1720 * This also means that we don't allocate space until we are sure that the
1721 * thing really will compile successfully, and we never have to move the
1722 * code and thus invalidate pointers into it. (Note that it has to be in
1723 * one piece because free() must be able to free it all.) [NB: not true in perl]
1725 * Beware that the optimization-preparation code in here knows about some
1726 * of the structure of the compiled regexp. [I'll say.]
1729 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1739 RExC_state_t RExC_state;
1740 RExC_state_t *pRExC_state = &RExC_state;
1743 FAIL("NULL regexp argument");
1745 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1749 if (!PL_colorset) reginitcolors();
1750 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1751 PL_colors[4],PL_colors[5],PL_colors[0],
1752 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1754 RExC_flags = pm->op_pmflags;
1758 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1759 RExC_seen_evals = 0;
1762 /* First pass: determine size, legality. */
1769 RExC_emit = &PL_regdummy;
1770 RExC_whilem_seen = 0;
1771 #if 0 /* REGC() is (currently) a NOP at the first pass.
1772 * Clever compilers notice this and complain. --jhi */
1773 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1775 if (reg(pRExC_state, 0, &flags) == NULL) {
1776 RExC_precomp = Nullch;
1779 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1781 /* Small enough for pointer-storage convention?
1782 If extralen==0, this means that we will not need long jumps. */
1783 if (RExC_size >= 0x10000L && RExC_extralen)
1784 RExC_size += RExC_extralen;
1787 if (RExC_whilem_seen > 15)
1788 RExC_whilem_seen = 15;
1790 /* Allocate space and initialize. */
1791 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1794 FAIL("Regexp out of space");
1797 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1798 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1801 r->prelen = xend - exp;
1802 r->precomp = savepvn(RExC_precomp, r->prelen);
1804 #ifdef PERL_COPY_ON_WRITE
1805 r->saved_copy = Nullsv;
1807 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1808 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1810 r->substrs = 0; /* Useful during FAIL. */
1811 r->startp = 0; /* Useful during FAIL. */
1812 r->endp = 0; /* Useful during FAIL. */
1814 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1816 r->offsets[0] = RExC_size;
1818 DEBUG_r(PerlIO_printf(Perl_debug_log,
1819 "%s %"UVuf" bytes for offset annotations.\n",
1820 r->offsets ? "Got" : "Couldn't get",
1821 (UV)((2*RExC_size+1) * sizeof(U32))));
1825 /* Second pass: emit code. */
1826 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1831 RExC_emit_start = r->program;
1832 RExC_emit = r->program;
1833 /* Store the count of eval-groups for security checks: */
1834 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1835 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1837 if (reg(pRExC_state, 0, &flags) == NULL)
1840 /* Dig out information for optimizations. */
1841 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1842 pm->op_pmflags = RExC_flags;
1844 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1845 r->regstclass = NULL;
1846 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1847 r->reganch |= ROPT_NAUGHTY;
1848 scan = r->program + 1; /* First BRANCH. */
1850 /* XXXX To minimize changes to RE engine we always allocate
1851 3-units-long substrs field. */
1852 Newz(1004, r->substrs, 1, struct reg_substr_data);
1854 StructCopy(&zero_scan_data, &data, scan_data_t);
1855 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1856 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1858 STRLEN longest_float_length, longest_fixed_length;
1859 struct regnode_charclass_class ch_class;
1864 /* Skip introductions and multiplicators >= 1. */
1865 while ((OP(first) == OPEN && (sawopen = 1)) ||
1866 /* An OR of *one* alternative - should not happen now. */
1867 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1868 (OP(first) == PLUS) ||
1869 (OP(first) == MINMOD) ||
1870 /* An {n,m} with n>0 */
1871 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1872 if (OP(first) == PLUS)
1875 first += regarglen[(U8)OP(first)];
1876 first = NEXTOPER(first);
1879 /* Starting-point info. */
1881 if (PL_regkind[(U8)OP(first)] == EXACT) {
1882 if (OP(first) == EXACT)
1883 ; /* Empty, get anchored substr later. */
1884 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1885 r->regstclass = first;
1887 else if (strchr((char*)PL_simple,OP(first)))
1888 r->regstclass = first;
1889 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1890 PL_regkind[(U8)OP(first)] == NBOUND)
1891 r->regstclass = first;
1892 else if (PL_regkind[(U8)OP(first)] == BOL) {
1893 r->reganch |= (OP(first) == MBOL
1895 : (OP(first) == SBOL
1898 first = NEXTOPER(first);
1901 else if (OP(first) == GPOS) {
1902 r->reganch |= ROPT_ANCH_GPOS;
1903 first = NEXTOPER(first);
1906 else if (!sawopen && (OP(first) == STAR &&
1907 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1908 !(r->reganch & ROPT_ANCH) )
1910 /* turn .* into ^.* with an implied $*=1 */
1911 int type = OP(NEXTOPER(first));
1913 if (type == REG_ANY)
1914 type = ROPT_ANCH_MBOL;
1916 type = ROPT_ANCH_SBOL;
1918 r->reganch |= type | ROPT_IMPLICIT;
1919 first = NEXTOPER(first);
1922 if (sawplus && (!sawopen || !RExC_sawback)
1923 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1924 /* x+ must match at the 1st pos of run of x's */
1925 r->reganch |= ROPT_SKIP;
1927 /* Scan is after the zeroth branch, first is atomic matcher. */
1928 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1929 (IV)(first - scan + 1)));
1931 * If there's something expensive in the r.e., find the
1932 * longest literal string that must appear and make it the
1933 * regmust. Resolve ties in favor of later strings, since
1934 * the regstart check works with the beginning of the r.e.
1935 * and avoiding duplication strengthens checking. Not a
1936 * strong reason, but sufficient in the absence of others.
1937 * [Now we resolve ties in favor of the earlier string if
1938 * it happens that c_offset_min has been invalidated, since the
1939 * earlier string may buy us something the later one won't.]
1943 data.longest_fixed = newSVpvn("",0);
1944 data.longest_float = newSVpvn("",0);
1945 data.last_found = newSVpvn("",0);
1946 data.longest = &(data.longest_fixed);
1948 if (!r->regstclass) {
1949 cl_init(pRExC_state, &ch_class);
1950 data.start_class = &ch_class;
1951 stclass_flag = SCF_DO_STCLASS_AND;
1952 } else /* XXXX Check for BOUND? */
1954 data.last_closep = &last_close;
1956 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1957 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1958 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1959 && data.last_start_min == 0 && data.last_end > 0
1960 && !RExC_seen_zerolen
1961 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1962 r->reganch |= ROPT_CHECK_ALL;
1963 scan_commit(pRExC_state, &data);
1964 SvREFCNT_dec(data.last_found);
1966 longest_float_length = CHR_SVLEN(data.longest_float);
1967 if (longest_float_length
1968 || (data.flags & SF_FL_BEFORE_EOL
1969 && (!(data.flags & SF_FL_BEFORE_MEOL)
1970 || (RExC_flags & PMf_MULTILINE)))) {
1973 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1974 && data.offset_fixed == data.offset_float_min
1975 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1976 goto remove_float; /* As in (a)+. */
1978 if (SvUTF8(data.longest_float)) {
1979 r->float_utf8 = data.longest_float;
1980 r->float_substr = Nullsv;
1982 r->float_substr = data.longest_float;
1983 r->float_utf8 = Nullsv;
1985 r->float_min_offset = data.offset_float_min;
1986 r->float_max_offset = data.offset_float_max;
1987 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1988 && (!(data.flags & SF_FL_BEFORE_MEOL)
1989 || (RExC_flags & PMf_MULTILINE)));
1990 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1994 r->float_substr = r->float_utf8 = Nullsv;
1995 SvREFCNT_dec(data.longest_float);
1996 longest_float_length = 0;
1999 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2000 if (longest_fixed_length
2001 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2002 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2003 || (RExC_flags & PMf_MULTILINE)))) {
2006 if (SvUTF8(data.longest_fixed)) {
2007 r->anchored_utf8 = data.longest_fixed;
2008 r->anchored_substr = Nullsv;
2010 r->anchored_substr = data.longest_fixed;
2011 r->anchored_utf8 = Nullsv;
2013 r->anchored_offset = data.offset_fixed;
2014 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2015 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2016 || (RExC_flags & PMf_MULTILINE)));
2017 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2020 r->anchored_substr = r->anchored_utf8 = Nullsv;
2021 SvREFCNT_dec(data.longest_fixed);
2022 longest_fixed_length = 0;
2025 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2026 r->regstclass = NULL;
2027 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029 && !(data.start_class->flags & ANYOF_EOS)
2030 && !cl_is_anything(data.start_class))
2032 I32 n = add_data(pRExC_state, 1, "f");
2034 New(1006, RExC_rx->data->data[n], 1,
2035 struct regnode_charclass_class);
2036 StructCopy(data.start_class,
2037 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2038 struct regnode_charclass_class);
2039 r->regstclass = (regnode*)RExC_rx->data->data[n];
2040 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2041 PL_regdata = r->data; /* for regprop() */
2042 DEBUG_r({ SV *sv = sv_newmortal();
2043 regprop(sv, (regnode*)data.start_class);
2044 PerlIO_printf(Perl_debug_log,
2045 "synthetic stclass `%s'.\n",
2049 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2050 if (longest_fixed_length > longest_float_length) {
2051 r->check_substr = r->anchored_substr;
2052 r->check_utf8 = r->anchored_utf8;
2053 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2054 if (r->reganch & ROPT_ANCH_SINGLE)
2055 r->reganch |= ROPT_NOSCAN;
2058 r->check_substr = r->float_substr;
2059 r->check_utf8 = r->float_utf8;
2060 r->check_offset_min = data.offset_float_min;
2061 r->check_offset_max = data.offset_float_max;
2063 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2064 This should be changed ASAP! */
2065 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2066 r->reganch |= RE_USE_INTUIT;
2067 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2068 r->reganch |= RE_INTUIT_TAIL;
2072 /* Several toplevels. Best we can is to set minlen. */
2074 struct regnode_charclass_class ch_class;
2077 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2078 scan = r->program + 1;
2079 cl_init(pRExC_state, &ch_class);
2080 data.start_class = &ch_class;
2081 data.last_closep = &last_close;
2082 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2083 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2084 = r->float_substr = r->float_utf8 = Nullsv;
2085 if (!(data.start_class->flags & ANYOF_EOS)
2086 && !cl_is_anything(data.start_class))
2088 I32 n = add_data(pRExC_state, 1, "f");
2090 New(1006, RExC_rx->data->data[n], 1,
2091 struct regnode_charclass_class);
2092 StructCopy(data.start_class,
2093 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2094 struct regnode_charclass_class);
2095 r->regstclass = (regnode*)RExC_rx->data->data[n];
2096 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2097 DEBUG_r({ SV* sv = sv_newmortal();
2098 regprop(sv, (regnode*)data.start_class);
2099 PerlIO_printf(Perl_debug_log,
2100 "synthetic stclass `%s'.\n",
2106 if (RExC_seen & REG_SEEN_GPOS)
2107 r->reganch |= ROPT_GPOS_SEEN;
2108 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2109 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2110 if (RExC_seen & REG_SEEN_EVAL)
2111 r->reganch |= ROPT_EVAL_SEEN;
2112 if (RExC_seen & REG_SEEN_CANY)
2113 r->reganch |= ROPT_CANY_SEEN;
2114 Newz(1002, r->startp, RExC_npar, I32);
2115 Newz(1002, r->endp, RExC_npar, I32);
2116 PL_regdata = r->data; /* for regprop() */
2117 DEBUG_r(regdump(r));
2122 - reg - regular expression, i.e. main body or parenthesized thing
2124 * Caller must absorb opening parenthesis.
2126 * Combining parenthesis handling with the base level of regular expression
2127 * is a trifle forced, but the need to tie the tails of the branches to what
2128 * follows makes it hard to avoid.
2131 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2132 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2134 register regnode *ret; /* Will be the head of the group. */
2135 register regnode *br;
2136 register regnode *lastbr;
2137 register regnode *ender = 0;
2138 register I32 parno = 0;
2139 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2141 /* for (?g), (?gc), and (?o) warnings; warning
2142 about (?c) will warn about (?g) -- japhy */
2144 I32 wastedflags = 0x00,
2147 wasted_gc = 0x02 | 0x04,
2150 char * parse_start = RExC_parse; /* MJD */
2151 char *oregcomp_parse = RExC_parse;
2154 *flagp = 0; /* Tentatively. */
2157 /* Make an OPEN node, if parenthesized. */
2159 if (*RExC_parse == '?') { /* (?...) */
2160 U32 posflags = 0, negflags = 0;
2161 U32 *flagsp = &posflags;
2163 char *seqstart = RExC_parse;
2166 paren = *RExC_parse++;
2167 ret = NULL; /* For look-ahead/behind. */
2169 case '<': /* (?<...) */
2170 RExC_seen |= REG_SEEN_LOOKBEHIND;
2171 if (*RExC_parse == '!')
2173 if (*RExC_parse != '=' && *RExC_parse != '!')
2176 case '=': /* (?=...) */
2177 case '!': /* (?!...) */
2178 RExC_seen_zerolen++;
2179 case ':': /* (?:...) */
2180 case '>': /* (?>...) */
2182 case '$': /* (?$...) */
2183 case '@': /* (?@...) */
2184 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2186 case '#': /* (?#...) */
2187 while (*RExC_parse && *RExC_parse != ')')
2189 if (*RExC_parse != ')')
2190 FAIL("Sequence (?#... not terminated");
2191 nextchar(pRExC_state);
2194 case 'p': /* (?p...) */
2195 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2196 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2198 case '?': /* (??...) */
2200 if (*RExC_parse != '{')
2202 paren = *RExC_parse++;
2204 case '{': /* (?{...}) */
2206 I32 count = 1, n = 0;
2208 char *s = RExC_parse;
2210 OP_4tree *sop, *rop;
2212 RExC_seen_zerolen++;
2213 RExC_seen |= REG_SEEN_EVAL;
2214 while (count && (c = *RExC_parse)) {
2215 if (c == '\\' && RExC_parse[1])
2223 if (*RExC_parse != ')')
2226 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2231 if (RExC_parse - 1 - s)
2232 sv = newSVpvn(s, RExC_parse - 1 - s);
2234 sv = newSVpvn("", 0);
2237 Perl_save_re_context(aTHX);
2238 rop = sv_compile_2op(sv, &sop, "re", &pad);
2239 sop->op_private |= OPpREFCOUNTED;
2240 /* re_dup will OpREFCNT_inc */
2241 OpREFCNT_set(sop, 1);
2244 n = add_data(pRExC_state, 3, "nop");
2245 RExC_rx->data->data[n] = (void*)rop;
2246 RExC_rx->data->data[n+1] = (void*)sop;
2247 RExC_rx->data->data[n+2] = (void*)pad;
2250 else { /* First pass */
2251 if (PL_reginterp_cnt < ++RExC_seen_evals
2252 && PL_curcop != &PL_compiling)
2253 /* No compiled RE interpolated, has runtime
2254 components ===> unsafe. */
2255 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2256 if (PL_tainting && PL_tainted)
2257 FAIL("Eval-group in insecure regular expression");
2260 nextchar(pRExC_state);
2262 ret = reg_node(pRExC_state, LOGICAL);
2265 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2266 /* deal with the length of this later - MJD */
2269 ret = reganode(pRExC_state, EVAL, n);
2270 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2271 Set_Node_Offset(ret, parse_start);
2274 case '(': /* (?(?{...})...) and (?(?=...)...) */
2276 if (RExC_parse[0] == '?') { /* (?(?...)) */
2277 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2278 || RExC_parse[1] == '<'
2279 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2282 ret = reg_node(pRExC_state, LOGICAL);
2285 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2289 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2291 parno = atoi(RExC_parse++);
2293 while (isDIGIT(*RExC_parse))
2295 ret = reganode(pRExC_state, GROUPP, parno);
2297 if ((c = *nextchar(pRExC_state)) != ')')
2298 vFAIL("Switch condition not recognized");
2300 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2301 br = regbranch(pRExC_state, &flags, 1);
2303 br = reganode(pRExC_state, LONGJMP, 0);
2305 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2306 c = *nextchar(pRExC_state);
2310 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2311 regbranch(pRExC_state, &flags, 1);
2312 regtail(pRExC_state, ret, lastbr);
2315 c = *nextchar(pRExC_state);
2320 vFAIL("Switch (?(condition)... contains too many branches");
2321 ender = reg_node(pRExC_state, TAIL);
2322 regtail(pRExC_state, br, ender);
2324 regtail(pRExC_state, lastbr, ender);
2325 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2328 regtail(pRExC_state, ret, ender);
2332 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2336 RExC_parse--; /* for vFAIL to print correctly */
2337 vFAIL("Sequence (? incomplete");
2341 parse_flags: /* (?i) */
2342 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2343 /* (?g), (?gc) and (?o) are useless here
2344 and must be globally applied -- japhy */
2346 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2347 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2348 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2349 if (! (wastedflags & wflagbit) ) {
2350 wastedflags |= wflagbit;
2353 "Useless (%s%c) - %suse /%c modifier",
2354 flagsp == &negflags ? "?-" : "?",
2356 flagsp == &negflags ? "don't " : "",
2362 else if (*RExC_parse == 'c') {
2363 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2364 if (! (wastedflags & wasted_c) ) {
2365 wastedflags |= wasted_gc;
2368 "Useless (%sc) - %suse /gc modifier",
2369 flagsp == &negflags ? "?-" : "?",
2370 flagsp == &negflags ? "don't " : ""
2375 else { pmflag(flagsp, *RExC_parse); }
2379 if (*RExC_parse == '-') {
2381 wastedflags = 0; /* reset so (?g-c) warns twice */
2385 RExC_flags |= posflags;
2386 RExC_flags &= ~negflags;
2387 if (*RExC_parse == ':') {
2393 if (*RExC_parse != ')') {
2395 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2397 nextchar(pRExC_state);
2405 ret = reganode(pRExC_state, OPEN, parno);
2406 Set_Node_Length(ret, 1); /* MJD */
2407 Set_Node_Offset(ret, RExC_parse); /* MJD */
2414 /* Pick up the branches, linking them together. */
2415 parse_start = RExC_parse; /* MJD */
2416 br = regbranch(pRExC_state, &flags, 1);
2417 /* branch_len = (paren != 0); */
2421 if (*RExC_parse == '|') {
2422 if (!SIZE_ONLY && RExC_extralen) {
2423 reginsert(pRExC_state, BRANCHJ, br);
2426 reginsert(pRExC_state, BRANCH, br);
2427 Set_Node_Length(br, paren != 0);
2428 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2432 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2434 else if (paren == ':') {
2435 *flagp |= flags&SIMPLE;
2437 if (open) { /* Starts with OPEN. */
2438 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2440 else if (paren != '?') /* Not Conditional */
2442 *flagp |= flags & (SPSTART | HASWIDTH);
2444 while (*RExC_parse == '|') {
2445 if (!SIZE_ONLY && RExC_extralen) {
2446 ender = reganode(pRExC_state, LONGJMP,0);
2447 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2450 RExC_extralen += 2; /* Account for LONGJMP. */
2451 nextchar(pRExC_state);
2452 br = regbranch(pRExC_state, &flags, 0);
2456 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2460 *flagp |= flags&SPSTART;
2463 if (have_branch || paren != ':') {
2464 /* Make a closing node, and hook it on the end. */
2467 ender = reg_node(pRExC_state, TAIL);
2470 ender = reganode(pRExC_state, CLOSE, parno);
2471 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2472 Set_Node_Length(ender,1); /* MJD */
2478 *flagp &= ~HASWIDTH;
2481 ender = reg_node(pRExC_state, SUCCEED);
2484 ender = reg_node(pRExC_state, END);
2487 regtail(pRExC_state, lastbr, ender);
2490 /* Hook the tails of the branches to the closing node. */
2491 for (br = ret; br != NULL; br = regnext(br)) {
2492 regoptail(pRExC_state, br, ender);
2499 static char parens[] = "=!<,>";
2501 if (paren && (p = strchr(parens, paren))) {
2502 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2503 int flag = (p - parens) > 1;
2506 node = SUSPEND, flag = 0;
2507 reginsert(pRExC_state, node,ret);
2508 Set_Node_Offset(ret, oregcomp_parse);
2509 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2511 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2515 /* Check for proper termination. */
2517 RExC_flags = oregflags;
2518 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2519 RExC_parse = oregcomp_parse;
2520 vFAIL("Unmatched (");
2523 else if (!paren && RExC_parse < RExC_end) {
2524 if (*RExC_parse == ')') {
2526 vFAIL("Unmatched )");
2529 FAIL("Junk on end of regexp"); /* "Can't happen". */
2537 - regbranch - one alternative of an | operator
2539 * Implements the concatenation operator.
2542 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2544 register regnode *ret;
2545 register regnode *chain = NULL;
2546 register regnode *latest;
2547 I32 flags = 0, c = 0;
2552 if (!SIZE_ONLY && RExC_extralen)
2553 ret = reganode(pRExC_state, BRANCHJ,0);
2555 ret = reg_node(pRExC_state, BRANCH);
2556 Set_Node_Length(ret, 1);
2560 if (!first && SIZE_ONLY)
2561 RExC_extralen += 1; /* BRANCHJ */
2563 *flagp = WORST; /* Tentatively. */
2566 nextchar(pRExC_state);
2567 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2569 latest = regpiece(pRExC_state, &flags);
2570 if (latest == NULL) {
2571 if (flags & TRYAGAIN)
2575 else if (ret == NULL)
2577 *flagp |= flags&HASWIDTH;
2578 if (chain == NULL) /* First piece. */
2579 *flagp |= flags&SPSTART;
2582 regtail(pRExC_state, chain, latest);
2587 if (chain == NULL) { /* Loop ran zero times. */
2588 chain = reg_node(pRExC_state, NOTHING);
2593 *flagp |= flags&SIMPLE;
2600 - regpiece - something followed by possible [*+?]
2602 * Note that the branching code sequences used for ? and the general cases
2603 * of * and + are somewhat optimized: they use the same NOTHING node as
2604 * both the endmarker for their branch list and the body of the last branch.
2605 * It might seem that this node could be dispensed with entirely, but the
2606 * endmarker role is not redundant.
2609 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2611 register regnode *ret;
2613 register char *next;
2615 char *origparse = RExC_parse;
2618 I32 max = REG_INFTY;
2621 ret = regatom(pRExC_state, &flags);
2623 if (flags & TRYAGAIN)
2630 if (op == '{' && regcurly(RExC_parse)) {
2631 parse_start = RExC_parse; /* MJD */
2632 next = RExC_parse + 1;
2634 while (isDIGIT(*next) || *next == ',') {
2643 if (*next == '}') { /* got one */
2647 min = atoi(RExC_parse);
2651 maxpos = RExC_parse;
2653 if (!max && *maxpos != '0')
2654 max = REG_INFTY; /* meaning "infinity" */
2655 else if (max >= REG_INFTY)
2656 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2658 nextchar(pRExC_state);
2661 if ((flags&SIMPLE)) {
2662 RExC_naughty += 2 + RExC_naughty / 2;
2663 reginsert(pRExC_state, CURLY, ret);
2664 Set_Node_Offset(ret, parse_start+1); /* MJD */
2665 Set_Node_Cur_Length(ret);
2668 regnode *w = reg_node(pRExC_state, WHILEM);
2671 regtail(pRExC_state, ret, w);
2672 if (!SIZE_ONLY && RExC_extralen) {
2673 reginsert(pRExC_state, LONGJMP,ret);
2674 reginsert(pRExC_state, NOTHING,ret);
2675 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2677 reginsert(pRExC_state, CURLYX,ret);
2679 Set_Node_Offset(ret, parse_start+1);
2680 Set_Node_Length(ret,
2681 op == '{' ? (RExC_parse - parse_start) : 1);
2683 if (!SIZE_ONLY && RExC_extralen)
2684 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2685 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2687 RExC_whilem_seen++, RExC_extralen += 3;
2688 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2696 if (max && max < min)
2697 vFAIL("Can't do {n,m} with n > m");
2699 ARG1_SET(ret, (U16)min);
2700 ARG2_SET(ret, (U16)max);
2712 #if 0 /* Now runtime fix should be reliable. */
2714 /* if this is reinstated, don't forget to put this back into perldiag:
2716 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718 (F) The part of the regexp subject to either the * or + quantifier
2719 could match an empty string. The {#} shows in the regular
2720 expression about where the problem was discovered.
2724 if (!(flags&HASWIDTH) && op != '?')
2725 vFAIL("Regexp *+ operand could be empty");
2728 parse_start = RExC_parse;
2729 nextchar(pRExC_state);
2731 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2733 if (op == '*' && (flags&SIMPLE)) {
2734 reginsert(pRExC_state, STAR, ret);
2738 else if (op == '*') {
2742 else if (op == '+' && (flags&SIMPLE)) {
2743 reginsert(pRExC_state, PLUS, ret);
2747 else if (op == '+') {
2751 else if (op == '?') {
2756 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2758 "%.*s matches null string many times",
2759 RExC_parse - origparse,
2763 if (*RExC_parse == '?') {
2764 nextchar(pRExC_state);
2765 reginsert(pRExC_state, MINMOD, ret);
2766 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2768 if (ISMULT2(RExC_parse)) {
2770 vFAIL("Nested quantifiers");
2777 - regatom - the lowest level
2779 * Optimization: gobbles an entire sequence of ordinary characters so that
2780 * it can turn them into a single node, which is smaller to store and
2781 * faster to run. Backslashed characters are exceptions, each becoming a
2782 * separate node; the code is simpler that way and it's not worth fixing.
2784 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2786 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2788 register regnode *ret = 0;
2790 char *parse_start = 0;
2792 *flagp = WORST; /* Tentatively. */
2795 switch (*RExC_parse) {
2797 RExC_seen_zerolen++;
2798 nextchar(pRExC_state);
2799 if (RExC_flags & PMf_MULTILINE)
2800 ret = reg_node(pRExC_state, MBOL);
2801 else if (RExC_flags & PMf_SINGLELINE)
2802 ret = reg_node(pRExC_state, SBOL);
2804 ret = reg_node(pRExC_state, BOL);
2805 Set_Node_Length(ret, 1); /* MJD */
2808 nextchar(pRExC_state);
2810 RExC_seen_zerolen++;
2811 if (RExC_flags & PMf_MULTILINE)
2812 ret = reg_node(pRExC_state, MEOL);
2813 else if (RExC_flags & PMf_SINGLELINE)
2814 ret = reg_node(pRExC_state, SEOL);
2816 ret = reg_node(pRExC_state, EOL);
2817 Set_Node_Length(ret, 1); /* MJD */
2820 nextchar(pRExC_state);
2821 if (RExC_flags & PMf_SINGLELINE)
2822 ret = reg_node(pRExC_state, SANY);
2824 ret = reg_node(pRExC_state, REG_ANY);
2825 *flagp |= HASWIDTH|SIMPLE;
2827 Set_Node_Length(ret, 1); /* MJD */
2831 char *oregcomp_parse = ++RExC_parse;
2832 ret = regclass(pRExC_state);
2833 if (*RExC_parse != ']') {
2834 RExC_parse = oregcomp_parse;
2835 vFAIL("Unmatched [");
2837 nextchar(pRExC_state);
2838 *flagp |= HASWIDTH|SIMPLE;
2839 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2843 nextchar(pRExC_state);
2844 ret = reg(pRExC_state, 1, &flags);
2846 if (flags & TRYAGAIN) {
2847 if (RExC_parse == RExC_end) {
2848 /* Make parent create an empty node if needed. */
2856 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2860 if (flags & TRYAGAIN) {
2864 vFAIL("Internal urp");
2865 /* Supposed to be caught earlier. */
2868 if (!regcurly(RExC_parse)) {
2877 vFAIL("Quantifier follows nothing");
2880 switch (*++RExC_parse) {
2882 RExC_seen_zerolen++;
2883 ret = reg_node(pRExC_state, SBOL);
2885 nextchar(pRExC_state);
2886 Set_Node_Length(ret, 2); /* MJD */
2889 ret = reg_node(pRExC_state, GPOS);
2890 RExC_seen |= REG_SEEN_GPOS;
2892 nextchar(pRExC_state);
2893 Set_Node_Length(ret, 2); /* MJD */
2896 ret = reg_node(pRExC_state, SEOL);
2898 RExC_seen_zerolen++; /* Do not optimize RE away */
2899 nextchar(pRExC_state);
2902 ret = reg_node(pRExC_state, EOS);
2904 RExC_seen_zerolen++; /* Do not optimize RE away */
2905 nextchar(pRExC_state);
2906 Set_Node_Length(ret, 2); /* MJD */
2909 ret = reg_node(pRExC_state, CANY);
2910 RExC_seen |= REG_SEEN_CANY;
2911 *flagp |= HASWIDTH|SIMPLE;
2912 nextchar(pRExC_state);
2913 Set_Node_Length(ret, 2); /* MJD */
2916 ret = reg_node(pRExC_state, CLUMP);
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2922 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2923 *flagp |= HASWIDTH|SIMPLE;
2924 nextchar(pRExC_state);
2925 Set_Node_Length(ret, 2); /* MJD */
2928 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2929 *flagp |= HASWIDTH|SIMPLE;
2930 nextchar(pRExC_state);
2931 Set_Node_Length(ret, 2); /* MJD */
2934 RExC_seen_zerolen++;
2935 RExC_seen |= REG_SEEN_LOOKBEHIND;
2936 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2938 nextchar(pRExC_state);
2939 Set_Node_Length(ret, 2); /* MJD */
2942 RExC_seen_zerolen++;
2943 RExC_seen |= REG_SEEN_LOOKBEHIND;
2944 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2946 nextchar(pRExC_state);
2947 Set_Node_Length(ret, 2); /* MJD */
2950 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2951 *flagp |= HASWIDTH|SIMPLE;
2952 nextchar(pRExC_state);
2953 Set_Node_Length(ret, 2); /* MJD */
2956 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2957 *flagp |= HASWIDTH|SIMPLE;
2958 nextchar(pRExC_state);
2959 Set_Node_Length(ret, 2); /* MJD */
2962 ret = reg_node(pRExC_state, DIGIT);
2963 *flagp |= HASWIDTH|SIMPLE;
2964 nextchar(pRExC_state);
2965 Set_Node_Length(ret, 2); /* MJD */
2968 ret = reg_node(pRExC_state, NDIGIT);
2969 *flagp |= HASWIDTH|SIMPLE;
2970 nextchar(pRExC_state);
2971 Set_Node_Length(ret, 2); /* MJD */
2976 char* oldregxend = RExC_end;
2977 char* parse_start = RExC_parse - 2;
2979 if (RExC_parse[1] == '{') {
2980 /* a lovely hack--pretend we saw [\pX] instead */
2981 RExC_end = strchr(RExC_parse, '}');
2983 U8 c = (U8)*RExC_parse;
2985 RExC_end = oldregxend;
2986 vFAIL2("Missing right brace on \\%c{}", c);
2991 RExC_end = RExC_parse + 2;
2992 if (RExC_end > oldregxend)
2993 RExC_end = oldregxend;
2997 ret = regclass(pRExC_state);
2999 RExC_end = oldregxend;
3002 Set_Node_Offset(ret, parse_start + 2);
3003 Set_Node_Cur_Length(ret);
3004 nextchar(pRExC_state);
3005 *flagp |= HASWIDTH|SIMPLE;
3018 case '1': case '2': case '3': case '4':
3019 case '5': case '6': case '7': case '8': case '9':
3021 I32 num = atoi(RExC_parse);
3023 if (num > 9 && num >= RExC_npar)
3026 char * parse_start = RExC_parse - 1; /* MJD */
3027 while (isDIGIT(*RExC_parse))
3030 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3031 vFAIL("Reference to nonexistent group");
3033 ret = reganode(pRExC_state,
3034 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3038 /* override incorrect value set in reganode MJD */
3039 Set_Node_Offset(ret, parse_start+1);
3040 Set_Node_Cur_Length(ret); /* MJD */
3042 nextchar(pRExC_state);
3047 if (RExC_parse >= RExC_end)
3048 FAIL("Trailing \\");
3051 /* Do not generate `unrecognized' warnings here, we fall
3052 back into the quick-grab loop below */
3058 if (RExC_flags & PMf_EXTENDED) {
3059 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3060 if (RExC_parse < RExC_end)
3066 register STRLEN len;
3072 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3074 parse_start = RExC_parse - 1;
3080 ret = reg_node(pRExC_state,
3081 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3083 for (len = 0, p = RExC_parse - 1;
3084 len < 127 && p < RExC_end;
3089 if (RExC_flags & PMf_EXTENDED)
3090 p = regwhite(p, RExC_end);
3137 ender = ASCII_TO_NATIVE('\033');
3141 ender = ASCII_TO_NATIVE('\007');
3146 char* e = strchr(p, '}');
3150 vFAIL("Missing right brace on \\x{}");
3153 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3154 | PERL_SCAN_DISALLOW_PREFIX;
3156 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3159 /* numlen is generous */
3160 if (numlen + len >= 127) {
3168 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3170 ender = grok_hex(p, &numlen, &flags, NULL);
3176 ender = UCHARAT(p++);
3177 ender = toCTRL(ender);
3179 case '0': case '1': case '2': case '3':case '4':
3180 case '5': case '6': case '7': case '8':case '9':
3182 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3185 ender = grok_oct(p, &numlen, &flags, NULL);
3195 FAIL("Trailing \\");
3198 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3199 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3200 goto normal_default;
3205 if (UTF8_IS_START(*p) && UTF) {
3206 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3214 if (RExC_flags & PMf_EXTENDED)
3215 p = regwhite(p, RExC_end);
3217 /* Prime the casefolded buffer. */
3218 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3220 if (ISMULT2(p)) { /* Back off on ?+*. */
3227 /* Emit all the Unicode characters. */
3228 for (foldbuf = tmpbuf;
3230 foldlen -= numlen) {
3231 ender = utf8_to_uvchr(foldbuf, &numlen);
3233 reguni(pRExC_state, ender, s, &unilen);
3236 /* In EBCDIC the numlen
3237 * and unilen can differ. */
3239 if (numlen >= foldlen)
3243 break; /* "Can't happen." */
3247 reguni(pRExC_state, ender, s, &unilen);
3256 REGC((char)ender, s++);
3264 /* Emit all the Unicode characters. */
3265 for (foldbuf = tmpbuf;
3267 foldlen -= numlen) {
3268 ender = utf8_to_uvchr(foldbuf, &numlen);
3270 reguni(pRExC_state, ender, s, &unilen);
3273 /* In EBCDIC the numlen
3274 * and unilen can differ. */
3276 if (numlen >= foldlen)
3284 reguni(pRExC_state, ender, s, &unilen);
3293 REGC((char)ender, s++);
3297 Set_Node_Cur_Length(ret); /* MJD */
3298 nextchar(pRExC_state);
3300 /* len is STRLEN which is unsigned, need to copy to signed */
3303 vFAIL("Internal disaster");
3312 RExC_size += STR_SZ(len);
3314 RExC_emit += STR_SZ(len);
3319 /* If the encoding pragma is in effect recode the text of
3320 * any EXACT-kind nodes. */
3321 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3322 STRLEN oldlen = STR_LEN(ret);
3323 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3327 if (sv_utf8_downgrade(sv, TRUE)) {
3328 char *s = sv_recode_to_utf8(sv, PL_encoding);
3329 STRLEN newlen = SvCUR(sv);
3334 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3335 (int)oldlen, STRING(ret),
3337 Copy(s, STRING(ret), newlen, char);
3338 STR_LEN(ret) += newlen - oldlen;
3339 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3341 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3349 S_regwhite(pTHX_ char *p, char *e)
3354 else if (*p == '#') {
3357 } while (p < e && *p != '\n');
3365 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3366 Character classes ([:foo:]) can also be negated ([:^foo:]).
3367 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3368 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3369 but trigger failures because they are currently unimplemented. */
3371 #define POSIXCC_DONE(c) ((c) == ':')
3372 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3373 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3376 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3379 I32 namedclass = OOB_NAMEDCLASS;
3381 if (value == '[' && RExC_parse + 1 < RExC_end &&
3382 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3383 POSIXCC(UCHARAT(RExC_parse))) {
3384 char c = UCHARAT(RExC_parse);
3385 char* s = RExC_parse++;
3387 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3389 if (RExC_parse == RExC_end)
3390 /* Grandfather lone [:, [=, [. */
3393 char* t = RExC_parse++; /* skip over the c */
3395 if (UCHARAT(RExC_parse) == ']') {
3396 RExC_parse++; /* skip over the ending ] */
3399 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3400 I32 skip = 5; /* the most common skip */
3404 if (strnEQ(posixcc, "alnum", 5))
3406 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3407 else if (strnEQ(posixcc, "alpha", 5))
3409 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3410 else if (strnEQ(posixcc, "ascii", 5))
3412 complement ? ANYOF_NASCII : ANYOF_ASCII;
3415 if (strnEQ(posixcc, "blank", 5))
3417 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3420 if (strnEQ(posixcc, "cntrl", 5))
3422 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3425 if (strnEQ(posixcc, "digit", 5))
3427 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3430 if (strnEQ(posixcc, "graph", 5))
3432 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3435 if (strnEQ(posixcc, "lower", 5))
3437 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3440 if (strnEQ(posixcc, "print", 5))
3442 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3443 else if (strnEQ(posixcc, "punct", 5))
3445 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3448 if (strnEQ(posixcc, "space", 5))
3450 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3453 if (strnEQ(posixcc, "upper", 5))
3455 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3457 case 'w': /* this is not POSIX, this is the Perl \w */
3458 if (strnEQ(posixcc, "word", 4)) {
3460 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3465 if (strnEQ(posixcc, "xdigit", 6)) {
3467 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3472 if (namedclass == OOB_NAMEDCLASS ||
3473 posixcc[skip] != ':' ||
3474 posixcc[skip+1] != ']')
3476 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3479 } else if (!SIZE_ONLY) {
3480 /* [[=foo=]] and [[.foo.]] are still future. */
3482 /* adjust RExC_parse so the warning shows after
3484 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3486 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3489 /* Maternal grandfather:
3490 * "[:" ending in ":" but not in ":]" */
3500 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3502 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3503 char *s = RExC_parse;
3506 while(*s && isALNUM(*s))
3508 if (*s && c == *s && s[1] == ']') {
3509 if (ckWARN(WARN_REGEXP))
3511 "POSIX syntax [%c %c] belongs inside character classes",
3514 /* [[=foo=]] and [[.foo.]] are still future. */
3515 if (POSIXCC_NOTYET(c)) {
3516 /* adjust RExC_parse so the error shows after
3518 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3520 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3527 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3530 register UV nextvalue;
3531 register IV prevvalue = OOB_UNICODE;
3532 register IV range = 0;
3533 register regnode *ret;
3536 char *rangebegin = 0;
3537 bool need_class = 0;
3538 SV *listsv = Nullsv;
3541 bool optimize_invert = TRUE;
3542 AV* unicode_alternate = 0;
3544 UV literal_endpoint = 0;
3547 ret = reganode(pRExC_state, ANYOF, 0);
3550 ANYOF_FLAGS(ret) = 0;
3552 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3556 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3560 RExC_size += ANYOF_SKIP;
3562 RExC_emit += ANYOF_SKIP;
3564 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3566 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3567 ANYOF_BITMAP_ZERO(ret);
3568 listsv = newSVpvn("# comment\n", 10);
3571 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3573 if (!SIZE_ONLY && POSIXCC(nextvalue))
3574 checkposixcc(pRExC_state);
3576 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3577 if (UCHARAT(RExC_parse) == ']')
3580 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3584 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3587 rangebegin = RExC_parse;
3589 value = utf8n_to_uvchr((U8*)RExC_parse,
3590 RExC_end - RExC_parse,
3592 RExC_parse += numlen;
3595 value = UCHARAT(RExC_parse++);
3596 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3597 if (value == '[' && POSIXCC(nextvalue))
3598 namedclass = regpposixcc(pRExC_state, value);
3599 else if (value == '\\') {
3601 value = utf8n_to_uvchr((U8*)RExC_parse,
3602 RExC_end - RExC_parse,
3604 RExC_parse += numlen;
3607 value = UCHARAT(RExC_parse++);
3608 /* Some compilers cannot handle switching on 64-bit integer
3609 * values, therefore value cannot be an UV. Yes, this will
3610 * be a problem later if we want switch on Unicode.
3611 * A similar issue a little bit later when switching on
3612 * namedclass. --jhi */
3613 switch ((I32)value) {
3614 case 'w': namedclass = ANYOF_ALNUM; break;
3615 case 'W': namedclass = ANYOF_NALNUM; break;
3616 case 's': namedclass = ANYOF_SPACE; break;
3617 case 'S': namedclass = ANYOF_NSPACE; break;
3618 case 'd': namedclass = ANYOF_DIGIT; break;
3619 case 'D': namedclass = ANYOF_NDIGIT; break;
3622 if (RExC_parse >= RExC_end)
3623 vFAIL2("Empty \\%c{}", (U8)value);
3624 if (*RExC_parse == '{') {
3626 e = strchr(RExC_parse++, '}');
3628 vFAIL2("Missing right brace on \\%c{}", c);
3629 while (isSPACE(UCHARAT(RExC_parse)))
3631 if (e == RExC_parse)
3632 vFAIL2("Empty \\%c{}", c);
3634 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3642 if (UCHARAT(RExC_parse) == '^') {
3645 value = value == 'p' ? 'P' : 'p'; /* toggle */
3646 while (isSPACE(UCHARAT(RExC_parse))) {
3652 Perl_sv_catpvf(aTHX_ listsv,
3653 "+utf8::%.*s\n", (int)n, RExC_parse);
3655 Perl_sv_catpvf(aTHX_ listsv,
3656 "!utf8::%.*s\n", (int)n, RExC_parse);
3659 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3661 case 'n': value = '\n'; break;
3662 case 'r': value = '\r'; break;
3663 case 't': value = '\t'; break;
3664 case 'f': value = '\f'; break;
3665 case 'b': value = '\b'; break;
3666 case 'e': value = ASCII_TO_NATIVE('\033');break;
3667 case 'a': value = ASCII_TO_NATIVE('\007');break;
3669 if (*RExC_parse == '{') {
3670 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3671 | PERL_SCAN_DISALLOW_PREFIX;
3672 e = strchr(RExC_parse++, '}');
3674 vFAIL("Missing right brace on \\x{}");
3676 numlen = e - RExC_parse;
3677 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3681 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3683 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3684 RExC_parse += numlen;
3688 value = UCHARAT(RExC_parse++);
3689 value = toCTRL(value);
3691 case '0': case '1': case '2': case '3': case '4':
3692 case '5': case '6': case '7': case '8': case '9':
3696 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3697 RExC_parse += numlen;
3701 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3703 "Unrecognized escape \\%c in character class passed through",
3707 } /* end of \blah */
3713 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3715 if (!SIZE_ONLY && !need_class)
3716 ANYOF_CLASS_ZERO(ret);
3720 /* a bad range like a-\d, a-[:digit:] ? */
3723 if (ckWARN(WARN_REGEXP))
3725 "False [] range \"%*.*s\"",
3726 RExC_parse - rangebegin,
3727 RExC_parse - rangebegin,
3729 if (prevvalue < 256) {
3730 ANYOF_BITMAP_SET(ret, prevvalue);
3731 ANYOF_BITMAP_SET(ret, '-');
3734 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3735 Perl_sv_catpvf(aTHX_ listsv,
3736 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3740 range = 0; /* this was not a true range */
3744 if (namedclass > OOB_NAMEDCLASS)
3745 optimize_invert = FALSE;
3746 /* Possible truncation here but in some 64-bit environments
3747 * the compiler gets heartburn about switch on 64-bit values.
3748 * A similar issue a little earlier when switching on value.
3750 switch ((I32)namedclass) {
3753 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3755 for (value = 0; value < 256; value++)
3757 ANYOF_BITMAP_SET(ret, value);
3759 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3763 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3765 for (value = 0; value < 256; value++)
3766 if (!isALNUM(value))
3767 ANYOF_BITMAP_SET(ret, value);
3769 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3773 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3775 for (value = 0; value < 256; value++)
3776 if (isALNUMC(value))
3777 ANYOF_BITMAP_SET(ret, value);
3779 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3783 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3785 for (value = 0; value < 256; value++)
3786 if (!isALNUMC(value))
3787 ANYOF_BITMAP_SET(ret, value);
3789 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3793 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3795 for (value = 0; value < 256; value++)
3797 ANYOF_BITMAP_SET(ret, value);
3799 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3803 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3805 for (value = 0; value < 256; value++)
3806 if (!isALPHA(value))
3807 ANYOF_BITMAP_SET(ret, value);
3809 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3813 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3816 for (value = 0; value < 128; value++)
3817 ANYOF_BITMAP_SET(ret, value);
3819 for (value = 0; value < 256; value++) {
3821 ANYOF_BITMAP_SET(ret, value);
3825 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3829 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3832 for (value = 128; value < 256; value++)
3833 ANYOF_BITMAP_SET(ret, value);
3835 for (value = 0; value < 256; value++) {
3836 if (!isASCII(value))
3837 ANYOF_BITMAP_SET(ret, value);
3841 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3845 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3847 for (value = 0; value < 256; value++)
3849 ANYOF_BITMAP_SET(ret, value);
3851 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3855 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3857 for (value = 0; value < 256; value++)
3858 if (!isBLANK(value))
3859 ANYOF_BITMAP_SET(ret, value);
3861 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3865 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3867 for (value = 0; value < 256; value++)
3869 ANYOF_BITMAP_SET(ret, value);
3871 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3875 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3877 for (value = 0; value < 256; value++)
3878 if (!isCNTRL(value))
3879 ANYOF_BITMAP_SET(ret, value);
3881 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3885 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3887 /* consecutive digits assumed */
3888 for (value = '0'; value <= '9'; value++)
3889 ANYOF_BITMAP_SET(ret, value);
3891 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3895 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3897 /* consecutive digits assumed */
3898 for (value = 0; value < '0'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3900 for (value = '9' + 1; value < 256; value++)
3901 ANYOF_BITMAP_SET(ret, value);
3903 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3907 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3909 for (value = 0; value < 256; value++)
3911 ANYOF_BITMAP_SET(ret, value);
3913 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3917 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3919 for (value = 0; value < 256; value++)
3920 if (!isGRAPH(value))
3921 ANYOF_BITMAP_SET(ret, value);
3923 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3927 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3929 for (value = 0; value < 256; value++)
3931 ANYOF_BITMAP_SET(ret, value);
3933 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3937 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3939 for (value = 0; value < 256; value++)
3940 if (!isLOWER(value))
3941 ANYOF_BITMAP_SET(ret, value);
3943 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3947 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3949 for (value = 0; value < 256; value++)
3951 ANYOF_BITMAP_SET(ret, value);
3953 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3957 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3959 for (value = 0; value < 256; value++)
3960 if (!isPRINT(value))
3961 ANYOF_BITMAP_SET(ret, value);
3963 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3967 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3969 for (value = 0; value < 256; value++)
3970 if (isPSXSPC(value))
3971 ANYOF_BITMAP_SET(ret, value);
3973 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3977 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3979 for (value = 0; value < 256; value++)
3980 if (!isPSXSPC(value))
3981 ANYOF_BITMAP_SET(ret, value);
3983 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3987 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3989 for (value = 0; value < 256; value++)
3991 ANYOF_BITMAP_SET(ret, value);
3993 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3997 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3999 for (value = 0; value < 256; value++)
4000 if (!isPUNCT(value))
4001 ANYOF_BITMAP_SET(ret, value);
4003 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4007 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4009 for (value = 0; value < 256; value++)
4011 ANYOF_BITMAP_SET(ret, value);
4013 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4017 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4019 for (value = 0; value < 256; value++)
4020 if (!isSPACE(value))
4021 ANYOF_BITMAP_SET(ret, value);
4023 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4027 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4029 for (value = 0; value < 256; value++)
4031 ANYOF_BITMAP_SET(ret, value);
4033 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4037 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4039 for (value = 0; value < 256; value++)
4040 if (!isUPPER(value))
4041 ANYOF_BITMAP_SET(ret, value);
4043 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4047 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4049 for (value = 0; value < 256; value++)
4050 if (isXDIGIT(value))
4051 ANYOF_BITMAP_SET(ret, value);
4053 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4057 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4059 for (value = 0; value < 256; value++)
4060 if (!isXDIGIT(value))
4061 ANYOF_BITMAP_SET(ret, value);
4063 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4066 vFAIL("Invalid [::] class");
4070 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4073 } /* end of namedclass \blah */
4076 if (prevvalue > (IV)value) /* b-a */ {
4077 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4078 RExC_parse - rangebegin,
4079 RExC_parse - rangebegin,
4081 range = 0; /* not a valid range */
4085 prevvalue = value; /* save the beginning of the range */
4086 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4087 RExC_parse[1] != ']') {
4090 /* a bad range like \w-, [:word:]- ? */
4091 if (namedclass > OOB_NAMEDCLASS) {
4092 if (ckWARN(WARN_REGEXP))
4094 "False [] range \"%*.*s\"",
4095 RExC_parse - rangebegin,
4096 RExC_parse - rangebegin,
4099 ANYOF_BITMAP_SET(ret, '-');
4101 range = 1; /* yeah, it's a range! */
4102 continue; /* but do it the next time */
4106 /* now is the next time */
4110 if (prevvalue < 256) {
4111 IV ceilvalue = value < 256 ? value : 255;
4114 /* In EBCDIC [\x89-\x91] should include
4115 * the \x8e but [i-j] should not. */
4116 if (literal_endpoint == 2 &&
4117 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4118 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4120 if (isLOWER(prevvalue)) {
4121 for (i = prevvalue; i <= ceilvalue; i++)
4123 ANYOF_BITMAP_SET(ret, i);
4125 for (i = prevvalue; i <= ceilvalue; i++)
4127 ANYOF_BITMAP_SET(ret, i);
4132 for (i = prevvalue; i <= ceilvalue; i++)
4133 ANYOF_BITMAP_SET(ret, i);
4135 if (value > 255 || UTF) {
4136 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4137 UV natvalue = NATIVE_TO_UNI(value);
4139 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4140 if (prevnatvalue < natvalue) { /* what about > ? */
4141 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4142 prevnatvalue, natvalue);
4144 else if (prevnatvalue == natvalue) {
4145 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4147 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4149 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4151 /* If folding and foldable and a single
4152 * character, insert also the folded version
4153 * to the charclass. */
4155 if (foldlen == (STRLEN)UNISKIP(f))
4156 Perl_sv_catpvf(aTHX_ listsv,
4159 /* Any multicharacter foldings
4160 * require the following transform:
4161 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4162 * where E folds into "pq" and F folds
4163 * into "rst", all other characters
4164 * fold to single characters. We save
4165 * away these multicharacter foldings,
4166 * to be later saved as part of the
4167 * additional "s" data. */
4170 if (!unicode_alternate)
4171 unicode_alternate = newAV();
4172 sv = newSVpvn((char*)foldbuf, foldlen);
4174 av_push(unicode_alternate, sv);
4178 /* If folding and the value is one of the Greek
4179 * sigmas insert a few more sigmas to make the
4180 * folding rules of the sigmas to work right.
4181 * Note that not all the possible combinations
4182 * are handled here: some of them are handled
4183 * by the standard folding rules, and some of
4184 * them (literal or EXACTF cases) are handled
4185 * during runtime in regexec.c:S_find_byclass(). */
4186 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4187 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4188 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4189 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4190 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4192 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4193 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4194 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4199 literal_endpoint = 0;
4203 range = 0; /* this range (if it was one) is done now */
4207 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4209 RExC_size += ANYOF_CLASS_ADD_SKIP;
4211 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4214 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4216 /* If the only flag is folding (plus possibly inversion). */
4217 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4219 for (value = 0; value < 256; ++value) {
4220 if (ANYOF_BITMAP_TEST(ret, value)) {
4221 UV fold = PL_fold[value];
4224 ANYOF_BITMAP_SET(ret, fold);
4227 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4230 /* optimize inverted simple patterns (e.g. [^a-z]) */
4231 if (!SIZE_ONLY && optimize_invert &&
4232 /* If the only flag is inversion. */
4233 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4234 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4235 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4236 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4243 /* The 0th element stores the character class description
4244 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4245 * to initialize the appropriate swash (which gets stored in
4246 * the 1st element), and also useful for dumping the regnode.
4247 * The 2nd element stores the multicharacter foldings,
4248 * used later (regexec.c:S_reginclass()). */
4249 av_store(av, 0, listsv);
4250 av_store(av, 1, NULL);
4251 av_store(av, 2, (SV*)unicode_alternate);
4252 rv = newRV_noinc((SV*)av);
4253 n = add_data(pRExC_state, 1, "s");
4254 RExC_rx->data->data[n] = (void*)rv;
4262 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4264 char* retval = RExC_parse++;
4267 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4268 RExC_parse[2] == '#') {
4269 while (*RExC_parse && *RExC_parse != ')')
4274 if (RExC_flags & PMf_EXTENDED) {
4275 if (isSPACE(*RExC_parse)) {
4279 else if (*RExC_parse == '#') {
4280 while (*RExC_parse && *RExC_parse != '\n')
4291 - reg_node - emit a node
4293 STATIC regnode * /* Location. */
4294 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4296 register regnode *ret;
4297 register regnode *ptr;
4301 SIZE_ALIGN(RExC_size);
4306 NODE_ALIGN_FILL(ret);
4308 FILL_ADVANCE_NODE(ptr, op);
4309 if (RExC_offsets) { /* MJD */
4310 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4311 "reg_node", __LINE__,
4313 RExC_emit - RExC_emit_start > RExC_offsets[0]
4314 ? "Overwriting end of array!\n" : "OK",
4315 RExC_emit - RExC_emit_start,
4316 RExC_parse - RExC_start,
4318 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4327 - reganode - emit a node with an argument
4329 STATIC regnode * /* Location. */
4330 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4332 register regnode *ret;
4333 register regnode *ptr;
4337 SIZE_ALIGN(RExC_size);
4342 NODE_ALIGN_FILL(ret);
4344 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4345 if (RExC_offsets) { /* MJD */
4346 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4350 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4351 "Overwriting end of array!\n" : "OK",
4352 RExC_emit - RExC_emit_start,
4353 RExC_parse - RExC_start,
4355 Set_Cur_Node_Offset;
4364 - reguni - emit (if appropriate) a Unicode character
4367 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4369 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4373 - reginsert - insert an operator in front of already-emitted operand
4375 * Means relocating the operand.
4378 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4380 register regnode *src;
4381 register regnode *dst;
4382 register regnode *place;
4383 register int offset = regarglen[(U8)op];
4385 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4388 RExC_size += NODE_STEP_REGNODE + offset;
4393 RExC_emit += NODE_STEP_REGNODE + offset;
4395 while (src > opnd) {
4396 StructCopy(--src, --dst, regnode);
4397 if (RExC_offsets) { /* MJD 20010112 */
4398 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4402 dst - RExC_emit_start > RExC_offsets[0]
4403 ? "Overwriting end of array!\n" : "OK",
4404 src - RExC_emit_start,
4405 dst - RExC_emit_start,
4407 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4408 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4413 place = opnd; /* Op node, where operand used to be. */
4414 if (RExC_offsets) { /* MJD */
4415 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4419 place - RExC_emit_start > RExC_offsets[0]
4420 ? "Overwriting end of array!\n" : "OK",
4421 place - RExC_emit_start,
4422 RExC_parse - RExC_start,
4424 Set_Node_Offset(place, RExC_parse);
4426 src = NEXTOPER(place);
4427 FILL_ADVANCE_NODE(place, op);
4428 Zero(src, offset, regnode);
4432 - regtail - set the next-pointer at the end of a node chain of p to val.
4435 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4437 register regnode *scan;
4438 register regnode *temp;
4443 /* Find last node. */
4446 temp = regnext(scan);
4452 if (reg_off_by_arg[OP(scan)]) {
4453 ARG_SET(scan, val - scan);
4456 NEXT_OFF(scan) = val - scan;
4461 - regoptail - regtail on operand of first argument; nop if operandless
4464 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4466 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4467 if (p == NULL || SIZE_ONLY)
4469 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4470 regtail(pRExC_state, NEXTOPER(p), val);
4472 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4473 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4480 - regcurly - a little FSA that accepts {\d+,?\d*}
4483 S_regcurly(pTHX_ register char *s)
4504 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4506 register U8 op = EXACT; /* Arbitrary non-END op. */
4507 register regnode *next;
4509 while (op != END && (!last || node < last)) {
4510 /* While that wasn't END last time... */
4516 next = regnext(node);
4518 if (OP(node) == OPTIMIZED)
4521 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4522 (int)(2*l + 1), "", SvPVX(sv));
4523 if (next == NULL) /* Next ptr. */
4524 PerlIO_printf(Perl_debug_log, "(0)");
4526 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4527 (void)PerlIO_putc(Perl_debug_log, '\n');
4529 if (PL_regkind[(U8)op] == BRANCHJ) {
4530 register regnode *nnode = (OP(next) == LONGJMP
4533 if (last && nnode > last)
4535 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4537 else if (PL_regkind[(U8)op] == BRANCH) {
4538 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4540 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4541 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4542 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4544 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4545 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4548 else if ( op == PLUS || op == STAR) {
4549 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4551 else if (op == ANYOF) {
4552 /* arglen 1 + class block */
4553 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4554 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4555 node = NEXTOPER(node);
4557 else if (PL_regkind[(U8)op] == EXACT) {
4558 /* Literal string, where present. */
4559 node += NODE_SZ_STR(node) - 1;
4560 node = NEXTOPER(node);
4563 node = NEXTOPER(node);
4564 node += regarglen[(U8)op];
4566 if (op == CURLYX || op == OPEN)
4568 else if (op == WHILEM)
4574 #endif /* DEBUGGING */
4577 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4580 Perl_regdump(pTHX_ regexp *r)
4583 SV *sv = sv_newmortal();
4585 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4587 /* Header fields of interest. */
4588 if (r->anchored_substr)
4589 PerlIO_printf(Perl_debug_log,
4590 "anchored `%s%.*s%s'%s at %"IVdf" ",
4592 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4593 SvPVX(r->anchored_substr),
4595 SvTAIL(r->anchored_substr) ? "$" : "",
4596 (IV)r->anchored_offset);
4597 else if (r->anchored_utf8)
4598 PerlIO_printf(Perl_debug_log,
4599 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4601 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4602 SvPVX(r->anchored_utf8),
4604 SvTAIL(r->anchored_utf8) ? "$" : "",
4605 (IV)r->anchored_offset);
4606 if (r->float_substr)
4607 PerlIO_printf(Perl_debug_log,
4608 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4610 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4611 SvPVX(r->float_substr),
4613 SvTAIL(r->float_substr) ? "$" : "",
4614 (IV)r->float_min_offset, (UV)r->float_max_offset);
4615 else if (r->float_utf8)
4616 PerlIO_printf(Perl_debug_log,
4617 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4619 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4620 SvPVX(r->float_utf8),
4622 SvTAIL(r->float_utf8) ? "$" : "",
4623 (IV)r->float_min_offset, (UV)r->float_max_offset);
4624 if (r->check_substr || r->check_utf8)
4625 PerlIO_printf(Perl_debug_log,
4626 r->check_substr == r->float_substr
4627 && r->check_utf8 == r->float_utf8
4628 ? "(checking floating" : "(checking anchored");
4629 if (r->reganch & ROPT_NOSCAN)
4630 PerlIO_printf(Perl_debug_log, " noscan");
4631 if (r->reganch & ROPT_CHECK_ALL)
4632 PerlIO_printf(Perl_debug_log, " isall");
4633 if (r->check_substr || r->check_utf8)
4634 PerlIO_printf(Perl_debug_log, ") ");
4636 if (r->regstclass) {
4637 regprop(sv, r->regstclass);
4638 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4640 if (r->reganch & ROPT_ANCH) {
4641 PerlIO_printf(Perl_debug_log, "anchored");
4642 if (r->reganch & ROPT_ANCH_BOL)
4643 PerlIO_printf(Perl_debug_log, "(BOL)");
4644 if (r->reganch & ROPT_ANCH_MBOL)
4645 PerlIO_printf(Perl_debug_log, "(MBOL)");
4646 if (r->reganch & ROPT_ANCH_SBOL)
4647 PerlIO_printf(Perl_debug_log, "(SBOL)");
4648 if (r->reganch & ROPT_ANCH_GPOS)
4649 PerlIO_printf(Perl_debug_log, "(GPOS)");
4650 PerlIO_putc(Perl_debug_log, ' ');
4652 if (r->reganch & ROPT_GPOS_SEEN)
4653 PerlIO_printf(Perl_debug_log, "GPOS ");
4654 if (r->reganch & ROPT_SKIP)
4655 PerlIO_printf(Perl_debug_log, "plus ");
4656 if (r->reganch & ROPT_IMPLICIT)
4657 PerlIO_printf(Perl_debug_log, "implicit ");
4658 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4659 if (r->reganch & ROPT_EVAL_SEEN)
4660 PerlIO_printf(Perl_debug_log, "with eval ");
4661 PerlIO_printf(Perl_debug_log, "\n");
4664 U32 len = r->offsets[0];
4665 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4666 for (i = 1; i <= len; i++)
4667 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4668 (UV)r->offsets[i*2-1],
4669 (UV)r->offsets[i*2]);
4670 PerlIO_printf(Perl_debug_log, "\n");
4672 #endif /* DEBUGGING */
4678 S_put_byte(pTHX_ SV *sv, int c)
4680 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4681 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4682 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4683 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4685 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4688 #endif /* DEBUGGING */
4691 - regprop - printable representation of opcode
4694 Perl_regprop(pTHX_ SV *sv, regnode *o)
4699 sv_setpvn(sv, "", 0);
4700 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4701 /* It would be nice to FAIL() here, but this may be called from
4702 regexec.c, and it would be hard to supply pRExC_state. */
4703 Perl_croak(aTHX_ "Corrupted regexp opcode");
4704 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4706 k = PL_regkind[(U8)OP(o)];
4709 SV *dsv = sv_2mortal(newSVpvn("", 0));
4710 /* Using is_utf8_string() is a crude hack but it may
4711 * be the best for now since we have no flag "this EXACTish
4712 * node was UTF-8" --jhi */
4713 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4715 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4716 UNI_DISPLAY_REGEX) :
4721 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4726 else if (k == CURLY) {
4727 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4728 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4729 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4731 else if (k == WHILEM && o->flags) /* Ordinal/of */
4732 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4733 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4734 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4735 else if (k == LOGICAL)
4736 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4737 else if (k == ANYOF) {
4738 int i, rangestart = -1;
4739 U8 flags = ANYOF_FLAGS(o);
4740 const char * const anyofs[] = { /* Should be synchronized with
4741 * ANYOF_ #xdefines in regcomp.h */
4774 if (flags & ANYOF_LOCALE)
4775 sv_catpv(sv, "{loc}");
4776 if (flags & ANYOF_FOLD)
4777 sv_catpv(sv, "{i}");
4778 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4779 if (flags & ANYOF_INVERT)
4781 for (i = 0; i <= 256; i++) {
4782 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4783 if (rangestart == -1)
4785 } else if (rangestart != -1) {
4786 if (i <= rangestart + 3)
4787 for (; rangestart < i; rangestart++)
4788 put_byte(sv, rangestart);
4790 put_byte(sv, rangestart);
4792 put_byte(sv, i - 1);
4798 if (o->flags & ANYOF_CLASS)
4799 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4800 if (ANYOF_CLASS_TEST(o,i))
4801 sv_catpv(sv, anyofs[i]);
4803 if (flags & ANYOF_UNICODE)
4804 sv_catpv(sv, "{unicode}");
4805 else if (flags & ANYOF_UNICODE_ALL)
4806 sv_catpv(sv, "{unicode_all}");
4810 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4814 U8 s[UTF8_MAXLEN+1];
4816 for (i = 0; i <= 256; i++) { /* just the first 256 */
4817 U8 *e = uvchr_to_utf8(s, i);
4819 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4820 if (rangestart == -1)
4822 } else if (rangestart != -1) {
4825 if (i <= rangestart + 3)
4826 for (; rangestart < i; rangestart++) {
4827 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4831 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4834 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4841 sv_catpv(sv, "..."); /* et cetera */
4845 char *s = savepv(SvPVX(lv));
4848 while(*s && *s != '\n') s++;
4869 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4871 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4872 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4873 #endif /* DEBUGGING */
4877 Perl_re_intuit_string(pTHX_ regexp *prog)
4878 { /* Assume that RE_INTUIT is set */
4881 char *s = SvPV(prog->check_substr
4882 ? prog->check_substr : prog->check_utf8, n_a);
4884 if (!PL_colorset) reginitcolors();
4885 PerlIO_printf(Perl_debug_log,
4886 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4888 prog->check_substr ? "" : "utf8 ",
4889 PL_colors[5],PL_colors[0],
4892 (strlen(s) > 60 ? "..." : ""));
4895 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4899 Perl_pregfree(pTHX_ struct regexp *r)
4902 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4905 if (!r || (--r->refcnt > 0))
4911 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4912 r->prelen, 60, UNI_DISPLAY_REGEX)
4913 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4917 PerlIO_printf(Perl_debug_log,
4918 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4919 PL_colors[4],PL_colors[5],PL_colors[0],
4922 len > 60 ? "..." : "");
4926 Safefree(r->precomp);
4927 if (r->offsets) /* 20010421 MJD */
4928 Safefree(r->offsets);
4929 RX_MATCH_COPY_FREE(r);
4930 #ifdef PERL_COPY_ON_WRITE
4932 SvREFCNT_dec(r->saved_copy);
4935 if (r->anchored_substr)
4936 SvREFCNT_dec(r->anchored_substr);
4937 if (r->anchored_utf8)
4938 SvREFCNT_dec(r->anchored_utf8);
4939 if (r->float_substr)
4940 SvREFCNT_dec(r->float_substr);
4942 SvREFCNT_dec(r->float_utf8);
4943 Safefree(r->substrs);
4946 int n = r->data->count;
4947 PAD* new_comppad = NULL;
4951 /* If you add a ->what type here, update the comment in regcomp.h */
4952 switch (r->data->what[n]) {
4954 SvREFCNT_dec((SV*)r->data->data[n]);
4957 Safefree(r->data->data[n]);
4960 new_comppad = (AV*)r->data->data[n];
4963 if (new_comppad == NULL)
4964 Perl_croak(aTHX_ "panic: pregfree comppad");
4965 PAD_SAVE_LOCAL(old_comppad,
4966 /* Watch out for global destruction's random ordering. */
4967 (SvTYPE(new_comppad) == SVt_PVAV) ?
4968 new_comppad : Null(PAD *)
4970 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4971 op_free((OP_4tree*)r->data->data[n]);
4974 PAD_RESTORE_LOCAL(old_comppad);
4975 SvREFCNT_dec((SV*)new_comppad);
4981 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4984 Safefree(r->data->what);
4987 Safefree(r->startp);
4993 - regnext - dig the "next" pointer out of a node
4995 * [Note, when REGALIGN is defined there are two places in regmatch()
4996 * that bypass this code for speed.]
4999 Perl_regnext(pTHX_ register regnode *p)
5001 register I32 offset;
5003 if (p == &PL_regdummy)
5006 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5014 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5017 STRLEN l1 = strlen(pat1);
5018 STRLEN l2 = strlen(pat2);
5027 Copy(pat1, buf, l1 , char);
5028 Copy(pat2, buf + l1, l2 , char);
5029 buf[l1 + l2] = '\n';
5030 buf[l1 + l2 + 1] = '\0';
5032 /* ANSI variant takes additional second argument */
5033 va_start(args, pat2);
5037 msv = vmess(buf, &args);
5039 message = SvPV(msv,l1);
5042 Copy(message, buf, l1 , char);
5043 buf[l1] = '\0'; /* Overwrite \n */
5044 Perl_croak(aTHX_ "%s", buf);
5047 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5050 Perl_save_re_context(pTHX)
5052 SAVEI32(PL_reg_flags); /* from regexec.c */
5054 SAVEPPTR(PL_reginput); /* String-input pointer. */
5055 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5056 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5057 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5058 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5059 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5060 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5061 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5062 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5063 PL_reg_start_tmp = 0;
5064 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5065 PL_reg_start_tmpl = 0;
5066 SAVEVPTR(PL_regdata);
5067 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5068 SAVEI32(PL_regnarrate); /* from regexec.c */
5069 SAVEVPTR(PL_regprogram); /* from regexec.c */
5070 SAVEINT(PL_regindent); /* from regexec.c */
5071 SAVEVPTR(PL_regcc); /* from regexec.c */
5072 SAVEVPTR(PL_curcop);
5073 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5074 SAVEVPTR(PL_reg_re); /* from regexec.c */
5075 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5076 SAVESPTR(PL_reg_sv); /* from regexec.c */
5077 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5078 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5079 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5080 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5081 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5082 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5083 PL_reg_oldsaved = Nullch;
5084 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5085 PL_reg_oldsavedlen = 0;
5086 #ifdef PERL_COPY_ON_WRITE
5090 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5092 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5093 PL_reg_leftiter = 0;
5094 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5095 PL_reg_poscache = Nullch;
5096 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5097 PL_reg_poscache_size = 0;
5098 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5099 SAVEI32(PL_regnpar); /* () count. */
5100 SAVEI32(PL_regsize); /* from regexec.c */
5103 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5109 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5110 for (i = 1; i <= rx->nparens; i++) {
5111 sprintf(digits, "%lu", (long)i);
5112 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5119 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5124 clear_re(pTHX_ void *r)
5126 ReREFCNT_dec((regexp *)r);