5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2002, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 #define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 #define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 #define vFAIL(m) STMT_START { \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 * Like Simple_vFAIL(), but accepts two arguments.
316 #define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 #define vFAIL2(m,a1) STMT_START { \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
333 * Like Simple_vFAIL(), but accepts three arguments.
335 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 #define vFAIL3(m,a1,a2) STMT_START { \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
351 * Like Simple_vFAIL(), but accepts four arguments.
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
360 * Like Simple_vFAIL(), but accepts five arguments.
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 data->flags &= ~SF_BEFORE_EOL;
502 /* Can match anything (initialization) */
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
506 ANYOF_CLASS_ZERO(cl);
507 ANYOF_BITMAP_SETALL(cl);
508 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
510 cl->flags |= ANYOF_LOCALE;
513 /* Can match anything (initialization) */
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
519 for (value = 0; value <= ANYOF_MAX; value += 2)
520 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
522 if (!(cl->flags & ANYOF_UNICODE_ALL))
524 if (!ANYOF_BITMAP_TESTALLSET(cl))
529 /* Can match anything (initialization) */
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
533 Zero(cl, 1, struct regnode_charclass_class);
535 cl_anything(pRExC_state, cl);
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
545 cl->flags |= ANYOF_LOCALE;
548 /* 'And' a given class with another one. Can create false positives */
549 /* We assume that cl is not inverted */
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552 struct regnode_charclass_class *and_with)
554 if (!(and_with->flags & ANYOF_CLASS)
555 && !(cl->flags & ANYOF_CLASS)
556 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557 && !(and_with->flags & ANYOF_FOLD)
558 && !(cl->flags & ANYOF_FOLD)) {
561 if (and_with->flags & ANYOF_INVERT)
562 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563 cl->bitmap[i] &= ~and_with->bitmap[i];
565 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566 cl->bitmap[i] &= and_with->bitmap[i];
567 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568 if (!(and_with->flags & ANYOF_EOS))
569 cl->flags &= ~ANYOF_EOS;
571 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 cl->flags |= ANYOF_UNICODE;
574 ARG_SET(cl, ARG(and_with));
576 if (!(and_with->flags & ANYOF_UNICODE_ALL))
577 cl->flags &= ~ANYOF_UNICODE_ALL;
578 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579 cl->flags &= ~ANYOF_UNICODE;
582 /* 'OR' a given class with another one. Can create false positives */
583 /* We assume that cl is not inverted */
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
587 if (or_with->flags & ANYOF_INVERT) {
589 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590 * <= (B1 | !B2) | (CL1 | !CL2)
591 * which is wasteful if CL2 is small, but we ignore CL2:
592 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593 * XXXX Can we handle case-fold? Unclear:
594 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
597 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598 && !(or_with->flags & ANYOF_FOLD)
599 && !(cl->flags & ANYOF_FOLD) ) {
602 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603 cl->bitmap[i] |= ~or_with->bitmap[i];
604 } /* XXXX: logic is complicated otherwise */
606 cl_anything(pRExC_state, cl);
609 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611 && (!(or_with->flags & ANYOF_FOLD)
612 || (cl->flags & ANYOF_FOLD)) ) {
615 /* OR char bitmap and class bitmap separately */
616 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617 cl->bitmap[i] |= or_with->bitmap[i];
618 if (or_with->flags & ANYOF_CLASS) {
619 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620 cl->classflags[i] |= or_with->classflags[i];
621 cl->flags |= ANYOF_CLASS;
624 else { /* XXXX: logic is complicated, leave it along for a moment. */
625 cl_anything(pRExC_state, cl);
628 if (or_with->flags & ANYOF_EOS)
629 cl->flags |= ANYOF_EOS;
631 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632 ARG(cl) != ARG(or_with)) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
636 if (or_with->flags & ANYOF_UNICODE_ALL) {
637 cl->flags |= ANYOF_UNICODE_ALL;
638 cl->flags &= ~ANYOF_UNICODE;
643 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644 * These need to be revisited when a newer toolchain becomes available.
646 #if defined(__sparc64__) && defined(__GNUC__)
647 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 # undef SPARC64_GCC_WORKAROUND
649 # define SPARC64_GCC_WORKAROUND 1
653 /* REx optimizer. Converts nodes into quickier variants "in place".
654 Finds fixed substrings. */
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657 to the position after last scanned or to NULL. */
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661 /* scanp: Start here (read-write). */
662 /* deltap: Write maxlen-minlen here. */
663 /* last: Stop before this one. */
665 I32 min = 0, pars = 0, code;
666 regnode *scan = *scanp, *next;
668 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669 int is_inf_internal = 0; /* The studied chunk is infinite */
670 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671 scan_data_t data_fake;
672 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
674 while (scan && OP(scan) != END && scan < last) {
675 /* Peephole optimizer: */
677 if (PL_regkind[(U8)OP(scan)] == EXACT) {
678 /* Merge several consecutive EXACTish nodes into one. */
679 regnode *n = regnext(scan);
682 regnode *stop = scan;
685 next = scan + NODE_SZ_STR(scan);
686 /* Skip NOTHING, merge EXACT*. */
688 ( PL_regkind[(U8)OP(n)] == NOTHING ||
689 (stringok && (OP(n) == OP(scan))))
691 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692 if (OP(n) == TAIL || n > next)
694 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695 NEXT_OFF(scan) += NEXT_OFF(n);
696 next = n + NODE_STEP_REGNODE;
704 int oldl = STR_LEN(scan);
705 regnode *nnext = regnext(n);
707 if (oldl + STR_LEN(n) > U8_MAX)
709 NEXT_OFF(scan) += NEXT_OFF(n);
710 STR_LEN(scan) += STR_LEN(n);
711 next = n + NODE_SZ_STR(n);
712 /* Now we can overwrite *n : */
713 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
721 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
723 Two problematic code points in Unicode casefolding of EXACT nodes:
725 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
732 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
733 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
735 This means that in case-insensitive matching (or "loose matching",
736 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737 length of the above casefolded versions) can match a target string
738 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739 This would rather mess up the minimum length computation.
741 What we'll do is to look for the tail four bytes, and then peek
742 at the preceding two bytes to see whether we need to decrease
743 the minimum length by four (six minus two).
745 Thanks to the design of UTF-8, there cannot be false matches:
746 A sequence of valid UTF-8 bytes cannot be a subsequence of
747 another valid sequence of UTF-8 bytes.
750 char *s0 = STRING(scan), *s, *t;
751 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752 char *t0 = "\xcc\x88\xcc\x81";
756 s < s2 && (t = ninstr(s, s1, t0, t1));
758 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
766 n = scan + NODE_SZ_STR(scan);
768 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
776 /* Follow the next-chain of the current node and optimize
777 away all the NOTHINGs from it. */
778 if (OP(scan) != CURLYX) {
779 int max = (reg_off_by_arg[OP(scan)]
781 /* I32 may be smaller than U16 on CRAYs! */
782 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
787 /* Skip NOTHING and LONGJMP. */
788 while ((n = regnext(n))
789 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
793 if (reg_off_by_arg[OP(scan)])
796 NEXT_OFF(scan) = off;
798 /* The principal pseudo-switch. Cannot be a switch, since we
799 look into several different things. */
800 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802 next = regnext(scan);
805 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806 I32 max1 = 0, min1 = I32_MAX, num = 0;
807 struct regnode_charclass_class accum;
809 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811 if (flags & SCF_DO_STCLASS)
812 cl_init_zero(pRExC_state, &accum);
813 while (OP(scan) == code) {
814 I32 deltanext, minnext, f = 0, fake;
815 struct regnode_charclass_class this_class;
820 data_fake.whilem_c = data->whilem_c;
821 data_fake.last_closep = data->last_closep;
824 data_fake.last_closep = &fake;
825 next = regnext(scan);
826 scan = NEXTOPER(scan);
828 scan = NEXTOPER(scan);
829 if (flags & SCF_DO_STCLASS) {
830 cl_init(pRExC_state, &this_class);
831 data_fake.start_class = &this_class;
832 f = SCF_DO_STCLASS_AND;
834 if (flags & SCF_WHILEM_VISITED_POS)
835 f |= SCF_WHILEM_VISITED_POS;
836 /* we suppose the run is continuous, last=next...*/
837 minnext = study_chunk(pRExC_state, &scan, &deltanext,
838 next, &data_fake, f);
841 if (max1 < minnext + deltanext)
842 max1 = minnext + deltanext;
843 if (deltanext == I32_MAX)
844 is_inf = is_inf_internal = 1;
846 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
848 if (data && (data_fake.flags & SF_HAS_EVAL))
849 data->flags |= SF_HAS_EVAL;
851 data->whilem_c = data_fake.whilem_c;
852 if (flags & SCF_DO_STCLASS)
853 cl_or(pRExC_state, &accum, &this_class);
857 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
859 if (flags & SCF_DO_SUBSTR) {
860 data->pos_min += min1;
861 data->pos_delta += max1 - min1;
862 if (max1 != min1 || is_inf)
863 data->longest = &(data->longest_float);
866 delta += max1 - min1;
867 if (flags & SCF_DO_STCLASS_OR) {
868 cl_or(pRExC_state, data->start_class, &accum);
870 cl_and(data->start_class, &and_with);
871 flags &= ~SCF_DO_STCLASS;
874 else if (flags & SCF_DO_STCLASS_AND) {
876 cl_and(data->start_class, &accum);
877 flags &= ~SCF_DO_STCLASS;
880 /* Switch to OR mode: cache the old value of
881 * data->start_class */
882 StructCopy(data->start_class, &and_with,
883 struct regnode_charclass_class);
884 flags &= ~SCF_DO_STCLASS_AND;
885 StructCopy(&accum, data->start_class,
886 struct regnode_charclass_class);
887 flags |= SCF_DO_STCLASS_OR;
888 data->start_class->flags |= ANYOF_EOS;
892 else if (code == BRANCHJ) /* single branch is optimized. */
893 scan = NEXTOPER(NEXTOPER(scan));
894 else /* single branch is optimized. */
895 scan = NEXTOPER(scan);
898 else if (OP(scan) == EXACT) {
899 I32 l = STR_LEN(scan);
900 UV uc = *((U8*)STRING(scan));
902 U8 *s = (U8*)STRING(scan);
903 l = utf8_length(s, s + l);
904 uc = utf8_to_uvchr(s, NULL);
907 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908 /* The code below prefers earlier match for fixed
909 offset, later match for variable offset. */
910 if (data->last_end == -1) { /* Update the start info. */
911 data->last_start_min = data->pos_min;
912 data->last_start_max = is_inf
913 ? I32_MAX : data->pos_min + data->pos_delta;
915 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
917 SvUTF8_on(data->last_found);
918 data->last_end = data->pos_min + l;
919 data->pos_min += l; /* As in the first entry. */
920 data->flags &= ~SF_BEFORE_EOL;
922 if (flags & SCF_DO_STCLASS_AND) {
923 /* Check whether it is compatible with what we know already! */
927 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928 && !ANYOF_BITMAP_TEST(data->start_class, uc)
929 && (!(data->start_class->flags & ANYOF_FOLD)
930 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
933 ANYOF_CLASS_ZERO(data->start_class);
934 ANYOF_BITMAP_ZERO(data->start_class);
936 ANYOF_BITMAP_SET(data->start_class, uc);
937 data->start_class->flags &= ~ANYOF_EOS;
939 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
941 else if (flags & SCF_DO_STCLASS_OR) {
942 /* false positive possible if the class is case-folded */
944 ANYOF_BITMAP_SET(data->start_class, uc);
946 data->start_class->flags |= ANYOF_UNICODE_ALL;
947 data->start_class->flags &= ~ANYOF_EOS;
948 cl_and(data->start_class, &and_with);
950 flags &= ~SCF_DO_STCLASS;
952 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953 I32 l = STR_LEN(scan);
954 UV uc = *((U8*)STRING(scan));
956 /* Search for fixed substrings supports EXACT only. */
957 if (flags & SCF_DO_SUBSTR)
958 scan_commit(pRExC_state, data);
960 U8 *s = (U8 *)STRING(scan);
961 l = utf8_length(s, s + l);
962 uc = utf8_to_uvchr(s, NULL);
965 if (data && (flags & SCF_DO_SUBSTR))
967 if (flags & SCF_DO_STCLASS_AND) {
968 /* Check whether it is compatible with what we know already! */
972 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973 && !ANYOF_BITMAP_TEST(data->start_class, uc)
974 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
976 ANYOF_CLASS_ZERO(data->start_class);
977 ANYOF_BITMAP_ZERO(data->start_class);
979 ANYOF_BITMAP_SET(data->start_class, uc);
980 data->start_class->flags &= ~ANYOF_EOS;
981 data->start_class->flags |= ANYOF_FOLD;
982 if (OP(scan) == EXACTFL)
983 data->start_class->flags |= ANYOF_LOCALE;
986 else if (flags & SCF_DO_STCLASS_OR) {
987 if (data->start_class->flags & ANYOF_FOLD) {
988 /* false positive possible if the class is case-folded.
989 Assume that the locale settings are the same... */
991 ANYOF_BITMAP_SET(data->start_class, uc);
992 data->start_class->flags &= ~ANYOF_EOS;
994 cl_and(data->start_class, &and_with);
996 flags &= ~SCF_DO_STCLASS;
998 else if (strchr((char*)PL_varies,OP(scan))) {
999 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000 I32 f = flags, pos_before = 0;
1001 regnode *oscan = scan;
1002 struct regnode_charclass_class this_class;
1003 struct regnode_charclass_class *oclass = NULL;
1004 I32 next_is_eval = 0;
1006 switch (PL_regkind[(U8)OP(scan)]) {
1007 case WHILEM: /* End of (?:...)* . */
1008 scan = NEXTOPER(scan);
1011 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012 next = NEXTOPER(scan);
1013 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1015 maxcount = REG_INFTY;
1016 next = regnext(scan);
1017 scan = NEXTOPER(scan);
1021 if (flags & SCF_DO_SUBSTR)
1026 if (flags & SCF_DO_STCLASS) {
1028 maxcount = REG_INFTY;
1029 next = regnext(scan);
1030 scan = NEXTOPER(scan);
1033 is_inf = is_inf_internal = 1;
1034 scan = regnext(scan);
1035 if (flags & SCF_DO_SUBSTR) {
1036 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037 data->longest = &(data->longest_float);
1039 goto optimize_curly_tail;
1041 mincount = ARG1(scan);
1042 maxcount = ARG2(scan);
1043 next = regnext(scan);
1044 if (OP(scan) == CURLYX) {
1045 I32 lp = (data ? *(data->last_closep) : 0);
1047 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1049 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050 next_is_eval = (OP(scan) == EVAL);
1052 if (flags & SCF_DO_SUBSTR) {
1053 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054 pos_before = data->pos_min;
1058 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1060 data->flags |= SF_IS_INF;
1062 if (flags & SCF_DO_STCLASS) {
1063 cl_init(pRExC_state, &this_class);
1064 oclass = data->start_class;
1065 data->start_class = &this_class;
1066 f |= SCF_DO_STCLASS_AND;
1067 f &= ~SCF_DO_STCLASS_OR;
1069 /* These are the cases when once a subexpression
1070 fails at a particular position, it cannot succeed
1071 even after backtracking at the enclosing scope.
1073 XXXX what if minimal match and we are at the
1074 initial run of {n,m}? */
1075 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076 f &= ~SCF_WHILEM_VISITED_POS;
1078 /* This will finish on WHILEM, setting scan, or on NULL: */
1079 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1081 ? (f & ~SCF_DO_SUBSTR) : f);
1083 if (flags & SCF_DO_STCLASS)
1084 data->start_class = oclass;
1085 if (mincount == 0 || minnext == 0) {
1086 if (flags & SCF_DO_STCLASS_OR) {
1087 cl_or(pRExC_state, data->start_class, &this_class);
1089 else if (flags & SCF_DO_STCLASS_AND) {
1090 /* Switch to OR mode: cache the old value of
1091 * data->start_class */
1092 StructCopy(data->start_class, &and_with,
1093 struct regnode_charclass_class);
1094 flags &= ~SCF_DO_STCLASS_AND;
1095 StructCopy(&this_class, data->start_class,
1096 struct regnode_charclass_class);
1097 flags |= SCF_DO_STCLASS_OR;
1098 data->start_class->flags |= ANYOF_EOS;
1100 } else { /* Non-zero len */
1101 if (flags & SCF_DO_STCLASS_OR) {
1102 cl_or(pRExC_state, data->start_class, &this_class);
1103 cl_and(data->start_class, &and_with);
1105 else if (flags & SCF_DO_STCLASS_AND)
1106 cl_and(data->start_class, &this_class);
1107 flags &= ~SCF_DO_STCLASS;
1109 if (!scan) /* It was not CURLYX, but CURLY. */
1111 if (ckWARN(WARN_REGEXP)
1112 /* ? quantifier ok, except for (?{ ... }) */
1113 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114 && (minnext == 0) && (deltanext == 0)
1115 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1119 "Quantifier unexpected on zero-length expression");
1122 min += minnext * mincount;
1123 is_inf_internal |= ((maxcount == REG_INFTY
1124 && (minnext + deltanext) > 0)
1125 || deltanext == I32_MAX);
1126 is_inf |= is_inf_internal;
1127 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1129 /* Try powerful optimization CURLYX => CURLYN. */
1130 if ( OP(oscan) == CURLYX && data
1131 && data->flags & SF_IN_PAR
1132 && !(data->flags & SF_HAS_EVAL)
1133 && !deltanext && minnext == 1 ) {
1134 /* Try to optimize to CURLYN. */
1135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136 regnode *nxt1 = nxt;
1143 if (!strchr((char*)PL_simple,OP(nxt))
1144 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145 && STR_LEN(nxt) == 1))
1151 if (OP(nxt) != CLOSE)
1153 /* Now we know that nxt2 is the only contents: */
1154 oscan->flags = (U8)ARG(nxt);
1156 OP(nxt1) = NOTHING; /* was OPEN. */
1158 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1162 OP(nxt + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1168 /* Try optimization CURLYX => CURLYM. */
1169 if ( OP(oscan) == CURLYX && data
1170 && !(data->flags & SF_HAS_PAR)
1171 && !(data->flags & SF_HAS_EVAL)
1173 /* XXXX How to optimize if data == 0? */
1174 /* Optimize to a simpler form. */
1175 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1179 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180 && (OP(nxt2) != WHILEM))
1182 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1183 /* Need to optimize away parenths. */
1184 if (data->flags & SF_IN_PAR) {
1185 /* Set the parenth number. */
1186 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1188 if (OP(nxt) != CLOSE)
1189 FAIL("Panic opt close");
1190 oscan->flags = (U8)ARG(nxt);
1191 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1192 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1194 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195 OP(nxt + 1) = OPTIMIZED; /* was count. */
1196 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1200 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201 regnode *nnxt = regnext(nxt1);
1204 if (reg_off_by_arg[OP(nxt1)])
1205 ARG_SET(nxt1, nxt2 - nxt1);
1206 else if (nxt2 - nxt1 < U16_MAX)
1207 NEXT_OFF(nxt1) = nxt2 - nxt1;
1209 OP(nxt) = NOTHING; /* Cannot beautify */
1214 /* Optimize again: */
1215 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1221 else if ((OP(oscan) == CURLYX)
1222 && (flags & SCF_WHILEM_VISITED_POS)
1223 /* See the comment on a similar expression above.
1224 However, this time it not a subexpression
1225 we care about, but the expression itself. */
1226 && (maxcount == REG_INFTY)
1227 && data && ++data->whilem_c < 16) {
1228 /* This stays as CURLYX, we can put the count/of pair. */
1229 /* Find WHILEM (as in regexec.c) */
1230 regnode *nxt = oscan + NEXT_OFF(oscan);
1232 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1234 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235 | (RExC_whilem_seen << 4)); /* On WHILEM */
1237 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1239 if (flags & SCF_DO_SUBSTR) {
1240 SV *last_str = Nullsv;
1241 int counted = mincount != 0;
1243 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1250 if (pos_before >= data->last_start_min)
1253 b = data->last_start_min;
1256 s = SvPV(data->last_found, l);
1257 old = b - data->last_start_min;
1260 I32 b = pos_before >= data->last_start_min
1261 ? pos_before : data->last_start_min;
1263 char *s = SvPV(data->last_found, l);
1264 I32 old = b - data->last_start_min;
1268 old = utf8_hop((U8*)s, old) - (U8*)s;
1271 /* Get the added string: */
1272 last_str = newSVpvn(s + old, l);
1274 SvUTF8_on(last_str);
1275 if (deltanext == 0 && pos_before == b) {
1276 /* What was added is a constant string */
1278 SvGROW(last_str, (mincount * l) + 1);
1279 repeatcpy(SvPVX(last_str) + l,
1280 SvPVX(last_str), l, mincount - 1);
1281 SvCUR(last_str) *= mincount;
1282 /* Add additional parts. */
1283 SvCUR_set(data->last_found,
1284 SvCUR(data->last_found) - l);
1285 sv_catsv(data->last_found, last_str);
1286 data->last_end += l * (mincount - 1);
1289 /* start offset must point into the last copy */
1290 data->last_start_min += minnext * (mincount - 1);
1291 data->last_start_max += is_inf ? I32_MAX
1292 : (maxcount - 1) * (minnext + data->pos_delta);
1295 /* It is counted once already... */
1296 data->pos_min += minnext * (mincount - counted);
1297 data->pos_delta += - counted * deltanext +
1298 (minnext + deltanext) * maxcount - minnext * mincount;
1299 if (mincount != maxcount) {
1300 /* Cannot extend fixed substrings found inside
1302 scan_commit(pRExC_state,data);
1303 if (mincount && last_str) {
1304 sv_setsv(data->last_found, last_str);
1305 data->last_end = data->pos_min;
1306 data->last_start_min =
1307 data->pos_min - CHR_SVLEN(last_str);
1308 data->last_start_max = is_inf
1310 : data->pos_min + data->pos_delta
1311 - CHR_SVLEN(last_str);
1313 data->longest = &(data->longest_float);
1315 SvREFCNT_dec(last_str);
1317 if (data && (fl & SF_HAS_EVAL))
1318 data->flags |= SF_HAS_EVAL;
1319 optimize_curly_tail:
1320 if (OP(oscan) != CURLYX) {
1321 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1323 NEXT_OFF(oscan) += NEXT_OFF(next);
1326 default: /* REF and CLUMP only? */
1327 if (flags & SCF_DO_SUBSTR) {
1328 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1329 data->longest = &(data->longest_float);
1331 is_inf = is_inf_internal = 1;
1332 if (flags & SCF_DO_STCLASS_OR)
1333 cl_anything(pRExC_state, data->start_class);
1334 flags &= ~SCF_DO_STCLASS;
1338 else if (strchr((char*)PL_simple,OP(scan))) {
1341 if (flags & SCF_DO_SUBSTR) {
1342 scan_commit(pRExC_state,data);
1346 if (flags & SCF_DO_STCLASS) {
1347 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1349 /* Some of the logic below assumes that switching
1350 locale on will only add false positives. */
1351 switch (PL_regkind[(U8)OP(scan)]) {
1355 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1357 cl_anything(pRExC_state, data->start_class);
1360 if (OP(scan) == SANY)
1362 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364 || (data->start_class->flags & ANYOF_CLASS));
1365 cl_anything(pRExC_state, data->start_class);
1367 if (flags & SCF_DO_STCLASS_AND || !value)
1368 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1371 if (flags & SCF_DO_STCLASS_AND)
1372 cl_and(data->start_class,
1373 (struct regnode_charclass_class*)scan);
1375 cl_or(pRExC_state, data->start_class,
1376 (struct regnode_charclass_class*)scan);
1379 if (flags & SCF_DO_STCLASS_AND) {
1380 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382 for (value = 0; value < 256; value++)
1383 if (!isALNUM(value))
1384 ANYOF_BITMAP_CLEAR(data->start_class, value);
1388 if (data->start_class->flags & ANYOF_LOCALE)
1389 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1391 for (value = 0; value < 256; value++)
1393 ANYOF_BITMAP_SET(data->start_class, value);
1398 if (flags & SCF_DO_STCLASS_AND) {
1399 if (data->start_class->flags & ANYOF_LOCALE)
1400 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1403 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404 data->start_class->flags |= ANYOF_LOCALE;
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411 for (value = 0; value < 256; value++)
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1420 for (value = 0; value < 256; value++)
1421 if (!isALNUM(value))
1422 ANYOF_BITMAP_SET(data->start_class, value);
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1432 data->start_class->flags |= ANYOF_LOCALE;
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440 for (value = 0; value < 256; value++)
1441 if (!isSPACE(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1449 for (value = 0; value < 256; value++)
1451 ANYOF_BITMAP_SET(data->start_class, value);
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469 for (value = 0; value < 256; value++)
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1478 for (value = 0; value < 256; value++)
1479 if (!isSPACE(value))
1480 ANYOF_BITMAP_SET(data->start_class, value);
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE) {
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1489 if (!isSPACE(value))
1490 ANYOF_BITMAP_CLEAR(data->start_class, value);
1494 data->start_class->flags |= ANYOF_LOCALE;
1495 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1499 if (flags & SCF_DO_STCLASS_AND) {
1500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501 for (value = 0; value < 256; value++)
1502 if (!isDIGIT(value))
1503 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1509 for (value = 0; value < 256; value++)
1511 ANYOF_BITMAP_SET(data->start_class, value);
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518 for (value = 0; value < 256; value++)
1520 ANYOF_BITMAP_CLEAR(data->start_class, value);
1523 if (data->start_class->flags & ANYOF_LOCALE)
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1526 for (value = 0; value < 256; value++)
1527 if (!isDIGIT(value))
1528 ANYOF_BITMAP_SET(data->start_class, value);
1533 if (flags & SCF_DO_STCLASS_OR)
1534 cl_and(data->start_class, &and_with);
1535 flags &= ~SCF_DO_STCLASS;
1538 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1539 data->flags |= (OP(scan) == MEOL
1543 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1544 /* Lookbehind, or need to calculate parens/evals/stclass: */
1545 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1546 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1547 /* Lookahead/lookbehind */
1548 I32 deltanext, minnext, fake = 0;
1550 struct regnode_charclass_class intrnl;
1553 data_fake.flags = 0;
1555 data_fake.whilem_c = data->whilem_c;
1556 data_fake.last_closep = data->last_closep;
1559 data_fake.last_closep = &fake;
1560 if ( flags & SCF_DO_STCLASS && !scan->flags
1561 && OP(scan) == IFMATCH ) { /* Lookahead */
1562 cl_init(pRExC_state, &intrnl);
1563 data_fake.start_class = &intrnl;
1564 f |= SCF_DO_STCLASS_AND;
1566 if (flags & SCF_WHILEM_VISITED_POS)
1567 f |= SCF_WHILEM_VISITED_POS;
1568 next = regnext(scan);
1569 nscan = NEXTOPER(NEXTOPER(scan));
1570 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1573 vFAIL("Variable length lookbehind not implemented");
1575 else if (minnext > U8_MAX) {
1576 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1578 scan->flags = (U8)minnext;
1580 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1582 if (data && (data_fake.flags & SF_HAS_EVAL))
1583 data->flags |= SF_HAS_EVAL;
1585 data->whilem_c = data_fake.whilem_c;
1586 if (f & SCF_DO_STCLASS_AND) {
1587 int was = (data->start_class->flags & ANYOF_EOS);
1589 cl_and(data->start_class, &intrnl);
1591 data->start_class->flags |= ANYOF_EOS;
1594 else if (OP(scan) == OPEN) {
1597 else if (OP(scan) == CLOSE) {
1598 if ((I32)ARG(scan) == is_par) {
1599 next = regnext(scan);
1601 if ( next && (OP(next) != WHILEM) && next < last)
1602 is_par = 0; /* Disable optimization */
1605 *(data->last_closep) = ARG(scan);
1607 else if (OP(scan) == EVAL) {
1609 data->flags |= SF_HAS_EVAL;
1611 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1612 if (flags & SCF_DO_SUBSTR) {
1613 scan_commit(pRExC_state,data);
1614 data->longest = &(data->longest_float);
1616 is_inf = is_inf_internal = 1;
1617 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1618 cl_anything(pRExC_state, data->start_class);
1619 flags &= ~SCF_DO_STCLASS;
1621 /* Else: zero-length, ignore. */
1622 scan = regnext(scan);
1627 *deltap = is_inf_internal ? I32_MAX : delta;
1628 if (flags & SCF_DO_SUBSTR && is_inf)
1629 data->pos_delta = I32_MAX - data->pos_min;
1630 if (is_par > U8_MAX)
1632 if (is_par && pars==1 && data) {
1633 data->flags |= SF_IN_PAR;
1634 data->flags &= ~SF_HAS_PAR;
1636 else if (pars && data) {
1637 data->flags |= SF_HAS_PAR;
1638 data->flags &= ~SF_IN_PAR;
1640 if (flags & SCF_DO_STCLASS_OR)
1641 cl_and(data->start_class, &and_with);
1646 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1648 if (RExC_rx->data) {
1649 Renewc(RExC_rx->data,
1650 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1651 char, struct reg_data);
1652 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653 RExC_rx->data->count += n;
1656 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1657 char, struct reg_data);
1658 New(1208, RExC_rx->data->what, n, U8);
1659 RExC_rx->data->count = n;
1661 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662 return RExC_rx->data->count - n;
1666 Perl_reginitcolors(pTHX)
1669 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1672 PL_colors[0] = s = savepv(s);
1674 s = strchr(s, '\t');
1680 PL_colors[i] = s = "";
1684 PL_colors[i++] = "";
1691 - pregcomp - compile a regular expression into internal code
1693 * We can't allocate space until we know how big the compiled form will be,
1694 * but we can't compile it (and thus know how big it is) until we've got a
1695 * place to put the code. So we cheat: we compile it twice, once with code
1696 * generation turned off and size counting turned on, and once "for real".
1697 * This also means that we don't allocate space until we are sure that the
1698 * thing really will compile successfully, and we never have to move the
1699 * code and thus invalidate pointers into it. (Note that it has to be in
1700 * one piece because free() must be able to free it all.) [NB: not true in perl]
1702 * Beware that the optimization-preparation code in here knows about some
1703 * of the structure of the compiled regexp. [I'll say.]
1706 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1716 RExC_state_t RExC_state;
1717 RExC_state_t *pRExC_state = &RExC_state;
1720 FAIL("NULL regexp argument");
1722 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1726 if (!PL_colorset) reginitcolors();
1727 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728 PL_colors[4],PL_colors[5],PL_colors[0],
1729 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1731 RExC_flags = pm->op_pmflags;
1735 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736 RExC_seen_evals = 0;
1739 /* First pass: determine size, legality. */
1746 RExC_emit = &PL_regdummy;
1747 RExC_whilem_seen = 0;
1748 #if 0 /* REGC() is (currently) a NOP at the first pass.
1749 * Clever compilers notice this and complain. --jhi */
1750 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1752 if (reg(pRExC_state, 0, &flags) == NULL) {
1753 RExC_precomp = Nullch;
1756 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1758 /* Small enough for pointer-storage convention?
1759 If extralen==0, this means that we will not need long jumps. */
1760 if (RExC_size >= 0x10000L && RExC_extralen)
1761 RExC_size += RExC_extralen;
1764 if (RExC_whilem_seen > 15)
1765 RExC_whilem_seen = 15;
1767 /* Allocate space and initialize. */
1768 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1771 FAIL("Regexp out of space");
1774 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1775 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1778 r->prelen = xend - exp;
1779 r->precomp = savepvn(RExC_precomp, r->prelen);
1781 #ifdef PERL_COPY_ON_WRITE
1782 r->saved_copy = Nullsv;
1784 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1785 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1787 r->substrs = 0; /* Useful during FAIL. */
1788 r->startp = 0; /* Useful during FAIL. */
1789 r->endp = 0; /* Useful during FAIL. */
1791 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1793 r->offsets[0] = RExC_size;
1795 DEBUG_r(PerlIO_printf(Perl_debug_log,
1796 "%s %"UVuf" bytes for offset annotations.\n",
1797 r->offsets ? "Got" : "Couldn't get",
1798 (UV)((2*RExC_size+1) * sizeof(U32))));
1802 /* Second pass: emit code. */
1803 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1808 RExC_emit_start = r->program;
1809 RExC_emit = r->program;
1810 /* Store the count of eval-groups for security checks: */
1811 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1812 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1814 if (reg(pRExC_state, 0, &flags) == NULL)
1817 /* Dig out information for optimizations. */
1818 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1819 pm->op_pmflags = RExC_flags;
1821 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1822 r->regstclass = NULL;
1823 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1824 r->reganch |= ROPT_NAUGHTY;
1825 scan = r->program + 1; /* First BRANCH. */
1827 /* XXXX To minimize changes to RE engine we always allocate
1828 3-units-long substrs field. */
1829 Newz(1004, r->substrs, 1, struct reg_substr_data);
1831 StructCopy(&zero_scan_data, &data, scan_data_t);
1832 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1833 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1835 STRLEN longest_float_length, longest_fixed_length;
1836 struct regnode_charclass_class ch_class;
1841 /* Skip introductions and multiplicators >= 1. */
1842 while ((OP(first) == OPEN && (sawopen = 1)) ||
1843 /* An OR of *one* alternative - should not happen now. */
1844 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1845 (OP(first) == PLUS) ||
1846 (OP(first) == MINMOD) ||
1847 /* An {n,m} with n>0 */
1848 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1849 if (OP(first) == PLUS)
1852 first += regarglen[(U8)OP(first)];
1853 first = NEXTOPER(first);
1856 /* Starting-point info. */
1858 if (PL_regkind[(U8)OP(first)] == EXACT) {
1859 if (OP(first) == EXACT)
1860 ; /* Empty, get anchored substr later. */
1861 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1862 r->regstclass = first;
1864 else if (strchr((char*)PL_simple,OP(first)))
1865 r->regstclass = first;
1866 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1867 PL_regkind[(U8)OP(first)] == NBOUND)
1868 r->regstclass = first;
1869 else if (PL_regkind[(U8)OP(first)] == BOL) {
1870 r->reganch |= (OP(first) == MBOL
1872 : (OP(first) == SBOL
1875 first = NEXTOPER(first);
1878 else if (OP(first) == GPOS) {
1879 r->reganch |= ROPT_ANCH_GPOS;
1880 first = NEXTOPER(first);
1883 else if (!sawopen && (OP(first) == STAR &&
1884 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1885 !(r->reganch & ROPT_ANCH) )
1887 /* turn .* into ^.* with an implied $*=1 */
1888 int type = OP(NEXTOPER(first));
1890 if (type == REG_ANY)
1891 type = ROPT_ANCH_MBOL;
1893 type = ROPT_ANCH_SBOL;
1895 r->reganch |= type | ROPT_IMPLICIT;
1896 first = NEXTOPER(first);
1899 if (sawplus && (!sawopen || !RExC_sawback)
1900 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1901 /* x+ must match at the 1st pos of run of x's */
1902 r->reganch |= ROPT_SKIP;
1904 /* Scan is after the zeroth branch, first is atomic matcher. */
1905 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1906 (IV)(first - scan + 1)));
1908 * If there's something expensive in the r.e., find the
1909 * longest literal string that must appear and make it the
1910 * regmust. Resolve ties in favor of later strings, since
1911 * the regstart check works with the beginning of the r.e.
1912 * and avoiding duplication strengthens checking. Not a
1913 * strong reason, but sufficient in the absence of others.
1914 * [Now we resolve ties in favor of the earlier string if
1915 * it happens that c_offset_min has been invalidated, since the
1916 * earlier string may buy us something the later one won't.]
1920 data.longest_fixed = newSVpvn("",0);
1921 data.longest_float = newSVpvn("",0);
1922 data.last_found = newSVpvn("",0);
1923 data.longest = &(data.longest_fixed);
1925 if (!r->regstclass) {
1926 cl_init(pRExC_state, &ch_class);
1927 data.start_class = &ch_class;
1928 stclass_flag = SCF_DO_STCLASS_AND;
1929 } else /* XXXX Check for BOUND? */
1931 data.last_closep = &last_close;
1933 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1934 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1935 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1936 && data.last_start_min == 0 && data.last_end > 0
1937 && !RExC_seen_zerolen
1938 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1939 r->reganch |= ROPT_CHECK_ALL;
1940 scan_commit(pRExC_state, &data);
1941 SvREFCNT_dec(data.last_found);
1943 longest_float_length = CHR_SVLEN(data.longest_float);
1944 if (longest_float_length
1945 || (data.flags & SF_FL_BEFORE_EOL
1946 && (!(data.flags & SF_FL_BEFORE_MEOL)
1947 || (RExC_flags & PMf_MULTILINE)))) {
1950 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1951 && data.offset_fixed == data.offset_float_min
1952 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1953 goto remove_float; /* As in (a)+. */
1955 if (SvUTF8(data.longest_float)) {
1956 r->float_utf8 = data.longest_float;
1957 r->float_substr = Nullsv;
1959 r->float_substr = data.longest_float;
1960 r->float_utf8 = Nullsv;
1962 r->float_min_offset = data.offset_float_min;
1963 r->float_max_offset = data.offset_float_max;
1964 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1965 && (!(data.flags & SF_FL_BEFORE_MEOL)
1966 || (RExC_flags & PMf_MULTILINE)));
1967 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1971 r->float_substr = r->float_utf8 = Nullsv;
1972 SvREFCNT_dec(data.longest_float);
1973 longest_float_length = 0;
1976 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1977 if (longest_fixed_length
1978 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1979 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1980 || (RExC_flags & PMf_MULTILINE)))) {
1983 if (SvUTF8(data.longest_fixed)) {
1984 r->anchored_utf8 = data.longest_fixed;
1985 r->anchored_substr = Nullsv;
1987 r->anchored_substr = data.longest_fixed;
1988 r->anchored_utf8 = Nullsv;
1990 r->anchored_offset = data.offset_fixed;
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993 || (RExC_flags & PMf_MULTILINE)));
1994 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1997 r->anchored_substr = r->anchored_utf8 = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
1999 longest_fixed_length = 0;
2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003 r->regstclass = NULL;
2004 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2006 && !(data.start_class->flags & ANYOF_EOS)
2007 && !cl_is_anything(data.start_class))
2009 I32 n = add_data(pRExC_state, 1, "f");
2011 New(1006, RExC_rx->data->data[n], 1,
2012 struct regnode_charclass_class);
2013 StructCopy(data.start_class,
2014 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2015 struct regnode_charclass_class);
2016 r->regstclass = (regnode*)RExC_rx->data->data[n];
2017 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2018 PL_regdata = r->data; /* for regprop() */
2019 DEBUG_r({ SV *sv = sv_newmortal();
2020 regprop(sv, (regnode*)data.start_class);
2021 PerlIO_printf(Perl_debug_log,
2022 "synthetic stclass `%s'.\n",
2026 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2027 if (longest_fixed_length > longest_float_length) {
2028 r->check_substr = r->anchored_substr;
2029 r->check_utf8 = r->anchored_utf8;
2030 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2031 if (r->reganch & ROPT_ANCH_SINGLE)
2032 r->reganch |= ROPT_NOSCAN;
2035 r->check_substr = r->float_substr;
2036 r->check_utf8 = r->float_utf8;
2037 r->check_offset_min = data.offset_float_min;
2038 r->check_offset_max = data.offset_float_max;
2040 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2041 This should be changed ASAP! */
2042 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2043 r->reganch |= RE_USE_INTUIT;
2044 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2045 r->reganch |= RE_INTUIT_TAIL;
2049 /* Several toplevels. Best we can is to set minlen. */
2051 struct regnode_charclass_class ch_class;
2054 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2055 scan = r->program + 1;
2056 cl_init(pRExC_state, &ch_class);
2057 data.start_class = &ch_class;
2058 data.last_closep = &last_close;
2059 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2060 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2061 = r->float_substr = r->float_utf8 = Nullsv;
2062 if (!(data.start_class->flags & ANYOF_EOS)
2063 && !cl_is_anything(data.start_class))
2065 I32 n = add_data(pRExC_state, 1, "f");
2067 New(1006, RExC_rx->data->data[n], 1,
2068 struct regnode_charclass_class);
2069 StructCopy(data.start_class,
2070 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2071 struct regnode_charclass_class);
2072 r->regstclass = (regnode*)RExC_rx->data->data[n];
2073 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2074 DEBUG_r({ SV* sv = sv_newmortal();
2075 regprop(sv, (regnode*)data.start_class);
2076 PerlIO_printf(Perl_debug_log,
2077 "synthetic stclass `%s'.\n",
2083 if (RExC_seen & REG_SEEN_GPOS)
2084 r->reganch |= ROPT_GPOS_SEEN;
2085 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2086 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2087 if (RExC_seen & REG_SEEN_EVAL)
2088 r->reganch |= ROPT_EVAL_SEEN;
2089 if (RExC_seen & REG_SEEN_CANY)
2090 r->reganch |= ROPT_CANY_SEEN;
2091 Newz(1002, r->startp, RExC_npar, I32);
2092 Newz(1002, r->endp, RExC_npar, I32);
2093 PL_regdata = r->data; /* for regprop() */
2094 DEBUG_r(regdump(r));
2099 - reg - regular expression, i.e. main body or parenthesized thing
2101 * Caller must absorb opening parenthesis.
2103 * Combining parenthesis handling with the base level of regular expression
2104 * is a trifle forced, but the need to tie the tails of the branches to what
2105 * follows makes it hard to avoid.
2108 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2109 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2111 register regnode *ret; /* Will be the head of the group. */
2112 register regnode *br;
2113 register regnode *lastbr;
2114 register regnode *ender = 0;
2115 register I32 parno = 0;
2116 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2118 /* for (?g), (?gc), and (?o) warnings; warning
2119 about (?c) will warn about (?g) -- japhy */
2121 I32 wastedflags = 0x00,
2124 wasted_gc = 0x02 | 0x04,
2127 char * parse_start = RExC_parse; /* MJD */
2128 char *oregcomp_parse = RExC_parse;
2131 *flagp = 0; /* Tentatively. */
2134 /* Make an OPEN node, if parenthesized. */
2136 if (*RExC_parse == '?') { /* (?...) */
2137 U32 posflags = 0, negflags = 0;
2138 U32 *flagsp = &posflags;
2140 char *seqstart = RExC_parse;
2143 paren = *RExC_parse++;
2144 ret = NULL; /* For look-ahead/behind. */
2146 case '<': /* (?<...) */
2147 RExC_seen |= REG_SEEN_LOOKBEHIND;
2148 if (*RExC_parse == '!')
2150 if (*RExC_parse != '=' && *RExC_parse != '!')
2153 case '=': /* (?=...) */
2154 case '!': /* (?!...) */
2155 RExC_seen_zerolen++;
2156 case ':': /* (?:...) */
2157 case '>': /* (?>...) */
2159 case '$': /* (?$...) */
2160 case '@': /* (?@...) */
2161 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2163 case '#': /* (?#...) */
2164 while (*RExC_parse && *RExC_parse != ')')
2166 if (*RExC_parse != ')')
2167 FAIL("Sequence (?#... not terminated");
2168 nextchar(pRExC_state);
2171 case 'p': /* (?p...) */
2172 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2173 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2175 case '?': /* (??...) */
2177 if (*RExC_parse != '{')
2179 paren = *RExC_parse++;
2181 case '{': /* (?{...}) */
2183 I32 count = 1, n = 0;
2185 char *s = RExC_parse;
2187 OP_4tree *sop, *rop;
2189 RExC_seen_zerolen++;
2190 RExC_seen |= REG_SEEN_EVAL;
2191 while (count && (c = *RExC_parse)) {
2192 if (c == '\\' && RExC_parse[1])
2200 if (*RExC_parse != ')')
2203 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2208 if (RExC_parse - 1 - s)
2209 sv = newSVpvn(s, RExC_parse - 1 - s);
2211 sv = newSVpvn("", 0);
2214 Perl_save_re_context(aTHX);
2215 rop = sv_compile_2op(sv, &sop, "re", &pad);
2216 sop->op_private |= OPpREFCOUNTED;
2217 /* re_dup will OpREFCNT_inc */
2218 OpREFCNT_set(sop, 1);
2221 n = add_data(pRExC_state, 3, "nop");
2222 RExC_rx->data->data[n] = (void*)rop;
2223 RExC_rx->data->data[n+1] = (void*)sop;
2224 RExC_rx->data->data[n+2] = (void*)pad;
2227 else { /* First pass */
2228 if (PL_reginterp_cnt < ++RExC_seen_evals
2229 && PL_curcop != &PL_compiling)
2230 /* No compiled RE interpolated, has runtime
2231 components ===> unsafe. */
2232 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2233 if (PL_tainting && PL_tainted)
2234 FAIL("Eval-group in insecure regular expression");
2237 nextchar(pRExC_state);
2239 ret = reg_node(pRExC_state, LOGICAL);
2242 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2243 /* deal with the length of this later - MJD */
2246 ret = reganode(pRExC_state, EVAL, n);
2247 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2248 Set_Node_Offset(ret, parse_start);
2251 case '(': /* (?(?{...})...) and (?(?=...)...) */
2253 if (RExC_parse[0] == '?') { /* (?(?...)) */
2254 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2255 || RExC_parse[1] == '<'
2256 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2259 ret = reg_node(pRExC_state, LOGICAL);
2262 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2266 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2268 parno = atoi(RExC_parse++);
2270 while (isDIGIT(*RExC_parse))
2272 ret = reganode(pRExC_state, GROUPP, parno);
2274 if ((c = *nextchar(pRExC_state)) != ')')
2275 vFAIL("Switch condition not recognized");
2277 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2278 br = regbranch(pRExC_state, &flags, 1);
2280 br = reganode(pRExC_state, LONGJMP, 0);
2282 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2283 c = *nextchar(pRExC_state);
2287 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2288 regbranch(pRExC_state, &flags, 1);
2289 regtail(pRExC_state, ret, lastbr);
2292 c = *nextchar(pRExC_state);
2297 vFAIL("Switch (?(condition)... contains too many branches");
2298 ender = reg_node(pRExC_state, TAIL);
2299 regtail(pRExC_state, br, ender);
2301 regtail(pRExC_state, lastbr, ender);
2302 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2305 regtail(pRExC_state, ret, ender);
2309 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2313 RExC_parse--; /* for vFAIL to print correctly */
2314 vFAIL("Sequence (? incomplete");
2318 parse_flags: /* (?i) */
2319 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2320 /* (?g), (?gc) and (?o) are useless here
2321 and must be globally applied -- japhy */
2323 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2324 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2325 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2326 if (! (wastedflags & wflagbit) ) {
2327 wastedflags |= wflagbit;
2330 "Useless (%s%c) - %suse /%c modifier",
2331 flagsp == &negflags ? "?-" : "?",
2333 flagsp == &negflags ? "don't " : "",
2339 else if (*RExC_parse == 'c') {
2340 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2341 if (! (wastedflags & wasted_c) ) {
2342 wastedflags |= wasted_gc;
2345 "Useless (%sc) - %suse /gc modifier",
2346 flagsp == &negflags ? "?-" : "?",
2347 flagsp == &negflags ? "don't " : ""
2352 else { pmflag(flagsp, *RExC_parse); }
2356 if (*RExC_parse == '-') {
2358 wastedflags = 0; /* reset so (?g-c) warns twice */
2362 RExC_flags |= posflags;
2363 RExC_flags &= ~negflags;
2364 if (*RExC_parse == ':') {
2370 if (*RExC_parse != ')') {
2372 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2374 nextchar(pRExC_state);
2382 ret = reganode(pRExC_state, OPEN, parno);
2383 Set_Node_Length(ret, 1); /* MJD */
2384 Set_Node_Offset(ret, RExC_parse); /* MJD */
2391 /* Pick up the branches, linking them together. */
2392 parse_start = RExC_parse; /* MJD */
2393 br = regbranch(pRExC_state, &flags, 1);
2394 /* branch_len = (paren != 0); */
2398 if (*RExC_parse == '|') {
2399 if (!SIZE_ONLY && RExC_extralen) {
2400 reginsert(pRExC_state, BRANCHJ, br);
2403 reginsert(pRExC_state, BRANCH, br);
2404 Set_Node_Length(br, paren != 0);
2405 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2409 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2411 else if (paren == ':') {
2412 *flagp |= flags&SIMPLE;
2414 if (open) { /* Starts with OPEN. */
2415 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2417 else if (paren != '?') /* Not Conditional */
2419 *flagp |= flags & (SPSTART | HASWIDTH);
2421 while (*RExC_parse == '|') {
2422 if (!SIZE_ONLY && RExC_extralen) {
2423 ender = reganode(pRExC_state, LONGJMP,0);
2424 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2427 RExC_extralen += 2; /* Account for LONGJMP. */
2428 nextchar(pRExC_state);
2429 br = regbranch(pRExC_state, &flags, 0);
2433 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2437 *flagp |= flags&SPSTART;
2440 if (have_branch || paren != ':') {
2441 /* Make a closing node, and hook it on the end. */
2444 ender = reg_node(pRExC_state, TAIL);
2447 ender = reganode(pRExC_state, CLOSE, parno);
2448 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2449 Set_Node_Length(ender,1); /* MJD */
2455 *flagp &= ~HASWIDTH;
2458 ender = reg_node(pRExC_state, SUCCEED);
2461 ender = reg_node(pRExC_state, END);
2464 regtail(pRExC_state, lastbr, ender);
2467 /* Hook the tails of the branches to the closing node. */
2468 for (br = ret; br != NULL; br = regnext(br)) {
2469 regoptail(pRExC_state, br, ender);
2476 static char parens[] = "=!<,>";
2478 if (paren && (p = strchr(parens, paren))) {
2479 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2480 int flag = (p - parens) > 1;
2483 node = SUSPEND, flag = 0;
2484 reginsert(pRExC_state, node,ret);
2485 Set_Node_Offset(ret, oregcomp_parse);
2486 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2488 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2492 /* Check for proper termination. */
2494 RExC_flags = oregflags;
2495 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2496 RExC_parse = oregcomp_parse;
2497 vFAIL("Unmatched (");
2500 else if (!paren && RExC_parse < RExC_end) {
2501 if (*RExC_parse == ')') {
2503 vFAIL("Unmatched )");
2506 FAIL("Junk on end of regexp"); /* "Can't happen". */
2514 - regbranch - one alternative of an | operator
2516 * Implements the concatenation operator.
2519 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2521 register regnode *ret;
2522 register regnode *chain = NULL;
2523 register regnode *latest;
2524 I32 flags = 0, c = 0;
2529 if (!SIZE_ONLY && RExC_extralen)
2530 ret = reganode(pRExC_state, BRANCHJ,0);
2532 ret = reg_node(pRExC_state, BRANCH);
2533 Set_Node_Length(ret, 1);
2537 if (!first && SIZE_ONLY)
2538 RExC_extralen += 1; /* BRANCHJ */
2540 *flagp = WORST; /* Tentatively. */
2543 nextchar(pRExC_state);
2544 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2546 latest = regpiece(pRExC_state, &flags);
2547 if (latest == NULL) {
2548 if (flags & TRYAGAIN)
2552 else if (ret == NULL)
2554 *flagp |= flags&HASWIDTH;
2555 if (chain == NULL) /* First piece. */
2556 *flagp |= flags&SPSTART;
2559 regtail(pRExC_state, chain, latest);
2564 if (chain == NULL) { /* Loop ran zero times. */
2565 chain = reg_node(pRExC_state, NOTHING);
2570 *flagp |= flags&SIMPLE;
2577 - regpiece - something followed by possible [*+?]
2579 * Note that the branching code sequences used for ? and the general cases
2580 * of * and + are somewhat optimized: they use the same NOTHING node as
2581 * both the endmarker for their branch list and the body of the last branch.
2582 * It might seem that this node could be dispensed with entirely, but the
2583 * endmarker role is not redundant.
2586 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2588 register regnode *ret;
2590 register char *next;
2592 char *origparse = RExC_parse;
2595 I32 max = REG_INFTY;
2598 ret = regatom(pRExC_state, &flags);
2600 if (flags & TRYAGAIN)
2607 if (op == '{' && regcurly(RExC_parse)) {
2608 parse_start = RExC_parse; /* MJD */
2609 next = RExC_parse + 1;
2611 while (isDIGIT(*next) || *next == ',') {
2620 if (*next == '}') { /* got one */
2624 min = atoi(RExC_parse);
2628 maxpos = RExC_parse;
2630 if (!max && *maxpos != '0')
2631 max = REG_INFTY; /* meaning "infinity" */
2632 else if (max >= REG_INFTY)
2633 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2635 nextchar(pRExC_state);
2638 if ((flags&SIMPLE)) {
2639 RExC_naughty += 2 + RExC_naughty / 2;
2640 reginsert(pRExC_state, CURLY, ret);
2641 Set_Node_Offset(ret, parse_start+1); /* MJD */
2642 Set_Node_Cur_Length(ret);
2645 regnode *w = reg_node(pRExC_state, WHILEM);
2648 regtail(pRExC_state, ret, w);
2649 if (!SIZE_ONLY && RExC_extralen) {
2650 reginsert(pRExC_state, LONGJMP,ret);
2651 reginsert(pRExC_state, NOTHING,ret);
2652 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2654 reginsert(pRExC_state, CURLYX,ret);
2656 Set_Node_Offset(ret, parse_start+1);
2657 Set_Node_Length(ret,
2658 op == '{' ? (RExC_parse - parse_start) : 1);
2660 if (!SIZE_ONLY && RExC_extralen)
2661 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2662 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2664 RExC_whilem_seen++, RExC_extralen += 3;
2665 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2673 if (max && max < min)
2674 vFAIL("Can't do {n,m} with n > m");
2676 ARG1_SET(ret, (U16)min);
2677 ARG2_SET(ret, (U16)max);
2689 #if 0 /* Now runtime fix should be reliable. */
2691 /* if this is reinstated, don't forget to put this back into perldiag:
2693 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2695 (F) The part of the regexp subject to either the * or + quantifier
2696 could match an empty string. The {#} shows in the regular
2697 expression about where the problem was discovered.
2701 if (!(flags&HASWIDTH) && op != '?')
2702 vFAIL("Regexp *+ operand could be empty");
2705 parse_start = RExC_parse;
2706 nextchar(pRExC_state);
2708 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2710 if (op == '*' && (flags&SIMPLE)) {
2711 reginsert(pRExC_state, STAR, ret);
2715 else if (op == '*') {
2719 else if (op == '+' && (flags&SIMPLE)) {
2720 reginsert(pRExC_state, PLUS, ret);
2724 else if (op == '+') {
2728 else if (op == '?') {
2733 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2735 "%.*s matches null string many times",
2736 RExC_parse - origparse,
2740 if (*RExC_parse == '?') {
2741 nextchar(pRExC_state);
2742 reginsert(pRExC_state, MINMOD, ret);
2743 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2745 if (ISMULT2(RExC_parse)) {
2747 vFAIL("Nested quantifiers");
2754 - regatom - the lowest level
2756 * Optimization: gobbles an entire sequence of ordinary characters so that
2757 * it can turn them into a single node, which is smaller to store and
2758 * faster to run. Backslashed characters are exceptions, each becoming a
2759 * separate node; the code is simpler that way and it's not worth fixing.
2761 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2763 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2765 register regnode *ret = 0;
2767 char *parse_start = 0;
2769 *flagp = WORST; /* Tentatively. */
2772 switch (*RExC_parse) {
2774 RExC_seen_zerolen++;
2775 nextchar(pRExC_state);
2776 if (RExC_flags & PMf_MULTILINE)
2777 ret = reg_node(pRExC_state, MBOL);
2778 else if (RExC_flags & PMf_SINGLELINE)
2779 ret = reg_node(pRExC_state, SBOL);
2781 ret = reg_node(pRExC_state, BOL);
2782 Set_Node_Length(ret, 1); /* MJD */
2785 nextchar(pRExC_state);
2787 RExC_seen_zerolen++;
2788 if (RExC_flags & PMf_MULTILINE)
2789 ret = reg_node(pRExC_state, MEOL);
2790 else if (RExC_flags & PMf_SINGLELINE)
2791 ret = reg_node(pRExC_state, SEOL);
2793 ret = reg_node(pRExC_state, EOL);
2794 Set_Node_Length(ret, 1); /* MJD */
2797 nextchar(pRExC_state);
2798 if (RExC_flags & PMf_SINGLELINE)
2799 ret = reg_node(pRExC_state, SANY);
2801 ret = reg_node(pRExC_state, REG_ANY);
2802 *flagp |= HASWIDTH|SIMPLE;
2804 Set_Node_Length(ret, 1); /* MJD */
2808 char *oregcomp_parse = ++RExC_parse;
2809 ret = regclass(pRExC_state);
2810 if (*RExC_parse != ']') {
2811 RExC_parse = oregcomp_parse;
2812 vFAIL("Unmatched [");
2814 nextchar(pRExC_state);
2815 *flagp |= HASWIDTH|SIMPLE;
2816 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2820 nextchar(pRExC_state);
2821 ret = reg(pRExC_state, 1, &flags);
2823 if (flags & TRYAGAIN) {
2824 if (RExC_parse == RExC_end) {
2825 /* Make parent create an empty node if needed. */
2833 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2837 if (flags & TRYAGAIN) {
2841 vFAIL("Internal urp");
2842 /* Supposed to be caught earlier. */
2845 if (!regcurly(RExC_parse)) {
2854 vFAIL("Quantifier follows nothing");
2857 switch (*++RExC_parse) {
2859 RExC_seen_zerolen++;
2860 ret = reg_node(pRExC_state, SBOL);
2862 nextchar(pRExC_state);
2863 Set_Node_Length(ret, 2); /* MJD */
2866 ret = reg_node(pRExC_state, GPOS);
2867 RExC_seen |= REG_SEEN_GPOS;
2869 nextchar(pRExC_state);
2870 Set_Node_Length(ret, 2); /* MJD */
2873 ret = reg_node(pRExC_state, SEOL);
2875 RExC_seen_zerolen++; /* Do not optimize RE away */
2876 nextchar(pRExC_state);
2879 ret = reg_node(pRExC_state, EOS);
2881 RExC_seen_zerolen++; /* Do not optimize RE away */
2882 nextchar(pRExC_state);
2883 Set_Node_Length(ret, 2); /* MJD */
2886 ret = reg_node(pRExC_state, CANY);
2887 RExC_seen |= REG_SEEN_CANY;
2888 *flagp |= HASWIDTH|SIMPLE;
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, CLUMP);
2895 nextchar(pRExC_state);
2896 Set_Node_Length(ret, 2); /* MJD */
2899 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2900 *flagp |= HASWIDTH|SIMPLE;
2901 nextchar(pRExC_state);
2902 Set_Node_Length(ret, 2); /* MJD */
2905 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2906 *flagp |= HASWIDTH|SIMPLE;
2907 nextchar(pRExC_state);
2908 Set_Node_Length(ret, 2); /* MJD */
2911 RExC_seen_zerolen++;
2912 RExC_seen |= REG_SEEN_LOOKBEHIND;
2913 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2915 nextchar(pRExC_state);
2916 Set_Node_Length(ret, 2); /* MJD */
2919 RExC_seen_zerolen++;
2920 RExC_seen |= REG_SEEN_LOOKBEHIND;
2921 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2923 nextchar(pRExC_state);
2924 Set_Node_Length(ret, 2); /* MJD */
2927 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2928 *flagp |= HASWIDTH|SIMPLE;
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 ret = reg_node(pRExC_state, DIGIT);
2940 *flagp |= HASWIDTH|SIMPLE;
2941 nextchar(pRExC_state);
2942 Set_Node_Length(ret, 2); /* MJD */
2945 ret = reg_node(pRExC_state, NDIGIT);
2946 *flagp |= HASWIDTH|SIMPLE;
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2953 char* oldregxend = RExC_end;
2954 char* parse_start = RExC_parse - 2;
2956 if (RExC_parse[1] == '{') {
2957 /* a lovely hack--pretend we saw [\pX] instead */
2958 RExC_end = strchr(RExC_parse, '}');
2960 U8 c = (U8)*RExC_parse;
2962 RExC_end = oldregxend;
2963 vFAIL2("Missing right brace on \\%c{}", c);
2968 RExC_end = RExC_parse + 2;
2969 if (RExC_end > oldregxend)
2970 RExC_end = oldregxend;
2974 ret = regclass(pRExC_state);
2976 RExC_end = oldregxend;
2979 Set_Node_Offset(ret, parse_start + 2);
2980 Set_Node_Cur_Length(ret);
2981 nextchar(pRExC_state);
2982 *flagp |= HASWIDTH|SIMPLE;
2995 case '1': case '2': case '3': case '4':
2996 case '5': case '6': case '7': case '8': case '9':
2998 I32 num = atoi(RExC_parse);
3000 if (num > 9 && num >= RExC_npar)
3003 char * parse_start = RExC_parse - 1; /* MJD */
3004 while (isDIGIT(*RExC_parse))
3007 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3008 vFAIL("Reference to nonexistent group");
3010 ret = reganode(pRExC_state,
3011 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3015 /* override incorrect value set in reganode MJD */
3016 Set_Node_Offset(ret, parse_start+1);
3017 Set_Node_Cur_Length(ret); /* MJD */
3019 nextchar(pRExC_state);
3024 if (RExC_parse >= RExC_end)
3025 FAIL("Trailing \\");
3028 /* Do not generate `unrecognized' warnings here, we fall
3029 back into the quick-grab loop below */
3035 if (RExC_flags & PMf_EXTENDED) {
3036 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3037 if (RExC_parse < RExC_end)
3043 register STRLEN len;
3049 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3051 parse_start = RExC_parse - 1;
3057 ret = reg_node(pRExC_state,
3058 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3060 for (len = 0, p = RExC_parse - 1;
3061 len < 127 && p < RExC_end;
3066 if (RExC_flags & PMf_EXTENDED)
3067 p = regwhite(p, RExC_end);
3114 ender = ASCII_TO_NATIVE('\033');
3118 ender = ASCII_TO_NATIVE('\007');
3123 char* e = strchr(p, '}');
3127 vFAIL("Missing right brace on \\x{}");
3130 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3131 | PERL_SCAN_DISALLOW_PREFIX;
3133 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3136 /* numlen is generous */
3137 if (numlen + len >= 127) {
3145 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3147 ender = grok_hex(p, &numlen, &flags, NULL);
3153 ender = UCHARAT(p++);
3154 ender = toCTRL(ender);
3156 case '0': case '1': case '2': case '3':case '4':
3157 case '5': case '6': case '7': case '8':case '9':
3159 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3162 ender = grok_oct(p, &numlen, &flags, NULL);
3172 FAIL("Trailing \\");
3175 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3176 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3177 goto normal_default;
3182 if (UTF8_IS_START(*p) && UTF) {
3183 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3191 if (RExC_flags & PMf_EXTENDED)
3192 p = regwhite(p, RExC_end);
3194 /* Prime the casefolded buffer. */
3195 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3197 if (ISMULT2(p)) { /* Back off on ?+*. */
3204 /* Emit all the Unicode characters. */
3205 for (foldbuf = tmpbuf;
3207 foldlen -= numlen) {
3208 ender = utf8_to_uvchr(foldbuf, &numlen);
3210 reguni(pRExC_state, ender, s, &unilen);
3213 /* In EBCDIC the numlen
3214 * and unilen can differ. */
3216 if (numlen >= foldlen)
3220 break; /* "Can't happen." */
3224 reguni(pRExC_state, ender, s, &unilen);
3233 REGC((char)ender, s++);
3241 /* Emit all the Unicode characters. */
3242 for (foldbuf = tmpbuf;
3244 foldlen -= numlen) {
3245 ender = utf8_to_uvchr(foldbuf, &numlen);
3247 reguni(pRExC_state, ender, s, &unilen);
3250 /* In EBCDIC the numlen
3251 * and unilen can differ. */
3253 if (numlen >= foldlen)
3261 reguni(pRExC_state, ender, s, &unilen);
3270 REGC((char)ender, s++);
3274 Set_Node_Cur_Length(ret); /* MJD */
3275 nextchar(pRExC_state);
3277 /* len is STRLEN which is unsigned, need to copy to signed */
3280 vFAIL("Internal disaster");
3289 RExC_size += STR_SZ(len);
3291 RExC_emit += STR_SZ(len);
3296 /* If the encoding pragma is in effect recode the text of
3297 * any EXACT-kind nodes. */
3298 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3299 STRLEN oldlen = STR_LEN(ret);
3300 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3304 if (sv_utf8_downgrade(sv, TRUE)) {
3305 char *s = sv_recode_to_utf8(sv, PL_encoding);
3306 STRLEN newlen = SvCUR(sv);
3311 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3312 (int)oldlen, STRING(ret),
3314 Copy(s, STRING(ret), newlen, char);
3315 STR_LEN(ret) += newlen - oldlen;
3316 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3318 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3326 S_regwhite(pTHX_ char *p, char *e)
3331 else if (*p == '#') {
3334 } while (p < e && *p != '\n');
3342 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3343 Character classes ([:foo:]) can also be negated ([:^foo:]).
3344 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3345 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3346 but trigger failures because they are currently unimplemented. */
3348 #define POSIXCC_DONE(c) ((c) == ':')
3349 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3350 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3353 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3356 I32 namedclass = OOB_NAMEDCLASS;
3358 if (value == '[' && RExC_parse + 1 < RExC_end &&
3359 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3360 POSIXCC(UCHARAT(RExC_parse))) {
3361 char c = UCHARAT(RExC_parse);
3362 char* s = RExC_parse++;
3364 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3366 if (RExC_parse == RExC_end)
3367 /* Grandfather lone [:, [=, [. */
3370 char* t = RExC_parse++; /* skip over the c */
3372 if (UCHARAT(RExC_parse) == ']') {
3373 RExC_parse++; /* skip over the ending ] */
3376 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3377 I32 skip = 5; /* the most common skip */
3381 if (strnEQ(posixcc, "alnum", 5))
3383 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3384 else if (strnEQ(posixcc, "alpha", 5))
3386 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3387 else if (strnEQ(posixcc, "ascii", 5))
3389 complement ? ANYOF_NASCII : ANYOF_ASCII;
3392 if (strnEQ(posixcc, "blank", 5))
3394 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3397 if (strnEQ(posixcc, "cntrl", 5))
3399 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3402 if (strnEQ(posixcc, "digit", 5))
3404 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3407 if (strnEQ(posixcc, "graph", 5))
3409 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3412 if (strnEQ(posixcc, "lower", 5))
3414 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3417 if (strnEQ(posixcc, "print", 5))
3419 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3420 else if (strnEQ(posixcc, "punct", 5))
3422 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3425 if (strnEQ(posixcc, "space", 5))
3427 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3430 if (strnEQ(posixcc, "upper", 5))
3432 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3434 case 'w': /* this is not POSIX, this is the Perl \w */
3435 if (strnEQ(posixcc, "word", 4)) {
3437 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3442 if (strnEQ(posixcc, "xdigit", 6)) {
3444 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3449 if (namedclass == OOB_NAMEDCLASS ||
3450 posixcc[skip] != ':' ||
3451 posixcc[skip+1] != ']')
3453 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3456 } else if (!SIZE_ONLY) {
3457 /* [[=foo=]] and [[.foo.]] are still future. */
3459 /* adjust RExC_parse so the warning shows after
3461 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3463 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3466 /* Maternal grandfather:
3467 * "[:" ending in ":" but not in ":]" */
3477 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3479 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3480 char *s = RExC_parse;
3483 while(*s && isALNUM(*s))
3485 if (*s && c == *s && s[1] == ']') {
3486 if (ckWARN(WARN_REGEXP))
3488 "POSIX syntax [%c %c] belongs inside character classes",
3491 /* [[=foo=]] and [[.foo.]] are still future. */
3492 if (POSIXCC_NOTYET(c)) {
3493 /* adjust RExC_parse so the error shows after
3495 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3497 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3504 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3507 register UV nextvalue;
3508 register IV prevvalue = OOB_UNICODE;
3509 register IV range = 0;
3510 register regnode *ret;
3513 char *rangebegin = 0;
3514 bool need_class = 0;
3515 SV *listsv = Nullsv;
3518 bool optimize_invert = TRUE;
3519 AV* unicode_alternate = 0;
3521 UV literal_endpoint = 0;
3524 ret = reganode(pRExC_state, ANYOF, 0);
3527 ANYOF_FLAGS(ret) = 0;
3529 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3533 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3537 RExC_size += ANYOF_SKIP;
3539 RExC_emit += ANYOF_SKIP;
3541 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3543 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3544 ANYOF_BITMAP_ZERO(ret);
3545 listsv = newSVpvn("# comment\n", 10);
3548 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3550 if (!SIZE_ONLY && POSIXCC(nextvalue))
3551 checkposixcc(pRExC_state);
3553 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3554 if (UCHARAT(RExC_parse) == ']')
3557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3561 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3564 rangebegin = RExC_parse;
3566 value = utf8n_to_uvchr((U8*)RExC_parse,
3567 RExC_end - RExC_parse,
3569 RExC_parse += numlen;
3572 value = UCHARAT(RExC_parse++);
3573 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574 if (value == '[' && POSIXCC(nextvalue))
3575 namedclass = regpposixcc(pRExC_state, value);
3576 else if (value == '\\') {
3578 value = utf8n_to_uvchr((U8*)RExC_parse,
3579 RExC_end - RExC_parse,
3581 RExC_parse += numlen;
3584 value = UCHARAT(RExC_parse++);
3585 /* Some compilers cannot handle switching on 64-bit integer
3586 * values, therefore value cannot be an UV. Yes, this will
3587 * be a problem later if we want switch on Unicode.
3588 * A similar issue a little bit later when switching on
3589 * namedclass. --jhi */
3590 switch ((I32)value) {
3591 case 'w': namedclass = ANYOF_ALNUM; break;
3592 case 'W': namedclass = ANYOF_NALNUM; break;
3593 case 's': namedclass = ANYOF_SPACE; break;
3594 case 'S': namedclass = ANYOF_NSPACE; break;
3595 case 'd': namedclass = ANYOF_DIGIT; break;
3596 case 'D': namedclass = ANYOF_NDIGIT; break;
3599 if (RExC_parse >= RExC_end)
3600 vFAIL2("Empty \\%c{}", (U8)value);
3601 if (*RExC_parse == '{') {
3603 e = strchr(RExC_parse++, '}');
3605 vFAIL2("Missing right brace on \\%c{}", c);
3606 while (isSPACE(UCHARAT(RExC_parse)))
3608 if (e == RExC_parse)
3609 vFAIL2("Empty \\%c{}", c);
3611 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3619 if (UCHARAT(RExC_parse) == '^') {
3622 value = value == 'p' ? 'P' : 'p'; /* toggle */
3623 while (isSPACE(UCHARAT(RExC_parse))) {
3629 Perl_sv_catpvf(aTHX_ listsv,
3630 "+utf8::%.*s\n", (int)n, RExC_parse);
3632 Perl_sv_catpvf(aTHX_ listsv,
3633 "!utf8::%.*s\n", (int)n, RExC_parse);
3636 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3638 case 'n': value = '\n'; break;
3639 case 'r': value = '\r'; break;
3640 case 't': value = '\t'; break;
3641 case 'f': value = '\f'; break;
3642 case 'b': value = '\b'; break;
3643 case 'e': value = ASCII_TO_NATIVE('\033');break;
3644 case 'a': value = ASCII_TO_NATIVE('\007');break;
3646 if (*RExC_parse == '{') {
3647 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3648 | PERL_SCAN_DISALLOW_PREFIX;
3649 e = strchr(RExC_parse++, '}');
3651 vFAIL("Missing right brace on \\x{}");
3653 numlen = e - RExC_parse;
3654 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3658 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3660 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3661 RExC_parse += numlen;
3665 value = UCHARAT(RExC_parse++);
3666 value = toCTRL(value);
3668 case '0': case '1': case '2': case '3': case '4':
3669 case '5': case '6': case '7': case '8': case '9':
3673 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3674 RExC_parse += numlen;
3678 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3680 "Unrecognized escape \\%c in character class passed through",
3684 } /* end of \blah */
3690 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3692 if (!SIZE_ONLY && !need_class)
3693 ANYOF_CLASS_ZERO(ret);
3697 /* a bad range like a-\d, a-[:digit:] ? */
3700 if (ckWARN(WARN_REGEXP))
3702 "False [] range \"%*.*s\"",
3703 RExC_parse - rangebegin,
3704 RExC_parse - rangebegin,
3706 if (prevvalue < 256) {
3707 ANYOF_BITMAP_SET(ret, prevvalue);
3708 ANYOF_BITMAP_SET(ret, '-');
3711 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3712 Perl_sv_catpvf(aTHX_ listsv,
3713 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3717 range = 0; /* this was not a true range */
3721 if (namedclass > OOB_NAMEDCLASS)
3722 optimize_invert = FALSE;
3723 /* Possible truncation here but in some 64-bit environments
3724 * the compiler gets heartburn about switch on 64-bit values.
3725 * A similar issue a little earlier when switching on value.
3727 switch ((I32)namedclass) {
3730 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3732 for (value = 0; value < 256; value++)
3734 ANYOF_BITMAP_SET(ret, value);
3736 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3740 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3742 for (value = 0; value < 256; value++)
3743 if (!isALNUM(value))
3744 ANYOF_BITMAP_SET(ret, value);
3746 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3750 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3752 for (value = 0; value < 256; value++)
3753 if (isALNUMC(value))
3754 ANYOF_BITMAP_SET(ret, value);
3756 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3760 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3762 for (value = 0; value < 256; value++)
3763 if (!isALNUMC(value))
3764 ANYOF_BITMAP_SET(ret, value);
3766 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3770 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3772 for (value = 0; value < 256; value++)
3774 ANYOF_BITMAP_SET(ret, value);
3776 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3780 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3782 for (value = 0; value < 256; value++)
3783 if (!isALPHA(value))
3784 ANYOF_BITMAP_SET(ret, value);
3786 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3790 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3793 for (value = 0; value < 128; value++)
3794 ANYOF_BITMAP_SET(ret, value);
3796 for (value = 0; value < 256; value++) {
3798 ANYOF_BITMAP_SET(ret, value);
3802 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3806 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3809 for (value = 128; value < 256; value++)
3810 ANYOF_BITMAP_SET(ret, value);
3812 for (value = 0; value < 256; value++) {
3813 if (!isASCII(value))
3814 ANYOF_BITMAP_SET(ret, value);
3818 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3822 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3824 for (value = 0; value < 256; value++)
3826 ANYOF_BITMAP_SET(ret, value);
3828 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3832 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3834 for (value = 0; value < 256; value++)
3835 if (!isBLANK(value))
3836 ANYOF_BITMAP_SET(ret, value);
3838 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3842 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3844 for (value = 0; value < 256; value++)
3846 ANYOF_BITMAP_SET(ret, value);
3848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3852 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3854 for (value = 0; value < 256; value++)
3855 if (!isCNTRL(value))
3856 ANYOF_BITMAP_SET(ret, value);
3858 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3862 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3864 /* consecutive digits assumed */
3865 for (value = '0'; value <= '9'; value++)
3866 ANYOF_BITMAP_SET(ret, value);
3868 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3872 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3874 /* consecutive digits assumed */
3875 for (value = 0; value < '0'; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3877 for (value = '9' + 1; value < 256; value++)
3878 ANYOF_BITMAP_SET(ret, value);
3880 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3884 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3886 for (value = 0; value < 256; value++)
3888 ANYOF_BITMAP_SET(ret, value);
3890 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3894 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3896 for (value = 0; value < 256; value++)
3897 if (!isGRAPH(value))
3898 ANYOF_BITMAP_SET(ret, value);
3900 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3904 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3906 for (value = 0; value < 256; value++)
3908 ANYOF_BITMAP_SET(ret, value);
3910 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3914 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3916 for (value = 0; value < 256; value++)
3917 if (!isLOWER(value))
3918 ANYOF_BITMAP_SET(ret, value);
3920 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3924 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3926 for (value = 0; value < 256; value++)
3928 ANYOF_BITMAP_SET(ret, value);
3930 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3934 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3936 for (value = 0; value < 256; value++)
3937 if (!isPRINT(value))
3938 ANYOF_BITMAP_SET(ret, value);
3940 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3944 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3946 for (value = 0; value < 256; value++)
3947 if (isPSXSPC(value))
3948 ANYOF_BITMAP_SET(ret, value);
3950 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3954 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3956 for (value = 0; value < 256; value++)
3957 if (!isPSXSPC(value))
3958 ANYOF_BITMAP_SET(ret, value);
3960 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3964 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3966 for (value = 0; value < 256; value++)
3968 ANYOF_BITMAP_SET(ret, value);
3970 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3974 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3976 for (value = 0; value < 256; value++)
3977 if (!isPUNCT(value))
3978 ANYOF_BITMAP_SET(ret, value);
3980 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3984 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3986 for (value = 0; value < 256; value++)
3988 ANYOF_BITMAP_SET(ret, value);
3990 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3994 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3996 for (value = 0; value < 256; value++)
3997 if (!isSPACE(value))
3998 ANYOF_BITMAP_SET(ret, value);
4000 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4004 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4006 for (value = 0; value < 256; value++)
4008 ANYOF_BITMAP_SET(ret, value);
4010 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4014 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4016 for (value = 0; value < 256; value++)
4017 if (!isUPPER(value))
4018 ANYOF_BITMAP_SET(ret, value);
4020 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4024 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4026 for (value = 0; value < 256; value++)
4027 if (isXDIGIT(value))
4028 ANYOF_BITMAP_SET(ret, value);
4030 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4034 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4036 for (value = 0; value < 256; value++)
4037 if (!isXDIGIT(value))
4038 ANYOF_BITMAP_SET(ret, value);
4040 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4043 vFAIL("Invalid [::] class");
4047 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4050 } /* end of namedclass \blah */
4053 if (prevvalue > (IV)value) /* b-a */ {
4054 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4055 RExC_parse - rangebegin,
4056 RExC_parse - rangebegin,
4058 range = 0; /* not a valid range */
4062 prevvalue = value; /* save the beginning of the range */
4063 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4064 RExC_parse[1] != ']') {
4067 /* a bad range like \w-, [:word:]- ? */
4068 if (namedclass > OOB_NAMEDCLASS) {
4069 if (ckWARN(WARN_REGEXP))
4071 "False [] range \"%*.*s\"",
4072 RExC_parse - rangebegin,
4073 RExC_parse - rangebegin,
4076 ANYOF_BITMAP_SET(ret, '-');
4078 range = 1; /* yeah, it's a range! */
4079 continue; /* but do it the next time */
4083 /* now is the next time */
4087 if (prevvalue < 256) {
4088 IV ceilvalue = value < 256 ? value : 255;
4091 /* In EBCDIC [\x89-\x91] should include
4092 * the \x8e but [i-j] should not. */
4093 if (literal_endpoint == 2 &&
4094 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4095 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4097 if (isLOWER(prevvalue)) {
4098 for (i = prevvalue; i <= ceilvalue; i++)
4100 ANYOF_BITMAP_SET(ret, i);
4102 for (i = prevvalue; i <= ceilvalue; i++)
4104 ANYOF_BITMAP_SET(ret, i);
4109 for (i = prevvalue; i <= ceilvalue; i++)
4110 ANYOF_BITMAP_SET(ret, i);
4112 if (value > 255 || UTF) {
4113 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4114 UV natvalue = NATIVE_TO_UNI(value);
4116 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4117 if (prevnatvalue < natvalue) { /* what about > ? */
4118 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4119 prevnatvalue, natvalue);
4121 else if (prevnatvalue == natvalue) {
4122 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4124 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4126 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4128 /* If folding and foldable and a single
4129 * character, insert also the folded version
4130 * to the charclass. */
4132 if (foldlen == (STRLEN)UNISKIP(f))
4133 Perl_sv_catpvf(aTHX_ listsv,
4136 /* Any multicharacter foldings
4137 * require the following transform:
4138 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4139 * where E folds into "pq" and F folds
4140 * into "rst", all other characters
4141 * fold to single characters. We save
4142 * away these multicharacter foldings,
4143 * to be later saved as part of the
4144 * additional "s" data. */
4147 if (!unicode_alternate)
4148 unicode_alternate = newAV();
4149 sv = newSVpvn((char*)foldbuf, foldlen);
4151 av_push(unicode_alternate, sv);
4155 /* If folding and the value is one of the Greek
4156 * sigmas insert a few more sigmas to make the
4157 * folding rules of the sigmas to work right.
4158 * Note that not all the possible combinations
4159 * are handled here: some of them are handled
4160 * by the standard folding rules, and some of
4161 * them (literal or EXACTF cases) are handled
4162 * during runtime in regexec.c:S_find_byclass(). */
4163 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4164 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4165 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4166 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4167 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4169 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4170 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4171 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4176 literal_endpoint = 0;
4180 range = 0; /* this range (if it was one) is done now */
4184 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4186 RExC_size += ANYOF_CLASS_ADD_SKIP;
4188 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4191 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4193 /* If the only flag is folding (plus possibly inversion). */
4194 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4196 for (value = 0; value < 256; ++value) {
4197 if (ANYOF_BITMAP_TEST(ret, value)) {
4198 UV fold = PL_fold[value];
4201 ANYOF_BITMAP_SET(ret, fold);
4204 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4207 /* optimize inverted simple patterns (e.g. [^a-z]) */
4208 if (!SIZE_ONLY && optimize_invert &&
4209 /* If the only flag is inversion. */
4210 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4211 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4212 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4213 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4220 /* The 0th element stores the character class description
4221 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4222 * to initialize the appropriate swash (which gets stored in
4223 * the 1st element), and also useful for dumping the regnode.
4224 * The 2nd element stores the multicharacter foldings,
4225 * used later (regexec.c:S_reginclass()). */
4226 av_store(av, 0, listsv);
4227 av_store(av, 1, NULL);
4228 av_store(av, 2, (SV*)unicode_alternate);
4229 rv = newRV_noinc((SV*)av);
4230 n = add_data(pRExC_state, 1, "s");
4231 RExC_rx->data->data[n] = (void*)rv;
4239 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4241 char* retval = RExC_parse++;
4244 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4245 RExC_parse[2] == '#') {
4246 while (*RExC_parse && *RExC_parse != ')')
4251 if (RExC_flags & PMf_EXTENDED) {
4252 if (isSPACE(*RExC_parse)) {
4256 else if (*RExC_parse == '#') {
4257 while (*RExC_parse && *RExC_parse != '\n')
4268 - reg_node - emit a node
4270 STATIC regnode * /* Location. */
4271 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4273 register regnode *ret;
4274 register regnode *ptr;
4278 SIZE_ALIGN(RExC_size);
4283 NODE_ALIGN_FILL(ret);
4285 FILL_ADVANCE_NODE(ptr, op);
4286 if (RExC_offsets) { /* MJD */
4287 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4288 "reg_node", __LINE__,
4290 RExC_emit - RExC_emit_start > RExC_offsets[0]
4291 ? "Overwriting end of array!\n" : "OK",
4292 RExC_emit - RExC_emit_start,
4293 RExC_parse - RExC_start,
4295 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4304 - reganode - emit a node with an argument
4306 STATIC regnode * /* Location. */
4307 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4309 register regnode *ret;
4310 register regnode *ptr;
4314 SIZE_ALIGN(RExC_size);
4319 NODE_ALIGN_FILL(ret);
4321 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4322 if (RExC_offsets) { /* MJD */
4323 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4327 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4328 "Overwriting end of array!\n" : "OK",
4329 RExC_emit - RExC_emit_start,
4330 RExC_parse - RExC_start,
4332 Set_Cur_Node_Offset;
4341 - reguni - emit (if appropriate) a Unicode character
4344 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4346 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4350 - reginsert - insert an operator in front of already-emitted operand
4352 * Means relocating the operand.
4355 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4357 register regnode *src;
4358 register regnode *dst;
4359 register regnode *place;
4360 register int offset = regarglen[(U8)op];
4362 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4365 RExC_size += NODE_STEP_REGNODE + offset;
4370 RExC_emit += NODE_STEP_REGNODE + offset;
4372 while (src > opnd) {
4373 StructCopy(--src, --dst, regnode);
4374 if (RExC_offsets) { /* MJD 20010112 */
4375 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4379 dst - RExC_emit_start > RExC_offsets[0]
4380 ? "Overwriting end of array!\n" : "OK",
4381 src - RExC_emit_start,
4382 dst - RExC_emit_start,
4384 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4385 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4390 place = opnd; /* Op node, where operand used to be. */
4391 if (RExC_offsets) { /* MJD */
4392 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4396 place - RExC_emit_start > RExC_offsets[0]
4397 ? "Overwriting end of array!\n" : "OK",
4398 place - RExC_emit_start,
4399 RExC_parse - RExC_start,
4401 Set_Node_Offset(place, RExC_parse);
4403 src = NEXTOPER(place);
4404 FILL_ADVANCE_NODE(place, op);
4405 Zero(src, offset, regnode);
4409 - regtail - set the next-pointer at the end of a node chain of p to val.
4412 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4414 register regnode *scan;
4415 register regnode *temp;
4420 /* Find last node. */
4423 temp = regnext(scan);
4429 if (reg_off_by_arg[OP(scan)]) {
4430 ARG_SET(scan, val - scan);
4433 NEXT_OFF(scan) = val - scan;
4438 - regoptail - regtail on operand of first argument; nop if operandless
4441 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4443 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4444 if (p == NULL || SIZE_ONLY)
4446 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4447 regtail(pRExC_state, NEXTOPER(p), val);
4449 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4450 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4457 - regcurly - a little FSA that accepts {\d+,?\d*}
4460 S_regcurly(pTHX_ register char *s)
4481 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4483 register U8 op = EXACT; /* Arbitrary non-END op. */
4484 register regnode *next;
4486 while (op != END && (!last || node < last)) {
4487 /* While that wasn't END last time... */
4493 next = regnext(node);
4495 if (OP(node) == OPTIMIZED)
4498 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4499 (int)(2*l + 1), "", SvPVX(sv));
4500 if (next == NULL) /* Next ptr. */
4501 PerlIO_printf(Perl_debug_log, "(0)");
4503 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4504 (void)PerlIO_putc(Perl_debug_log, '\n');
4506 if (PL_regkind[(U8)op] == BRANCHJ) {
4507 register regnode *nnode = (OP(next) == LONGJMP
4510 if (last && nnode > last)
4512 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4514 else if (PL_regkind[(U8)op] == BRANCH) {
4515 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4517 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4518 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4519 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4521 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4522 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4525 else if ( op == PLUS || op == STAR) {
4526 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4528 else if (op == ANYOF) {
4529 /* arglen 1 + class block */
4530 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4531 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4532 node = NEXTOPER(node);
4534 else if (PL_regkind[(U8)op] == EXACT) {
4535 /* Literal string, where present. */
4536 node += NODE_SZ_STR(node) - 1;
4537 node = NEXTOPER(node);
4540 node = NEXTOPER(node);
4541 node += regarglen[(U8)op];
4543 if (op == CURLYX || op == OPEN)
4545 else if (op == WHILEM)
4551 #endif /* DEBUGGING */
4554 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4557 Perl_regdump(pTHX_ regexp *r)
4560 SV *sv = sv_newmortal();
4562 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4564 /* Header fields of interest. */
4565 if (r->anchored_substr)
4566 PerlIO_printf(Perl_debug_log,
4567 "anchored `%s%.*s%s'%s at %"IVdf" ",
4569 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4570 SvPVX(r->anchored_substr),
4572 SvTAIL(r->anchored_substr) ? "$" : "",
4573 (IV)r->anchored_offset);
4574 else if (r->anchored_utf8)
4575 PerlIO_printf(Perl_debug_log,
4576 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4578 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4579 SvPVX(r->anchored_utf8),
4581 SvTAIL(r->anchored_utf8) ? "$" : "",
4582 (IV)r->anchored_offset);
4583 if (r->float_substr)
4584 PerlIO_printf(Perl_debug_log,
4585 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4587 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4588 SvPVX(r->float_substr),
4590 SvTAIL(r->float_substr) ? "$" : "",
4591 (IV)r->float_min_offset, (UV)r->float_max_offset);
4592 else if (r->float_utf8)
4593 PerlIO_printf(Perl_debug_log,
4594 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4596 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4597 SvPVX(r->float_utf8),
4599 SvTAIL(r->float_utf8) ? "$" : "",
4600 (IV)r->float_min_offset, (UV)r->float_max_offset);
4601 if (r->check_substr || r->check_utf8)
4602 PerlIO_printf(Perl_debug_log,
4603 r->check_substr == r->float_substr
4604 && r->check_utf8 == r->float_utf8
4605 ? "(checking floating" : "(checking anchored");
4606 if (r->reganch & ROPT_NOSCAN)
4607 PerlIO_printf(Perl_debug_log, " noscan");
4608 if (r->reganch & ROPT_CHECK_ALL)
4609 PerlIO_printf(Perl_debug_log, " isall");
4610 if (r->check_substr || r->check_utf8)
4611 PerlIO_printf(Perl_debug_log, ") ");
4613 if (r->regstclass) {
4614 regprop(sv, r->regstclass);
4615 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4617 if (r->reganch & ROPT_ANCH) {
4618 PerlIO_printf(Perl_debug_log, "anchored");
4619 if (r->reganch & ROPT_ANCH_BOL)
4620 PerlIO_printf(Perl_debug_log, "(BOL)");
4621 if (r->reganch & ROPT_ANCH_MBOL)
4622 PerlIO_printf(Perl_debug_log, "(MBOL)");
4623 if (r->reganch & ROPT_ANCH_SBOL)
4624 PerlIO_printf(Perl_debug_log, "(SBOL)");
4625 if (r->reganch & ROPT_ANCH_GPOS)
4626 PerlIO_printf(Perl_debug_log, "(GPOS)");
4627 PerlIO_putc(Perl_debug_log, ' ');
4629 if (r->reganch & ROPT_GPOS_SEEN)
4630 PerlIO_printf(Perl_debug_log, "GPOS ");
4631 if (r->reganch & ROPT_SKIP)
4632 PerlIO_printf(Perl_debug_log, "plus ");
4633 if (r->reganch & ROPT_IMPLICIT)
4634 PerlIO_printf(Perl_debug_log, "implicit ");
4635 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4636 if (r->reganch & ROPT_EVAL_SEEN)
4637 PerlIO_printf(Perl_debug_log, "with eval ");
4638 PerlIO_printf(Perl_debug_log, "\n");
4641 U32 len = r->offsets[0];
4642 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4643 for (i = 1; i <= len; i++)
4644 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4645 (UV)r->offsets[i*2-1],
4646 (UV)r->offsets[i*2]);
4647 PerlIO_printf(Perl_debug_log, "\n");
4649 #endif /* DEBUGGING */
4655 S_put_byte(pTHX_ SV *sv, int c)
4657 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4658 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4659 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4660 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4662 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4665 #endif /* DEBUGGING */
4668 - regprop - printable representation of opcode
4671 Perl_regprop(pTHX_ SV *sv, regnode *o)
4676 sv_setpvn(sv, "", 0);
4677 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4678 /* It would be nice to FAIL() here, but this may be called from
4679 regexec.c, and it would be hard to supply pRExC_state. */
4680 Perl_croak(aTHX_ "Corrupted regexp opcode");
4681 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4683 k = PL_regkind[(U8)OP(o)];
4686 SV *dsv = sv_2mortal(newSVpvn("", 0));
4687 /* Using is_utf8_string() is a crude hack but it may
4688 * be the best for now since we have no flag "this EXACTish
4689 * node was UTF-8" --jhi */
4690 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4692 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4693 UNI_DISPLAY_REGEX) :
4698 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4703 else if (k == CURLY) {
4704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4708 else if (k == WHILEM && o->flags) /* Ordinal/of */
4709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4712 else if (k == LOGICAL)
4713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4714 else if (k == ANYOF) {
4715 int i, rangestart = -1;
4716 U8 flags = ANYOF_FLAGS(o);
4717 const char * const anyofs[] = { /* Should be syncronized with
4718 * ANYOF_ #xdefines in regcomp.h */
4751 if (flags & ANYOF_LOCALE)
4752 sv_catpv(sv, "{loc}");
4753 if (flags & ANYOF_FOLD)
4754 sv_catpv(sv, "{i}");
4755 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4756 if (flags & ANYOF_INVERT)
4758 for (i = 0; i <= 256; i++) {
4759 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4760 if (rangestart == -1)
4762 } else if (rangestart != -1) {
4763 if (i <= rangestart + 3)
4764 for (; rangestart < i; rangestart++)
4765 put_byte(sv, rangestart);
4767 put_byte(sv, rangestart);
4769 put_byte(sv, i - 1);
4775 if (o->flags & ANYOF_CLASS)
4776 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4777 if (ANYOF_CLASS_TEST(o,i))
4778 sv_catpv(sv, anyofs[i]);
4780 if (flags & ANYOF_UNICODE)
4781 sv_catpv(sv, "{unicode}");
4782 else if (flags & ANYOF_UNICODE_ALL)
4783 sv_catpv(sv, "{unicode_all}");
4787 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4791 U8 s[UTF8_MAXLEN+1];
4793 for (i = 0; i <= 256; i++) { /* just the first 256 */
4794 U8 *e = uvchr_to_utf8(s, i);
4796 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4797 if (rangestart == -1)
4799 } else if (rangestart != -1) {
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++) {
4804 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4808 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4811 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4818 sv_catpv(sv, "..."); /* et cetera */
4822 char *s = savepv(SvPVX(lv));
4825 while(*s && *s != '\n') s++;
4846 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4848 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4849 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4850 #endif /* DEBUGGING */
4854 Perl_re_intuit_string(pTHX_ regexp *prog)
4855 { /* Assume that RE_INTUIT is set */
4858 char *s = SvPV(prog->check_substr
4859 ? prog->check_substr : prog->check_utf8, n_a);
4861 if (!PL_colorset) reginitcolors();
4862 PerlIO_printf(Perl_debug_log,
4863 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4865 prog->check_substr ? "" : "utf8 ",
4866 PL_colors[5],PL_colors[0],
4869 (strlen(s) > 60 ? "..." : ""));
4872 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4876 Perl_pregfree(pTHX_ struct regexp *r)
4879 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4882 if (!r || (--r->refcnt > 0))
4888 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4889 r->prelen, 60, UNI_DISPLAY_REGEX)
4890 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4894 PerlIO_printf(Perl_debug_log,
4895 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4896 PL_colors[4],PL_colors[5],PL_colors[0],
4899 len > 60 ? "..." : "");
4903 Safefree(r->precomp);
4904 if (r->offsets) /* 20010421 MJD */
4905 Safefree(r->offsets);
4906 RX_MATCH_COPY_FREE(r);
4907 #ifdef PERL_COPY_ON_WRITE
4909 SvREFCNT_dec(r->saved_copy);
4912 if (r->anchored_substr)
4913 SvREFCNT_dec(r->anchored_substr);
4914 if (r->anchored_utf8)
4915 SvREFCNT_dec(r->anchored_utf8);
4916 if (r->float_substr)
4917 SvREFCNT_dec(r->float_substr);
4919 SvREFCNT_dec(r->float_utf8);
4920 Safefree(r->substrs);
4923 int n = r->data->count;
4924 PAD* new_comppad = NULL;
4928 /* If you add a ->what type here, update the comment in regcomp.h */
4929 switch (r->data->what[n]) {
4931 SvREFCNT_dec((SV*)r->data->data[n]);
4934 Safefree(r->data->data[n]);
4937 new_comppad = (AV*)r->data->data[n];
4940 if (new_comppad == NULL)
4941 Perl_croak(aTHX_ "panic: pregfree comppad");
4942 PAD_SAVE_LOCAL(old_comppad,
4943 /* Watch out for global destruction's random ordering. */
4944 (SvTYPE(new_comppad) == SVt_PVAV) ?
4945 new_comppad : Null(PAD *)
4947 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4948 op_free((OP_4tree*)r->data->data[n]);
4951 PAD_RESTORE_LOCAL(old_comppad);
4952 SvREFCNT_dec((SV*)new_comppad);
4958 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4961 Safefree(r->data->what);
4964 Safefree(r->startp);
4970 - regnext - dig the "next" pointer out of a node
4972 * [Note, when REGALIGN is defined there are two places in regmatch()
4973 * that bypass this code for speed.]
4976 Perl_regnext(pTHX_ register regnode *p)
4978 register I32 offset;
4980 if (p == &PL_regdummy)
4983 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4991 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4994 STRLEN l1 = strlen(pat1);
4995 STRLEN l2 = strlen(pat2);
5004 Copy(pat1, buf, l1 , char);
5005 Copy(pat2, buf + l1, l2 , char);
5006 buf[l1 + l2] = '\n';
5007 buf[l1 + l2 + 1] = '\0';
5009 /* ANSI variant takes additional second argument */
5010 va_start(args, pat2);
5014 msv = vmess(buf, &args);
5016 message = SvPV(msv,l1);
5019 Copy(message, buf, l1 , char);
5020 buf[l1] = '\0'; /* Overwrite \n */
5021 Perl_croak(aTHX_ "%s", buf);
5024 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5027 Perl_save_re_context(pTHX)
5029 SAVEI32(PL_reg_flags); /* from regexec.c */
5031 SAVEPPTR(PL_reginput); /* String-input pointer. */
5032 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5033 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5034 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5035 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5036 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5037 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5038 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5039 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5040 PL_reg_start_tmp = 0;
5041 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5042 PL_reg_start_tmpl = 0;
5043 SAVEVPTR(PL_regdata);
5044 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5045 SAVEI32(PL_regnarrate); /* from regexec.c */
5046 SAVEVPTR(PL_regprogram); /* from regexec.c */
5047 SAVEINT(PL_regindent); /* from regexec.c */
5048 SAVEVPTR(PL_regcc); /* from regexec.c */
5049 SAVEVPTR(PL_curcop);
5050 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5051 SAVEVPTR(PL_reg_re); /* from regexec.c */
5052 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5053 SAVESPTR(PL_reg_sv); /* from regexec.c */
5054 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5055 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5056 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5057 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5058 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5059 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5060 PL_reg_oldsaved = Nullch;
5061 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5062 PL_reg_oldsavedlen = 0;
5063 #ifdef PERL_COPY_ON_WRITE
5067 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5069 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5070 PL_reg_leftiter = 0;
5071 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5072 PL_reg_poscache = Nullch;
5073 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5074 PL_reg_poscache_size = 0;
5075 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5076 SAVEI32(PL_regnpar); /* () count. */
5077 SAVEI32(PL_regsize); /* from regexec.c */
5080 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5086 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5087 for (i = 1; i <= rx->nparens; i++) {
5088 sprintf(digits, "%lu", (long)i);
5089 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5096 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5101 clear_re(pTHX_ void *r)
5103 ReREFCNT_dec((regexp *)r);