5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGCOMP_C
87 #ifndef PERL_IN_XSUB_RE
99 # if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
108 #define STATIC static
111 typedef struct RExC_state_t {
112 U32 flags; /* are we folding, multilining? */
113 char *precomp; /* uncompiled string. */
115 char *start; /* Start of input for compile */
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
119 regnode *emit_start; /* Start of emitted-code area */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
136 #define RExC_flags (pRExC_state->flags)
137 #define RExC_precomp (pRExC_state->precomp)
138 #define RExC_rx (pRExC_state->rx)
139 #define RExC_start (pRExC_state->start)
140 #define RExC_end (pRExC_state->end)
141 #define RExC_parse (pRExC_state->parse)
142 #define RExC_whilem_seen (pRExC_state->whilem_seen)
143 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty (pRExC_state->naughty)
147 #define RExC_sawback (pRExC_state->sawback)
148 #define RExC_seen (pRExC_state->seen)
149 #define RExC_size (pRExC_state->size)
150 #define RExC_npar (pRExC_state->npar)
151 #define RExC_extralen (pRExC_state->extralen)
152 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8 (pRExC_state->utf8)
156 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
161 #undef SPSTART /* dratted cpp namespace... */
164 * Flags to be passed up and down.
166 #define WORST 0 /* Worst case. */
167 #define HASWIDTH 0x1 /* Known to match non-null strings. */
168 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART 0x4 /* Starts with * or +. */
170 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
172 /* Length of a variant. */
174 typedef struct scan_data_t {
180 I32 last_end; /* min value, <0 unless valid. */
183 SV **longest; /* Either &l_fixed, or &l_float. */
187 I32 offset_float_min;
188 I32 offset_float_max;
192 struct regnode_charclass_class *start_class;
196 * Forward declarations for pregcomp()'s friends.
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL 0x1
204 #define SF_BEFORE_MEOL 0x2
205 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
209 # define SF_FIX_SHIFT_EOL (0+2)
210 # define SF_FL_SHIFT_EOL (0+4)
212 # define SF_FIX_SHIFT_EOL (+2)
213 # define SF_FL_SHIFT_EOL (+4)
216 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF 0x40
222 #define SF_HAS_PAR 0x80
223 #define SF_IN_PAR 0x100
224 #define SF_HAS_EVAL 0x200
225 #define SCF_DO_SUBSTR 0x400
226 #define SCF_DO_STCLASS_AND 0x0800
227 #define SCF_DO_STCLASS_OR 0x1000
228 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS 0x2000
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
235 #define OOB_UNICODE 12345678
236 #define OOB_NAMEDCLASS -1
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
250 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
260 #define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
280 #define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
299 #define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
308 #define vFAIL(m) STMT_START { \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
315 * Like Simple_vFAIL(), but accepts two arguments.
317 #define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
326 #define vFAIL2(m,a1) STMT_START { \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
334 * Like Simple_vFAIL(), but accepts three arguments.
336 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
345 #define vFAIL3(m,a1,a2) STMT_START { \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
352 * Like Simple_vFAIL(), but accepts four arguments.
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
361 * Like Simple_vFAIL(), but accepts five arguments.
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
376 #define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 #define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
390 #define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
414 /* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
424 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
431 RExC_offsets[2*(node)-1] = (byte); \
436 #define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
440 #define Set_Node_Length_To_R(node,len) STMT_START { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
447 RExC_offsets[2*(node)] = (len); \
452 #define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
462 static void clear_re(pTHX_ void *r);
464 /* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 sv_setsv(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
482 data->flags &= ~SF_FIX_BEFORE_EOL;
485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
486 data->offset_float_max = (l
487 ? data->last_start_max
488 : data->pos_min + data->pos_delta);
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
491 if (data->flags & SF_BEFORE_EOL)
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
495 data->flags &= ~SF_FL_BEFORE_EOL;
498 SvCUR_set(data->last_found, 0);
500 SV * sv = data->last_found;
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
507 data->flags &= ~SF_BEFORE_EOL;
510 /* Can match anything (initialization) */
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
514 ANYOF_CLASS_ZERO(cl);
515 ANYOF_BITMAP_SETALL(cl);
516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
518 cl->flags |= ANYOF_LOCALE;
521 /* Can match anything (initialization) */
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
527 for (value = 0; value <= ANYOF_MAX; value += 2)
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
537 /* Can match anything (initialization) */
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
549 Zero(cl, 1, struct regnode_charclass_class);
551 cl_anything(pRExC_state, cl);
553 cl->flags |= ANYOF_LOCALE;
556 /* 'And' a given class with another one. Can create false positives */
557 /* We assume that cl is not inverted */
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL))
585 cl->flags &= ~ANYOF_UNICODE_ALL;
586 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
587 cl->flags &= ~ANYOF_UNICODE;
590 /* 'OR' a given class with another one. Can create false positives */
591 /* We assume that cl is not inverted */
593 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
595 if (or_with->flags & ANYOF_INVERT) {
597 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
598 * <= (B1 | !B2) | (CL1 | !CL2)
599 * which is wasteful if CL2 is small, but we ignore CL2:
600 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
601 * XXXX Can we handle case-fold? Unclear:
602 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
603 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
605 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
606 && !(or_with->flags & ANYOF_FOLD)
607 && !(cl->flags & ANYOF_FOLD) ) {
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= ~or_with->bitmap[i];
612 } /* XXXX: logic is complicated otherwise */
614 cl_anything(pRExC_state, cl);
617 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && (!(or_with->flags & ANYOF_FOLD)
620 || (cl->flags & ANYOF_FOLD)) ) {
623 /* OR char bitmap and class bitmap separately */
624 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
625 cl->bitmap[i] |= or_with->bitmap[i];
626 if (or_with->flags & ANYOF_CLASS) {
627 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
628 cl->classflags[i] |= or_with->classflags[i];
629 cl->flags |= ANYOF_CLASS;
632 else { /* XXXX: logic is complicated, leave it along for a moment. */
633 cl_anything(pRExC_state, cl);
636 if (or_with->flags & ANYOF_EOS)
637 cl->flags |= ANYOF_EOS;
639 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
640 ARG(cl) != ARG(or_with)) {
641 cl->flags |= ANYOF_UNICODE_ALL;
642 cl->flags &= ~ANYOF_UNICODE;
644 if (or_with->flags & ANYOF_UNICODE_ALL) {
645 cl->flags |= ANYOF_UNICODE_ALL;
646 cl->flags &= ~ANYOF_UNICODE;
651 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
652 * These need to be revisited when a newer toolchain becomes available.
654 #if defined(__sparc64__) && defined(__GNUC__)
655 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
656 # undef SPARC64_GCC_WORKAROUND
657 # define SPARC64_GCC_WORKAROUND 1
661 /* REx optimizer. Converts nodes into quickier variants "in place".
662 Finds fixed substrings. */
664 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
665 to the position after last scanned or to NULL. */
668 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
669 /* scanp: Start here (read-write). */
670 /* deltap: Write maxlen-minlen here. */
671 /* last: Stop before this one. */
673 I32 min = 0, pars = 0, code;
674 regnode *scan = *scanp, *next;
676 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
677 int is_inf_internal = 0; /* The studied chunk is infinite */
678 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
679 scan_data_t data_fake;
680 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
682 while (scan && OP(scan) != END && scan < last) {
683 /* Peephole optimizer: */
685 if (PL_regkind[(U8)OP(scan)] == EXACT) {
686 /* Merge several consecutive EXACTish nodes into one. */
687 regnode *n = regnext(scan);
690 regnode *stop = scan;
693 next = scan + NODE_SZ_STR(scan);
694 /* Skip NOTHING, merge EXACT*. */
696 ( PL_regkind[(U8)OP(n)] == NOTHING ||
697 (stringok && (OP(n) == OP(scan))))
699 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
700 if (OP(n) == TAIL || n > next)
702 if (PL_regkind[(U8)OP(n)] == NOTHING) {
703 NEXT_OFF(scan) += NEXT_OFF(n);
704 next = n + NODE_STEP_REGNODE;
712 int oldl = STR_LEN(scan);
713 regnode *nnext = regnext(n);
715 if (oldl + STR_LEN(n) > U8_MAX)
717 NEXT_OFF(scan) += NEXT_OFF(n);
718 STR_LEN(scan) += STR_LEN(n);
719 next = n + NODE_SZ_STR(n);
720 /* Now we can overwrite *n : */
721 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
729 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
731 Two problematic code points in Unicode casefolding of EXACT nodes:
733 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
734 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
740 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
741 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
743 This means that in case-insensitive matching (or "loose matching",
744 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
745 length of the above casefolded versions) can match a target string
746 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
747 This would rather mess up the minimum length computation.
749 What we'll do is to look for the tail four bytes, and then peek
750 at the preceding two bytes to see whether we need to decrease
751 the minimum length by four (six minus two).
753 Thanks to the design of UTF-8, there cannot be false matches:
754 A sequence of valid UTF-8 bytes cannot be a subsequence of
755 another valid sequence of UTF-8 bytes.
758 char *s0 = STRING(scan), *s, *t;
759 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
760 char *t0 = "\xcc\x88\xcc\x81";
764 s < s2 && (t = ninstr(s, s1, t0, t1));
766 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
767 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
774 n = scan + NODE_SZ_STR(scan);
776 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
784 /* Follow the next-chain of the current node and optimize
785 away all the NOTHINGs from it. */
786 if (OP(scan) != CURLYX) {
787 int max = (reg_off_by_arg[OP(scan)]
789 /* I32 may be smaller than U16 on CRAYs! */
790 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
791 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795 /* Skip NOTHING and LONGJMP. */
796 while ((n = regnext(n))
797 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
798 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
801 if (reg_off_by_arg[OP(scan)])
804 NEXT_OFF(scan) = off;
806 /* The principal pseudo-switch. Cannot be a switch, since we
807 look into several different things. */
808 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
809 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
810 next = regnext(scan);
813 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
814 I32 max1 = 0, min1 = I32_MAX, num = 0;
815 struct regnode_charclass_class accum;
817 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
818 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
819 if (flags & SCF_DO_STCLASS)
820 cl_init_zero(pRExC_state, &accum);
821 while (OP(scan) == code) {
822 I32 deltanext, minnext, f = 0, fake;
823 struct regnode_charclass_class this_class;
828 data_fake.whilem_c = data->whilem_c;
829 data_fake.last_closep = data->last_closep;
832 data_fake.last_closep = &fake;
833 next = regnext(scan);
834 scan = NEXTOPER(scan);
836 scan = NEXTOPER(scan);
837 if (flags & SCF_DO_STCLASS) {
838 cl_init(pRExC_state, &this_class);
839 data_fake.start_class = &this_class;
840 f = SCF_DO_STCLASS_AND;
842 if (flags & SCF_WHILEM_VISITED_POS)
843 f |= SCF_WHILEM_VISITED_POS;
844 /* we suppose the run is continuous, last=next...*/
845 minnext = study_chunk(pRExC_state, &scan, &deltanext,
846 next, &data_fake, f);
849 if (max1 < minnext + deltanext)
850 max1 = minnext + deltanext;
851 if (deltanext == I32_MAX)
852 is_inf = is_inf_internal = 1;
854 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
856 if (data && (data_fake.flags & SF_HAS_EVAL))
857 data->flags |= SF_HAS_EVAL;
859 data->whilem_c = data_fake.whilem_c;
860 if (flags & SCF_DO_STCLASS)
861 cl_or(pRExC_state, &accum, &this_class);
865 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
867 if (flags & SCF_DO_SUBSTR) {
868 data->pos_min += min1;
869 data->pos_delta += max1 - min1;
870 if (max1 != min1 || is_inf)
871 data->longest = &(data->longest_float);
874 delta += max1 - min1;
875 if (flags & SCF_DO_STCLASS_OR) {
876 cl_or(pRExC_state, data->start_class, &accum);
878 cl_and(data->start_class, &and_with);
879 flags &= ~SCF_DO_STCLASS;
882 else if (flags & SCF_DO_STCLASS_AND) {
884 cl_and(data->start_class, &accum);
885 flags &= ~SCF_DO_STCLASS;
888 /* Switch to OR mode: cache the old value of
889 * data->start_class */
890 StructCopy(data->start_class, &and_with,
891 struct regnode_charclass_class);
892 flags &= ~SCF_DO_STCLASS_AND;
893 StructCopy(&accum, data->start_class,
894 struct regnode_charclass_class);
895 flags |= SCF_DO_STCLASS_OR;
896 data->start_class->flags |= ANYOF_EOS;
900 else if (code == BRANCHJ) /* single branch is optimized. */
901 scan = NEXTOPER(NEXTOPER(scan));
902 else /* single branch is optimized. */
903 scan = NEXTOPER(scan);
906 else if (OP(scan) == EXACT) {
907 I32 l = STR_LEN(scan);
908 UV uc = *((U8*)STRING(scan));
910 U8 *s = (U8*)STRING(scan);
911 l = utf8_length(s, s + l);
912 uc = utf8_to_uvchr(s, NULL);
915 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
916 /* The code below prefers earlier match for fixed
917 offset, later match for variable offset. */
918 if (data->last_end == -1) { /* Update the start info. */
919 data->last_start_min = data->pos_min;
920 data->last_start_max = is_inf
921 ? I32_MAX : data->pos_min + data->pos_delta;
923 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
925 SV * sv = data->last_found;
926 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
927 mg_find(sv, PERL_MAGIC_utf8) : NULL;
928 if (mg && mg->mg_len >= 0)
929 mg->mg_len += utf8_length((U8*)STRING(scan),
930 (U8*)STRING(scan)+STR_LEN(scan));
933 SvUTF8_on(data->last_found);
934 data->last_end = data->pos_min + l;
935 data->pos_min += l; /* As in the first entry. */
936 data->flags &= ~SF_BEFORE_EOL;
938 if (flags & SCF_DO_STCLASS_AND) {
939 /* Check whether it is compatible with what we know already! */
943 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
944 && !ANYOF_BITMAP_TEST(data->start_class, uc)
945 && (!(data->start_class->flags & ANYOF_FOLD)
946 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
949 ANYOF_CLASS_ZERO(data->start_class);
950 ANYOF_BITMAP_ZERO(data->start_class);
952 ANYOF_BITMAP_SET(data->start_class, uc);
953 data->start_class->flags &= ~ANYOF_EOS;
955 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957 else if (flags & SCF_DO_STCLASS_OR) {
958 /* false positive possible if the class is case-folded */
960 ANYOF_BITMAP_SET(data->start_class, uc);
962 data->start_class->flags |= ANYOF_UNICODE_ALL;
963 data->start_class->flags &= ~ANYOF_EOS;
964 cl_and(data->start_class, &and_with);
966 flags &= ~SCF_DO_STCLASS;
968 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
969 I32 l = STR_LEN(scan);
970 UV uc = *((U8*)STRING(scan));
972 /* Search for fixed substrings supports EXACT only. */
973 if (flags & SCF_DO_SUBSTR)
974 scan_commit(pRExC_state, data);
976 U8 *s = (U8 *)STRING(scan);
977 l = utf8_length(s, s + l);
978 uc = utf8_to_uvchr(s, NULL);
981 if (data && (flags & SCF_DO_SUBSTR))
983 if (flags & SCF_DO_STCLASS_AND) {
984 /* Check whether it is compatible with what we know already! */
988 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
989 && !ANYOF_BITMAP_TEST(data->start_class, uc)
990 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992 ANYOF_CLASS_ZERO(data->start_class);
993 ANYOF_BITMAP_ZERO(data->start_class);
995 ANYOF_BITMAP_SET(data->start_class, uc);
996 data->start_class->flags &= ~ANYOF_EOS;
997 data->start_class->flags |= ANYOF_FOLD;
998 if (OP(scan) == EXACTFL)
999 data->start_class->flags |= ANYOF_LOCALE;
1002 else if (flags & SCF_DO_STCLASS_OR) {
1003 if (data->start_class->flags & ANYOF_FOLD) {
1004 /* false positive possible if the class is case-folded.
1005 Assume that the locale settings are the same... */
1007 ANYOF_BITMAP_SET(data->start_class, uc);
1008 data->start_class->flags &= ~ANYOF_EOS;
1010 cl_and(data->start_class, &and_with);
1012 flags &= ~SCF_DO_STCLASS;
1014 else if (strchr((char*)PL_varies,OP(scan))) {
1015 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1016 I32 f = flags, pos_before = 0;
1017 regnode *oscan = scan;
1018 struct regnode_charclass_class this_class;
1019 struct regnode_charclass_class *oclass = NULL;
1020 I32 next_is_eval = 0;
1022 switch (PL_regkind[(U8)OP(scan)]) {
1023 case WHILEM: /* End of (?:...)* . */
1024 scan = NEXTOPER(scan);
1027 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1028 next = NEXTOPER(scan);
1029 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031 maxcount = REG_INFTY;
1032 next = regnext(scan);
1033 scan = NEXTOPER(scan);
1037 if (flags & SCF_DO_SUBSTR)
1042 if (flags & SCF_DO_STCLASS) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1049 is_inf = is_inf_internal = 1;
1050 scan = regnext(scan);
1051 if (flags & SCF_DO_SUBSTR) {
1052 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1053 data->longest = &(data->longest_float);
1055 goto optimize_curly_tail;
1057 mincount = ARG1(scan);
1058 maxcount = ARG2(scan);
1059 next = regnext(scan);
1060 if (OP(scan) == CURLYX) {
1061 I32 lp = (data ? *(data->last_closep) : 0);
1063 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1066 next_is_eval = (OP(scan) == EVAL);
1068 if (flags & SCF_DO_SUBSTR) {
1069 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1070 pos_before = data->pos_min;
1074 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 data->flags |= SF_IS_INF;
1078 if (flags & SCF_DO_STCLASS) {
1079 cl_init(pRExC_state, &this_class);
1080 oclass = data->start_class;
1081 data->start_class = &this_class;
1082 f |= SCF_DO_STCLASS_AND;
1083 f &= ~SCF_DO_STCLASS_OR;
1085 /* These are the cases when once a subexpression
1086 fails at a particular position, it cannot succeed
1087 even after backtracking at the enclosing scope.
1089 XXXX what if minimal match and we are at the
1090 initial run of {n,m}? */
1091 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1092 f &= ~SCF_WHILEM_VISITED_POS;
1094 /* This will finish on WHILEM, setting scan, or on NULL: */
1095 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 ? (f & ~SCF_DO_SUBSTR) : f);
1099 if (flags & SCF_DO_STCLASS)
1100 data->start_class = oclass;
1101 if (mincount == 0 || minnext == 0) {
1102 if (flags & SCF_DO_STCLASS_OR) {
1103 cl_or(pRExC_state, data->start_class, &this_class);
1105 else if (flags & SCF_DO_STCLASS_AND) {
1106 /* Switch to OR mode: cache the old value of
1107 * data->start_class */
1108 StructCopy(data->start_class, &and_with,
1109 struct regnode_charclass_class);
1110 flags &= ~SCF_DO_STCLASS_AND;
1111 StructCopy(&this_class, data->start_class,
1112 struct regnode_charclass_class);
1113 flags |= SCF_DO_STCLASS_OR;
1114 data->start_class->flags |= ANYOF_EOS;
1116 } else { /* Non-zero len */
1117 if (flags & SCF_DO_STCLASS_OR) {
1118 cl_or(pRExC_state, data->start_class, &this_class);
1119 cl_and(data->start_class, &and_with);
1121 else if (flags & SCF_DO_STCLASS_AND)
1122 cl_and(data->start_class, &this_class);
1123 flags &= ~SCF_DO_STCLASS;
1125 if (!scan) /* It was not CURLYX, but CURLY. */
1127 if (ckWARN(WARN_REGEXP)
1128 /* ? quantifier ok, except for (?{ ... }) */
1129 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1130 && (minnext == 0) && (deltanext == 0)
1131 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1132 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1135 "Quantifier unexpected on zero-length expression");
1138 min += minnext * mincount;
1139 is_inf_internal |= ((maxcount == REG_INFTY
1140 && (minnext + deltanext) > 0)
1141 || deltanext == I32_MAX);
1142 is_inf |= is_inf_internal;
1143 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145 /* Try powerful optimization CURLYX => CURLYN. */
1146 if ( OP(oscan) == CURLYX && data
1147 && data->flags & SF_IN_PAR
1148 && !(data->flags & SF_HAS_EVAL)
1149 && !deltanext && minnext == 1 ) {
1150 /* Try to optimize to CURLYN. */
1151 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1152 regnode *nxt1 = nxt;
1159 if (!strchr((char*)PL_simple,OP(nxt))
1160 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1161 && STR_LEN(nxt) == 1))
1167 if (OP(nxt) != CLOSE)
1169 /* Now we know that nxt2 is the only contents: */
1170 oscan->flags = (U8)ARG(nxt);
1172 OP(nxt1) = NOTHING; /* was OPEN. */
1174 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1175 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1176 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1177 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1178 OP(nxt + 1) = OPTIMIZED; /* was count. */
1179 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 /* Try optimization CURLYX => CURLYM. */
1185 if ( OP(oscan) == CURLYX && data
1186 && !(data->flags & SF_HAS_PAR)
1187 && !(data->flags & SF_HAS_EVAL)
1189 /* XXXX How to optimize if data == 0? */
1190 /* Optimize to a simpler form. */
1191 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1196 && (OP(nxt2) != WHILEM))
1198 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1199 /* Need to optimize away parenths. */
1200 if (data->flags & SF_IN_PAR) {
1201 /* Set the parenth number. */
1202 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204 if (OP(nxt) != CLOSE)
1205 FAIL("Panic opt close");
1206 oscan->flags = (U8)ARG(nxt);
1207 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1208 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1211 OP(nxt + 1) = OPTIMIZED; /* was count. */
1212 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1213 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1216 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1217 regnode *nnxt = regnext(nxt1);
1220 if (reg_off_by_arg[OP(nxt1)])
1221 ARG_SET(nxt1, nxt2 - nxt1);
1222 else if (nxt2 - nxt1 < U16_MAX)
1223 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 OP(nxt) = NOTHING; /* Cannot beautify */
1230 /* Optimize again: */
1231 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237 else if ((OP(oscan) == CURLYX)
1238 && (flags & SCF_WHILEM_VISITED_POS)
1239 /* See the comment on a similar expression above.
1240 However, this time it not a subexpression
1241 we care about, but the expression itself. */
1242 && (maxcount == REG_INFTY)
1243 && data && ++data->whilem_c < 16) {
1244 /* This stays as CURLYX, we can put the count/of pair. */
1245 /* Find WHILEM (as in regexec.c) */
1246 regnode *nxt = oscan + NEXT_OFF(oscan);
1248 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1251 | (RExC_whilem_seen << 4)); /* On WHILEM */
1253 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255 if (flags & SCF_DO_SUBSTR) {
1256 SV *last_str = Nullsv;
1257 int counted = mincount != 0;
1259 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1260 #if defined(SPARC64_GCC_WORKAROUND)
1266 if (pos_before >= data->last_start_min)
1269 b = data->last_start_min;
1272 s = SvPV(data->last_found, l);
1273 old = b - data->last_start_min;
1276 I32 b = pos_before >= data->last_start_min
1277 ? pos_before : data->last_start_min;
1279 char *s = SvPV(data->last_found, l);
1280 I32 old = b - data->last_start_min;
1284 old = utf8_hop((U8*)s, old) - (U8*)s;
1287 /* Get the added string: */
1288 last_str = newSVpvn(s + old, l);
1290 SvUTF8_on(last_str);
1291 if (deltanext == 0 && pos_before == b) {
1292 /* What was added is a constant string */
1294 SvGROW(last_str, (mincount * l) + 1);
1295 repeatcpy(SvPVX(last_str) + l,
1296 SvPVX(last_str), l, mincount - 1);
1297 SvCUR(last_str) *= mincount;
1298 /* Add additional parts. */
1299 SvCUR_set(data->last_found,
1300 SvCUR(data->last_found) - l);
1301 sv_catsv(data->last_found, last_str);
1303 SV * sv = data->last_found;
1305 SvUTF8(sv) && SvMAGICAL(sv) ?
1306 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1307 if (mg && mg->mg_len >= 0)
1308 mg->mg_len += CHR_SVLEN(last_str);
1310 data->last_end += l * (mincount - 1);
1313 /* start offset must point into the last copy */
1314 data->last_start_min += minnext * (mincount - 1);
1315 data->last_start_max += is_inf ? I32_MAX
1316 : (maxcount - 1) * (minnext + data->pos_delta);
1319 /* It is counted once already... */
1320 data->pos_min += minnext * (mincount - counted);
1321 data->pos_delta += - counted * deltanext +
1322 (minnext + deltanext) * maxcount - minnext * mincount;
1323 if (mincount != maxcount) {
1324 /* Cannot extend fixed substrings found inside
1326 scan_commit(pRExC_state,data);
1327 if (mincount && last_str) {
1328 sv_setsv(data->last_found, last_str);
1329 data->last_end = data->pos_min;
1330 data->last_start_min =
1331 data->pos_min - CHR_SVLEN(last_str);
1332 data->last_start_max = is_inf
1334 : data->pos_min + data->pos_delta
1335 - CHR_SVLEN(last_str);
1337 data->longest = &(data->longest_float);
1339 SvREFCNT_dec(last_str);
1341 if (data && (fl & SF_HAS_EVAL))
1342 data->flags |= SF_HAS_EVAL;
1343 optimize_curly_tail:
1344 if (OP(oscan) != CURLYX) {
1345 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1347 NEXT_OFF(oscan) += NEXT_OFF(next);
1350 default: /* REF and CLUMP only? */
1351 if (flags & SCF_DO_SUBSTR) {
1352 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1353 data->longest = &(data->longest_float);
1355 is_inf = is_inf_internal = 1;
1356 if (flags & SCF_DO_STCLASS_OR)
1357 cl_anything(pRExC_state, data->start_class);
1358 flags &= ~SCF_DO_STCLASS;
1362 else if (strchr((char*)PL_simple,OP(scan))) {
1365 if (flags & SCF_DO_SUBSTR) {
1366 scan_commit(pRExC_state,data);
1370 if (flags & SCF_DO_STCLASS) {
1371 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1373 /* Some of the logic below assumes that switching
1374 locale on will only add false positives. */
1375 switch (PL_regkind[(U8)OP(scan)]) {
1379 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1380 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1381 cl_anything(pRExC_state, data->start_class);
1384 if (OP(scan) == SANY)
1386 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1387 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1388 || (data->start_class->flags & ANYOF_CLASS));
1389 cl_anything(pRExC_state, data->start_class);
1391 if (flags & SCF_DO_STCLASS_AND || !value)
1392 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1395 if (flags & SCF_DO_STCLASS_AND)
1396 cl_and(data->start_class,
1397 (struct regnode_charclass_class*)scan);
1399 cl_or(pRExC_state, data->start_class,
1400 (struct regnode_charclass_class*)scan);
1403 if (flags & SCF_DO_STCLASS_AND) {
1404 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1405 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1406 for (value = 0; value < 256; value++)
1407 if (!isALNUM(value))
1408 ANYOF_BITMAP_CLEAR(data->start_class, value);
1412 if (data->start_class->flags & ANYOF_LOCALE)
1413 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1415 for (value = 0; value < 256; value++)
1417 ANYOF_BITMAP_SET(data->start_class, value);
1422 if (flags & SCF_DO_STCLASS_AND) {
1423 if (data->start_class->flags & ANYOF_LOCALE)
1424 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1427 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1428 data->start_class->flags |= ANYOF_LOCALE;
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1435 for (value = 0; value < 256; value++)
1437 ANYOF_BITMAP_CLEAR(data->start_class, value);
1441 if (data->start_class->flags & ANYOF_LOCALE)
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1444 for (value = 0; value < 256; value++)
1445 if (!isALNUM(value))
1446 ANYOF_BITMAP_SET(data->start_class, value);
1451 if (flags & SCF_DO_STCLASS_AND) {
1452 if (data->start_class->flags & ANYOF_LOCALE)
1453 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1456 data->start_class->flags |= ANYOF_LOCALE;
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1464 for (value = 0; value < 256; value++)
1465 if (!isSPACE(value))
1466 ANYOF_BITMAP_CLEAR(data->start_class, value);
1470 if (data->start_class->flags & ANYOF_LOCALE)
1471 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1473 for (value = 0; value < 256; value++)
1475 ANYOF_BITMAP_SET(data->start_class, value);
1480 if (flags & SCF_DO_STCLASS_AND) {
1481 if (data->start_class->flags & ANYOF_LOCALE)
1482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1485 data->start_class->flags |= ANYOF_LOCALE;
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1493 for (value = 0; value < 256; value++)
1495 ANYOF_BITMAP_CLEAR(data->start_class, value);
1499 if (data->start_class->flags & ANYOF_LOCALE)
1500 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1502 for (value = 0; value < 256; value++)
1503 if (!isSPACE(value))
1504 ANYOF_BITMAP_SET(data->start_class, value);
1509 if (flags & SCF_DO_STCLASS_AND) {
1510 if (data->start_class->flags & ANYOF_LOCALE) {
1511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
1514 ANYOF_BITMAP_CLEAR(data->start_class, value);
1518 data->start_class->flags |= ANYOF_LOCALE;
1519 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1523 if (flags & SCF_DO_STCLASS_AND) {
1524 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1525 for (value = 0; value < 256; value++)
1526 if (!isDIGIT(value))
1527 ANYOF_BITMAP_CLEAR(data->start_class, value);
1530 if (data->start_class->flags & ANYOF_LOCALE)
1531 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1535 ANYOF_BITMAP_SET(data->start_class, value);
1540 if (flags & SCF_DO_STCLASS_AND) {
1541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1542 for (value = 0; value < 256; value++)
1544 ANYOF_BITMAP_CLEAR(data->start_class, value);
1547 if (data->start_class->flags & ANYOF_LOCALE)
1548 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1550 for (value = 0; value < 256; value++)
1551 if (!isDIGIT(value))
1552 ANYOF_BITMAP_SET(data->start_class, value);
1557 if (flags & SCF_DO_STCLASS_OR)
1558 cl_and(data->start_class, &and_with);
1559 flags &= ~SCF_DO_STCLASS;
1562 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1563 data->flags |= (OP(scan) == MEOL
1567 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1568 /* Lookbehind, or need to calculate parens/evals/stclass: */
1569 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1570 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1571 /* Lookahead/lookbehind */
1572 I32 deltanext, minnext, fake = 0;
1574 struct regnode_charclass_class intrnl;
1577 data_fake.flags = 0;
1579 data_fake.whilem_c = data->whilem_c;
1580 data_fake.last_closep = data->last_closep;
1583 data_fake.last_closep = &fake;
1584 if ( flags & SCF_DO_STCLASS && !scan->flags
1585 && OP(scan) == IFMATCH ) { /* Lookahead */
1586 cl_init(pRExC_state, &intrnl);
1587 data_fake.start_class = &intrnl;
1588 f |= SCF_DO_STCLASS_AND;
1590 if (flags & SCF_WHILEM_VISITED_POS)
1591 f |= SCF_WHILEM_VISITED_POS;
1592 next = regnext(scan);
1593 nscan = NEXTOPER(NEXTOPER(scan));
1594 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1597 vFAIL("Variable length lookbehind not implemented");
1599 else if (minnext > U8_MAX) {
1600 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1602 scan->flags = (U8)minnext;
1604 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1606 if (data && (data_fake.flags & SF_HAS_EVAL))
1607 data->flags |= SF_HAS_EVAL;
1609 data->whilem_c = data_fake.whilem_c;
1610 if (f & SCF_DO_STCLASS_AND) {
1611 int was = (data->start_class->flags & ANYOF_EOS);
1613 cl_and(data->start_class, &intrnl);
1615 data->start_class->flags |= ANYOF_EOS;
1618 else if (OP(scan) == OPEN) {
1621 else if (OP(scan) == CLOSE) {
1622 if ((I32)ARG(scan) == is_par) {
1623 next = regnext(scan);
1625 if ( next && (OP(next) != WHILEM) && next < last)
1626 is_par = 0; /* Disable optimization */
1629 *(data->last_closep) = ARG(scan);
1631 else if (OP(scan) == EVAL) {
1633 data->flags |= SF_HAS_EVAL;
1635 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1636 if (flags & SCF_DO_SUBSTR) {
1637 scan_commit(pRExC_state,data);
1638 data->longest = &(data->longest_float);
1640 is_inf = is_inf_internal = 1;
1641 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1642 cl_anything(pRExC_state, data->start_class);
1643 flags &= ~SCF_DO_STCLASS;
1645 /* Else: zero-length, ignore. */
1646 scan = regnext(scan);
1651 *deltap = is_inf_internal ? I32_MAX : delta;
1652 if (flags & SCF_DO_SUBSTR && is_inf)
1653 data->pos_delta = I32_MAX - data->pos_min;
1654 if (is_par > U8_MAX)
1656 if (is_par && pars==1 && data) {
1657 data->flags |= SF_IN_PAR;
1658 data->flags &= ~SF_HAS_PAR;
1660 else if (pars && data) {
1661 data->flags |= SF_HAS_PAR;
1662 data->flags &= ~SF_IN_PAR;
1664 if (flags & SCF_DO_STCLASS_OR)
1665 cl_and(data->start_class, &and_with);
1670 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1672 if (RExC_rx->data) {
1673 Renewc(RExC_rx->data,
1674 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1675 char, struct reg_data);
1676 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1677 RExC_rx->data->count += n;
1680 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1681 char, struct reg_data);
1682 New(1208, RExC_rx->data->what, n, U8);
1683 RExC_rx->data->count = n;
1685 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1686 return RExC_rx->data->count - n;
1690 Perl_reginitcolors(pTHX)
1693 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1696 PL_colors[0] = s = savepv(s);
1698 s = strchr(s, '\t');
1704 PL_colors[i] = s = "";
1708 PL_colors[i++] = "";
1715 - pregcomp - compile a regular expression into internal code
1717 * We can't allocate space until we know how big the compiled form will be,
1718 * but we can't compile it (and thus know how big it is) until we've got a
1719 * place to put the code. So we cheat: we compile it twice, once with code
1720 * generation turned off and size counting turned on, and once "for real".
1721 * This also means that we don't allocate space until we are sure that the
1722 * thing really will compile successfully, and we never have to move the
1723 * code and thus invalidate pointers into it. (Note that it has to be in
1724 * one piece because free() must be able to free it all.) [NB: not true in perl]
1726 * Beware that the optimization-preparation code in here knows about some
1727 * of the structure of the compiled regexp. [I'll say.]
1730 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1740 RExC_state_t RExC_state;
1741 RExC_state_t *pRExC_state = &RExC_state;
1744 FAIL("NULL regexp argument");
1746 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1750 if (!PL_colorset) reginitcolors();
1751 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1752 PL_colors[4],PL_colors[5],PL_colors[0],
1753 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1755 RExC_flags = pm->op_pmflags;
1759 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1760 RExC_seen_evals = 0;
1763 /* First pass: determine size, legality. */
1770 RExC_emit = &PL_regdummy;
1771 RExC_whilem_seen = 0;
1772 #if 0 /* REGC() is (currently) a NOP at the first pass.
1773 * Clever compilers notice this and complain. --jhi */
1774 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1776 if (reg(pRExC_state, 0, &flags) == NULL) {
1777 RExC_precomp = Nullch;
1780 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1782 /* Small enough for pointer-storage convention?
1783 If extralen==0, this means that we will not need long jumps. */
1784 if (RExC_size >= 0x10000L && RExC_extralen)
1785 RExC_size += RExC_extralen;
1788 if (RExC_whilem_seen > 15)
1789 RExC_whilem_seen = 15;
1791 /* Allocate space and initialize. */
1792 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1795 FAIL("Regexp out of space");
1798 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1799 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1802 r->prelen = xend - exp;
1803 r->precomp = savepvn(RExC_precomp, r->prelen);
1805 #ifdef PERL_COPY_ON_WRITE
1806 r->saved_copy = Nullsv;
1808 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1809 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1811 r->substrs = 0; /* Useful during FAIL. */
1812 r->startp = 0; /* Useful during FAIL. */
1813 r->endp = 0; /* Useful during FAIL. */
1815 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1817 r->offsets[0] = RExC_size;
1819 DEBUG_r(PerlIO_printf(Perl_debug_log,
1820 "%s %"UVuf" bytes for offset annotations.\n",
1821 r->offsets ? "Got" : "Couldn't get",
1822 (UV)((2*RExC_size+1) * sizeof(U32))));
1826 /* Second pass: emit code. */
1827 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1832 RExC_emit_start = r->program;
1833 RExC_emit = r->program;
1834 /* Store the count of eval-groups for security checks: */
1835 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1836 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1838 if (reg(pRExC_state, 0, &flags) == NULL)
1841 /* Dig out information for optimizations. */
1842 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1843 pm->op_pmflags = RExC_flags;
1845 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1846 r->regstclass = NULL;
1847 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1848 r->reganch |= ROPT_NAUGHTY;
1849 scan = r->program + 1; /* First BRANCH. */
1851 /* XXXX To minimize changes to RE engine we always allocate
1852 3-units-long substrs field. */
1853 Newz(1004, r->substrs, 1, struct reg_substr_data);
1855 StructCopy(&zero_scan_data, &data, scan_data_t);
1856 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1857 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1859 STRLEN longest_float_length, longest_fixed_length;
1860 struct regnode_charclass_class ch_class;
1865 /* Skip introductions and multiplicators >= 1. */
1866 while ((OP(first) == OPEN && (sawopen = 1)) ||
1867 /* An OR of *one* alternative - should not happen now. */
1868 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869 (OP(first) == PLUS) ||
1870 (OP(first) == MINMOD) ||
1871 /* An {n,m} with n>0 */
1872 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1873 if (OP(first) == PLUS)
1876 first += regarglen[(U8)OP(first)];
1877 first = NEXTOPER(first);
1880 /* Starting-point info. */
1882 if (PL_regkind[(U8)OP(first)] == EXACT) {
1883 if (OP(first) == EXACT)
1884 ; /* Empty, get anchored substr later. */
1885 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1886 r->regstclass = first;
1888 else if (strchr((char*)PL_simple,OP(first)))
1889 r->regstclass = first;
1890 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891 PL_regkind[(U8)OP(first)] == NBOUND)
1892 r->regstclass = first;
1893 else if (PL_regkind[(U8)OP(first)] == BOL) {
1894 r->reganch |= (OP(first) == MBOL
1896 : (OP(first) == SBOL
1899 first = NEXTOPER(first);
1902 else if (OP(first) == GPOS) {
1903 r->reganch |= ROPT_ANCH_GPOS;
1904 first = NEXTOPER(first);
1907 else if (!sawopen && (OP(first) == STAR &&
1908 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1909 !(r->reganch & ROPT_ANCH) )
1911 /* turn .* into ^.* with an implied $*=1 */
1912 int type = OP(NEXTOPER(first));
1914 if (type == REG_ANY)
1915 type = ROPT_ANCH_MBOL;
1917 type = ROPT_ANCH_SBOL;
1919 r->reganch |= type | ROPT_IMPLICIT;
1920 first = NEXTOPER(first);
1923 if (sawplus && (!sawopen || !RExC_sawback)
1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
1928 /* Scan is after the zeroth branch, first is atomic matcher. */
1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1930 (IV)(first - scan + 1)));
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
1939 * it happens that c_offset_min has been invalidated, since the
1940 * earlier string may buy us something the later one won't.]
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
1947 data.longest = &(data.longest_fixed);
1949 if (!r->regstclass) {
1950 cl_init(pRExC_state, &ch_class);
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1955 data.last_closep = &last_close;
1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1960 && data.last_start_min == 0 && data.last_end > 0
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1963 r->reganch |= ROPT_CHECK_ALL;
1964 scan_commit(pRExC_state, &data);
1965 SvREFCNT_dec(data.last_found);
1967 longest_float_length = CHR_SVLEN(data.longest_float);
1968 if (longest_float_length
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971 || (RExC_flags & PMf_MULTILINE)))) {
1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
1990 || (RExC_flags & PMf_MULTILINE)));
1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1995 r->float_substr = r->float_utf8 = Nullsv;
1996 SvREFCNT_dec(data.longest_float);
1997 longest_float_length = 0;
2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2001 if (longest_fixed_length
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2004 || (RExC_flags & PMf_MULTILINE)))) {
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2014 r->anchored_offset = data.offset_fixed;
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2017 || (RExC_flags & PMf_MULTILINE)));
2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
2022 SvREFCNT_dec(data.longest_fixed);
2023 longest_fixed_length = 0;
2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2027 r->regstclass = NULL;
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2030 && !(data.start_class->flags & ANYOF_EOS)
2031 && !cl_is_anything(data.start_class))
2033 I32 n = add_data(pRExC_state, 1, "f");
2035 New(1006, RExC_rx->data->data[n], 1,
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2039 struct regnode_charclass_class);
2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2042 PL_regdata = r->data; /* for regprop() */
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
2046 "synthetic stclass `%s'.\n",
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2051 if (longest_fixed_length > longest_float_length) {
2052 r->check_substr = r->anchored_substr;
2053 r->check_utf8 = r->anchored_utf8;
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
2059 r->check_substr = r->float_substr;
2060 r->check_utf8 = r->float_utf8;
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2067 r->reganch |= RE_USE_INTUIT;
2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2069 r->reganch |= RE_INTUIT_TAIL;
2073 /* Several toplevels. Best we can is to set minlen. */
2075 struct regnode_charclass_class ch_class;
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
2080 cl_init(pRExC_state, &ch_class);
2081 data.start_class = &ch_class;
2082 data.last_closep = &last_close;
2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
2086 if (!(data.start_class->flags & ANYOF_EOS)
2087 && !cl_is_anything(data.start_class))
2089 I32 n = add_data(pRExC_state, 1, "f");
2091 New(1006, RExC_rx->data->data[n], 1,
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2095 struct regnode_charclass_class);
2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
2101 "synthetic stclass `%s'.\n",
2107 if (RExC_seen & REG_SEEN_GPOS)
2108 r->reganch |= ROPT_GPOS_SEEN;
2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2111 if (RExC_seen & REG_SEEN_EVAL)
2112 r->reganch |= ROPT_EVAL_SEEN;
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
2115 Newz(1002, r->startp, RExC_npar, I32);
2116 Newz(1002, r->endp, RExC_npar, I32);
2117 PL_regdata = r->data; /* for regprop() */
2118 DEBUG_r(regdump(r));
2123 - reg - regular expression, i.e. main body or parenthesized thing
2125 * Caller must absorb opening parenthesis.
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2132 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
2139 register I32 parno = 0;
2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2145 I32 wastedflags = 0x00,
2148 wasted_gc = 0x02 | 0x04,
2151 char * parse_start = RExC_parse; /* MJD */
2152 char *oregcomp_parse = RExC_parse;
2155 *flagp = 0; /* Tentatively. */
2158 /* Make an OPEN node, if parenthesized. */
2160 if (*RExC_parse == '?') { /* (?...) */
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
2164 char *seqstart = RExC_parse;
2167 paren = *RExC_parse++;
2168 ret = NULL; /* For look-ahead/behind. */
2170 case '<': /* (?<...) */
2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
2172 if (*RExC_parse == '!')
2174 if (*RExC_parse != '=' && *RExC_parse != '!')
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
2179 RExC_seen_zerolen++;
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2187 case '#': /* (?#...) */
2188 while (*RExC_parse && *RExC_parse != ')')
2190 if (*RExC_parse != ')')
2191 FAIL("Sequence (?#... not terminated");
2192 nextchar(pRExC_state);
2195 case 'p': /* (?p...) */
2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2199 case '?': /* (??...) */
2201 if (*RExC_parse != '{')
2203 paren = *RExC_parse++;
2205 case '{': /* (?{...}) */
2207 I32 count = 1, n = 0;
2209 char *s = RExC_parse;
2211 OP_4tree *sop, *rop;
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2224 if (*RExC_parse != ')')
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2232 if (RExC_parse - 1 - s)
2233 sv = newSVpvn(s, RExC_parse - 1 - s);
2235 sv = newSVpvn("", 0);
2238 Perl_save_re_context(aTHX);
2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
2248 RExC_rx->data->data[n+2] = (void*)pad;
2251 else { /* First pass */
2252 if (PL_reginterp_cnt < ++RExC_seen_evals
2253 && PL_curcop != &PL_compiling)
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2257 if (PL_tainting && PL_tainted)
2258 FAIL("Eval-group in insecure regular expression");
2261 nextchar(pRExC_state);
2263 ret = reg_node(pRExC_state, LOGICAL);
2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2267 /* deal with the length of this later - MJD */
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2283 ret = reg_node(pRExC_state, LOGICAL);
2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2292 parno = atoi(RExC_parse++);
2294 while (isDIGIT(*RExC_parse))
2296 ret = reganode(pRExC_state, GROUPP, parno);
2298 if ((c = *nextchar(pRExC_state)) != ')')
2299 vFAIL("Switch condition not recognized");
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
2304 br = reganode(pRExC_state, LONGJMP, 0);
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
2316 c = *nextchar(pRExC_state);
2321 vFAIL("Switch (?(condition)... contains too many branches");
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2329 regtail(pRExC_state, ret, ender);
2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2337 RExC_parse--; /* for vFAIL to print correctly */
2338 vFAIL("Sequence (? incomplete");
2342 parse_flags: /* (?i) */
2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2357 flagsp == &negflags ? "don't " : "",
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2376 else { pmflag(flagsp, *RExC_parse); }
2380 if (*RExC_parse == '-') {
2382 wastedflags = 0; /* reset so (?g-c) warns twice */
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
2388 if (*RExC_parse == ':') {
2394 if (*RExC_parse != ')') {
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2398 nextchar(pRExC_state);
2406 ret = reganode(pRExC_state, OPEN, parno);
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
2415 /* Pick up the branches, linking them together. */
2416 parse_start = RExC_parse; /* MJD */
2417 br = regbranch(pRExC_state, &flags, 1);
2418 /* branch_len = (paren != 0); */
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
2427 reginsert(pRExC_state, BRANCH, br);
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2435 else if (paren == ':') {
2436 *flagp |= flags&SIMPLE;
2438 if (open) { /* Starts with OPEN. */
2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2441 else if (paren != '?') /* Not Conditional */
2443 *flagp |= flags & (SPSTART | HASWIDTH);
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2461 *flagp |= flags&SPSTART;
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2468 ender = reg_node(pRExC_state, TAIL);
2471 ender = reganode(pRExC_state, CLOSE, parno);
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
2479 *flagp &= ~HASWIDTH;
2482 ender = reg_node(pRExC_state, SUCCEED);
2485 ender = reg_node(pRExC_state, END);
2488 regtail(pRExC_state, lastbr, ender);
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
2493 regoptail(pRExC_state, br, ender);
2500 static char parens[] = "=!<,>";
2502 if (paren && (p = strchr(parens, paren))) {
2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2504 int flag = (p - parens) > 1;
2507 node = SUSPEND, flag = 0;
2508 reginsert(pRExC_state, node,ret);
2509 Set_Node_Offset(ret, oregcomp_parse);
2510 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2516 /* Check for proper termination. */
2518 RExC_flags = oregflags;
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
2521 vFAIL("Unmatched (");
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2527 vFAIL("Unmatched )");
2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
2538 - regbranch - one alternative of an | operator
2540 * Implements the concatenation operator.
2543 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
2553 if (!SIZE_ONLY && RExC_extralen)
2554 ret = reganode(pRExC_state, BRANCHJ,0);
2556 ret = reg_node(pRExC_state, BRANCH);
2557 Set_Node_Length(ret, 1);
2561 if (!first && SIZE_ONLY)
2562 RExC_extralen += 1; /* BRANCHJ */
2564 *flagp = WORST; /* Tentatively. */
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2570 latest = regpiece(pRExC_state, &flags);
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2576 else if (ret == NULL)
2578 *flagp |= flags&HASWIDTH;
2579 if (chain == NULL) /* First piece. */
2580 *flagp |= flags&SPSTART;
2583 regtail(pRExC_state, chain, latest);
2588 if (chain == NULL) { /* Loop ran zero times. */
2589 chain = reg_node(pRExC_state, NOTHING);
2594 *flagp |= flags&SIMPLE;
2601 - regpiece - something followed by possible [*+?]
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2610 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2612 register regnode *ret;
2614 register char *next;
2616 char *origparse = RExC_parse;
2619 I32 max = REG_INFTY;
2622 ret = regatom(pRExC_state, &flags);
2624 if (flags & TRYAGAIN)
2631 if (op == '{' && regcurly(RExC_parse)) {
2632 parse_start = RExC_parse; /* MJD */
2633 next = RExC_parse + 1;
2635 while (isDIGIT(*next) || *next == ',') {
2644 if (*next == '}') { /* got one */
2648 min = atoi(RExC_parse);
2652 maxpos = RExC_parse;
2654 if (!max && *maxpos != '0')
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2659 nextchar(pRExC_state);
2662 if ((flags&SIMPLE)) {
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
2669 regnode *w = reg_node(pRExC_state, WHILEM);
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2678 reginsert(pRExC_state, CURLYX,ret);
2680 Set_Node_Offset(ret, parse_start+1);
2681 Set_Node_Length(ret,
2682 op == '{' ? (RExC_parse - parse_start) : 1);
2684 if (!SIZE_ONLY && RExC_extralen)
2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2697 if (max && max < min)
2698 vFAIL("Can't do {n,m} with n > m");
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
2713 #if 0 /* Now runtime fix should be reliable. */
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2725 if (!(flags&HASWIDTH) && op != '?')
2726 vFAIL("Regexp *+ operand could be empty");
2729 parse_start = RExC_parse;
2730 nextchar(pRExC_state);
2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2734 if (op == '*' && (flags&SIMPLE)) {
2735 reginsert(pRExC_state, STAR, ret);
2739 else if (op == '*') {
2743 else if (op == '+' && (flags&SIMPLE)) {
2744 reginsert(pRExC_state, PLUS, ret);
2748 else if (op == '+') {
2752 else if (op == '?') {
2757 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2759 "%.*s matches null string many times",
2760 RExC_parse - origparse,
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2769 if (ISMULT2(RExC_parse)) {
2771 vFAIL("Nested quantifiers");
2778 - regatom - the lowest level
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2787 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2789 register regnode *ret = 0;
2791 char *parse_start = 0;
2793 *flagp = WORST; /* Tentatively. */
2796 switch (*RExC_parse) {
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
2800 if (RExC_flags & PMf_MULTILINE)
2801 ret = reg_node(pRExC_state, MBOL);
2802 else if (RExC_flags & PMf_SINGLELINE)
2803 ret = reg_node(pRExC_state, SBOL);
2805 ret = reg_node(pRExC_state, BOL);
2806 Set_Node_Length(ret, 1); /* MJD */
2809 nextchar(pRExC_state);
2811 RExC_seen_zerolen++;
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MEOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SEOL);
2817 ret = reg_node(pRExC_state, EOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2821 nextchar(pRExC_state);
2822 if (RExC_flags & PMf_SINGLELINE)
2823 ret = reg_node(pRExC_state, SANY);
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
2828 Set_Node_Length(ret, 1); /* MJD */
2832 char *oregcomp_parse = ++RExC_parse;
2833 ret = regclass(pRExC_state);
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
2836 vFAIL("Unmatched [");
2838 nextchar(pRExC_state);
2839 *flagp |= HASWIDTH|SIMPLE;
2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
2847 if (flags & TRYAGAIN) {
2848 if (RExC_parse == RExC_end) {
2849 /* Make parent create an empty node if needed. */
2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2861 if (flags & TRYAGAIN) {
2865 vFAIL("Internal urp");
2866 /* Supposed to be caught earlier. */
2869 if (!regcurly(RExC_parse)) {
2878 vFAIL("Quantifier follows nothing");
2881 switch (*++RExC_parse) {
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
2886 nextchar(pRExC_state);
2887 Set_Node_Length(ret, 2); /* MJD */
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
2893 nextchar(pRExC_state);
2894 Set_Node_Length(ret, 2); /* MJD */
2897 ret = reg_node(pRExC_state, SEOL);
2899 RExC_seen_zerolen++; /* Do not optimize RE away */
2900 nextchar(pRExC_state);
2903 ret = reg_node(pRExC_state, EOS);
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
2907 Set_Node_Length(ret, 2); /* MJD */
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 ret = reg_node(pRExC_state, CLUMP);
2919 nextchar(pRExC_state);
2920 Set_Node_Length(ret, 2); /* MJD */
2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2930 *flagp |= HASWIDTH|SIMPLE;
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2939 nextchar(pRExC_state);
2940 Set_Node_Length(ret, 2); /* MJD */
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2958 *flagp |= HASWIDTH|SIMPLE;
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2963 ret = reg_node(pRExC_state, DIGIT);
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2969 ret = reg_node(pRExC_state, NDIGIT);
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2977 char* oldregxend = RExC_end;
2978 char* parse_start = RExC_parse - 2;
2980 if (RExC_parse[1] == '{') {
2981 /* a lovely hack--pretend we saw [\pX] instead */
2982 RExC_end = strchr(RExC_parse, '}');
2984 U8 c = (U8)*RExC_parse;
2986 RExC_end = oldregxend;
2987 vFAIL2("Missing right brace on \\%c{}", c);
2992 RExC_end = RExC_parse + 2;
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2998 ret = regclass(pRExC_state);
3000 RExC_end = oldregxend;
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
3005 nextchar(pRExC_state);
3006 *flagp |= HASWIDTH|SIMPLE;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3022 I32 num = atoi(RExC_parse);
3024 if (num > 9 && num >= RExC_npar)
3027 char * parse_start = RExC_parse - 1; /* MJD */
3028 while (isDIGIT(*RExC_parse))
3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3032 vFAIL("Reference to nonexistent group");
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3039 /* override incorrect value set in reganode MJD */
3040 Set_Node_Offset(ret, parse_start+1);
3041 Set_Node_Cur_Length(ret); /* MJD */
3043 nextchar(pRExC_state);
3048 if (RExC_parse >= RExC_end)
3049 FAIL("Trailing \\");
3052 /* Do not generate `unrecognized' warnings here, we fall
3053 back into the quick-grab loop below */
3059 if (RExC_flags & PMf_EXTENDED) {
3060 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061 if (RExC_parse < RExC_end)
3067 register STRLEN len;
3073 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3075 parse_start = RExC_parse - 1;
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
3090 if (RExC_flags & PMf_EXTENDED)
3091 p = regwhite(p, RExC_end);
3138 ender = ASCII_TO_NATIVE('\033');
3142 ender = ASCII_TO_NATIVE('\007');
3147 char* e = strchr(p, '}');
3151 vFAIL("Missing right brace on \\x{}");
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3160 /* numlen is generous */
3161 if (numlen + len >= 127) {
3169 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3171 ender = grok_hex(p, &numlen, &flags, NULL);
3177 ender = UCHARAT(p++);
3178 ender = toCTRL(ender);
3180 case '0': case '1': case '2': case '3':case '4':
3181 case '5': case '6': case '7': case '8':case '9':
3183 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3186 ender = grok_oct(p, &numlen, &flags, NULL);
3196 FAIL("Trailing \\");
3199 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3200 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3201 goto normal_default;
3206 if (UTF8_IS_START(*p) && UTF) {
3207 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3215 if (RExC_flags & PMf_EXTENDED)
3216 p = regwhite(p, RExC_end);
3218 /* Prime the casefolded buffer. */
3219 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3221 if (ISMULT2(p)) { /* Back off on ?+*. */
3228 /* Emit all the Unicode characters. */
3229 for (foldbuf = tmpbuf;
3231 foldlen -= numlen) {
3232 ender = utf8_to_uvchr(foldbuf, &numlen);
3234 reguni(pRExC_state, ender, s, &unilen);
3237 /* In EBCDIC the numlen
3238 * and unilen can differ. */
3240 if (numlen >= foldlen)
3244 break; /* "Can't happen." */
3248 reguni(pRExC_state, ender, s, &unilen);
3257 REGC((char)ender, s++);
3265 /* Emit all the Unicode characters. */
3266 for (foldbuf = tmpbuf;
3268 foldlen -= numlen) {
3269 ender = utf8_to_uvchr(foldbuf, &numlen);
3271 reguni(pRExC_state, ender, s, &unilen);
3274 /* In EBCDIC the numlen
3275 * and unilen can differ. */
3277 if (numlen >= foldlen)
3285 reguni(pRExC_state, ender, s, &unilen);
3294 REGC((char)ender, s++);
3298 Set_Node_Cur_Length(ret); /* MJD */
3299 nextchar(pRExC_state);
3301 /* len is STRLEN which is unsigned, need to copy to signed */
3304 vFAIL("Internal disaster");
3313 RExC_size += STR_SZ(len);
3315 RExC_emit += STR_SZ(len);
3320 /* If the encoding pragma is in effect recode the text of
3321 * any EXACT-kind nodes. */
3322 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3323 STRLEN oldlen = STR_LEN(ret);
3324 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3328 if (sv_utf8_downgrade(sv, TRUE)) {
3329 char *s = sv_recode_to_utf8(sv, PL_encoding);
3330 STRLEN newlen = SvCUR(sv);
3335 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3336 (int)oldlen, STRING(ret),
3338 Copy(s, STRING(ret), newlen, char);
3339 STR_LEN(ret) += newlen - oldlen;
3340 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3342 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3350 S_regwhite(pTHX_ char *p, char *e)
3355 else if (*p == '#') {
3358 } while (p < e && *p != '\n');
3366 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3367 Character classes ([:foo:]) can also be negated ([:^foo:]).
3368 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3369 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3370 but trigger failures because they are currently unimplemented. */
3372 #define POSIXCC_DONE(c) ((c) == ':')
3373 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3374 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3377 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3380 I32 namedclass = OOB_NAMEDCLASS;
3382 if (value == '[' && RExC_parse + 1 < RExC_end &&
3383 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3384 POSIXCC(UCHARAT(RExC_parse))) {
3385 char c = UCHARAT(RExC_parse);
3386 char* s = RExC_parse++;
3388 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3390 if (RExC_parse == RExC_end)
3391 /* Grandfather lone [:, [=, [. */
3394 char* t = RExC_parse++; /* skip over the c */
3396 if (UCHARAT(RExC_parse) == ']') {
3397 RExC_parse++; /* skip over the ending ] */
3400 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401 I32 skip = 5; /* the most common skip */
3405 if (strnEQ(posixcc, "alnum", 5))
3407 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3408 else if (strnEQ(posixcc, "alpha", 5))
3410 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3411 else if (strnEQ(posixcc, "ascii", 5))
3413 complement ? ANYOF_NASCII : ANYOF_ASCII;
3416 if (strnEQ(posixcc, "blank", 5))
3418 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3421 if (strnEQ(posixcc, "cntrl", 5))
3423 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3426 if (strnEQ(posixcc, "digit", 5))
3428 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3431 if (strnEQ(posixcc, "graph", 5))
3433 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3436 if (strnEQ(posixcc, "lower", 5))
3438 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3441 if (strnEQ(posixcc, "print", 5))
3443 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3444 else if (strnEQ(posixcc, "punct", 5))
3446 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3449 if (strnEQ(posixcc, "space", 5))
3451 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3454 if (strnEQ(posixcc, "upper", 5))
3456 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3458 case 'w': /* this is not POSIX, this is the Perl \w */
3459 if (strnEQ(posixcc, "word", 4)) {
3461 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3466 if (strnEQ(posixcc, "xdigit", 6)) {
3468 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3473 if (namedclass == OOB_NAMEDCLASS ||
3474 posixcc[skip] != ':' ||
3475 posixcc[skip+1] != ']')
3477 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3480 } else if (!SIZE_ONLY) {
3481 /* [[=foo=]] and [[.foo.]] are still future. */
3483 /* adjust RExC_parse so the warning shows after
3485 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3487 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3490 /* Maternal grandfather:
3491 * "[:" ending in ":" but not in ":]" */
3501 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3503 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3504 char *s = RExC_parse;
3507 while(*s && isALNUM(*s))
3509 if (*s && c == *s && s[1] == ']') {
3510 if (ckWARN(WARN_REGEXP))
3512 "POSIX syntax [%c %c] belongs inside character classes",
3515 /* [[=foo=]] and [[.foo.]] are still future. */
3516 if (POSIXCC_NOTYET(c)) {
3517 /* adjust RExC_parse so the error shows after
3519 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3521 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3528 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3531 register UV nextvalue;
3532 register IV prevvalue = OOB_UNICODE;
3533 register IV range = 0;
3534 register regnode *ret;
3537 char *rangebegin = 0;
3538 bool need_class = 0;
3539 SV *listsv = Nullsv;
3542 bool optimize_invert = TRUE;
3543 AV* unicode_alternate = 0;
3545 UV literal_endpoint = 0;
3548 ret = reganode(pRExC_state, ANYOF, 0);
3551 ANYOF_FLAGS(ret) = 0;
3553 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3557 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3561 RExC_size += ANYOF_SKIP;
3563 RExC_emit += ANYOF_SKIP;
3565 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3567 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3568 ANYOF_BITMAP_ZERO(ret);
3569 listsv = newSVpvn("# comment\n", 10);
3572 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574 if (!SIZE_ONLY && POSIXCC(nextvalue))
3575 checkposixcc(pRExC_state);
3577 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3578 if (UCHARAT(RExC_parse) == ']')
3581 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3585 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3588 rangebegin = RExC_parse;
3590 value = utf8n_to_uvchr((U8*)RExC_parse,
3591 RExC_end - RExC_parse,
3593 RExC_parse += numlen;
3596 value = UCHARAT(RExC_parse++);
3597 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3598 if (value == '[' && POSIXCC(nextvalue))
3599 namedclass = regpposixcc(pRExC_state, value);
3600 else if (value == '\\') {
3602 value = utf8n_to_uvchr((U8*)RExC_parse,
3603 RExC_end - RExC_parse,
3605 RExC_parse += numlen;
3608 value = UCHARAT(RExC_parse++);
3609 /* Some compilers cannot handle switching on 64-bit integer
3610 * values, therefore value cannot be an UV. Yes, this will
3611 * be a problem later if we want switch on Unicode.
3612 * A similar issue a little bit later when switching on
3613 * namedclass. --jhi */
3614 switch ((I32)value) {
3615 case 'w': namedclass = ANYOF_ALNUM; break;
3616 case 'W': namedclass = ANYOF_NALNUM; break;
3617 case 's': namedclass = ANYOF_SPACE; break;
3618 case 'S': namedclass = ANYOF_NSPACE; break;
3619 case 'd': namedclass = ANYOF_DIGIT; break;
3620 case 'D': namedclass = ANYOF_NDIGIT; break;
3623 if (RExC_parse >= RExC_end)
3624 vFAIL2("Empty \\%c{}", (U8)value);
3625 if (*RExC_parse == '{') {
3627 e = strchr(RExC_parse++, '}');
3629 vFAIL2("Missing right brace on \\%c{}", c);
3630 while (isSPACE(UCHARAT(RExC_parse)))
3632 if (e == RExC_parse)
3633 vFAIL2("Empty \\%c{}", c);
3635 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3643 if (UCHARAT(RExC_parse) == '^') {
3646 value = value == 'p' ? 'P' : 'p'; /* toggle */
3647 while (isSPACE(UCHARAT(RExC_parse))) {
3653 Perl_sv_catpvf(aTHX_ listsv,
3654 "+utf8::%.*s\n", (int)n, RExC_parse);
3656 Perl_sv_catpvf(aTHX_ listsv,
3657 "!utf8::%.*s\n", (int)n, RExC_parse);
3660 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3662 case 'n': value = '\n'; break;
3663 case 'r': value = '\r'; break;
3664 case 't': value = '\t'; break;
3665 case 'f': value = '\f'; break;
3666 case 'b': value = '\b'; break;
3667 case 'e': value = ASCII_TO_NATIVE('\033');break;
3668 case 'a': value = ASCII_TO_NATIVE('\007');break;
3670 if (*RExC_parse == '{') {
3671 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3672 | PERL_SCAN_DISALLOW_PREFIX;
3673 e = strchr(RExC_parse++, '}');
3675 vFAIL("Missing right brace on \\x{}");
3677 numlen = e - RExC_parse;
3678 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3682 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3684 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3685 RExC_parse += numlen;
3689 value = UCHARAT(RExC_parse++);
3690 value = toCTRL(value);
3692 case '0': case '1': case '2': case '3': case '4':
3693 case '5': case '6': case '7': case '8': case '9':
3697 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3698 RExC_parse += numlen;
3702 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3704 "Unrecognized escape \\%c in character class passed through",
3708 } /* end of \blah */
3714 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3716 if (!SIZE_ONLY && !need_class)
3717 ANYOF_CLASS_ZERO(ret);
3721 /* a bad range like a-\d, a-[:digit:] ? */
3724 if (ckWARN(WARN_REGEXP))
3726 "False [] range \"%*.*s\"",
3727 RExC_parse - rangebegin,
3728 RExC_parse - rangebegin,
3730 if (prevvalue < 256) {
3731 ANYOF_BITMAP_SET(ret, prevvalue);
3732 ANYOF_BITMAP_SET(ret, '-');
3735 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3736 Perl_sv_catpvf(aTHX_ listsv,
3737 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3741 range = 0; /* this was not a true range */
3745 if (namedclass > OOB_NAMEDCLASS)
3746 optimize_invert = FALSE;
3747 /* Possible truncation here but in some 64-bit environments
3748 * the compiler gets heartburn about switch on 64-bit values.
3749 * A similar issue a little earlier when switching on value.
3751 switch ((I32)namedclass) {
3754 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3756 for (value = 0; value < 256; value++)
3758 ANYOF_BITMAP_SET(ret, value);
3760 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3764 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3766 for (value = 0; value < 256; value++)
3767 if (!isALNUM(value))
3768 ANYOF_BITMAP_SET(ret, value);
3770 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3774 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3776 for (value = 0; value < 256; value++)
3777 if (isALNUMC(value))
3778 ANYOF_BITMAP_SET(ret, value);
3780 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3784 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3786 for (value = 0; value < 256; value++)
3787 if (!isALNUMC(value))
3788 ANYOF_BITMAP_SET(ret, value);
3790 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3794 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3796 for (value = 0; value < 256; value++)
3798 ANYOF_BITMAP_SET(ret, value);
3800 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3804 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3806 for (value = 0; value < 256; value++)
3807 if (!isALPHA(value))
3808 ANYOF_BITMAP_SET(ret, value);
3810 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3814 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3817 for (value = 0; value < 128; value++)
3818 ANYOF_BITMAP_SET(ret, value);
3820 for (value = 0; value < 256; value++) {
3822 ANYOF_BITMAP_SET(ret, value);
3826 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3830 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3833 for (value = 128; value < 256; value++)
3834 ANYOF_BITMAP_SET(ret, value);
3836 for (value = 0; value < 256; value++) {
3837 if (!isASCII(value))
3838 ANYOF_BITMAP_SET(ret, value);
3842 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3846 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3848 for (value = 0; value < 256; value++)
3850 ANYOF_BITMAP_SET(ret, value);
3852 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3856 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3858 for (value = 0; value < 256; value++)
3859 if (!isBLANK(value))
3860 ANYOF_BITMAP_SET(ret, value);
3862 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3866 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3868 for (value = 0; value < 256; value++)
3870 ANYOF_BITMAP_SET(ret, value);
3872 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3876 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3878 for (value = 0; value < 256; value++)
3879 if (!isCNTRL(value))
3880 ANYOF_BITMAP_SET(ret, value);
3882 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3886 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3888 /* consecutive digits assumed */
3889 for (value = '0'; value <= '9'; value++)
3890 ANYOF_BITMAP_SET(ret, value);
3892 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3896 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3898 /* consecutive digits assumed */
3899 for (value = 0; value < '0'; value++)
3900 ANYOF_BITMAP_SET(ret, value);
3901 for (value = '9' + 1; value < 256; value++)
3902 ANYOF_BITMAP_SET(ret, value);
3904 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3908 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3910 for (value = 0; value < 256; value++)
3912 ANYOF_BITMAP_SET(ret, value);
3914 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3918 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3920 for (value = 0; value < 256; value++)
3921 if (!isGRAPH(value))
3922 ANYOF_BITMAP_SET(ret, value);
3924 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3928 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3930 for (value = 0; value < 256; value++)
3932 ANYOF_BITMAP_SET(ret, value);
3934 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3938 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3940 for (value = 0; value < 256; value++)
3941 if (!isLOWER(value))
3942 ANYOF_BITMAP_SET(ret, value);
3944 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3948 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3950 for (value = 0; value < 256; value++)
3952 ANYOF_BITMAP_SET(ret, value);
3954 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3958 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3960 for (value = 0; value < 256; value++)
3961 if (!isPRINT(value))
3962 ANYOF_BITMAP_SET(ret, value);
3964 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3968 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3970 for (value = 0; value < 256; value++)
3971 if (isPSXSPC(value))
3972 ANYOF_BITMAP_SET(ret, value);
3974 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3978 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3980 for (value = 0; value < 256; value++)
3981 if (!isPSXSPC(value))
3982 ANYOF_BITMAP_SET(ret, value);
3984 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3988 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3990 for (value = 0; value < 256; value++)
3992 ANYOF_BITMAP_SET(ret, value);
3994 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3998 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4000 for (value = 0; value < 256; value++)
4001 if (!isPUNCT(value))
4002 ANYOF_BITMAP_SET(ret, value);
4004 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4008 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4010 for (value = 0; value < 256; value++)
4012 ANYOF_BITMAP_SET(ret, value);
4014 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4018 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4020 for (value = 0; value < 256; value++)
4021 if (!isSPACE(value))
4022 ANYOF_BITMAP_SET(ret, value);
4024 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4028 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4030 for (value = 0; value < 256; value++)
4032 ANYOF_BITMAP_SET(ret, value);
4034 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4038 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4040 for (value = 0; value < 256; value++)
4041 if (!isUPPER(value))
4042 ANYOF_BITMAP_SET(ret, value);
4044 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4048 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4050 for (value = 0; value < 256; value++)
4051 if (isXDIGIT(value))
4052 ANYOF_BITMAP_SET(ret, value);
4054 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4058 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4060 for (value = 0; value < 256; value++)
4061 if (!isXDIGIT(value))
4062 ANYOF_BITMAP_SET(ret, value);
4064 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4067 vFAIL("Invalid [::] class");
4071 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4074 } /* end of namedclass \blah */
4077 if (prevvalue > (IV)value) /* b-a */ {
4078 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4079 RExC_parse - rangebegin,
4080 RExC_parse - rangebegin,
4082 range = 0; /* not a valid range */
4086 prevvalue = value; /* save the beginning of the range */
4087 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4088 RExC_parse[1] != ']') {
4091 /* a bad range like \w-, [:word:]- ? */
4092 if (namedclass > OOB_NAMEDCLASS) {
4093 if (ckWARN(WARN_REGEXP))
4095 "False [] range \"%*.*s\"",
4096 RExC_parse - rangebegin,
4097 RExC_parse - rangebegin,
4100 ANYOF_BITMAP_SET(ret, '-');
4102 range = 1; /* yeah, it's a range! */
4103 continue; /* but do it the next time */
4107 /* now is the next time */
4111 if (prevvalue < 256) {
4112 IV ceilvalue = value < 256 ? value : 255;
4115 /* In EBCDIC [\x89-\x91] should include
4116 * the \x8e but [i-j] should not. */
4117 if (literal_endpoint == 2 &&
4118 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4119 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4121 if (isLOWER(prevvalue)) {
4122 for (i = prevvalue; i <= ceilvalue; i++)
4124 ANYOF_BITMAP_SET(ret, i);
4126 for (i = prevvalue; i <= ceilvalue; i++)
4128 ANYOF_BITMAP_SET(ret, i);
4133 for (i = prevvalue; i <= ceilvalue; i++)
4134 ANYOF_BITMAP_SET(ret, i);
4136 if (value > 255 || UTF) {
4137 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4138 UV natvalue = NATIVE_TO_UNI(value);
4140 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4141 if (prevnatvalue < natvalue) { /* what about > ? */
4142 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4143 prevnatvalue, natvalue);
4145 else if (prevnatvalue == natvalue) {
4146 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4148 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4150 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4152 /* If folding and foldable and a single
4153 * character, insert also the folded version
4154 * to the charclass. */
4156 if (foldlen == (STRLEN)UNISKIP(f))
4157 Perl_sv_catpvf(aTHX_ listsv,
4160 /* Any multicharacter foldings
4161 * require the following transform:
4162 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4163 * where E folds into "pq" and F folds
4164 * into "rst", all other characters
4165 * fold to single characters. We save
4166 * away these multicharacter foldings,
4167 * to be later saved as part of the
4168 * additional "s" data. */
4171 if (!unicode_alternate)
4172 unicode_alternate = newAV();
4173 sv = newSVpvn((char*)foldbuf, foldlen);
4175 av_push(unicode_alternate, sv);
4179 /* If folding and the value is one of the Greek
4180 * sigmas insert a few more sigmas to make the
4181 * folding rules of the sigmas to work right.
4182 * Note that not all the possible combinations
4183 * are handled here: some of them are handled
4184 * by the standard folding rules, and some of
4185 * them (literal or EXACTF cases) are handled
4186 * during runtime in regexec.c:S_find_byclass(). */
4187 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4188 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4189 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4190 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4191 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4193 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4194 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4195 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4200 literal_endpoint = 0;
4204 range = 0; /* this range (if it was one) is done now */
4208 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4210 RExC_size += ANYOF_CLASS_ADD_SKIP;
4212 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4215 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4217 /* If the only flag is folding (plus possibly inversion). */
4218 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4220 for (value = 0; value < 256; ++value) {
4221 if (ANYOF_BITMAP_TEST(ret, value)) {
4222 UV fold = PL_fold[value];
4225 ANYOF_BITMAP_SET(ret, fold);
4228 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4231 /* optimize inverted simple patterns (e.g. [^a-z]) */
4232 if (!SIZE_ONLY && optimize_invert &&
4233 /* If the only flag is inversion. */
4234 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4235 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4236 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4237 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4244 /* The 0th element stores the character class description
4245 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4246 * to initialize the appropriate swash (which gets stored in
4247 * the 1st element), and also useful for dumping the regnode.
4248 * The 2nd element stores the multicharacter foldings,
4249 * used later (regexec.c:S_reginclass()). */
4250 av_store(av, 0, listsv);
4251 av_store(av, 1, NULL);
4252 av_store(av, 2, (SV*)unicode_alternate);
4253 rv = newRV_noinc((SV*)av);
4254 n = add_data(pRExC_state, 1, "s");
4255 RExC_rx->data->data[n] = (void*)rv;
4263 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4265 char* retval = RExC_parse++;
4268 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4269 RExC_parse[2] == '#') {
4270 while (*RExC_parse && *RExC_parse != ')')
4275 if (RExC_flags & PMf_EXTENDED) {
4276 if (isSPACE(*RExC_parse)) {
4280 else if (*RExC_parse == '#') {
4281 while (*RExC_parse && *RExC_parse != '\n')
4292 - reg_node - emit a node
4294 STATIC regnode * /* Location. */
4295 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4297 register regnode *ret;
4298 register regnode *ptr;
4302 SIZE_ALIGN(RExC_size);
4307 NODE_ALIGN_FILL(ret);
4309 FILL_ADVANCE_NODE(ptr, op);
4310 if (RExC_offsets) { /* MJD */
4311 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4312 "reg_node", __LINE__,
4314 RExC_emit - RExC_emit_start > RExC_offsets[0]
4315 ? "Overwriting end of array!\n" : "OK",
4316 RExC_emit - RExC_emit_start,
4317 RExC_parse - RExC_start,
4319 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4328 - reganode - emit a node with an argument
4330 STATIC regnode * /* Location. */
4331 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4333 register regnode *ret;
4334 register regnode *ptr;
4338 SIZE_ALIGN(RExC_size);
4343 NODE_ALIGN_FILL(ret);
4345 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4346 if (RExC_offsets) { /* MJD */
4347 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4351 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4352 "Overwriting end of array!\n" : "OK",
4353 RExC_emit - RExC_emit_start,
4354 RExC_parse - RExC_start,
4356 Set_Cur_Node_Offset;
4365 - reguni - emit (if appropriate) a Unicode character
4368 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4370 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4374 - reginsert - insert an operator in front of already-emitted operand
4376 * Means relocating the operand.
4379 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4381 register regnode *src;
4382 register regnode *dst;
4383 register regnode *place;
4384 register int offset = regarglen[(U8)op];
4386 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4389 RExC_size += NODE_STEP_REGNODE + offset;
4394 RExC_emit += NODE_STEP_REGNODE + offset;
4396 while (src > opnd) {
4397 StructCopy(--src, --dst, regnode);
4398 if (RExC_offsets) { /* MJD 20010112 */
4399 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4403 dst - RExC_emit_start > RExC_offsets[0]
4404 ? "Overwriting end of array!\n" : "OK",
4405 src - RExC_emit_start,
4406 dst - RExC_emit_start,
4408 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4409 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4414 place = opnd; /* Op node, where operand used to be. */
4415 if (RExC_offsets) { /* MJD */
4416 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4420 place - RExC_emit_start > RExC_offsets[0]
4421 ? "Overwriting end of array!\n" : "OK",
4422 place - RExC_emit_start,
4423 RExC_parse - RExC_start,
4425 Set_Node_Offset(place, RExC_parse);
4427 src = NEXTOPER(place);
4428 FILL_ADVANCE_NODE(place, op);
4429 Zero(src, offset, regnode);
4433 - regtail - set the next-pointer at the end of a node chain of p to val.
4436 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4438 register regnode *scan;
4439 register regnode *temp;
4444 /* Find last node. */
4447 temp = regnext(scan);
4453 if (reg_off_by_arg[OP(scan)]) {
4454 ARG_SET(scan, val - scan);
4457 NEXT_OFF(scan) = val - scan;
4462 - regoptail - regtail on operand of first argument; nop if operandless
4465 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4467 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4468 if (p == NULL || SIZE_ONLY)
4470 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4471 regtail(pRExC_state, NEXTOPER(p), val);
4473 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4474 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4481 - regcurly - a little FSA that accepts {\d+,?\d*}
4484 S_regcurly(pTHX_ register char *s)
4505 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4507 register U8 op = EXACT; /* Arbitrary non-END op. */
4508 register regnode *next;
4510 while (op != END && (!last || node < last)) {
4511 /* While that wasn't END last time... */
4517 next = regnext(node);
4519 if (OP(node) == OPTIMIZED)
4522 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4523 (int)(2*l + 1), "", SvPVX(sv));
4524 if (next == NULL) /* Next ptr. */
4525 PerlIO_printf(Perl_debug_log, "(0)");
4527 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4528 (void)PerlIO_putc(Perl_debug_log, '\n');
4530 if (PL_regkind[(U8)op] == BRANCHJ) {
4531 register regnode *nnode = (OP(next) == LONGJMP
4534 if (last && nnode > last)
4536 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4538 else if (PL_regkind[(U8)op] == BRANCH) {
4539 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4541 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4542 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4543 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4545 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4546 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4549 else if ( op == PLUS || op == STAR) {
4550 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4552 else if (op == ANYOF) {
4553 /* arglen 1 + class block */
4554 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4555 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4556 node = NEXTOPER(node);
4558 else if (PL_regkind[(U8)op] == EXACT) {
4559 /* Literal string, where present. */
4560 node += NODE_SZ_STR(node) - 1;
4561 node = NEXTOPER(node);
4564 node = NEXTOPER(node);
4565 node += regarglen[(U8)op];
4567 if (op == CURLYX || op == OPEN)
4569 else if (op == WHILEM)
4575 #endif /* DEBUGGING */
4578 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4581 Perl_regdump(pTHX_ regexp *r)
4584 SV *sv = sv_newmortal();
4586 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4588 /* Header fields of interest. */
4589 if (r->anchored_substr)
4590 PerlIO_printf(Perl_debug_log,
4591 "anchored `%s%.*s%s'%s at %"IVdf" ",
4593 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4594 SvPVX(r->anchored_substr),
4596 SvTAIL(r->anchored_substr) ? "$" : "",
4597 (IV)r->anchored_offset);
4598 else if (r->anchored_utf8)
4599 PerlIO_printf(Perl_debug_log,
4600 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4602 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4603 SvPVX(r->anchored_utf8),
4605 SvTAIL(r->anchored_utf8) ? "$" : "",
4606 (IV)r->anchored_offset);
4607 if (r->float_substr)
4608 PerlIO_printf(Perl_debug_log,
4609 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4611 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4612 SvPVX(r->float_substr),
4614 SvTAIL(r->float_substr) ? "$" : "",
4615 (IV)r->float_min_offset, (UV)r->float_max_offset);
4616 else if (r->float_utf8)
4617 PerlIO_printf(Perl_debug_log,
4618 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4620 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4621 SvPVX(r->float_utf8),
4623 SvTAIL(r->float_utf8) ? "$" : "",
4624 (IV)r->float_min_offset, (UV)r->float_max_offset);
4625 if (r->check_substr || r->check_utf8)
4626 PerlIO_printf(Perl_debug_log,
4627 r->check_substr == r->float_substr
4628 && r->check_utf8 == r->float_utf8
4629 ? "(checking floating" : "(checking anchored");
4630 if (r->reganch & ROPT_NOSCAN)
4631 PerlIO_printf(Perl_debug_log, " noscan");
4632 if (r->reganch & ROPT_CHECK_ALL)
4633 PerlIO_printf(Perl_debug_log, " isall");
4634 if (r->check_substr || r->check_utf8)
4635 PerlIO_printf(Perl_debug_log, ") ");
4637 if (r->regstclass) {
4638 regprop(sv, r->regstclass);
4639 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4641 if (r->reganch & ROPT_ANCH) {
4642 PerlIO_printf(Perl_debug_log, "anchored");
4643 if (r->reganch & ROPT_ANCH_BOL)
4644 PerlIO_printf(Perl_debug_log, "(BOL)");
4645 if (r->reganch & ROPT_ANCH_MBOL)
4646 PerlIO_printf(Perl_debug_log, "(MBOL)");
4647 if (r->reganch & ROPT_ANCH_SBOL)
4648 PerlIO_printf(Perl_debug_log, "(SBOL)");
4649 if (r->reganch & ROPT_ANCH_GPOS)
4650 PerlIO_printf(Perl_debug_log, "(GPOS)");
4651 PerlIO_putc(Perl_debug_log, ' ');
4653 if (r->reganch & ROPT_GPOS_SEEN)
4654 PerlIO_printf(Perl_debug_log, "GPOS ");
4655 if (r->reganch & ROPT_SKIP)
4656 PerlIO_printf(Perl_debug_log, "plus ");
4657 if (r->reganch & ROPT_IMPLICIT)
4658 PerlIO_printf(Perl_debug_log, "implicit ");
4659 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4660 if (r->reganch & ROPT_EVAL_SEEN)
4661 PerlIO_printf(Perl_debug_log, "with eval ");
4662 PerlIO_printf(Perl_debug_log, "\n");
4665 U32 len = r->offsets[0];
4666 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4667 for (i = 1; i <= len; i++)
4668 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4669 (UV)r->offsets[i*2-1],
4670 (UV)r->offsets[i*2]);
4671 PerlIO_printf(Perl_debug_log, "\n");
4673 #endif /* DEBUGGING */
4679 S_put_byte(pTHX_ SV *sv, int c)
4681 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4682 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4683 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4684 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4686 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4689 #endif /* DEBUGGING */
4692 - regprop - printable representation of opcode
4695 Perl_regprop(pTHX_ SV *sv, regnode *o)
4700 sv_setpvn(sv, "", 0);
4701 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4702 /* It would be nice to FAIL() here, but this may be called from
4703 regexec.c, and it would be hard to supply pRExC_state. */
4704 Perl_croak(aTHX_ "Corrupted regexp opcode");
4705 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4707 k = PL_regkind[(U8)OP(o)];
4710 SV *dsv = sv_2mortal(newSVpvn("", 0));
4711 /* Using is_utf8_string() is a crude hack but it may
4712 * be the best for now since we have no flag "this EXACTish
4713 * node was UTF-8" --jhi */
4714 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4716 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4717 UNI_DISPLAY_REGEX) :
4722 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4727 else if (k == CURLY) {
4728 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4729 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4730 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4732 else if (k == WHILEM && o->flags) /* Ordinal/of */
4733 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4734 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4735 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4736 else if (k == LOGICAL)
4737 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4738 else if (k == ANYOF) {
4739 int i, rangestart = -1;
4740 U8 flags = ANYOF_FLAGS(o);
4741 const char * const anyofs[] = { /* Should be synchronized with
4742 * ANYOF_ #xdefines in regcomp.h */
4775 if (flags & ANYOF_LOCALE)
4776 sv_catpv(sv, "{loc}");
4777 if (flags & ANYOF_FOLD)
4778 sv_catpv(sv, "{i}");
4779 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4780 if (flags & ANYOF_INVERT)
4782 for (i = 0; i <= 256; i++) {
4783 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4784 if (rangestart == -1)
4786 } else if (rangestart != -1) {
4787 if (i <= rangestart + 3)
4788 for (; rangestart < i; rangestart++)
4789 put_byte(sv, rangestart);
4791 put_byte(sv, rangestart);
4793 put_byte(sv, i - 1);
4799 if (o->flags & ANYOF_CLASS)
4800 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4801 if (ANYOF_CLASS_TEST(o,i))
4802 sv_catpv(sv, anyofs[i]);
4804 if (flags & ANYOF_UNICODE)
4805 sv_catpv(sv, "{unicode}");
4806 else if (flags & ANYOF_UNICODE_ALL)
4807 sv_catpv(sv, "{unicode_all}");
4811 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4815 U8 s[UTF8_MAXLEN+1];
4817 for (i = 0; i <= 256; i++) { /* just the first 256 */
4818 U8 *e = uvchr_to_utf8(s, i);
4820 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4821 if (rangestart == -1)
4823 } else if (rangestart != -1) {
4826 if (i <= rangestart + 3)
4827 for (; rangestart < i; rangestart++) {
4828 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4832 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4835 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4842 sv_catpv(sv, "..."); /* et cetera */
4846 char *s = savepv(SvPVX(lv));
4849 while(*s && *s != '\n') s++;
4870 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4872 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4873 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4874 #endif /* DEBUGGING */
4878 Perl_re_intuit_string(pTHX_ regexp *prog)
4879 { /* Assume that RE_INTUIT is set */
4882 char *s = SvPV(prog->check_substr
4883 ? prog->check_substr : prog->check_utf8, n_a);
4885 if (!PL_colorset) reginitcolors();
4886 PerlIO_printf(Perl_debug_log,
4887 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4889 prog->check_substr ? "" : "utf8 ",
4890 PL_colors[5],PL_colors[0],
4893 (strlen(s) > 60 ? "..." : ""));
4896 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4900 Perl_pregfree(pTHX_ struct regexp *r)
4903 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4906 if (!r || (--r->refcnt > 0))
4912 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4913 r->prelen, 60, UNI_DISPLAY_REGEX)
4914 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4918 PerlIO_printf(Perl_debug_log,
4919 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4920 PL_colors[4],PL_colors[5],PL_colors[0],
4923 len > 60 ? "..." : "");
4927 Safefree(r->precomp);
4928 if (r->offsets) /* 20010421 MJD */
4929 Safefree(r->offsets);
4930 RX_MATCH_COPY_FREE(r);
4931 #ifdef PERL_COPY_ON_WRITE
4933 SvREFCNT_dec(r->saved_copy);
4936 if (r->anchored_substr)
4937 SvREFCNT_dec(r->anchored_substr);
4938 if (r->anchored_utf8)
4939 SvREFCNT_dec(r->anchored_utf8);
4940 if (r->float_substr)
4941 SvREFCNT_dec(r->float_substr);
4943 SvREFCNT_dec(r->float_utf8);
4944 Safefree(r->substrs);
4947 int n = r->data->count;
4948 PAD* new_comppad = NULL;
4952 /* If you add a ->what type here, update the comment in regcomp.h */
4953 switch (r->data->what[n]) {
4955 SvREFCNT_dec((SV*)r->data->data[n]);
4958 Safefree(r->data->data[n]);
4961 new_comppad = (AV*)r->data->data[n];
4964 if (new_comppad == NULL)
4965 Perl_croak(aTHX_ "panic: pregfree comppad");
4966 PAD_SAVE_LOCAL(old_comppad,
4967 /* Watch out for global destruction's random ordering. */
4968 (SvTYPE(new_comppad) == SVt_PVAV) ?
4969 new_comppad : Null(PAD *)
4971 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4972 op_free((OP_4tree*)r->data->data[n]);
4975 PAD_RESTORE_LOCAL(old_comppad);
4976 SvREFCNT_dec((SV*)new_comppad);
4982 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4985 Safefree(r->data->what);
4988 Safefree(r->startp);
4994 - regnext - dig the "next" pointer out of a node
4996 * [Note, when REGALIGN is defined there are two places in regmatch()
4997 * that bypass this code for speed.]
5000 Perl_regnext(pTHX_ register regnode *p)
5002 register I32 offset;
5004 if (p == &PL_regdummy)
5007 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5015 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5018 STRLEN l1 = strlen(pat1);
5019 STRLEN l2 = strlen(pat2);
5028 Copy(pat1, buf, l1 , char);
5029 Copy(pat2, buf + l1, l2 , char);
5030 buf[l1 + l2] = '\n';
5031 buf[l1 + l2 + 1] = '\0';
5033 /* ANSI variant takes additional second argument */
5034 va_start(args, pat2);
5038 msv = vmess(buf, &args);
5040 message = SvPV(msv,l1);
5043 Copy(message, buf, l1 , char);
5044 buf[l1] = '\0'; /* Overwrite \n */
5045 Perl_croak(aTHX_ "%s", buf);
5048 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5051 Perl_save_re_context(pTHX)
5053 SAVEI32(PL_reg_flags); /* from regexec.c */
5055 SAVEPPTR(PL_reginput); /* String-input pointer. */
5056 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5057 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5058 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5059 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5060 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5061 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5062 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5063 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5064 PL_reg_start_tmp = 0;
5065 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5066 PL_reg_start_tmpl = 0;
5067 SAVEVPTR(PL_regdata);
5068 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5069 SAVEI32(PL_regnarrate); /* from regexec.c */
5070 SAVEVPTR(PL_regprogram); /* from regexec.c */
5071 SAVEINT(PL_regindent); /* from regexec.c */
5072 SAVEVPTR(PL_regcc); /* from regexec.c */
5073 SAVEVPTR(PL_curcop);
5074 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5075 SAVEVPTR(PL_reg_re); /* from regexec.c */
5076 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5077 SAVESPTR(PL_reg_sv); /* from regexec.c */
5078 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5079 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5080 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5081 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5082 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5083 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5084 PL_reg_oldsaved = Nullch;
5085 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5086 PL_reg_oldsavedlen = 0;
5087 #ifdef PERL_COPY_ON_WRITE
5091 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5093 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5094 PL_reg_leftiter = 0;
5095 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5096 PL_reg_poscache = Nullch;
5097 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5098 PL_reg_poscache_size = 0;
5099 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5100 SAVEI32(PL_regnpar); /* () count. */
5101 SAVEI32(PL_regsize); /* from regexec.c */
5104 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5110 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5111 for (i = 1; i <= rx->nparens; i++) {
5112 sprintf(digits, "%lu", (long)i);
5113 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5120 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5125 clear_re(pTHX_ void *r)
5127 ReREFCNT_dec((regexp *)r);