5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGCOMP_C
87 #ifndef PERL_IN_XSUB_RE
99 # if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
108 #define STATIC static
111 typedef struct RExC_state_t {
112 U32 flags; /* are we folding, multilining? */
113 char *precomp; /* uncompiled string. */
115 char *start; /* Start of input for compile */
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
119 regnode *emit_start; /* Start of emitted-code area */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
136 #define RExC_flags (pRExC_state->flags)
137 #define RExC_precomp (pRExC_state->precomp)
138 #define RExC_rx (pRExC_state->rx)
139 #define RExC_start (pRExC_state->start)
140 #define RExC_end (pRExC_state->end)
141 #define RExC_parse (pRExC_state->parse)
142 #define RExC_whilem_seen (pRExC_state->whilem_seen)
143 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty (pRExC_state->naughty)
147 #define RExC_sawback (pRExC_state->sawback)
148 #define RExC_seen (pRExC_state->seen)
149 #define RExC_size (pRExC_state->size)
150 #define RExC_npar (pRExC_state->npar)
151 #define RExC_extralen (pRExC_state->extralen)
152 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8 (pRExC_state->utf8)
156 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
161 #undef SPSTART /* dratted cpp namespace... */
164 * Flags to be passed up and down.
166 #define WORST 0 /* Worst case. */
167 #define HASWIDTH 0x1 /* Known to match non-null strings. */
168 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART 0x4 /* Starts with * or +. */
170 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
172 /* Length of a variant. */
174 typedef struct scan_data_t {
180 I32 last_end; /* min value, <0 unless valid. */
183 SV **longest; /* Either &l_fixed, or &l_float. */
187 I32 offset_float_min;
188 I32 offset_float_max;
192 struct regnode_charclass_class *start_class;
196 * Forward declarations for pregcomp()'s friends.
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL 0x1
204 #define SF_BEFORE_MEOL 0x2
205 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
209 # define SF_FIX_SHIFT_EOL (0+2)
210 # define SF_FL_SHIFT_EOL (0+4)
212 # define SF_FIX_SHIFT_EOL (+2)
213 # define SF_FL_SHIFT_EOL (+4)
216 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF 0x40
222 #define SF_HAS_PAR 0x80
223 #define SF_IN_PAR 0x100
224 #define SF_HAS_EVAL 0x200
225 #define SCF_DO_SUBSTR 0x400
226 #define SCF_DO_STCLASS_AND 0x0800
227 #define SCF_DO_STCLASS_OR 0x1000
228 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS 0x2000
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
235 #define OOB_UNICODE 12345678
236 #define OOB_NAMEDCLASS -1
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
250 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
260 #define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
280 #define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
299 #define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
308 #define vFAIL(m) STMT_START { \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
315 * Like Simple_vFAIL(), but accepts two arguments.
317 #define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
326 #define vFAIL2(m,a1) STMT_START { \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
334 * Like Simple_vFAIL(), but accepts three arguments.
336 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
345 #define vFAIL3(m,a1,a2) STMT_START { \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
352 * Like Simple_vFAIL(), but accepts four arguments.
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
361 * Like Simple_vFAIL(), but accepts five arguments.
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
376 #define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 #define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
390 #define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
414 /* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
424 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
431 RExC_offsets[2*(node)-1] = (byte); \
436 #define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
440 #define Set_Node_Length_To_R(node,len) STMT_START { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
447 RExC_offsets[2*(node)] = (len); \
452 #define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
462 static void clear_re(pTHX_ void *r);
464 /* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 SvSetMagicSV(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
482 data->flags &= ~SF_FIX_BEFORE_EOL;
485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
486 data->offset_float_max = (l
487 ? data->last_start_max
488 : data->pos_min + data->pos_delta);
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
491 if (data->flags & SF_BEFORE_EOL)
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
495 data->flags &= ~SF_FL_BEFORE_EOL;
498 SvCUR_set(data->last_found, 0);
500 SV * sv = data->last_found;
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
507 data->flags &= ~SF_BEFORE_EOL;
510 /* Can match anything (initialization) */
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
514 ANYOF_CLASS_ZERO(cl);
515 ANYOF_BITMAP_SETALL(cl);
516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
518 cl->flags |= ANYOF_LOCALE;
521 /* Can match anything (initialization) */
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
527 for (value = 0; value <= ANYOF_MAX; value += 2)
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
537 /* Can match anything (initialization) */
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
549 Zero(cl, 1, struct regnode_charclass_class);
551 cl_anything(pRExC_state, cl);
553 cl->flags |= ANYOF_LOCALE;
556 /* 'And' a given class with another one. Can create false positives */
557 /* We assume that cl is not inverted */
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
590 cl->flags &= ~ANYOF_UNICODE;
593 /* 'OR' a given class with another one. Can create false positives */
594 /* We assume that cl is not inverted */
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
598 if (or_with->flags & ANYOF_INVERT) {
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
617 cl_anything(pRExC_state, cl);
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622 && (!(or_with->flags & ANYOF_FOLD)
623 || (cl->flags & ANYOF_FOLD)) ) {
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
636 cl_anything(pRExC_state, cl);
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
654 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655 * These need to be revisited when a newer toolchain becomes available.
657 #if defined(__sparc64__) && defined(__GNUC__)
658 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 # undef SPARC64_GCC_WORKAROUND
660 # define SPARC64_GCC_WORKAROUND 1
664 /* REx optimizer. Converts nodes into quickier variants "in place".
665 Finds fixed substrings. */
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668 to the position after last scanned or to NULL. */
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672 /* scanp: Start here (read-write). */
673 /* deltap: Write maxlen-minlen here. */
674 /* last: Stop before this one. */
676 I32 min = 0, pars = 0, code;
677 regnode *scan = *scanp, *next;
679 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680 int is_inf_internal = 0; /* The studied chunk is infinite */
681 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682 scan_data_t data_fake;
683 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
685 while (scan && OP(scan) != END && scan < last) {
686 /* Peephole optimizer: */
688 if (PL_regkind[(U8)OP(scan)] == EXACT) {
689 /* Merge several consecutive EXACTish nodes into one. */
690 regnode *n = regnext(scan);
693 regnode *stop = scan;
696 next = scan + NODE_SZ_STR(scan);
697 /* Skip NOTHING, merge EXACT*. */
699 ( PL_regkind[(U8)OP(n)] == NOTHING ||
700 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703 if (OP(n) == TAIL || n > next)
705 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706 NEXT_OFF(scan) += NEXT_OFF(n);
707 next = n + NODE_STEP_REGNODE;
715 int oldl = STR_LEN(scan);
716 regnode *nnext = regnext(n);
718 if (oldl + STR_LEN(n) > U8_MAX)
720 NEXT_OFF(scan) += NEXT_OFF(n);
721 STR_LEN(scan) += STR_LEN(n);
722 next = n + NODE_SZ_STR(n);
723 /* Now we can overwrite *n : */
724 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
734 Two problematic code points in Unicode casefolding of EXACT nodes:
736 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
744 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
746 This means that in case-insensitive matching (or "loose matching",
747 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748 length of the above casefolded versions) can match a target string
749 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750 This would rather mess up the minimum length computation.
752 What we'll do is to look for the tail four bytes, and then peek
753 at the preceding two bytes to see whether we need to decrease
754 the minimum length by four (six minus two).
756 Thanks to the design of UTF-8, there cannot be false matches:
757 A sequence of valid UTF-8 bytes cannot be a subsequence of
758 another valid sequence of UTF-8 bytes.
761 char *s0 = STRING(scan), *s, *t;
762 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763 char *t0 = "\xcc\x88\xcc\x81";
767 s < s2 && (t = ninstr(s, s1, t0, t1));
769 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
777 n = scan + NODE_SZ_STR(scan);
779 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787 /* Follow the next-chain of the current node and optimize
788 away all the NOTHINGs from it. */
789 if (OP(scan) != CURLYX) {
790 int max = (reg_off_by_arg[OP(scan)]
792 /* I32 may be smaller than U16 on CRAYs! */
793 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
798 /* Skip NOTHING and LONGJMP. */
799 while ((n = regnext(n))
800 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
804 if (reg_off_by_arg[OP(scan)])
807 NEXT_OFF(scan) = off;
809 /* The principal pseudo-switch. Cannot be a switch, since we
810 look into several different things. */
811 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813 next = regnext(scan);
816 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817 I32 max1 = 0, min1 = I32_MAX, num = 0;
818 struct regnode_charclass_class accum;
820 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822 if (flags & SCF_DO_STCLASS)
823 cl_init_zero(pRExC_state, &accum);
824 while (OP(scan) == code) {
825 I32 deltanext, minnext, f = 0, fake;
826 struct regnode_charclass_class this_class;
831 data_fake.whilem_c = data->whilem_c;
832 data_fake.last_closep = data->last_closep;
835 data_fake.last_closep = &fake;
836 next = regnext(scan);
837 scan = NEXTOPER(scan);
839 scan = NEXTOPER(scan);
840 if (flags & SCF_DO_STCLASS) {
841 cl_init(pRExC_state, &this_class);
842 data_fake.start_class = &this_class;
843 f = SCF_DO_STCLASS_AND;
845 if (flags & SCF_WHILEM_VISITED_POS)
846 f |= SCF_WHILEM_VISITED_POS;
847 /* we suppose the run is continuous, last=next...*/
848 minnext = study_chunk(pRExC_state, &scan, &deltanext,
849 next, &data_fake, f);
852 if (max1 < minnext + deltanext)
853 max1 = minnext + deltanext;
854 if (deltanext == I32_MAX)
855 is_inf = is_inf_internal = 1;
857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
859 if (data && (data_fake.flags & SF_HAS_EVAL))
860 data->flags |= SF_HAS_EVAL;
862 data->whilem_c = data_fake.whilem_c;
863 if (flags & SCF_DO_STCLASS)
864 cl_or(pRExC_state, &accum, &this_class);
868 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
870 if (flags & SCF_DO_SUBSTR) {
871 data->pos_min += min1;
872 data->pos_delta += max1 - min1;
873 if (max1 != min1 || is_inf)
874 data->longest = &(data->longest_float);
877 delta += max1 - min1;
878 if (flags & SCF_DO_STCLASS_OR) {
879 cl_or(pRExC_state, data->start_class, &accum);
881 cl_and(data->start_class, &and_with);
882 flags &= ~SCF_DO_STCLASS;
885 else if (flags & SCF_DO_STCLASS_AND) {
887 cl_and(data->start_class, &accum);
888 flags &= ~SCF_DO_STCLASS;
891 /* Switch to OR mode: cache the old value of
892 * data->start_class */
893 StructCopy(data->start_class, &and_with,
894 struct regnode_charclass_class);
895 flags &= ~SCF_DO_STCLASS_AND;
896 StructCopy(&accum, data->start_class,
897 struct regnode_charclass_class);
898 flags |= SCF_DO_STCLASS_OR;
899 data->start_class->flags |= ANYOF_EOS;
903 else if (code == BRANCHJ) /* single branch is optimized. */
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
909 else if (OP(scan) == EXACT) {
910 I32 l = STR_LEN(scan);
911 UV uc = *((U8*)STRING(scan));
913 U8 *s = (U8*)STRING(scan);
914 l = utf8_length(s, s + l);
915 uc = utf8_to_uvchr(s, NULL);
918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
924 ? I32_MAX : data->pos_min + data->pos_delta;
926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
928 SV * sv = data->last_found;
929 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
936 SvUTF8_on(data->last_found);
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
948 && (!(data->start_class->flags & ANYOF_FOLD)
949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
955 ANYOF_BITMAP_SET(data->start_class, uc);
956 data->start_class->flags &= ~ANYOF_EOS;
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
963 ANYOF_BITMAP_SET(data->start_class, uc);
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
969 flags &= ~SCF_DO_STCLASS;
971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972 I32 l = STR_LEN(scan);
973 UV uc = *((U8*)STRING(scan));
975 /* Search for fixed substrings supports EXACT only. */
976 if (flags & SCF_DO_SUBSTR)
977 scan_commit(pRExC_state, data);
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
981 uc = utf8_to_uvchr(s, NULL);
984 if (data && (flags & SCF_DO_SUBSTR))
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
998 ANYOF_BITMAP_SET(data->start_class, uc);
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1010 ANYOF_BITMAP_SET(data->start_class, uc);
1011 data->start_class->flags &= ~ANYOF_EOS;
1013 cl_and(data->start_class, &and_with);
1015 flags &= ~SCF_DO_STCLASS;
1017 else if (strchr((char*)PL_varies,OP(scan))) {
1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019 I32 f = flags, pos_before = 0;
1020 regnode *oscan = scan;
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
1023 I32 next_is_eval = 0;
1025 switch (PL_regkind[(U8)OP(scan)]) {
1026 case WHILEM: /* End of (?:...)* . */
1027 scan = NEXTOPER(scan);
1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031 next = NEXTOPER(scan);
1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034 maxcount = REG_INFTY;
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1040 if (flags & SCF_DO_SUBSTR)
1045 if (flags & SCF_DO_STCLASS) {
1047 maxcount = REG_INFTY;
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1052 is_inf = is_inf_internal = 1;
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056 data->longest = &(data->longest_float);
1058 goto optimize_curly_tail;
1060 mincount = ARG1(scan);
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069 next_is_eval = (OP(scan) == EVAL);
1071 if (flags & SCF_DO_SUBSTR) {
1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073 pos_before = data->pos_min;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 data->flags |= SF_IS_INF;
1081 if (flags & SCF_DO_STCLASS) {
1082 cl_init(pRExC_state, &this_class);
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
1097 /* This will finish on WHILEM, setting scan, or on NULL: */
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1108 else if (flags & SCF_DO_STCLASS_AND) {
1109 /* Switch to OR mode: cache the old value of
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
1121 cl_or(pRExC_state, data->start_class, &this_class);
1122 cl_and(data->start_class, &and_with);
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1130 if (ckWARN(WARN_REGEXP)
1131 /* ? quantifier ok, except for (?{ ... }) */
1132 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133 && (minnext == 0) && (deltanext == 0)
1134 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1138 "Quantifier unexpected on zero-length expression");
1141 min += minnext * mincount;
1142 is_inf_internal |= ((maxcount == REG_INFTY
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
1145 is_inf |= is_inf_internal;
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148 /* Try powerful optimization CURLYX => CURLYN. */
1149 if ( OP(oscan) == CURLYX && data
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155 regnode *nxt1 = nxt;
1162 if (!strchr((char*)PL_simple,OP(nxt))
1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164 && STR_LEN(nxt) == 1))
1170 if (OP(nxt) != CLOSE)
1172 /* Now we know that nxt2 is the only contents: */
1173 oscan->flags = (U8)ARG(nxt);
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 /* Try optimization CURLYX => CURLYM. */
1188 if ( OP(oscan) == CURLYX && data
1189 && !(data->flags & SF_HAS_PAR)
1190 && !(data->flags & SF_HAS_EVAL)
1191 && !deltanext /* atom is fixed width */
1192 && minnext != 0 /* CURLYM can't handle zero width */
1194 /* XXXX How to optimize if data == 0? */
1195 /* Optimize to a simpler form. */
1196 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1200 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1201 && (OP(nxt2) != WHILEM))
1203 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1204 /* Need to optimize away parenths. */
1205 if (data->flags & SF_IN_PAR) {
1206 /* Set the parenth number. */
1207 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1209 if (OP(nxt) != CLOSE)
1210 FAIL("Panic opt close");
1211 oscan->flags = (U8)ARG(nxt);
1212 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1213 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216 OP(nxt + 1) = OPTIMIZED; /* was count. */
1217 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1221 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222 regnode *nnxt = regnext(nxt1);
1225 if (reg_off_by_arg[OP(nxt1)])
1226 ARG_SET(nxt1, nxt2 - nxt1);
1227 else if (nxt2 - nxt1 < U16_MAX)
1228 NEXT_OFF(nxt1) = nxt2 - nxt1;
1230 OP(nxt) = NOTHING; /* Cannot beautify */
1235 /* Optimize again: */
1236 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1242 else if ((OP(oscan) == CURLYX)
1243 && (flags & SCF_WHILEM_VISITED_POS)
1244 /* See the comment on a similar expression above.
1245 However, this time it not a subexpression
1246 we care about, but the expression itself. */
1247 && (maxcount == REG_INFTY)
1248 && data && ++data->whilem_c < 16) {
1249 /* This stays as CURLYX, we can put the count/of pair. */
1250 /* Find WHILEM (as in regexec.c) */
1251 regnode *nxt = oscan + NEXT_OFF(oscan);
1253 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1255 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256 | (RExC_whilem_seen << 4)); /* On WHILEM */
1258 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1260 if (flags & SCF_DO_SUBSTR) {
1261 SV *last_str = Nullsv;
1262 int counted = mincount != 0;
1264 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1265 #if defined(SPARC64_GCC_WORKAROUND)
1271 if (pos_before >= data->last_start_min)
1274 b = data->last_start_min;
1277 s = SvPV(data->last_found, l);
1278 old = b - data->last_start_min;
1281 I32 b = pos_before >= data->last_start_min
1282 ? pos_before : data->last_start_min;
1284 char *s = SvPV(data->last_found, l);
1285 I32 old = b - data->last_start_min;
1289 old = utf8_hop((U8*)s, old) - (U8*)s;
1292 /* Get the added string: */
1293 last_str = newSVpvn(s + old, l);
1295 SvUTF8_on(last_str);
1296 if (deltanext == 0 && pos_before == b) {
1297 /* What was added is a constant string */
1299 SvGROW(last_str, (mincount * l) + 1);
1300 repeatcpy(SvPVX(last_str) + l,
1301 SvPVX(last_str), l, mincount - 1);
1302 SvCUR(last_str) *= mincount;
1303 /* Add additional parts. */
1304 SvCUR_set(data->last_found,
1305 SvCUR(data->last_found) - l);
1306 sv_catsv(data->last_found, last_str);
1308 SV * sv = data->last_found;
1310 SvUTF8(sv) && SvMAGICAL(sv) ?
1311 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312 if (mg && mg->mg_len >= 0)
1313 mg->mg_len += CHR_SVLEN(last_str);
1315 data->last_end += l * (mincount - 1);
1318 /* start offset must point into the last copy */
1319 data->last_start_min += minnext * (mincount - 1);
1320 data->last_start_max += is_inf ? I32_MAX
1321 : (maxcount - 1) * (minnext + data->pos_delta);
1324 /* It is counted once already... */
1325 data->pos_min += minnext * (mincount - counted);
1326 data->pos_delta += - counted * deltanext +
1327 (minnext + deltanext) * maxcount - minnext * mincount;
1328 if (mincount != maxcount) {
1329 /* Cannot extend fixed substrings found inside
1331 scan_commit(pRExC_state,data);
1332 if (mincount && last_str) {
1333 sv_setsv(data->last_found, last_str);
1334 data->last_end = data->pos_min;
1335 data->last_start_min =
1336 data->pos_min - CHR_SVLEN(last_str);
1337 data->last_start_max = is_inf
1339 : data->pos_min + data->pos_delta
1340 - CHR_SVLEN(last_str);
1342 data->longest = &(data->longest_float);
1344 SvREFCNT_dec(last_str);
1346 if (data && (fl & SF_HAS_EVAL))
1347 data->flags |= SF_HAS_EVAL;
1348 optimize_curly_tail:
1349 if (OP(oscan) != CURLYX) {
1350 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1352 NEXT_OFF(oscan) += NEXT_OFF(next);
1355 default: /* REF and CLUMP only? */
1356 if (flags & SCF_DO_SUBSTR) {
1357 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1358 data->longest = &(data->longest_float);
1360 is_inf = is_inf_internal = 1;
1361 if (flags & SCF_DO_STCLASS_OR)
1362 cl_anything(pRExC_state, data->start_class);
1363 flags &= ~SCF_DO_STCLASS;
1367 else if (strchr((char*)PL_simple,OP(scan))) {
1370 if (flags & SCF_DO_SUBSTR) {
1371 scan_commit(pRExC_state,data);
1375 if (flags & SCF_DO_STCLASS) {
1376 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1378 /* Some of the logic below assumes that switching
1379 locale on will only add false positives. */
1380 switch (PL_regkind[(U8)OP(scan)]) {
1384 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1385 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1386 cl_anything(pRExC_state, data->start_class);
1389 if (OP(scan) == SANY)
1391 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1392 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1393 || (data->start_class->flags & ANYOF_CLASS));
1394 cl_anything(pRExC_state, data->start_class);
1396 if (flags & SCF_DO_STCLASS_AND || !value)
1397 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1400 if (flags & SCF_DO_STCLASS_AND)
1401 cl_and(data->start_class,
1402 (struct regnode_charclass_class*)scan);
1404 cl_or(pRExC_state, data->start_class,
1405 (struct regnode_charclass_class*)scan);
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (!isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1420 for (value = 0; value < 256; value++)
1422 ANYOF_BITMAP_SET(data->start_class, value);
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1433 data->start_class->flags |= ANYOF_LOCALE;
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1440 for (value = 0; value < 256; value++)
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449 for (value = 0; value < 256; value++)
1450 if (!isALNUM(value))
1451 ANYOF_BITMAP_SET(data->start_class, value);
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1469 for (value = 0; value < 256; value++)
1470 if (!isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478 for (value = 0; value < 256; value++)
1480 ANYOF_BITMAP_SET(data->start_class, value);
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE)
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1490 data->start_class->flags |= ANYOF_LOCALE;
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1498 for (value = 0; value < 256; value++)
1500 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504 if (data->start_class->flags & ANYOF_LOCALE)
1505 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1507 for (value = 0; value < 256; value++)
1508 if (!isSPACE(value))
1509 ANYOF_BITMAP_SET(data->start_class, value);
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 if (data->start_class->flags & ANYOF_LOCALE) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1523 data->start_class->flags |= ANYOF_LOCALE;
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1528 if (flags & SCF_DO_STCLASS_AND) {
1529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1530 for (value = 0; value < 256; value++)
1531 if (!isDIGIT(value))
1532 ANYOF_BITMAP_CLEAR(data->start_class, value);
1535 if (data->start_class->flags & ANYOF_LOCALE)
1536 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1538 for (value = 0; value < 256; value++)
1540 ANYOF_BITMAP_SET(data->start_class, value);
1545 if (flags & SCF_DO_STCLASS_AND) {
1546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1547 for (value = 0; value < 256; value++)
1549 ANYOF_BITMAP_CLEAR(data->start_class, value);
1552 if (data->start_class->flags & ANYOF_LOCALE)
1553 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1555 for (value = 0; value < 256; value++)
1556 if (!isDIGIT(value))
1557 ANYOF_BITMAP_SET(data->start_class, value);
1562 if (flags & SCF_DO_STCLASS_OR)
1563 cl_and(data->start_class, &and_with);
1564 flags &= ~SCF_DO_STCLASS;
1567 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1568 data->flags |= (OP(scan) == MEOL
1572 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1573 /* Lookbehind, or need to calculate parens/evals/stclass: */
1574 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1575 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1576 /* Lookahead/lookbehind */
1577 I32 deltanext, minnext, fake = 0;
1579 struct regnode_charclass_class intrnl;
1582 data_fake.flags = 0;
1584 data_fake.whilem_c = data->whilem_c;
1585 data_fake.last_closep = data->last_closep;
1588 data_fake.last_closep = &fake;
1589 if ( flags & SCF_DO_STCLASS && !scan->flags
1590 && OP(scan) == IFMATCH ) { /* Lookahead */
1591 cl_init(pRExC_state, &intrnl);
1592 data_fake.start_class = &intrnl;
1593 f |= SCF_DO_STCLASS_AND;
1595 if (flags & SCF_WHILEM_VISITED_POS)
1596 f |= SCF_WHILEM_VISITED_POS;
1597 next = regnext(scan);
1598 nscan = NEXTOPER(NEXTOPER(scan));
1599 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1602 vFAIL("Variable length lookbehind not implemented");
1604 else if (minnext > U8_MAX) {
1605 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1607 scan->flags = (U8)minnext;
1609 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1611 if (data && (data_fake.flags & SF_HAS_EVAL))
1612 data->flags |= SF_HAS_EVAL;
1614 data->whilem_c = data_fake.whilem_c;
1615 if (f & SCF_DO_STCLASS_AND) {
1616 int was = (data->start_class->flags & ANYOF_EOS);
1618 cl_and(data->start_class, &intrnl);
1620 data->start_class->flags |= ANYOF_EOS;
1623 else if (OP(scan) == OPEN) {
1626 else if (OP(scan) == CLOSE) {
1627 if ((I32)ARG(scan) == is_par) {
1628 next = regnext(scan);
1630 if ( next && (OP(next) != WHILEM) && next < last)
1631 is_par = 0; /* Disable optimization */
1634 *(data->last_closep) = ARG(scan);
1636 else if (OP(scan) == EVAL) {
1638 data->flags |= SF_HAS_EVAL;
1640 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1641 if (flags & SCF_DO_SUBSTR) {
1642 scan_commit(pRExC_state,data);
1643 data->longest = &(data->longest_float);
1645 is_inf = is_inf_internal = 1;
1646 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1647 cl_anything(pRExC_state, data->start_class);
1648 flags &= ~SCF_DO_STCLASS;
1650 /* Else: zero-length, ignore. */
1651 scan = regnext(scan);
1656 *deltap = is_inf_internal ? I32_MAX : delta;
1657 if (flags & SCF_DO_SUBSTR && is_inf)
1658 data->pos_delta = I32_MAX - data->pos_min;
1659 if (is_par > U8_MAX)
1661 if (is_par && pars==1 && data) {
1662 data->flags |= SF_IN_PAR;
1663 data->flags &= ~SF_HAS_PAR;
1665 else if (pars && data) {
1666 data->flags |= SF_HAS_PAR;
1667 data->flags &= ~SF_IN_PAR;
1669 if (flags & SCF_DO_STCLASS_OR)
1670 cl_and(data->start_class, &and_with);
1675 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1677 if (RExC_rx->data) {
1678 Renewc(RExC_rx->data,
1679 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1680 char, struct reg_data);
1681 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1682 RExC_rx->data->count += n;
1685 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1686 char, struct reg_data);
1687 New(1208, RExC_rx->data->what, n, U8);
1688 RExC_rx->data->count = n;
1690 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1691 return RExC_rx->data->count - n;
1695 Perl_reginitcolors(pTHX)
1698 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1701 PL_colors[0] = s = savepv(s);
1703 s = strchr(s, '\t');
1709 PL_colors[i] = s = "";
1713 PL_colors[i++] = "";
1720 - pregcomp - compile a regular expression into internal code
1722 * We can't allocate space until we know how big the compiled form will be,
1723 * but we can't compile it (and thus know how big it is) until we've got a
1724 * place to put the code. So we cheat: we compile it twice, once with code
1725 * generation turned off and size counting turned on, and once "for real".
1726 * This also means that we don't allocate space until we are sure that the
1727 * thing really will compile successfully, and we never have to move the
1728 * code and thus invalidate pointers into it. (Note that it has to be in
1729 * one piece because free() must be able to free it all.) [NB: not true in perl]
1731 * Beware that the optimization-preparation code in here knows about some
1732 * of the structure of the compiled regexp. [I'll say.]
1735 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1745 RExC_state_t RExC_state;
1746 RExC_state_t *pRExC_state = &RExC_state;
1749 FAIL("NULL regexp argument");
1751 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1755 if (!PL_colorset) reginitcolors();
1756 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1757 PL_colors[4],PL_colors[5],PL_colors[0],
1758 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1760 RExC_flags = pm->op_pmflags;
1764 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1765 RExC_seen_evals = 0;
1768 /* First pass: determine size, legality. */
1775 RExC_emit = &PL_regdummy;
1776 RExC_whilem_seen = 0;
1777 #if 0 /* REGC() is (currently) a NOP at the first pass.
1778 * Clever compilers notice this and complain. --jhi */
1779 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1781 if (reg(pRExC_state, 0, &flags) == NULL) {
1782 RExC_precomp = Nullch;
1785 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1787 /* Small enough for pointer-storage convention?
1788 If extralen==0, this means that we will not need long jumps. */
1789 if (RExC_size >= 0x10000L && RExC_extralen)
1790 RExC_size += RExC_extralen;
1793 if (RExC_whilem_seen > 15)
1794 RExC_whilem_seen = 15;
1796 /* Allocate space and initialize. */
1797 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1800 FAIL("Regexp out of space");
1803 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1804 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1807 r->prelen = xend - exp;
1808 r->precomp = savepvn(RExC_precomp, r->prelen);
1810 #ifdef PERL_COPY_ON_WRITE
1811 r->saved_copy = Nullsv;
1813 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1814 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1816 r->substrs = 0; /* Useful during FAIL. */
1817 r->startp = 0; /* Useful during FAIL. */
1818 r->endp = 0; /* Useful during FAIL. */
1820 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1822 r->offsets[0] = RExC_size;
1824 DEBUG_r(PerlIO_printf(Perl_debug_log,
1825 "%s %"UVuf" bytes for offset annotations.\n",
1826 r->offsets ? "Got" : "Couldn't get",
1827 (UV)((2*RExC_size+1) * sizeof(U32))));
1831 /* Second pass: emit code. */
1832 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1837 RExC_emit_start = r->program;
1838 RExC_emit = r->program;
1839 /* Store the count of eval-groups for security checks: */
1840 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1841 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1843 if (reg(pRExC_state, 0, &flags) == NULL)
1846 /* Dig out information for optimizations. */
1847 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1848 pm->op_pmflags = RExC_flags;
1850 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1851 r->regstclass = NULL;
1852 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1853 r->reganch |= ROPT_NAUGHTY;
1854 scan = r->program + 1; /* First BRANCH. */
1856 /* XXXX To minimize changes to RE engine we always allocate
1857 3-units-long substrs field. */
1858 Newz(1004, r->substrs, 1, struct reg_substr_data);
1860 StructCopy(&zero_scan_data, &data, scan_data_t);
1861 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1862 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1864 STRLEN longest_float_length, longest_fixed_length;
1865 struct regnode_charclass_class ch_class;
1870 /* Skip introductions and multiplicators >= 1. */
1871 while ((OP(first) == OPEN && (sawopen = 1)) ||
1872 /* An OR of *one* alternative - should not happen now. */
1873 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1874 (OP(first) == PLUS) ||
1875 (OP(first) == MINMOD) ||
1876 /* An {n,m} with n>0 */
1877 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1878 if (OP(first) == PLUS)
1881 first += regarglen[(U8)OP(first)];
1882 first = NEXTOPER(first);
1885 /* Starting-point info. */
1887 if (PL_regkind[(U8)OP(first)] == EXACT) {
1888 if (OP(first) == EXACT)
1889 ; /* Empty, get anchored substr later. */
1890 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1891 r->regstclass = first;
1893 else if (strchr((char*)PL_simple,OP(first)))
1894 r->regstclass = first;
1895 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1896 PL_regkind[(U8)OP(first)] == NBOUND)
1897 r->regstclass = first;
1898 else if (PL_regkind[(U8)OP(first)] == BOL) {
1899 r->reganch |= (OP(first) == MBOL
1901 : (OP(first) == SBOL
1904 first = NEXTOPER(first);
1907 else if (OP(first) == GPOS) {
1908 r->reganch |= ROPT_ANCH_GPOS;
1909 first = NEXTOPER(first);
1912 else if (!sawopen && (OP(first) == STAR &&
1913 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1914 !(r->reganch & ROPT_ANCH) )
1916 /* turn .* into ^.* with an implied $*=1 */
1917 int type = OP(NEXTOPER(first));
1919 if (type == REG_ANY)
1920 type = ROPT_ANCH_MBOL;
1922 type = ROPT_ANCH_SBOL;
1924 r->reganch |= type | ROPT_IMPLICIT;
1925 first = NEXTOPER(first);
1928 if (sawplus && (!sawopen || !RExC_sawback)
1929 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1930 /* x+ must match at the 1st pos of run of x's */
1931 r->reganch |= ROPT_SKIP;
1933 /* Scan is after the zeroth branch, first is atomic matcher. */
1934 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1935 (IV)(first - scan + 1)));
1937 * If there's something expensive in the r.e., find the
1938 * longest literal string that must appear and make it the
1939 * regmust. Resolve ties in favor of later strings, since
1940 * the regstart check works with the beginning of the r.e.
1941 * and avoiding duplication strengthens checking. Not a
1942 * strong reason, but sufficient in the absence of others.
1943 * [Now we resolve ties in favor of the earlier string if
1944 * it happens that c_offset_min has been invalidated, since the
1945 * earlier string may buy us something the later one won't.]
1949 data.longest_fixed = newSVpvn("",0);
1950 data.longest_float = newSVpvn("",0);
1951 data.last_found = newSVpvn("",0);
1952 data.longest = &(data.longest_fixed);
1954 if (!r->regstclass) {
1955 cl_init(pRExC_state, &ch_class);
1956 data.start_class = &ch_class;
1957 stclass_flag = SCF_DO_STCLASS_AND;
1958 } else /* XXXX Check for BOUND? */
1960 data.last_closep = &last_close;
1962 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1963 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1964 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1965 && data.last_start_min == 0 && data.last_end > 0
1966 && !RExC_seen_zerolen
1967 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1968 r->reganch |= ROPT_CHECK_ALL;
1969 scan_commit(pRExC_state, &data);
1970 SvREFCNT_dec(data.last_found);
1972 longest_float_length = CHR_SVLEN(data.longest_float);
1973 if (longest_float_length
1974 || (data.flags & SF_FL_BEFORE_EOL
1975 && (!(data.flags & SF_FL_BEFORE_MEOL)
1976 || (RExC_flags & PMf_MULTILINE)))) {
1979 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1980 && data.offset_fixed == data.offset_float_min
1981 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1982 goto remove_float; /* As in (a)+. */
1984 if (SvUTF8(data.longest_float)) {
1985 r->float_utf8 = data.longest_float;
1986 r->float_substr = Nullsv;
1988 r->float_substr = data.longest_float;
1989 r->float_utf8 = Nullsv;
1991 r->float_min_offset = data.offset_float_min;
1992 r->float_max_offset = data.offset_float_max;
1993 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1994 && (!(data.flags & SF_FL_BEFORE_MEOL)
1995 || (RExC_flags & PMf_MULTILINE)));
1996 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2000 r->float_substr = r->float_utf8 = Nullsv;
2001 SvREFCNT_dec(data.longest_float);
2002 longest_float_length = 0;
2005 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2006 if (longest_fixed_length
2007 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2008 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2009 || (RExC_flags & PMf_MULTILINE)))) {
2012 if (SvUTF8(data.longest_fixed)) {
2013 r->anchored_utf8 = data.longest_fixed;
2014 r->anchored_substr = Nullsv;
2016 r->anchored_substr = data.longest_fixed;
2017 r->anchored_utf8 = Nullsv;
2019 r->anchored_offset = data.offset_fixed;
2020 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2021 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2022 || (RExC_flags & PMf_MULTILINE)));
2023 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2026 r->anchored_substr = r->anchored_utf8 = Nullsv;
2027 SvREFCNT_dec(data.longest_fixed);
2028 longest_fixed_length = 0;
2031 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2032 r->regstclass = NULL;
2033 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2035 && !(data.start_class->flags & ANYOF_EOS)
2036 && !cl_is_anything(data.start_class))
2038 I32 n = add_data(pRExC_state, 1, "f");
2040 New(1006, RExC_rx->data->data[n], 1,
2041 struct regnode_charclass_class);
2042 StructCopy(data.start_class,
2043 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2044 struct regnode_charclass_class);
2045 r->regstclass = (regnode*)RExC_rx->data->data[n];
2046 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2047 PL_regdata = r->data; /* for regprop() */
2048 DEBUG_r({ SV *sv = sv_newmortal();
2049 regprop(sv, (regnode*)data.start_class);
2050 PerlIO_printf(Perl_debug_log,
2051 "synthetic stclass `%s'.\n",
2055 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2056 if (longest_fixed_length > longest_float_length) {
2057 r->check_substr = r->anchored_substr;
2058 r->check_utf8 = r->anchored_utf8;
2059 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2060 if (r->reganch & ROPT_ANCH_SINGLE)
2061 r->reganch |= ROPT_NOSCAN;
2064 r->check_substr = r->float_substr;
2065 r->check_utf8 = r->float_utf8;
2066 r->check_offset_min = data.offset_float_min;
2067 r->check_offset_max = data.offset_float_max;
2069 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2070 This should be changed ASAP! */
2071 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2072 r->reganch |= RE_USE_INTUIT;
2073 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2074 r->reganch |= RE_INTUIT_TAIL;
2078 /* Several toplevels. Best we can is to set minlen. */
2080 struct regnode_charclass_class ch_class;
2083 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2084 scan = r->program + 1;
2085 cl_init(pRExC_state, &ch_class);
2086 data.start_class = &ch_class;
2087 data.last_closep = &last_close;
2088 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2089 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2090 = r->float_substr = r->float_utf8 = Nullsv;
2091 if (!(data.start_class->flags & ANYOF_EOS)
2092 && !cl_is_anything(data.start_class))
2094 I32 n = add_data(pRExC_state, 1, "f");
2096 New(1006, RExC_rx->data->data[n], 1,
2097 struct regnode_charclass_class);
2098 StructCopy(data.start_class,
2099 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2100 struct regnode_charclass_class);
2101 r->regstclass = (regnode*)RExC_rx->data->data[n];
2102 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2103 DEBUG_r({ SV* sv = sv_newmortal();
2104 regprop(sv, (regnode*)data.start_class);
2105 PerlIO_printf(Perl_debug_log,
2106 "synthetic stclass `%s'.\n",
2112 if (RExC_seen & REG_SEEN_GPOS)
2113 r->reganch |= ROPT_GPOS_SEEN;
2114 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2115 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2116 if (RExC_seen & REG_SEEN_EVAL)
2117 r->reganch |= ROPT_EVAL_SEEN;
2118 if (RExC_seen & REG_SEEN_CANY)
2119 r->reganch |= ROPT_CANY_SEEN;
2120 Newz(1002, r->startp, RExC_npar, I32);
2121 Newz(1002, r->endp, RExC_npar, I32);
2122 PL_regdata = r->data; /* for regprop() */
2123 DEBUG_r(regdump(r));
2128 - reg - regular expression, i.e. main body or parenthesized thing
2130 * Caller must absorb opening parenthesis.
2132 * Combining parenthesis handling with the base level of regular expression
2133 * is a trifle forced, but the need to tie the tails of the branches to what
2134 * follows makes it hard to avoid.
2137 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2138 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2140 register regnode *ret; /* Will be the head of the group. */
2141 register regnode *br;
2142 register regnode *lastbr;
2143 register regnode *ender = 0;
2144 register I32 parno = 0;
2145 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2147 /* for (?g), (?gc), and (?o) warnings; warning
2148 about (?c) will warn about (?g) -- japhy */
2150 I32 wastedflags = 0x00,
2153 wasted_gc = 0x02 | 0x04,
2156 char * parse_start = RExC_parse; /* MJD */
2157 char *oregcomp_parse = RExC_parse;
2160 *flagp = 0; /* Tentatively. */
2163 /* Make an OPEN node, if parenthesized. */
2165 if (*RExC_parse == '?') { /* (?...) */
2166 U32 posflags = 0, negflags = 0;
2167 U32 *flagsp = &posflags;
2169 char *seqstart = RExC_parse;
2172 paren = *RExC_parse++;
2173 ret = NULL; /* For look-ahead/behind. */
2175 case '<': /* (?<...) */
2176 RExC_seen |= REG_SEEN_LOOKBEHIND;
2177 if (*RExC_parse == '!')
2179 if (*RExC_parse != '=' && *RExC_parse != '!')
2182 case '=': /* (?=...) */
2183 case '!': /* (?!...) */
2184 RExC_seen_zerolen++;
2185 case ':': /* (?:...) */
2186 case '>': /* (?>...) */
2188 case '$': /* (?$...) */
2189 case '@': /* (?@...) */
2190 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2192 case '#': /* (?#...) */
2193 while (*RExC_parse && *RExC_parse != ')')
2195 if (*RExC_parse != ')')
2196 FAIL("Sequence (?#... not terminated");
2197 nextchar(pRExC_state);
2200 case 'p': /* (?p...) */
2201 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2202 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2204 case '?': /* (??...) */
2206 if (*RExC_parse != '{')
2208 paren = *RExC_parse++;
2210 case '{': /* (?{...}) */
2212 I32 count = 1, n = 0;
2214 char *s = RExC_parse;
2216 OP_4tree *sop, *rop;
2218 RExC_seen_zerolen++;
2219 RExC_seen |= REG_SEEN_EVAL;
2220 while (count && (c = *RExC_parse)) {
2221 if (c == '\\' && RExC_parse[1])
2229 if (*RExC_parse != ')')
2232 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2237 if (RExC_parse - 1 - s)
2238 sv = newSVpvn(s, RExC_parse - 1 - s);
2240 sv = newSVpvn("", 0);
2243 Perl_save_re_context(aTHX);
2244 rop = sv_compile_2op(sv, &sop, "re", &pad);
2245 sop->op_private |= OPpREFCOUNTED;
2246 /* re_dup will OpREFCNT_inc */
2247 OpREFCNT_set(sop, 1);
2250 n = add_data(pRExC_state, 3, "nop");
2251 RExC_rx->data->data[n] = (void*)rop;
2252 RExC_rx->data->data[n+1] = (void*)sop;
2253 RExC_rx->data->data[n+2] = (void*)pad;
2256 else { /* First pass */
2257 if (PL_reginterp_cnt < ++RExC_seen_evals
2259 /* No compiled RE interpolated, has runtime
2260 components ===> unsafe. */
2261 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2262 if (PL_tainting && PL_tainted)
2263 FAIL("Eval-group in insecure regular expression");
2264 if (IN_PERL_COMPILETIME)
2268 nextchar(pRExC_state);
2270 ret = reg_node(pRExC_state, LOGICAL);
2273 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2274 /* deal with the length of this later - MJD */
2277 ret = reganode(pRExC_state, EVAL, n);
2278 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2279 Set_Node_Offset(ret, parse_start);
2282 case '(': /* (?(?{...})...) and (?(?=...)...) */
2284 if (RExC_parse[0] == '?') { /* (?(?...)) */
2285 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2286 || RExC_parse[1] == '<'
2287 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2290 ret = reg_node(pRExC_state, LOGICAL);
2293 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2297 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2299 parno = atoi(RExC_parse++);
2301 while (isDIGIT(*RExC_parse))
2303 ret = reganode(pRExC_state, GROUPP, parno);
2305 if ((c = *nextchar(pRExC_state)) != ')')
2306 vFAIL("Switch condition not recognized");
2308 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2309 br = regbranch(pRExC_state, &flags, 1);
2311 br = reganode(pRExC_state, LONGJMP, 0);
2313 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2314 c = *nextchar(pRExC_state);
2318 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2319 regbranch(pRExC_state, &flags, 1);
2320 regtail(pRExC_state, ret, lastbr);
2323 c = *nextchar(pRExC_state);
2328 vFAIL("Switch (?(condition)... contains too many branches");
2329 ender = reg_node(pRExC_state, TAIL);
2330 regtail(pRExC_state, br, ender);
2332 regtail(pRExC_state, lastbr, ender);
2333 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2336 regtail(pRExC_state, ret, ender);
2340 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2344 RExC_parse--; /* for vFAIL to print correctly */
2345 vFAIL("Sequence (? incomplete");
2349 parse_flags: /* (?i) */
2350 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2351 /* (?g), (?gc) and (?o) are useless here
2352 and must be globally applied -- japhy */
2354 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2355 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2356 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2357 if (! (wastedflags & wflagbit) ) {
2358 wastedflags |= wflagbit;
2361 "Useless (%s%c) - %suse /%c modifier",
2362 flagsp == &negflags ? "?-" : "?",
2364 flagsp == &negflags ? "don't " : "",
2370 else if (*RExC_parse == 'c') {
2371 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2372 if (! (wastedflags & wasted_c) ) {
2373 wastedflags |= wasted_gc;
2376 "Useless (%sc) - %suse /gc modifier",
2377 flagsp == &negflags ? "?-" : "?",
2378 flagsp == &negflags ? "don't " : ""
2383 else { pmflag(flagsp, *RExC_parse); }
2387 if (*RExC_parse == '-') {
2389 wastedflags = 0; /* reset so (?g-c) warns twice */
2393 RExC_flags |= posflags;
2394 RExC_flags &= ~negflags;
2395 if (*RExC_parse == ':') {
2401 if (*RExC_parse != ')') {
2403 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2405 nextchar(pRExC_state);
2413 ret = reganode(pRExC_state, OPEN, parno);
2414 Set_Node_Length(ret, 1); /* MJD */
2415 Set_Node_Offset(ret, RExC_parse); /* MJD */
2422 /* Pick up the branches, linking them together. */
2423 parse_start = RExC_parse; /* MJD */
2424 br = regbranch(pRExC_state, &flags, 1);
2425 /* branch_len = (paren != 0); */
2429 if (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 reginsert(pRExC_state, BRANCHJ, br);
2434 reginsert(pRExC_state, BRANCH, br);
2435 Set_Node_Length(br, paren != 0);
2436 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2440 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2442 else if (paren == ':') {
2443 *flagp |= flags&SIMPLE;
2445 if (open) { /* Starts with OPEN. */
2446 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2448 else if (paren != '?') /* Not Conditional */
2450 *flagp |= flags & (SPSTART | HASWIDTH);
2452 while (*RExC_parse == '|') {
2453 if (!SIZE_ONLY && RExC_extralen) {
2454 ender = reganode(pRExC_state, LONGJMP,0);
2455 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2458 RExC_extralen += 2; /* Account for LONGJMP. */
2459 nextchar(pRExC_state);
2460 br = regbranch(pRExC_state, &flags, 0);
2464 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2468 *flagp |= flags&SPSTART;
2471 if (have_branch || paren != ':') {
2472 /* Make a closing node, and hook it on the end. */
2475 ender = reg_node(pRExC_state, TAIL);
2478 ender = reganode(pRExC_state, CLOSE, parno);
2479 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2480 Set_Node_Length(ender,1); /* MJD */
2486 *flagp &= ~HASWIDTH;
2489 ender = reg_node(pRExC_state, SUCCEED);
2492 ender = reg_node(pRExC_state, END);
2495 regtail(pRExC_state, lastbr, ender);
2498 /* Hook the tails of the branches to the closing node. */
2499 for (br = ret; br != NULL; br = regnext(br)) {
2500 regoptail(pRExC_state, br, ender);
2507 static char parens[] = "=!<,>";
2509 if (paren && (p = strchr(parens, paren))) {
2510 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2511 int flag = (p - parens) > 1;
2514 node = SUSPEND, flag = 0;
2515 reginsert(pRExC_state, node,ret);
2516 Set_Node_Cur_Length(ret);
2517 Set_Node_Offset(ret, parse_start + 1);
2519 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2523 /* Check for proper termination. */
2525 RExC_flags = oregflags;
2526 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2527 RExC_parse = oregcomp_parse;
2528 vFAIL("Unmatched (");
2531 else if (!paren && RExC_parse < RExC_end) {
2532 if (*RExC_parse == ')') {
2534 vFAIL("Unmatched )");
2537 FAIL("Junk on end of regexp"); /* "Can't happen". */
2545 - regbranch - one alternative of an | operator
2547 * Implements the concatenation operator.
2550 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2552 register regnode *ret;
2553 register regnode *chain = NULL;
2554 register regnode *latest;
2555 I32 flags = 0, c = 0;
2560 if (!SIZE_ONLY && RExC_extralen)
2561 ret = reganode(pRExC_state, BRANCHJ,0);
2563 ret = reg_node(pRExC_state, BRANCH);
2564 Set_Node_Length(ret, 1);
2568 if (!first && SIZE_ONLY)
2569 RExC_extralen += 1; /* BRANCHJ */
2571 *flagp = WORST; /* Tentatively. */
2574 nextchar(pRExC_state);
2575 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2577 latest = regpiece(pRExC_state, &flags);
2578 if (latest == NULL) {
2579 if (flags & TRYAGAIN)
2583 else if (ret == NULL)
2585 *flagp |= flags&HASWIDTH;
2586 if (chain == NULL) /* First piece. */
2587 *flagp |= flags&SPSTART;
2590 regtail(pRExC_state, chain, latest);
2595 if (chain == NULL) { /* Loop ran zero times. */
2596 chain = reg_node(pRExC_state, NOTHING);
2601 *flagp |= flags&SIMPLE;
2608 - regpiece - something followed by possible [*+?]
2610 * Note that the branching code sequences used for ? and the general cases
2611 * of * and + are somewhat optimized: they use the same NOTHING node as
2612 * both the endmarker for their branch list and the body of the last branch.
2613 * It might seem that this node could be dispensed with entirely, but the
2614 * endmarker role is not redundant.
2617 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2619 register regnode *ret;
2621 register char *next;
2623 char *origparse = RExC_parse;
2626 I32 max = REG_INFTY;
2629 ret = regatom(pRExC_state, &flags);
2631 if (flags & TRYAGAIN)
2638 if (op == '{' && regcurly(RExC_parse)) {
2639 parse_start = RExC_parse; /* MJD */
2640 next = RExC_parse + 1;
2642 while (isDIGIT(*next) || *next == ',') {
2651 if (*next == '}') { /* got one */
2655 min = atoi(RExC_parse);
2659 maxpos = RExC_parse;
2661 if (!max && *maxpos != '0')
2662 max = REG_INFTY; /* meaning "infinity" */
2663 else if (max >= REG_INFTY)
2664 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2666 nextchar(pRExC_state);
2669 if ((flags&SIMPLE)) {
2670 RExC_naughty += 2 + RExC_naughty / 2;
2671 reginsert(pRExC_state, CURLY, ret);
2672 Set_Node_Offset(ret, parse_start+1); /* MJD */
2673 Set_Node_Cur_Length(ret);
2676 regnode *w = reg_node(pRExC_state, WHILEM);
2679 regtail(pRExC_state, ret, w);
2680 if (!SIZE_ONLY && RExC_extralen) {
2681 reginsert(pRExC_state, LONGJMP,ret);
2682 reginsert(pRExC_state, NOTHING,ret);
2683 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2685 reginsert(pRExC_state, CURLYX,ret);
2687 Set_Node_Offset(ret, parse_start+1);
2688 Set_Node_Length(ret,
2689 op == '{' ? (RExC_parse - parse_start) : 1);
2691 if (!SIZE_ONLY && RExC_extralen)
2692 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2693 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2695 RExC_whilem_seen++, RExC_extralen += 3;
2696 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2704 if (max && max < min)
2705 vFAIL("Can't do {n,m} with n > m");
2707 ARG1_SET(ret, (U16)min);
2708 ARG2_SET(ret, (U16)max);
2720 #if 0 /* Now runtime fix should be reliable. */
2722 /* if this is reinstated, don't forget to put this back into perldiag:
2724 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2726 (F) The part of the regexp subject to either the * or + quantifier
2727 could match an empty string. The {#} shows in the regular
2728 expression about where the problem was discovered.
2732 if (!(flags&HASWIDTH) && op != '?')
2733 vFAIL("Regexp *+ operand could be empty");
2736 parse_start = RExC_parse;
2737 nextchar(pRExC_state);
2739 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2741 if (op == '*' && (flags&SIMPLE)) {
2742 reginsert(pRExC_state, STAR, ret);
2746 else if (op == '*') {
2750 else if (op == '+' && (flags&SIMPLE)) {
2751 reginsert(pRExC_state, PLUS, ret);
2755 else if (op == '+') {
2759 else if (op == '?') {
2764 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2766 "%.*s matches null string many times",
2767 RExC_parse - origparse,
2771 if (*RExC_parse == '?') {
2772 nextchar(pRExC_state);
2773 reginsert(pRExC_state, MINMOD, ret);
2774 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2776 if (ISMULT2(RExC_parse)) {
2778 vFAIL("Nested quantifiers");
2785 - regatom - the lowest level
2787 * Optimization: gobbles an entire sequence of ordinary characters so that
2788 * it can turn them into a single node, which is smaller to store and
2789 * faster to run. Backslashed characters are exceptions, each becoming a
2790 * separate node; the code is simpler that way and it's not worth fixing.
2792 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2794 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2796 register regnode *ret = 0;
2798 char *parse_start = RExC_parse;
2800 *flagp = WORST; /* Tentatively. */
2803 switch (*RExC_parse) {
2805 RExC_seen_zerolen++;
2806 nextchar(pRExC_state);
2807 if (RExC_flags & PMf_MULTILINE)
2808 ret = reg_node(pRExC_state, MBOL);
2809 else if (RExC_flags & PMf_SINGLELINE)
2810 ret = reg_node(pRExC_state, SBOL);
2812 ret = reg_node(pRExC_state, BOL);
2813 Set_Node_Length(ret, 1); /* MJD */
2816 nextchar(pRExC_state);
2818 RExC_seen_zerolen++;
2819 if (RExC_flags & PMf_MULTILINE)
2820 ret = reg_node(pRExC_state, MEOL);
2821 else if (RExC_flags & PMf_SINGLELINE)
2822 ret = reg_node(pRExC_state, SEOL);
2824 ret = reg_node(pRExC_state, EOL);
2825 Set_Node_Length(ret, 1); /* MJD */
2828 nextchar(pRExC_state);
2829 if (RExC_flags & PMf_SINGLELINE)
2830 ret = reg_node(pRExC_state, SANY);
2832 ret = reg_node(pRExC_state, REG_ANY);
2833 *flagp |= HASWIDTH|SIMPLE;
2835 Set_Node_Length(ret, 1); /* MJD */
2839 char *oregcomp_parse = ++RExC_parse;
2840 ret = regclass(pRExC_state);
2841 if (*RExC_parse != ']') {
2842 RExC_parse = oregcomp_parse;
2843 vFAIL("Unmatched [");
2845 nextchar(pRExC_state);
2846 *flagp |= HASWIDTH|SIMPLE;
2847 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2851 nextchar(pRExC_state);
2852 ret = reg(pRExC_state, 1, &flags);
2854 if (flags & TRYAGAIN) {
2855 if (RExC_parse == RExC_end) {
2856 /* Make parent create an empty node if needed. */
2864 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2868 if (flags & TRYAGAIN) {
2872 vFAIL("Internal urp");
2873 /* Supposed to be caught earlier. */
2876 if (!regcurly(RExC_parse)) {
2885 vFAIL("Quantifier follows nothing");
2888 switch (*++RExC_parse) {
2890 RExC_seen_zerolen++;
2891 ret = reg_node(pRExC_state, SBOL);
2893 nextchar(pRExC_state);
2894 Set_Node_Length(ret, 2); /* MJD */
2897 ret = reg_node(pRExC_state, GPOS);
2898 RExC_seen |= REG_SEEN_GPOS;
2900 nextchar(pRExC_state);
2901 Set_Node_Length(ret, 2); /* MJD */
2904 ret = reg_node(pRExC_state, SEOL);
2906 RExC_seen_zerolen++; /* Do not optimize RE away */
2907 nextchar(pRExC_state);
2910 ret = reg_node(pRExC_state, EOS);
2912 RExC_seen_zerolen++; /* Do not optimize RE away */
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 ret = reg_node(pRExC_state, CANY);
2918 RExC_seen |= REG_SEEN_CANY;
2919 *flagp |= HASWIDTH|SIMPLE;
2920 nextchar(pRExC_state);
2921 Set_Node_Length(ret, 2); /* MJD */
2924 ret = reg_node(pRExC_state, CLUMP);
2926 nextchar(pRExC_state);
2927 Set_Node_Length(ret, 2); /* MJD */
2930 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2931 *flagp |= HASWIDTH|SIMPLE;
2932 nextchar(pRExC_state);
2933 Set_Node_Length(ret, 2); /* MJD */
2936 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2937 *flagp |= HASWIDTH|SIMPLE;
2938 nextchar(pRExC_state);
2939 Set_Node_Length(ret, 2); /* MJD */
2942 RExC_seen_zerolen++;
2943 RExC_seen |= REG_SEEN_LOOKBEHIND;
2944 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2946 nextchar(pRExC_state);
2947 Set_Node_Length(ret, 2); /* MJD */
2950 RExC_seen_zerolen++;
2951 RExC_seen |= REG_SEEN_LOOKBEHIND;
2952 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2954 nextchar(pRExC_state);
2955 Set_Node_Length(ret, 2); /* MJD */
2958 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2959 *flagp |= HASWIDTH|SIMPLE;
2960 nextchar(pRExC_state);
2961 Set_Node_Length(ret, 2); /* MJD */
2964 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2965 *flagp |= HASWIDTH|SIMPLE;
2966 nextchar(pRExC_state);
2967 Set_Node_Length(ret, 2); /* MJD */
2970 ret = reg_node(pRExC_state, DIGIT);
2971 *flagp |= HASWIDTH|SIMPLE;
2972 nextchar(pRExC_state);
2973 Set_Node_Length(ret, 2); /* MJD */
2976 ret = reg_node(pRExC_state, NDIGIT);
2977 *flagp |= HASWIDTH|SIMPLE;
2978 nextchar(pRExC_state);
2979 Set_Node_Length(ret, 2); /* MJD */
2984 char* oldregxend = RExC_end;
2985 char* parse_start = RExC_parse - 2;
2987 if (RExC_parse[1] == '{') {
2988 /* a lovely hack--pretend we saw [\pX] instead */
2989 RExC_end = strchr(RExC_parse, '}');
2991 U8 c = (U8)*RExC_parse;
2993 RExC_end = oldregxend;
2994 vFAIL2("Missing right brace on \\%c{}", c);
2999 RExC_end = RExC_parse + 2;
3000 if (RExC_end > oldregxend)
3001 RExC_end = oldregxend;
3005 ret = regclass(pRExC_state);
3007 RExC_end = oldregxend;
3010 Set_Node_Offset(ret, parse_start + 2);
3011 Set_Node_Cur_Length(ret);
3012 nextchar(pRExC_state);
3013 *flagp |= HASWIDTH|SIMPLE;
3026 case '1': case '2': case '3': case '4':
3027 case '5': case '6': case '7': case '8': case '9':
3029 I32 num = atoi(RExC_parse);
3031 if (num > 9 && num >= RExC_npar)
3034 char * parse_start = RExC_parse - 1; /* MJD */
3035 while (isDIGIT(*RExC_parse))
3038 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3039 vFAIL("Reference to nonexistent group");
3041 ret = reganode(pRExC_state,
3042 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3046 /* override incorrect value set in reganode MJD */
3047 Set_Node_Offset(ret, parse_start+1);
3048 Set_Node_Cur_Length(ret); /* MJD */
3050 nextchar(pRExC_state);
3055 if (RExC_parse >= RExC_end)
3056 FAIL("Trailing \\");
3059 /* Do not generate `unrecognized' warnings here, we fall
3060 back into the quick-grab loop below */
3067 if (RExC_flags & PMf_EXTENDED) {
3068 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3069 if (RExC_parse < RExC_end)
3075 register STRLEN len;
3081 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3083 parse_start = RExC_parse - 1;
3089 ret = reg_node(pRExC_state,
3090 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3092 for (len = 0, p = RExC_parse - 1;
3093 len < 127 && p < RExC_end;
3098 if (RExC_flags & PMf_EXTENDED)
3099 p = regwhite(p, RExC_end);
3146 ender = ASCII_TO_NATIVE('\033');
3150 ender = ASCII_TO_NATIVE('\007');
3155 char* e = strchr(p, '}');
3159 vFAIL("Missing right brace on \\x{}");
3162 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3163 | PERL_SCAN_DISALLOW_PREFIX;
3165 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3172 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3174 ender = grok_hex(p, &numlen, &flags, NULL);
3180 ender = UCHARAT(p++);
3181 ender = toCTRL(ender);
3183 case '0': case '1': case '2': case '3':case '4':
3184 case '5': case '6': case '7': case '8':case '9':
3186 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3189 ender = grok_oct(p, &numlen, &flags, NULL);
3199 FAIL("Trailing \\");
3202 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3203 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3204 goto normal_default;
3209 if (UTF8_IS_START(*p) && UTF) {
3210 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3218 if (RExC_flags & PMf_EXTENDED)
3219 p = regwhite(p, RExC_end);
3221 /* Prime the casefolded buffer. */
3222 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3224 if (ISMULT2(p)) { /* Back off on ?+*. */
3231 /* Emit all the Unicode characters. */
3232 for (foldbuf = tmpbuf;
3234 foldlen -= numlen) {
3235 ender = utf8_to_uvchr(foldbuf, &numlen);
3237 reguni(pRExC_state, ender, s, &unilen);
3240 /* In EBCDIC the numlen
3241 * and unilen can differ. */
3243 if (numlen >= foldlen)
3247 break; /* "Can't happen." */
3251 reguni(pRExC_state, ender, s, &unilen);
3260 REGC((char)ender, s++);
3268 /* Emit all the Unicode characters. */
3269 for (foldbuf = tmpbuf;
3271 foldlen -= numlen) {
3272 ender = utf8_to_uvchr(foldbuf, &numlen);
3274 reguni(pRExC_state, ender, s, &unilen);
3277 /* In EBCDIC the numlen
3278 * and unilen can differ. */
3280 if (numlen >= foldlen)
3288 reguni(pRExC_state, ender, s, &unilen);
3297 REGC((char)ender, s++);
3301 Set_Node_Cur_Length(ret); /* MJD */
3302 nextchar(pRExC_state);
3304 /* len is STRLEN which is unsigned, need to copy to signed */
3307 vFAIL("Internal disaster");
3311 if (len == 1 && UNI_IS_INVARIANT(ender))
3316 RExC_size += STR_SZ(len);
3318 RExC_emit += STR_SZ(len);
3323 /* If the encoding pragma is in effect recode the text of
3324 * any EXACT-kind nodes. */
3325 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3326 STRLEN oldlen = STR_LEN(ret);
3327 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3331 if (sv_utf8_downgrade(sv, TRUE)) {
3332 char *s = sv_recode_to_utf8(sv, PL_encoding);
3333 STRLEN newlen = SvCUR(sv);
3338 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3339 (int)oldlen, STRING(ret),
3341 Copy(s, STRING(ret), newlen, char);
3342 STR_LEN(ret) += newlen - oldlen;
3343 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3345 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3353 S_regwhite(pTHX_ char *p, char *e)
3358 else if (*p == '#') {
3361 } while (p < e && *p != '\n');
3369 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3370 Character classes ([:foo:]) can also be negated ([:^foo:]).
3371 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3372 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3373 but trigger failures because they are currently unimplemented. */
3375 #define POSIXCC_DONE(c) ((c) == ':')
3376 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3377 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3380 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3383 I32 namedclass = OOB_NAMEDCLASS;
3385 if (value == '[' && RExC_parse + 1 < RExC_end &&
3386 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3387 POSIXCC(UCHARAT(RExC_parse))) {
3388 char c = UCHARAT(RExC_parse);
3389 char* s = RExC_parse++;
3391 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3393 if (RExC_parse == RExC_end)
3394 /* Grandfather lone [:, [=, [. */
3397 char* t = RExC_parse++; /* skip over the c */
3399 if (UCHARAT(RExC_parse) == ']') {
3400 RExC_parse++; /* skip over the ending ] */
3403 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3404 I32 skip = 5; /* the most common skip */
3408 if (strnEQ(posixcc, "alnum", 5))
3410 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3411 else if (strnEQ(posixcc, "alpha", 5))
3413 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3414 else if (strnEQ(posixcc, "ascii", 5))
3416 complement ? ANYOF_NASCII : ANYOF_ASCII;
3419 if (strnEQ(posixcc, "blank", 5))
3421 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3424 if (strnEQ(posixcc, "cntrl", 5))
3426 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3429 if (strnEQ(posixcc, "digit", 5))
3431 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3434 if (strnEQ(posixcc, "graph", 5))
3436 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3439 if (strnEQ(posixcc, "lower", 5))
3441 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3444 if (strnEQ(posixcc, "print", 5))
3446 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3447 else if (strnEQ(posixcc, "punct", 5))
3449 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3452 if (strnEQ(posixcc, "space", 5))
3454 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3457 if (strnEQ(posixcc, "upper", 5))
3459 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3461 case 'w': /* this is not POSIX, this is the Perl \w */
3462 if (strnEQ(posixcc, "word", 4)) {
3464 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3469 if (strnEQ(posixcc, "xdigit", 6)) {
3471 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3476 if (namedclass == OOB_NAMEDCLASS ||
3477 posixcc[skip] != ':' ||
3478 posixcc[skip+1] != ']')
3480 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3483 } else if (!SIZE_ONLY) {
3484 /* [[=foo=]] and [[.foo.]] are still future. */
3486 /* adjust RExC_parse so the warning shows after
3488 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3490 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3493 /* Maternal grandfather:
3494 * "[:" ending in ":" but not in ":]" */
3504 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3506 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3507 char *s = RExC_parse;
3510 while(*s && isALNUM(*s))
3512 if (*s && c == *s && s[1] == ']') {
3513 if (ckWARN(WARN_REGEXP))
3515 "POSIX syntax [%c %c] belongs inside character classes",
3518 /* [[=foo=]] and [[.foo.]] are still future. */
3519 if (POSIXCC_NOTYET(c)) {
3520 /* adjust RExC_parse so the error shows after
3522 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3524 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3531 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3534 register UV nextvalue;
3535 register IV prevvalue = OOB_UNICODE;
3536 register IV range = 0;
3537 register regnode *ret;
3540 char *rangebegin = 0;
3541 bool need_class = 0;
3542 SV *listsv = Nullsv;
3545 bool optimize_invert = TRUE;
3546 AV* unicode_alternate = 0;
3548 UV literal_endpoint = 0;
3551 ret = reganode(pRExC_state, ANYOF, 0);
3554 ANYOF_FLAGS(ret) = 0;
3556 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3560 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3564 RExC_size += ANYOF_SKIP;
3566 RExC_emit += ANYOF_SKIP;
3568 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3570 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3571 ANYOF_BITMAP_ZERO(ret);
3572 listsv = newSVpvn("# comment\n", 10);
3575 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3577 if (!SIZE_ONLY && POSIXCC(nextvalue))
3578 checkposixcc(pRExC_state);
3580 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3581 if (UCHARAT(RExC_parse) == ']')
3584 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3588 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3591 rangebegin = RExC_parse;
3593 value = utf8n_to_uvchr((U8*)RExC_parse,
3594 RExC_end - RExC_parse,
3596 RExC_parse += numlen;
3599 value = UCHARAT(RExC_parse++);
3600 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3601 if (value == '[' && POSIXCC(nextvalue))
3602 namedclass = regpposixcc(pRExC_state, value);
3603 else if (value == '\\') {
3605 value = utf8n_to_uvchr((U8*)RExC_parse,
3606 RExC_end - RExC_parse,
3608 RExC_parse += numlen;
3611 value = UCHARAT(RExC_parse++);
3612 /* Some compilers cannot handle switching on 64-bit integer
3613 * values, therefore value cannot be an UV. Yes, this will
3614 * be a problem later if we want switch on Unicode.
3615 * A similar issue a little bit later when switching on
3616 * namedclass. --jhi */
3617 switch ((I32)value) {
3618 case 'w': namedclass = ANYOF_ALNUM; break;
3619 case 'W': namedclass = ANYOF_NALNUM; break;
3620 case 's': namedclass = ANYOF_SPACE; break;
3621 case 'S': namedclass = ANYOF_NSPACE; break;
3622 case 'd': namedclass = ANYOF_DIGIT; break;
3623 case 'D': namedclass = ANYOF_NDIGIT; break;
3626 if (RExC_parse >= RExC_end)
3627 vFAIL2("Empty \\%c{}", (U8)value);
3628 if (*RExC_parse == '{') {
3630 e = strchr(RExC_parse++, '}');
3632 vFAIL2("Missing right brace on \\%c{}", c);
3633 while (isSPACE(UCHARAT(RExC_parse)))
3635 if (e == RExC_parse)
3636 vFAIL2("Empty \\%c{}", c);
3638 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3646 if (UCHARAT(RExC_parse) == '^') {
3649 value = value == 'p' ? 'P' : 'p'; /* toggle */
3650 while (isSPACE(UCHARAT(RExC_parse))) {
3656 Perl_sv_catpvf(aTHX_ listsv,
3657 "+utf8::%.*s\n", (int)n, RExC_parse);
3659 Perl_sv_catpvf(aTHX_ listsv,
3660 "!utf8::%.*s\n", (int)n, RExC_parse);
3663 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3664 namedclass = ANYOF_MAX; /* no official name, but it's named */
3666 case 'n': value = '\n'; break;
3667 case 'r': value = '\r'; break;
3668 case 't': value = '\t'; break;
3669 case 'f': value = '\f'; break;
3670 case 'b': value = '\b'; break;
3671 case 'e': value = ASCII_TO_NATIVE('\033');break;
3672 case 'a': value = ASCII_TO_NATIVE('\007');break;
3674 if (*RExC_parse == '{') {
3675 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3676 | PERL_SCAN_DISALLOW_PREFIX;
3677 e = strchr(RExC_parse++, '}');
3679 vFAIL("Missing right brace on \\x{}");
3681 numlen = e - RExC_parse;
3682 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3686 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3688 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3689 RExC_parse += numlen;
3693 value = UCHARAT(RExC_parse++);
3694 value = toCTRL(value);
3696 case '0': case '1': case '2': case '3': case '4':
3697 case '5': case '6': case '7': case '8': case '9':
3701 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3702 RExC_parse += numlen;
3706 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3708 "Unrecognized escape \\%c in character class passed through",
3712 } /* end of \blah */
3718 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3720 if (!SIZE_ONLY && !need_class)
3721 ANYOF_CLASS_ZERO(ret);
3725 /* a bad range like a-\d, a-[:digit:] ? */
3728 if (ckWARN(WARN_REGEXP))
3730 "False [] range \"%*.*s\"",
3731 RExC_parse - rangebegin,
3732 RExC_parse - rangebegin,
3734 if (prevvalue < 256) {
3735 ANYOF_BITMAP_SET(ret, prevvalue);
3736 ANYOF_BITMAP_SET(ret, '-');
3739 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3740 Perl_sv_catpvf(aTHX_ listsv,
3741 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3745 range = 0; /* this was not a true range */
3749 if (namedclass > OOB_NAMEDCLASS)
3750 optimize_invert = FALSE;
3751 /* Possible truncation here but in some 64-bit environments
3752 * the compiler gets heartburn about switch on 64-bit values.
3753 * A similar issue a little earlier when switching on value.
3755 switch ((I32)namedclass) {
3758 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3760 for (value = 0; value < 256; value++)
3762 ANYOF_BITMAP_SET(ret, value);
3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3768 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3770 for (value = 0; value < 256; value++)
3771 if (!isALNUM(value))
3772 ANYOF_BITMAP_SET(ret, value);
3774 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3778 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3780 for (value = 0; value < 256; value++)
3781 if (isALNUMC(value))
3782 ANYOF_BITMAP_SET(ret, value);
3784 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3788 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3790 for (value = 0; value < 256; value++)
3791 if (!isALNUMC(value))
3792 ANYOF_BITMAP_SET(ret, value);
3794 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3798 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3800 for (value = 0; value < 256; value++)
3802 ANYOF_BITMAP_SET(ret, value);
3804 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3808 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3810 for (value = 0; value < 256; value++)
3811 if (!isALPHA(value))
3812 ANYOF_BITMAP_SET(ret, value);
3814 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3818 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3821 for (value = 0; value < 128; value++)
3822 ANYOF_BITMAP_SET(ret, value);
3824 for (value = 0; value < 256; value++) {
3826 ANYOF_BITMAP_SET(ret, value);
3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3834 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3837 for (value = 128; value < 256; value++)
3838 ANYOF_BITMAP_SET(ret, value);
3840 for (value = 0; value < 256; value++) {
3841 if (!isASCII(value))
3842 ANYOF_BITMAP_SET(ret, value);
3846 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3850 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3852 for (value = 0; value < 256; value++)
3854 ANYOF_BITMAP_SET(ret, value);
3856 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3860 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3862 for (value = 0; value < 256; value++)
3863 if (!isBLANK(value))
3864 ANYOF_BITMAP_SET(ret, value);
3866 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3870 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3872 for (value = 0; value < 256; value++)
3874 ANYOF_BITMAP_SET(ret, value);
3876 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3880 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3882 for (value = 0; value < 256; value++)
3883 if (!isCNTRL(value))
3884 ANYOF_BITMAP_SET(ret, value);
3886 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3890 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3892 /* consecutive digits assumed */
3893 for (value = '0'; value <= '9'; value++)
3894 ANYOF_BITMAP_SET(ret, value);
3896 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3900 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3902 /* consecutive digits assumed */
3903 for (value = 0; value < '0'; value++)
3904 ANYOF_BITMAP_SET(ret, value);
3905 for (value = '9' + 1; value < 256; value++)
3906 ANYOF_BITMAP_SET(ret, value);
3908 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3912 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3914 for (value = 0; value < 256; value++)
3916 ANYOF_BITMAP_SET(ret, value);
3918 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3922 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3924 for (value = 0; value < 256; value++)
3925 if (!isGRAPH(value))
3926 ANYOF_BITMAP_SET(ret, value);
3928 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3932 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3934 for (value = 0; value < 256; value++)
3936 ANYOF_BITMAP_SET(ret, value);
3938 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3942 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3944 for (value = 0; value < 256; value++)
3945 if (!isLOWER(value))
3946 ANYOF_BITMAP_SET(ret, value);
3948 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3952 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3954 for (value = 0; value < 256; value++)
3956 ANYOF_BITMAP_SET(ret, value);
3958 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3962 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3964 for (value = 0; value < 256; value++)
3965 if (!isPRINT(value))
3966 ANYOF_BITMAP_SET(ret, value);
3968 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3972 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3974 for (value = 0; value < 256; value++)
3975 if (isPSXSPC(value))
3976 ANYOF_BITMAP_SET(ret, value);
3978 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3982 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3984 for (value = 0; value < 256; value++)
3985 if (!isPSXSPC(value))
3986 ANYOF_BITMAP_SET(ret, value);
3988 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3992 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3994 for (value = 0; value < 256; value++)
3996 ANYOF_BITMAP_SET(ret, value);
3998 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4002 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4004 for (value = 0; value < 256; value++)
4005 if (!isPUNCT(value))
4006 ANYOF_BITMAP_SET(ret, value);
4008 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4012 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4014 for (value = 0; value < 256; value++)
4016 ANYOF_BITMAP_SET(ret, value);
4018 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4022 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4024 for (value = 0; value < 256; value++)
4025 if (!isSPACE(value))
4026 ANYOF_BITMAP_SET(ret, value);
4028 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4032 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4034 for (value = 0; value < 256; value++)
4036 ANYOF_BITMAP_SET(ret, value);
4038 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4042 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4044 for (value = 0; value < 256; value++)
4045 if (!isUPPER(value))
4046 ANYOF_BITMAP_SET(ret, value);
4048 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4052 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4054 for (value = 0; value < 256; value++)
4055 if (isXDIGIT(value))
4056 ANYOF_BITMAP_SET(ret, value);
4058 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4062 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4064 for (value = 0; value < 256; value++)
4065 if (!isXDIGIT(value))
4066 ANYOF_BITMAP_SET(ret, value);
4068 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4071 /* this is to handle \p and \P */
4074 vFAIL("Invalid [::] class");
4078 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4081 } /* end of namedclass \blah */
4084 if (prevvalue > (IV)value) /* b-a */ {
4085 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4086 RExC_parse - rangebegin,
4087 RExC_parse - rangebegin,
4089 range = 0; /* not a valid range */
4093 prevvalue = value; /* save the beginning of the range */
4094 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4095 RExC_parse[1] != ']') {
4098 /* a bad range like \w-, [:word:]- ? */
4099 if (namedclass > OOB_NAMEDCLASS) {
4100 if (ckWARN(WARN_REGEXP))
4102 "False [] range \"%*.*s\"",
4103 RExC_parse - rangebegin,
4104 RExC_parse - rangebegin,
4107 ANYOF_BITMAP_SET(ret, '-');
4109 range = 1; /* yeah, it's a range! */
4110 continue; /* but do it the next time */
4114 /* now is the next time */
4118 if (prevvalue < 256) {
4119 IV ceilvalue = value < 256 ? value : 255;
4122 /* In EBCDIC [\x89-\x91] should include
4123 * the \x8e but [i-j] should not. */
4124 if (literal_endpoint == 2 &&
4125 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4126 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4128 if (isLOWER(prevvalue)) {
4129 for (i = prevvalue; i <= ceilvalue; i++)
4131 ANYOF_BITMAP_SET(ret, i);
4133 for (i = prevvalue; i <= ceilvalue; i++)
4135 ANYOF_BITMAP_SET(ret, i);
4140 for (i = prevvalue; i <= ceilvalue; i++)
4141 ANYOF_BITMAP_SET(ret, i);
4143 if (value > 255 || UTF) {
4144 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4145 UV natvalue = NATIVE_TO_UNI(value);
4147 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4148 if (prevnatvalue < natvalue) { /* what about > ? */
4149 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4150 prevnatvalue, natvalue);
4152 else if (prevnatvalue == natvalue) {
4153 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4155 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4157 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4159 /* If folding and foldable and a single
4160 * character, insert also the folded version
4161 * to the charclass. */
4163 if (foldlen == (STRLEN)UNISKIP(f))
4164 Perl_sv_catpvf(aTHX_ listsv,
4167 /* Any multicharacter foldings
4168 * require the following transform:
4169 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4170 * where E folds into "pq" and F folds
4171 * into "rst", all other characters
4172 * fold to single characters. We save
4173 * away these multicharacter foldings,
4174 * to be later saved as part of the
4175 * additional "s" data. */
4178 if (!unicode_alternate)
4179 unicode_alternate = newAV();
4180 sv = newSVpvn((char*)foldbuf, foldlen);
4182 av_push(unicode_alternate, sv);
4186 /* If folding and the value is one of the Greek
4187 * sigmas insert a few more sigmas to make the
4188 * folding rules of the sigmas to work right.
4189 * Note that not all the possible combinations
4190 * are handled here: some of them are handled
4191 * by the standard folding rules, and some of
4192 * them (literal or EXACTF cases) are handled
4193 * during runtime in regexec.c:S_find_byclass(). */
4194 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4195 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4196 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4197 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4198 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4200 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4201 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4202 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4207 literal_endpoint = 0;
4211 range = 0; /* this range (if it was one) is done now */
4215 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4217 RExC_size += ANYOF_CLASS_ADD_SKIP;
4219 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4222 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4224 /* If the only flag is folding (plus possibly inversion). */
4225 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4227 for (value = 0; value < 256; ++value) {
4228 if (ANYOF_BITMAP_TEST(ret, value)) {
4229 UV fold = PL_fold[value];
4232 ANYOF_BITMAP_SET(ret, fold);
4235 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4238 /* optimize inverted simple patterns (e.g. [^a-z]) */
4239 if (!SIZE_ONLY && optimize_invert &&
4240 /* If the only flag is inversion. */
4241 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4242 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4243 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4244 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4251 /* The 0th element stores the character class description
4252 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4253 * to initialize the appropriate swash (which gets stored in
4254 * the 1st element), and also useful for dumping the regnode.
4255 * The 2nd element stores the multicharacter foldings,
4256 * used later (regexec.c:S_reginclass()). */
4257 av_store(av, 0, listsv);
4258 av_store(av, 1, NULL);
4259 av_store(av, 2, (SV*)unicode_alternate);
4260 rv = newRV_noinc((SV*)av);
4261 n = add_data(pRExC_state, 1, "s");
4262 RExC_rx->data->data[n] = (void*)rv;
4270 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4272 char* retval = RExC_parse++;
4275 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4276 RExC_parse[2] == '#') {
4277 while (*RExC_parse != ')') {
4278 if (RExC_parse == RExC_end)
4279 FAIL("Sequence (?#... not terminated");
4285 if (RExC_flags & PMf_EXTENDED) {
4286 if (isSPACE(*RExC_parse)) {
4290 else if (*RExC_parse == '#') {
4291 while (RExC_parse < RExC_end)
4292 if (*RExC_parse++ == '\n') break;
4301 - reg_node - emit a node
4303 STATIC regnode * /* Location. */
4304 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4306 register regnode *ret;
4307 register regnode *ptr;
4311 SIZE_ALIGN(RExC_size);
4316 NODE_ALIGN_FILL(ret);
4318 FILL_ADVANCE_NODE(ptr, op);
4319 if (RExC_offsets) { /* MJD */
4320 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4321 "reg_node", __LINE__,
4323 RExC_emit - RExC_emit_start > RExC_offsets[0]
4324 ? "Overwriting end of array!\n" : "OK",
4325 RExC_emit - RExC_emit_start,
4326 RExC_parse - RExC_start,
4328 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4337 - reganode - emit a node with an argument
4339 STATIC regnode * /* Location. */
4340 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4342 register regnode *ret;
4343 register regnode *ptr;
4347 SIZE_ALIGN(RExC_size);
4352 NODE_ALIGN_FILL(ret);
4354 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4355 if (RExC_offsets) { /* MJD */
4356 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4360 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4361 "Overwriting end of array!\n" : "OK",
4362 RExC_emit - RExC_emit_start,
4363 RExC_parse - RExC_start,
4365 Set_Cur_Node_Offset;
4374 - reguni - emit (if appropriate) a Unicode character
4377 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4379 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4383 - reginsert - insert an operator in front of already-emitted operand
4385 * Means relocating the operand.
4388 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4390 register regnode *src;
4391 register regnode *dst;
4392 register regnode *place;
4393 register int offset = regarglen[(U8)op];
4395 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4398 RExC_size += NODE_STEP_REGNODE + offset;
4403 RExC_emit += NODE_STEP_REGNODE + offset;
4405 while (src > opnd) {
4406 StructCopy(--src, --dst, regnode);
4407 if (RExC_offsets) { /* MJD 20010112 */
4408 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4412 dst - RExC_emit_start > RExC_offsets[0]
4413 ? "Overwriting end of array!\n" : "OK",
4414 src - RExC_emit_start,
4415 dst - RExC_emit_start,
4417 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4418 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4423 place = opnd; /* Op node, where operand used to be. */
4424 if (RExC_offsets) { /* MJD */
4425 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4429 place - RExC_emit_start > RExC_offsets[0]
4430 ? "Overwriting end of array!\n" : "OK",
4431 place - RExC_emit_start,
4432 RExC_parse - RExC_start,
4434 Set_Node_Offset(place, RExC_parse);
4435 Set_Node_Length(place, 1);
4437 src = NEXTOPER(place);
4438 FILL_ADVANCE_NODE(place, op);
4439 Zero(src, offset, regnode);
4443 - regtail - set the next-pointer at the end of a node chain of p to val.
4446 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4448 register regnode *scan;
4449 register regnode *temp;
4454 /* Find last node. */
4457 temp = regnext(scan);
4463 if (reg_off_by_arg[OP(scan)]) {
4464 ARG_SET(scan, val - scan);
4467 NEXT_OFF(scan) = val - scan;
4472 - regoptail - regtail on operand of first argument; nop if operandless
4475 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4477 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4478 if (p == NULL || SIZE_ONLY)
4480 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4481 regtail(pRExC_state, NEXTOPER(p), val);
4483 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4484 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4491 - regcurly - a little FSA that accepts {\d+,?\d*}
4494 S_regcurly(pTHX_ register char *s)
4515 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4517 register U8 op = EXACT; /* Arbitrary non-END op. */
4518 register regnode *next;
4520 while (op != END && (!last || node < last)) {
4521 /* While that wasn't END last time... */
4527 next = regnext(node);
4529 if (OP(node) == OPTIMIZED)
4532 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4533 (int)(2*l + 1), "", SvPVX(sv));
4534 if (next == NULL) /* Next ptr. */
4535 PerlIO_printf(Perl_debug_log, "(0)");
4537 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4538 (void)PerlIO_putc(Perl_debug_log, '\n');
4540 if (PL_regkind[(U8)op] == BRANCHJ) {
4541 register regnode *nnode = (OP(next) == LONGJMP
4544 if (last && nnode > last)
4546 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4548 else if (PL_regkind[(U8)op] == BRANCH) {
4549 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4551 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4552 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4553 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4555 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4556 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4559 else if ( op == PLUS || op == STAR) {
4560 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4562 else if (op == ANYOF) {
4563 /* arglen 1 + class block */
4564 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4565 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4566 node = NEXTOPER(node);
4568 else if (PL_regkind[(U8)op] == EXACT) {
4569 /* Literal string, where present. */
4570 node += NODE_SZ_STR(node) - 1;
4571 node = NEXTOPER(node);
4574 node = NEXTOPER(node);
4575 node += regarglen[(U8)op];
4577 if (op == CURLYX || op == OPEN)
4579 else if (op == WHILEM)
4585 #endif /* DEBUGGING */
4588 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4591 Perl_regdump(pTHX_ regexp *r)
4594 SV *sv = sv_newmortal();
4596 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4598 /* Header fields of interest. */
4599 if (r->anchored_substr)
4600 PerlIO_printf(Perl_debug_log,
4601 "anchored `%s%.*s%s'%s at %"IVdf" ",
4603 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4604 SvPVX(r->anchored_substr),
4606 SvTAIL(r->anchored_substr) ? "$" : "",
4607 (IV)r->anchored_offset);
4608 else if (r->anchored_utf8)
4609 PerlIO_printf(Perl_debug_log,
4610 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4612 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4613 SvPVX(r->anchored_utf8),
4615 SvTAIL(r->anchored_utf8) ? "$" : "",
4616 (IV)r->anchored_offset);
4617 if (r->float_substr)
4618 PerlIO_printf(Perl_debug_log,
4619 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4621 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4622 SvPVX(r->float_substr),
4624 SvTAIL(r->float_substr) ? "$" : "",
4625 (IV)r->float_min_offset, (UV)r->float_max_offset);
4626 else if (r->float_utf8)
4627 PerlIO_printf(Perl_debug_log,
4628 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4630 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4631 SvPVX(r->float_utf8),
4633 SvTAIL(r->float_utf8) ? "$" : "",
4634 (IV)r->float_min_offset, (UV)r->float_max_offset);
4635 if (r->check_substr || r->check_utf8)
4636 PerlIO_printf(Perl_debug_log,
4637 r->check_substr == r->float_substr
4638 && r->check_utf8 == r->float_utf8
4639 ? "(checking floating" : "(checking anchored");
4640 if (r->reganch & ROPT_NOSCAN)
4641 PerlIO_printf(Perl_debug_log, " noscan");
4642 if (r->reganch & ROPT_CHECK_ALL)
4643 PerlIO_printf(Perl_debug_log, " isall");
4644 if (r->check_substr || r->check_utf8)
4645 PerlIO_printf(Perl_debug_log, ") ");
4647 if (r->regstclass) {
4648 regprop(sv, r->regstclass);
4649 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4651 if (r->reganch & ROPT_ANCH) {
4652 PerlIO_printf(Perl_debug_log, "anchored");
4653 if (r->reganch & ROPT_ANCH_BOL)
4654 PerlIO_printf(Perl_debug_log, "(BOL)");
4655 if (r->reganch & ROPT_ANCH_MBOL)
4656 PerlIO_printf(Perl_debug_log, "(MBOL)");
4657 if (r->reganch & ROPT_ANCH_SBOL)
4658 PerlIO_printf(Perl_debug_log, "(SBOL)");
4659 if (r->reganch & ROPT_ANCH_GPOS)
4660 PerlIO_printf(Perl_debug_log, "(GPOS)");
4661 PerlIO_putc(Perl_debug_log, ' ');
4663 if (r->reganch & ROPT_GPOS_SEEN)
4664 PerlIO_printf(Perl_debug_log, "GPOS ");
4665 if (r->reganch & ROPT_SKIP)
4666 PerlIO_printf(Perl_debug_log, "plus ");
4667 if (r->reganch & ROPT_IMPLICIT)
4668 PerlIO_printf(Perl_debug_log, "implicit ");
4669 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4670 if (r->reganch & ROPT_EVAL_SEEN)
4671 PerlIO_printf(Perl_debug_log, "with eval ");
4672 PerlIO_printf(Perl_debug_log, "\n");
4675 U32 len = r->offsets[0];
4676 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4677 for (i = 1; i <= len; i++)
4678 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4679 (UV)r->offsets[i*2-1],
4680 (UV)r->offsets[i*2]);
4681 PerlIO_printf(Perl_debug_log, "\n");
4683 #endif /* DEBUGGING */
4689 S_put_byte(pTHX_ SV *sv, int c)
4691 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4692 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4693 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4694 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4696 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4699 #endif /* DEBUGGING */
4702 - regprop - printable representation of opcode
4705 Perl_regprop(pTHX_ SV *sv, regnode *o)
4710 sv_setpvn(sv, "", 0);
4711 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4712 /* It would be nice to FAIL() here, but this may be called from
4713 regexec.c, and it would be hard to supply pRExC_state. */
4714 Perl_croak(aTHX_ "Corrupted regexp opcode");
4715 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4717 k = PL_regkind[(U8)OP(o)];
4720 SV *dsv = sv_2mortal(newSVpvn("", 0));
4721 /* Using is_utf8_string() is a crude hack but it may
4722 * be the best for now since we have no flag "this EXACTish
4723 * node was UTF-8" --jhi */
4724 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4726 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4727 UNI_DISPLAY_REGEX) :
4732 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4737 else if (k == CURLY) {
4738 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4739 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4740 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4742 else if (k == WHILEM && o->flags) /* Ordinal/of */
4743 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4744 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4745 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4746 else if (k == LOGICAL)
4747 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4748 else if (k == ANYOF) {
4749 int i, rangestart = -1;
4750 U8 flags = ANYOF_FLAGS(o);
4751 const char * const anyofs[] = { /* Should be synchronized with
4752 * ANYOF_ #xdefines in regcomp.h */
4785 if (flags & ANYOF_LOCALE)
4786 sv_catpv(sv, "{loc}");
4787 if (flags & ANYOF_FOLD)
4788 sv_catpv(sv, "{i}");
4789 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4790 if (flags & ANYOF_INVERT)
4792 for (i = 0; i <= 256; i++) {
4793 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4794 if (rangestart == -1)
4796 } else if (rangestart != -1) {
4797 if (i <= rangestart + 3)
4798 for (; rangestart < i; rangestart++)
4799 put_byte(sv, rangestart);
4801 put_byte(sv, rangestart);
4803 put_byte(sv, i - 1);
4809 if (o->flags & ANYOF_CLASS)
4810 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4811 if (ANYOF_CLASS_TEST(o,i))
4812 sv_catpv(sv, anyofs[i]);
4814 if (flags & ANYOF_UNICODE)
4815 sv_catpv(sv, "{unicode}");
4816 else if (flags & ANYOF_UNICODE_ALL)
4817 sv_catpv(sv, "{unicode_all}");
4821 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4825 U8 s[UTF8_MAXLEN+1];
4827 for (i = 0; i <= 256; i++) { /* just the first 256 */
4828 U8 *e = uvchr_to_utf8(s, i);
4830 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4831 if (rangestart == -1)
4833 } else if (rangestart != -1) {
4836 if (i <= rangestart + 3)
4837 for (; rangestart < i; rangestart++) {
4838 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4842 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4845 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4852 sv_catpv(sv, "..."); /* et cetera */
4856 char *s = savepv(SvPVX(lv));
4859 while(*s && *s != '\n') s++;
4880 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4882 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4883 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4884 #endif /* DEBUGGING */
4888 Perl_re_intuit_string(pTHX_ regexp *prog)
4889 { /* Assume that RE_INTUIT is set */
4892 char *s = SvPV(prog->check_substr
4893 ? prog->check_substr : prog->check_utf8, n_a);
4895 if (!PL_colorset) reginitcolors();
4896 PerlIO_printf(Perl_debug_log,
4897 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4899 prog->check_substr ? "" : "utf8 ",
4900 PL_colors[5],PL_colors[0],
4903 (strlen(s) > 60 ? "..." : ""));
4906 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4910 Perl_pregfree(pTHX_ struct regexp *r)
4913 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4916 if (!r || (--r->refcnt > 0))
4922 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4923 r->prelen, 60, UNI_DISPLAY_REGEX)
4924 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4928 PerlIO_printf(Perl_debug_log,
4929 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4930 PL_colors[4],PL_colors[5],PL_colors[0],
4933 len > 60 ? "..." : "");
4937 Safefree(r->precomp);
4938 if (r->offsets) /* 20010421 MJD */
4939 Safefree(r->offsets);
4940 RX_MATCH_COPY_FREE(r);
4941 #ifdef PERL_COPY_ON_WRITE
4943 SvREFCNT_dec(r->saved_copy);
4946 if (r->anchored_substr)
4947 SvREFCNT_dec(r->anchored_substr);
4948 if (r->anchored_utf8)
4949 SvREFCNT_dec(r->anchored_utf8);
4950 if (r->float_substr)
4951 SvREFCNT_dec(r->float_substr);
4953 SvREFCNT_dec(r->float_utf8);
4954 Safefree(r->substrs);
4957 int n = r->data->count;
4958 PAD* new_comppad = NULL;
4962 /* If you add a ->what type here, update the comment in regcomp.h */
4963 switch (r->data->what[n]) {
4965 SvREFCNT_dec((SV*)r->data->data[n]);
4968 Safefree(r->data->data[n]);
4971 new_comppad = (AV*)r->data->data[n];
4974 if (new_comppad == NULL)
4975 Perl_croak(aTHX_ "panic: pregfree comppad");
4976 PAD_SAVE_LOCAL(old_comppad,
4977 /* Watch out for global destruction's random ordering. */
4978 (SvTYPE(new_comppad) == SVt_PVAV) ?
4979 new_comppad : Null(PAD *)
4981 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4982 op_free((OP_4tree*)r->data->data[n]);
4985 PAD_RESTORE_LOCAL(old_comppad);
4986 SvREFCNT_dec((SV*)new_comppad);
4992 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4995 Safefree(r->data->what);
4998 Safefree(r->startp);
5004 - regnext - dig the "next" pointer out of a node
5006 * [Note, when REGALIGN is defined there are two places in regmatch()
5007 * that bypass this code for speed.]
5010 Perl_regnext(pTHX_ register regnode *p)
5012 register I32 offset;
5014 if (p == &PL_regdummy)
5017 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5025 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5028 STRLEN l1 = strlen(pat1);
5029 STRLEN l2 = strlen(pat2);
5038 Copy(pat1, buf, l1 , char);
5039 Copy(pat2, buf + l1, l2 , char);
5040 buf[l1 + l2] = '\n';
5041 buf[l1 + l2 + 1] = '\0';
5043 /* ANSI variant takes additional second argument */
5044 va_start(args, pat2);
5048 msv = vmess(buf, &args);
5050 message = SvPV(msv,l1);
5053 Copy(message, buf, l1 , char);
5054 buf[l1-1] = '\0'; /* Overwrite \n */
5055 Perl_croak(aTHX_ "%s", buf);
5058 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5061 Perl_save_re_context(pTHX)
5063 SAVEI32(PL_reg_flags); /* from regexec.c */
5065 SAVEPPTR(PL_reginput); /* String-input pointer. */
5066 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5067 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5068 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5069 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5070 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5071 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5072 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5073 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5074 PL_reg_start_tmp = 0;
5075 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5076 PL_reg_start_tmpl = 0;
5077 SAVEVPTR(PL_regdata);
5078 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5079 SAVEI32(PL_regnarrate); /* from regexec.c */
5080 SAVEVPTR(PL_regprogram); /* from regexec.c */
5081 SAVEINT(PL_regindent); /* from regexec.c */
5082 SAVEVPTR(PL_regcc); /* from regexec.c */
5083 SAVEVPTR(PL_curcop);
5084 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5085 SAVEVPTR(PL_reg_re); /* from regexec.c */
5086 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5087 SAVESPTR(PL_reg_sv); /* from regexec.c */
5088 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5089 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5090 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5091 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5092 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5093 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5094 PL_reg_oldsaved = Nullch;
5095 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5096 PL_reg_oldsavedlen = 0;
5097 #ifdef PERL_COPY_ON_WRITE
5101 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5103 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5104 PL_reg_leftiter = 0;
5105 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5106 PL_reg_poscache = Nullch;
5107 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5108 PL_reg_poscache_size = 0;
5109 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5110 SAVEI32(PL_regnpar); /* () count. */
5111 SAVEI32(PL_regsize); /* from regexec.c */
5114 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5120 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5121 for (i = 1; i <= rx->nparens; i++) {
5122 sprintf(digits, "%lu", (long)i);
5123 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5130 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5135 clear_re(pTHX_ void *r)
5137 ReREFCNT_dec((regexp *)r);