5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGCOMP_C
87 #ifndef PERL_IN_XSUB_RE
99 # if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
108 #define STATIC static
111 typedef struct RExC_state_t {
112 U32 flags; /* are we folding, multilining? */
113 char *precomp; /* uncompiled string. */
115 char *start; /* Start of input for compile */
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
119 regnode *emit_start; /* Start of emitted-code area */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
136 #define RExC_flags (pRExC_state->flags)
137 #define RExC_precomp (pRExC_state->precomp)
138 #define RExC_rx (pRExC_state->rx)
139 #define RExC_start (pRExC_state->start)
140 #define RExC_end (pRExC_state->end)
141 #define RExC_parse (pRExC_state->parse)
142 #define RExC_whilem_seen (pRExC_state->whilem_seen)
143 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty (pRExC_state->naughty)
147 #define RExC_sawback (pRExC_state->sawback)
148 #define RExC_seen (pRExC_state->seen)
149 #define RExC_size (pRExC_state->size)
150 #define RExC_npar (pRExC_state->npar)
151 #define RExC_extralen (pRExC_state->extralen)
152 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8 (pRExC_state->utf8)
156 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
161 #undef SPSTART /* dratted cpp namespace... */
164 * Flags to be passed up and down.
166 #define WORST 0 /* Worst case. */
167 #define HASWIDTH 0x1 /* Known to match non-null strings. */
168 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART 0x4 /* Starts with * or +. */
170 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
172 /* Length of a variant. */
174 typedef struct scan_data_t {
180 I32 last_end; /* min value, <0 unless valid. */
183 SV **longest; /* Either &l_fixed, or &l_float. */
187 I32 offset_float_min;
188 I32 offset_float_max;
192 struct regnode_charclass_class *start_class;
196 * Forward declarations for pregcomp()'s friends.
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL 0x1
204 #define SF_BEFORE_MEOL 0x2
205 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
209 # define SF_FIX_SHIFT_EOL (0+2)
210 # define SF_FL_SHIFT_EOL (0+4)
212 # define SF_FIX_SHIFT_EOL (+2)
213 # define SF_FL_SHIFT_EOL (+4)
216 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF 0x40
222 #define SF_HAS_PAR 0x80
223 #define SF_IN_PAR 0x100
224 #define SF_HAS_EVAL 0x200
225 #define SCF_DO_SUBSTR 0x400
226 #define SCF_DO_STCLASS_AND 0x0800
227 #define SCF_DO_STCLASS_OR 0x1000
228 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS 0x2000
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
235 #define OOB_UNICODE 12345678
236 #define OOB_NAMEDCLASS -1
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
250 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
260 #define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
280 #define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
299 #define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
308 #define vFAIL(m) STMT_START { \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
315 * Like Simple_vFAIL(), but accepts two arguments.
317 #define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
326 #define vFAIL2(m,a1) STMT_START { \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
334 * Like Simple_vFAIL(), but accepts three arguments.
336 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
345 #define vFAIL3(m,a1,a2) STMT_START { \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
352 * Like Simple_vFAIL(), but accepts four arguments.
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
361 * Like Simple_vFAIL(), but accepts five arguments.
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
376 #define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 #define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
390 #define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
414 /* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
424 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
431 RExC_offsets[2*(node)-1] = (byte); \
436 #define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
440 #define Set_Node_Length_To_R(node,len) STMT_START { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
447 RExC_offsets[2*(node)] = (len); \
452 #define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
462 static void clear_re(pTHX_ void *r);
464 /* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 sv_setsv(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
482 data->flags &= ~SF_FIX_BEFORE_EOL;
485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
486 data->offset_float_max = (l
487 ? data->last_start_max
488 : data->pos_min + data->pos_delta);
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
491 if (data->flags & SF_BEFORE_EOL)
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
495 data->flags &= ~SF_FL_BEFORE_EOL;
498 SvCUR_set(data->last_found, 0);
500 SV * sv = data->last_found;
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
507 data->flags &= ~SF_BEFORE_EOL;
510 /* Can match anything (initialization) */
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
514 ANYOF_CLASS_ZERO(cl);
515 ANYOF_BITMAP_SETALL(cl);
516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
518 cl->flags |= ANYOF_LOCALE;
521 /* Can match anything (initialization) */
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
527 for (value = 0; value <= ANYOF_MAX; value += 2)
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
537 /* Can match anything (initialization) */
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
549 Zero(cl, 1, struct regnode_charclass_class);
551 cl_anything(pRExC_state, cl);
553 cl->flags |= ANYOF_LOCALE;
556 /* 'And' a given class with another one. Can create false positives */
557 /* We assume that cl is not inverted */
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
590 cl->flags &= ~ANYOF_UNICODE;
593 /* 'OR' a given class with another one. Can create false positives */
594 /* We assume that cl is not inverted */
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
598 if (or_with->flags & ANYOF_INVERT) {
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
617 cl_anything(pRExC_state, cl);
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622 && (!(or_with->flags & ANYOF_FOLD)
623 || (cl->flags & ANYOF_FOLD)) ) {
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
636 cl_anything(pRExC_state, cl);
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
654 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655 * These need to be revisited when a newer toolchain becomes available.
657 #if defined(__sparc64__) && defined(__GNUC__)
658 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 # undef SPARC64_GCC_WORKAROUND
660 # define SPARC64_GCC_WORKAROUND 1
664 /* REx optimizer. Converts nodes into quickier variants "in place".
665 Finds fixed substrings. */
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668 to the position after last scanned or to NULL. */
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672 /* scanp: Start here (read-write). */
673 /* deltap: Write maxlen-minlen here. */
674 /* last: Stop before this one. */
676 I32 min = 0, pars = 0, code;
677 regnode *scan = *scanp, *next;
679 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680 int is_inf_internal = 0; /* The studied chunk is infinite */
681 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682 scan_data_t data_fake;
683 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
685 while (scan && OP(scan) != END && scan < last) {
686 /* Peephole optimizer: */
688 if (PL_regkind[(U8)OP(scan)] == EXACT) {
689 /* Merge several consecutive EXACTish nodes into one. */
690 regnode *n = regnext(scan);
693 regnode *stop = scan;
696 next = scan + NODE_SZ_STR(scan);
697 /* Skip NOTHING, merge EXACT*. */
699 ( PL_regkind[(U8)OP(n)] == NOTHING ||
700 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703 if (OP(n) == TAIL || n > next)
705 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706 NEXT_OFF(scan) += NEXT_OFF(n);
707 next = n + NODE_STEP_REGNODE;
715 int oldl = STR_LEN(scan);
716 regnode *nnext = regnext(n);
718 if (oldl + STR_LEN(n) > U8_MAX)
720 NEXT_OFF(scan) += NEXT_OFF(n);
721 STR_LEN(scan) += STR_LEN(n);
722 next = n + NODE_SZ_STR(n);
723 /* Now we can overwrite *n : */
724 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
734 Two problematic code points in Unicode casefolding of EXACT nodes:
736 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
744 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
746 This means that in case-insensitive matching (or "loose matching",
747 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748 length of the above casefolded versions) can match a target string
749 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750 This would rather mess up the minimum length computation.
752 What we'll do is to look for the tail four bytes, and then peek
753 at the preceding two bytes to see whether we need to decrease
754 the minimum length by four (six minus two).
756 Thanks to the design of UTF-8, there cannot be false matches:
757 A sequence of valid UTF-8 bytes cannot be a subsequence of
758 another valid sequence of UTF-8 bytes.
761 char *s0 = STRING(scan), *s, *t;
762 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763 char *t0 = "\xcc\x88\xcc\x81";
767 s < s2 && (t = ninstr(s, s1, t0, t1));
769 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
777 n = scan + NODE_SZ_STR(scan);
779 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787 /* Follow the next-chain of the current node and optimize
788 away all the NOTHINGs from it. */
789 if (OP(scan) != CURLYX) {
790 int max = (reg_off_by_arg[OP(scan)]
792 /* I32 may be smaller than U16 on CRAYs! */
793 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
798 /* Skip NOTHING and LONGJMP. */
799 while ((n = regnext(n))
800 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
804 if (reg_off_by_arg[OP(scan)])
807 NEXT_OFF(scan) = off;
809 /* The principal pseudo-switch. Cannot be a switch, since we
810 look into several different things. */
811 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813 next = regnext(scan);
816 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817 I32 max1 = 0, min1 = I32_MAX, num = 0;
818 struct regnode_charclass_class accum;
820 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822 if (flags & SCF_DO_STCLASS)
823 cl_init_zero(pRExC_state, &accum);
824 while (OP(scan) == code) {
825 I32 deltanext, minnext, f = 0, fake;
826 struct regnode_charclass_class this_class;
831 data_fake.whilem_c = data->whilem_c;
832 data_fake.last_closep = data->last_closep;
835 data_fake.last_closep = &fake;
836 next = regnext(scan);
837 scan = NEXTOPER(scan);
839 scan = NEXTOPER(scan);
840 if (flags & SCF_DO_STCLASS) {
841 cl_init(pRExC_state, &this_class);
842 data_fake.start_class = &this_class;
843 f = SCF_DO_STCLASS_AND;
845 if (flags & SCF_WHILEM_VISITED_POS)
846 f |= SCF_WHILEM_VISITED_POS;
847 /* we suppose the run is continuous, last=next...*/
848 minnext = study_chunk(pRExC_state, &scan, &deltanext,
849 next, &data_fake, f);
852 if (max1 < minnext + deltanext)
853 max1 = minnext + deltanext;
854 if (deltanext == I32_MAX)
855 is_inf = is_inf_internal = 1;
857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
859 if (data && (data_fake.flags & SF_HAS_EVAL))
860 data->flags |= SF_HAS_EVAL;
862 data->whilem_c = data_fake.whilem_c;
863 if (flags & SCF_DO_STCLASS)
864 cl_or(pRExC_state, &accum, &this_class);
868 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
870 if (flags & SCF_DO_SUBSTR) {
871 data->pos_min += min1;
872 data->pos_delta += max1 - min1;
873 if (max1 != min1 || is_inf)
874 data->longest = &(data->longest_float);
877 delta += max1 - min1;
878 if (flags & SCF_DO_STCLASS_OR) {
879 cl_or(pRExC_state, data->start_class, &accum);
881 cl_and(data->start_class, &and_with);
882 flags &= ~SCF_DO_STCLASS;
885 else if (flags & SCF_DO_STCLASS_AND) {
887 cl_and(data->start_class, &accum);
888 flags &= ~SCF_DO_STCLASS;
891 /* Switch to OR mode: cache the old value of
892 * data->start_class */
893 StructCopy(data->start_class, &and_with,
894 struct regnode_charclass_class);
895 flags &= ~SCF_DO_STCLASS_AND;
896 StructCopy(&accum, data->start_class,
897 struct regnode_charclass_class);
898 flags |= SCF_DO_STCLASS_OR;
899 data->start_class->flags |= ANYOF_EOS;
903 else if (code == BRANCHJ) /* single branch is optimized. */
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
909 else if (OP(scan) == EXACT) {
910 I32 l = STR_LEN(scan);
911 UV uc = *((U8*)STRING(scan));
913 U8 *s = (U8*)STRING(scan);
914 l = utf8_length(s, s + l);
915 uc = utf8_to_uvchr(s, NULL);
918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
924 ? I32_MAX : data->pos_min + data->pos_delta;
926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
928 SV * sv = data->last_found;
929 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
936 SvUTF8_on(data->last_found);
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
948 && (!(data->start_class->flags & ANYOF_FOLD)
949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
955 ANYOF_BITMAP_SET(data->start_class, uc);
956 data->start_class->flags &= ~ANYOF_EOS;
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
963 ANYOF_BITMAP_SET(data->start_class, uc);
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
969 flags &= ~SCF_DO_STCLASS;
971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972 I32 l = STR_LEN(scan);
973 UV uc = *((U8*)STRING(scan));
975 /* Search for fixed substrings supports EXACT only. */
976 if (flags & SCF_DO_SUBSTR)
977 scan_commit(pRExC_state, data);
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
981 uc = utf8_to_uvchr(s, NULL);
984 if (data && (flags & SCF_DO_SUBSTR))
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
998 ANYOF_BITMAP_SET(data->start_class, uc);
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1010 ANYOF_BITMAP_SET(data->start_class, uc);
1011 data->start_class->flags &= ~ANYOF_EOS;
1013 cl_and(data->start_class, &and_with);
1015 flags &= ~SCF_DO_STCLASS;
1017 else if (strchr((char*)PL_varies,OP(scan))) {
1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019 I32 f = flags, pos_before = 0;
1020 regnode *oscan = scan;
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
1023 I32 next_is_eval = 0;
1025 switch (PL_regkind[(U8)OP(scan)]) {
1026 case WHILEM: /* End of (?:...)* . */
1027 scan = NEXTOPER(scan);
1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031 next = NEXTOPER(scan);
1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034 maxcount = REG_INFTY;
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1040 if (flags & SCF_DO_SUBSTR)
1045 if (flags & SCF_DO_STCLASS) {
1047 maxcount = REG_INFTY;
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1052 is_inf = is_inf_internal = 1;
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056 data->longest = &(data->longest_float);
1058 goto optimize_curly_tail;
1060 mincount = ARG1(scan);
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069 next_is_eval = (OP(scan) == EVAL);
1071 if (flags & SCF_DO_SUBSTR) {
1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073 pos_before = data->pos_min;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 data->flags |= SF_IS_INF;
1081 if (flags & SCF_DO_STCLASS) {
1082 cl_init(pRExC_state, &this_class);
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
1097 /* This will finish on WHILEM, setting scan, or on NULL: */
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1108 else if (flags & SCF_DO_STCLASS_AND) {
1109 /* Switch to OR mode: cache the old value of
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
1121 cl_or(pRExC_state, data->start_class, &this_class);
1122 cl_and(data->start_class, &and_with);
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1130 if (ckWARN(WARN_REGEXP)
1131 /* ? quantifier ok, except for (?{ ... }) */
1132 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133 && (minnext == 0) && (deltanext == 0)
1134 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1138 "Quantifier unexpected on zero-length expression");
1141 min += minnext * mincount;
1142 is_inf_internal |= ((maxcount == REG_INFTY
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
1145 is_inf |= is_inf_internal;
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148 /* Try powerful optimization CURLYX => CURLYN. */
1149 if ( OP(oscan) == CURLYX && data
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155 regnode *nxt1 = nxt;
1162 if (!strchr((char*)PL_simple,OP(nxt))
1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164 && STR_LEN(nxt) == 1))
1170 if (OP(nxt) != CLOSE)
1172 /* Now we know that nxt2 is the only contents: */
1173 oscan->flags = (U8)ARG(nxt);
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 /* Try optimization CURLYX => CURLYM. */
1188 if ( OP(oscan) == CURLYX && data
1189 && !(data->flags & SF_HAS_PAR)
1190 && !(data->flags & SF_HAS_EVAL)
1192 /* XXXX How to optimize if data == 0? */
1193 /* Optimize to a simpler form. */
1194 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1198 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1199 && (OP(nxt2) != WHILEM))
1201 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1202 /* Need to optimize away parenths. */
1203 if (data->flags & SF_IN_PAR) {
1204 /* Set the parenth number. */
1205 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207 if (OP(nxt) != CLOSE)
1208 FAIL("Panic opt close");
1209 oscan->flags = (U8)ARG(nxt);
1210 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1211 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1213 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1214 OP(nxt + 1) = OPTIMIZED; /* was count. */
1215 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1216 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1219 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1220 regnode *nnxt = regnext(nxt1);
1223 if (reg_off_by_arg[OP(nxt1)])
1224 ARG_SET(nxt1, nxt2 - nxt1);
1225 else if (nxt2 - nxt1 < U16_MAX)
1226 NEXT_OFF(nxt1) = nxt2 - nxt1;
1228 OP(nxt) = NOTHING; /* Cannot beautify */
1233 /* Optimize again: */
1234 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1240 else if ((OP(oscan) == CURLYX)
1241 && (flags & SCF_WHILEM_VISITED_POS)
1242 /* See the comment on a similar expression above.
1243 However, this time it not a subexpression
1244 we care about, but the expression itself. */
1245 && (maxcount == REG_INFTY)
1246 && data && ++data->whilem_c < 16) {
1247 /* This stays as CURLYX, we can put the count/of pair. */
1248 /* Find WHILEM (as in regexec.c) */
1249 regnode *nxt = oscan + NEXT_OFF(oscan);
1251 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1254 | (RExC_whilem_seen << 4)); /* On WHILEM */
1256 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258 if (flags & SCF_DO_SUBSTR) {
1259 SV *last_str = Nullsv;
1260 int counted = mincount != 0;
1262 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1263 #if defined(SPARC64_GCC_WORKAROUND)
1269 if (pos_before >= data->last_start_min)
1272 b = data->last_start_min;
1275 s = SvPV(data->last_found, l);
1276 old = b - data->last_start_min;
1279 I32 b = pos_before >= data->last_start_min
1280 ? pos_before : data->last_start_min;
1282 char *s = SvPV(data->last_found, l);
1283 I32 old = b - data->last_start_min;
1287 old = utf8_hop((U8*)s, old) - (U8*)s;
1290 /* Get the added string: */
1291 last_str = newSVpvn(s + old, l);
1293 SvUTF8_on(last_str);
1294 if (deltanext == 0 && pos_before == b) {
1295 /* What was added is a constant string */
1297 SvGROW(last_str, (mincount * l) + 1);
1298 repeatcpy(SvPVX(last_str) + l,
1299 SvPVX(last_str), l, mincount - 1);
1300 SvCUR(last_str) *= mincount;
1301 /* Add additional parts. */
1302 SvCUR_set(data->last_found,
1303 SvCUR(data->last_found) - l);
1304 sv_catsv(data->last_found, last_str);
1306 SV * sv = data->last_found;
1308 SvUTF8(sv) && SvMAGICAL(sv) ?
1309 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1310 if (mg && mg->mg_len >= 0)
1311 mg->mg_len += CHR_SVLEN(last_str);
1313 data->last_end += l * (mincount - 1);
1316 /* start offset must point into the last copy */
1317 data->last_start_min += minnext * (mincount - 1);
1318 data->last_start_max += is_inf ? I32_MAX
1319 : (maxcount - 1) * (minnext + data->pos_delta);
1322 /* It is counted once already... */
1323 data->pos_min += minnext * (mincount - counted);
1324 data->pos_delta += - counted * deltanext +
1325 (minnext + deltanext) * maxcount - minnext * mincount;
1326 if (mincount != maxcount) {
1327 /* Cannot extend fixed substrings found inside
1329 scan_commit(pRExC_state,data);
1330 if (mincount && last_str) {
1331 sv_setsv(data->last_found, last_str);
1332 data->last_end = data->pos_min;
1333 data->last_start_min =
1334 data->pos_min - CHR_SVLEN(last_str);
1335 data->last_start_max = is_inf
1337 : data->pos_min + data->pos_delta
1338 - CHR_SVLEN(last_str);
1340 data->longest = &(data->longest_float);
1342 SvREFCNT_dec(last_str);
1344 if (data && (fl & SF_HAS_EVAL))
1345 data->flags |= SF_HAS_EVAL;
1346 optimize_curly_tail:
1347 if (OP(oscan) != CURLYX) {
1348 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1350 NEXT_OFF(oscan) += NEXT_OFF(next);
1353 default: /* REF and CLUMP only? */
1354 if (flags & SCF_DO_SUBSTR) {
1355 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1356 data->longest = &(data->longest_float);
1358 is_inf = is_inf_internal = 1;
1359 if (flags & SCF_DO_STCLASS_OR)
1360 cl_anything(pRExC_state, data->start_class);
1361 flags &= ~SCF_DO_STCLASS;
1365 else if (strchr((char*)PL_simple,OP(scan))) {
1368 if (flags & SCF_DO_SUBSTR) {
1369 scan_commit(pRExC_state,data);
1373 if (flags & SCF_DO_STCLASS) {
1374 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1376 /* Some of the logic below assumes that switching
1377 locale on will only add false positives. */
1378 switch (PL_regkind[(U8)OP(scan)]) {
1382 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1383 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1384 cl_anything(pRExC_state, data->start_class);
1387 if (OP(scan) == SANY)
1389 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1390 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1391 || (data->start_class->flags & ANYOF_CLASS));
1392 cl_anything(pRExC_state, data->start_class);
1394 if (flags & SCF_DO_STCLASS_AND || !value)
1395 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398 if (flags & SCF_DO_STCLASS_AND)
1399 cl_and(data->start_class,
1400 (struct regnode_charclass_class*)scan);
1402 cl_or(pRExC_state, data->start_class,
1403 (struct regnode_charclass_class*)scan);
1406 if (flags & SCF_DO_STCLASS_AND) {
1407 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1409 for (value = 0; value < 256; value++)
1410 if (!isALNUM(value))
1411 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418 for (value = 0; value < 256; value++)
1420 ANYOF_BITMAP_SET(data->start_class, value);
1425 if (flags & SCF_DO_STCLASS_AND) {
1426 if (data->start_class->flags & ANYOF_LOCALE)
1427 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1431 data->start_class->flags |= ANYOF_LOCALE;
1435 if (flags & SCF_DO_STCLASS_AND) {
1436 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1438 for (value = 0; value < 256; value++)
1440 ANYOF_BITMAP_CLEAR(data->start_class, value);
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1447 for (value = 0; value < 256; value++)
1448 if (!isALNUM(value))
1449 ANYOF_BITMAP_SET(data->start_class, value);
1454 if (flags & SCF_DO_STCLASS_AND) {
1455 if (data->start_class->flags & ANYOF_LOCALE)
1456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459 data->start_class->flags |= ANYOF_LOCALE;
1460 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1464 if (flags & SCF_DO_STCLASS_AND) {
1465 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1467 for (value = 0; value < 256; value++)
1468 if (!isSPACE(value))
1469 ANYOF_BITMAP_CLEAR(data->start_class, value);
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1476 for (value = 0; value < 256; value++)
1478 ANYOF_BITMAP_SET(data->start_class, value);
1483 if (flags & SCF_DO_STCLASS_AND) {
1484 if (data->start_class->flags & ANYOF_LOCALE)
1485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488 data->start_class->flags |= ANYOF_LOCALE;
1489 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1493 if (flags & SCF_DO_STCLASS_AND) {
1494 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1495 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1496 for (value = 0; value < 256; value++)
1498 ANYOF_BITMAP_CLEAR(data->start_class, value);
1502 if (data->start_class->flags & ANYOF_LOCALE)
1503 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1505 for (value = 0; value < 256; value++)
1506 if (!isSPACE(value))
1507 ANYOF_BITMAP_SET(data->start_class, value);
1512 if (flags & SCF_DO_STCLASS_AND) {
1513 if (data->start_class->flags & ANYOF_LOCALE) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1515 for (value = 0; value < 256; value++)
1516 if (!isSPACE(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 data->start_class->flags |= ANYOF_LOCALE;
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1526 if (flags & SCF_DO_STCLASS_AND) {
1527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1528 for (value = 0; value < 256; value++)
1529 if (!isDIGIT(value))
1530 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 if (data->start_class->flags & ANYOF_LOCALE)
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1536 for (value = 0; value < 256; value++)
1538 ANYOF_BITMAP_SET(data->start_class, value);
1543 if (flags & SCF_DO_STCLASS_AND) {
1544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1545 for (value = 0; value < 256; value++)
1547 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550 if (data->start_class->flags & ANYOF_LOCALE)
1551 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1553 for (value = 0; value < 256; value++)
1554 if (!isDIGIT(value))
1555 ANYOF_BITMAP_SET(data->start_class, value);
1560 if (flags & SCF_DO_STCLASS_OR)
1561 cl_and(data->start_class, &and_with);
1562 flags &= ~SCF_DO_STCLASS;
1565 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1566 data->flags |= (OP(scan) == MEOL
1570 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1571 /* Lookbehind, or need to calculate parens/evals/stclass: */
1572 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1573 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1574 /* Lookahead/lookbehind */
1575 I32 deltanext, minnext, fake = 0;
1577 struct regnode_charclass_class intrnl;
1580 data_fake.flags = 0;
1582 data_fake.whilem_c = data->whilem_c;
1583 data_fake.last_closep = data->last_closep;
1586 data_fake.last_closep = &fake;
1587 if ( flags & SCF_DO_STCLASS && !scan->flags
1588 && OP(scan) == IFMATCH ) { /* Lookahead */
1589 cl_init(pRExC_state, &intrnl);
1590 data_fake.start_class = &intrnl;
1591 f |= SCF_DO_STCLASS_AND;
1593 if (flags & SCF_WHILEM_VISITED_POS)
1594 f |= SCF_WHILEM_VISITED_POS;
1595 next = regnext(scan);
1596 nscan = NEXTOPER(NEXTOPER(scan));
1597 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1600 vFAIL("Variable length lookbehind not implemented");
1602 else if (minnext > U8_MAX) {
1603 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1605 scan->flags = (U8)minnext;
1607 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1609 if (data && (data_fake.flags & SF_HAS_EVAL))
1610 data->flags |= SF_HAS_EVAL;
1612 data->whilem_c = data_fake.whilem_c;
1613 if (f & SCF_DO_STCLASS_AND) {
1614 int was = (data->start_class->flags & ANYOF_EOS);
1616 cl_and(data->start_class, &intrnl);
1618 data->start_class->flags |= ANYOF_EOS;
1621 else if (OP(scan) == OPEN) {
1624 else if (OP(scan) == CLOSE) {
1625 if ((I32)ARG(scan) == is_par) {
1626 next = regnext(scan);
1628 if ( next && (OP(next) != WHILEM) && next < last)
1629 is_par = 0; /* Disable optimization */
1632 *(data->last_closep) = ARG(scan);
1634 else if (OP(scan) == EVAL) {
1636 data->flags |= SF_HAS_EVAL;
1638 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1639 if (flags & SCF_DO_SUBSTR) {
1640 scan_commit(pRExC_state,data);
1641 data->longest = &(data->longest_float);
1643 is_inf = is_inf_internal = 1;
1644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1645 cl_anything(pRExC_state, data->start_class);
1646 flags &= ~SCF_DO_STCLASS;
1648 /* Else: zero-length, ignore. */
1649 scan = regnext(scan);
1654 *deltap = is_inf_internal ? I32_MAX : delta;
1655 if (flags & SCF_DO_SUBSTR && is_inf)
1656 data->pos_delta = I32_MAX - data->pos_min;
1657 if (is_par > U8_MAX)
1659 if (is_par && pars==1 && data) {
1660 data->flags |= SF_IN_PAR;
1661 data->flags &= ~SF_HAS_PAR;
1663 else if (pars && data) {
1664 data->flags |= SF_HAS_PAR;
1665 data->flags &= ~SF_IN_PAR;
1667 if (flags & SCF_DO_STCLASS_OR)
1668 cl_and(data->start_class, &and_with);
1673 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1675 if (RExC_rx->data) {
1676 Renewc(RExC_rx->data,
1677 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1678 char, struct reg_data);
1679 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1680 RExC_rx->data->count += n;
1683 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1684 char, struct reg_data);
1685 New(1208, RExC_rx->data->what, n, U8);
1686 RExC_rx->data->count = n;
1688 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1689 return RExC_rx->data->count - n;
1693 Perl_reginitcolors(pTHX)
1696 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1699 PL_colors[0] = s = savepv(s);
1701 s = strchr(s, '\t');
1707 PL_colors[i] = s = "";
1711 PL_colors[i++] = "";
1718 - pregcomp - compile a regular expression into internal code
1720 * We can't allocate space until we know how big the compiled form will be,
1721 * but we can't compile it (and thus know how big it is) until we've got a
1722 * place to put the code. So we cheat: we compile it twice, once with code
1723 * generation turned off and size counting turned on, and once "for real".
1724 * This also means that we don't allocate space until we are sure that the
1725 * thing really will compile successfully, and we never have to move the
1726 * code and thus invalidate pointers into it. (Note that it has to be in
1727 * one piece because free() must be able to free it all.) [NB: not true in perl]
1729 * Beware that the optimization-preparation code in here knows about some
1730 * of the structure of the compiled regexp. [I'll say.]
1733 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1743 RExC_state_t RExC_state;
1744 RExC_state_t *pRExC_state = &RExC_state;
1747 FAIL("NULL regexp argument");
1749 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1753 if (!PL_colorset) reginitcolors();
1754 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1755 PL_colors[4],PL_colors[5],PL_colors[0],
1756 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1758 RExC_flags = pm->op_pmflags;
1762 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1763 RExC_seen_evals = 0;
1766 /* First pass: determine size, legality. */
1773 RExC_emit = &PL_regdummy;
1774 RExC_whilem_seen = 0;
1775 #if 0 /* REGC() is (currently) a NOP at the first pass.
1776 * Clever compilers notice this and complain. --jhi */
1777 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1779 if (reg(pRExC_state, 0, &flags) == NULL) {
1780 RExC_precomp = Nullch;
1783 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1785 /* Small enough for pointer-storage convention?
1786 If extralen==0, this means that we will not need long jumps. */
1787 if (RExC_size >= 0x10000L && RExC_extralen)
1788 RExC_size += RExC_extralen;
1791 if (RExC_whilem_seen > 15)
1792 RExC_whilem_seen = 15;
1794 /* Allocate space and initialize. */
1795 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1798 FAIL("Regexp out of space");
1801 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1802 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1805 r->prelen = xend - exp;
1806 r->precomp = savepvn(RExC_precomp, r->prelen);
1808 #ifdef PERL_COPY_ON_WRITE
1809 r->saved_copy = Nullsv;
1811 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1812 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1814 r->substrs = 0; /* Useful during FAIL. */
1815 r->startp = 0; /* Useful during FAIL. */
1816 r->endp = 0; /* Useful during FAIL. */
1818 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1820 r->offsets[0] = RExC_size;
1822 DEBUG_r(PerlIO_printf(Perl_debug_log,
1823 "%s %"UVuf" bytes for offset annotations.\n",
1824 r->offsets ? "Got" : "Couldn't get",
1825 (UV)((2*RExC_size+1) * sizeof(U32))));
1829 /* Second pass: emit code. */
1830 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1835 RExC_emit_start = r->program;
1836 RExC_emit = r->program;
1837 /* Store the count of eval-groups for security checks: */
1838 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1839 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1841 if (reg(pRExC_state, 0, &flags) == NULL)
1844 /* Dig out information for optimizations. */
1845 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1846 pm->op_pmflags = RExC_flags;
1848 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1849 r->regstclass = NULL;
1850 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1851 r->reganch |= ROPT_NAUGHTY;
1852 scan = r->program + 1; /* First BRANCH. */
1854 /* XXXX To minimize changes to RE engine we always allocate
1855 3-units-long substrs field. */
1856 Newz(1004, r->substrs, 1, struct reg_substr_data);
1858 StructCopy(&zero_scan_data, &data, scan_data_t);
1859 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1860 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1862 STRLEN longest_float_length, longest_fixed_length;
1863 struct regnode_charclass_class ch_class;
1868 /* Skip introductions and multiplicators >= 1. */
1869 while ((OP(first) == OPEN && (sawopen = 1)) ||
1870 /* An OR of *one* alternative - should not happen now. */
1871 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1872 (OP(first) == PLUS) ||
1873 (OP(first) == MINMOD) ||
1874 /* An {n,m} with n>0 */
1875 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1876 if (OP(first) == PLUS)
1879 first += regarglen[(U8)OP(first)];
1880 first = NEXTOPER(first);
1883 /* Starting-point info. */
1885 if (PL_regkind[(U8)OP(first)] == EXACT) {
1886 if (OP(first) == EXACT)
1887 ; /* Empty, get anchored substr later. */
1888 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1889 r->regstclass = first;
1891 else if (strchr((char*)PL_simple,OP(first)))
1892 r->regstclass = first;
1893 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1894 PL_regkind[(U8)OP(first)] == NBOUND)
1895 r->regstclass = first;
1896 else if (PL_regkind[(U8)OP(first)] == BOL) {
1897 r->reganch |= (OP(first) == MBOL
1899 : (OP(first) == SBOL
1902 first = NEXTOPER(first);
1905 else if (OP(first) == GPOS) {
1906 r->reganch |= ROPT_ANCH_GPOS;
1907 first = NEXTOPER(first);
1910 else if (!sawopen && (OP(first) == STAR &&
1911 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1912 !(r->reganch & ROPT_ANCH) )
1914 /* turn .* into ^.* with an implied $*=1 */
1915 int type = OP(NEXTOPER(first));
1917 if (type == REG_ANY)
1918 type = ROPT_ANCH_MBOL;
1920 type = ROPT_ANCH_SBOL;
1922 r->reganch |= type | ROPT_IMPLICIT;
1923 first = NEXTOPER(first);
1926 if (sawplus && (!sawopen || !RExC_sawback)
1927 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1928 /* x+ must match at the 1st pos of run of x's */
1929 r->reganch |= ROPT_SKIP;
1931 /* Scan is after the zeroth branch, first is atomic matcher. */
1932 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1933 (IV)(first - scan + 1)));
1935 * If there's something expensive in the r.e., find the
1936 * longest literal string that must appear and make it the
1937 * regmust. Resolve ties in favor of later strings, since
1938 * the regstart check works with the beginning of the r.e.
1939 * and avoiding duplication strengthens checking. Not a
1940 * strong reason, but sufficient in the absence of others.
1941 * [Now we resolve ties in favor of the earlier string if
1942 * it happens that c_offset_min has been invalidated, since the
1943 * earlier string may buy us something the later one won't.]
1947 data.longest_fixed = newSVpvn("",0);
1948 data.longest_float = newSVpvn("",0);
1949 data.last_found = newSVpvn("",0);
1950 data.longest = &(data.longest_fixed);
1952 if (!r->regstclass) {
1953 cl_init(pRExC_state, &ch_class);
1954 data.start_class = &ch_class;
1955 stclass_flag = SCF_DO_STCLASS_AND;
1956 } else /* XXXX Check for BOUND? */
1958 data.last_closep = &last_close;
1960 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1961 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1962 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1963 && data.last_start_min == 0 && data.last_end > 0
1964 && !RExC_seen_zerolen
1965 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1966 r->reganch |= ROPT_CHECK_ALL;
1967 scan_commit(pRExC_state, &data);
1968 SvREFCNT_dec(data.last_found);
1970 longest_float_length = CHR_SVLEN(data.longest_float);
1971 if (longest_float_length
1972 || (data.flags & SF_FL_BEFORE_EOL
1973 && (!(data.flags & SF_FL_BEFORE_MEOL)
1974 || (RExC_flags & PMf_MULTILINE)))) {
1977 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1978 && data.offset_fixed == data.offset_float_min
1979 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1980 goto remove_float; /* As in (a)+. */
1982 if (SvUTF8(data.longest_float)) {
1983 r->float_utf8 = data.longest_float;
1984 r->float_substr = Nullsv;
1986 r->float_substr = data.longest_float;
1987 r->float_utf8 = Nullsv;
1989 r->float_min_offset = data.offset_float_min;
1990 r->float_max_offset = data.offset_float_max;
1991 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FL_BEFORE_MEOL)
1993 || (RExC_flags & PMf_MULTILINE)));
1994 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1998 r->float_substr = r->float_utf8 = Nullsv;
1999 SvREFCNT_dec(data.longest_float);
2000 longest_float_length = 0;
2003 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2004 if (longest_fixed_length
2005 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2006 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2007 || (RExC_flags & PMf_MULTILINE)))) {
2010 if (SvUTF8(data.longest_fixed)) {
2011 r->anchored_utf8 = data.longest_fixed;
2012 r->anchored_substr = Nullsv;
2014 r->anchored_substr = data.longest_fixed;
2015 r->anchored_utf8 = Nullsv;
2017 r->anchored_offset = data.offset_fixed;
2018 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2019 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2020 || (RExC_flags & PMf_MULTILINE)));
2021 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2024 r->anchored_substr = r->anchored_utf8 = Nullsv;
2025 SvREFCNT_dec(data.longest_fixed);
2026 longest_fixed_length = 0;
2029 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2030 r->regstclass = NULL;
2031 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2033 && !(data.start_class->flags & ANYOF_EOS)
2034 && !cl_is_anything(data.start_class))
2036 I32 n = add_data(pRExC_state, 1, "f");
2038 New(1006, RExC_rx->data->data[n], 1,
2039 struct regnode_charclass_class);
2040 StructCopy(data.start_class,
2041 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2042 struct regnode_charclass_class);
2043 r->regstclass = (regnode*)RExC_rx->data->data[n];
2044 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2045 PL_regdata = r->data; /* for regprop() */
2046 DEBUG_r({ SV *sv = sv_newmortal();
2047 regprop(sv, (regnode*)data.start_class);
2048 PerlIO_printf(Perl_debug_log,
2049 "synthetic stclass `%s'.\n",
2053 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2054 if (longest_fixed_length > longest_float_length) {
2055 r->check_substr = r->anchored_substr;
2056 r->check_utf8 = r->anchored_utf8;
2057 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2058 if (r->reganch & ROPT_ANCH_SINGLE)
2059 r->reganch |= ROPT_NOSCAN;
2062 r->check_substr = r->float_substr;
2063 r->check_utf8 = r->float_utf8;
2064 r->check_offset_min = data.offset_float_min;
2065 r->check_offset_max = data.offset_float_max;
2067 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2068 This should be changed ASAP! */
2069 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2070 r->reganch |= RE_USE_INTUIT;
2071 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2072 r->reganch |= RE_INTUIT_TAIL;
2076 /* Several toplevels. Best we can is to set minlen. */
2078 struct regnode_charclass_class ch_class;
2081 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2082 scan = r->program + 1;
2083 cl_init(pRExC_state, &ch_class);
2084 data.start_class = &ch_class;
2085 data.last_closep = &last_close;
2086 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2087 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2088 = r->float_substr = r->float_utf8 = Nullsv;
2089 if (!(data.start_class->flags & ANYOF_EOS)
2090 && !cl_is_anything(data.start_class))
2092 I32 n = add_data(pRExC_state, 1, "f");
2094 New(1006, RExC_rx->data->data[n], 1,
2095 struct regnode_charclass_class);
2096 StructCopy(data.start_class,
2097 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2098 struct regnode_charclass_class);
2099 r->regstclass = (regnode*)RExC_rx->data->data[n];
2100 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2101 DEBUG_r({ SV* sv = sv_newmortal();
2102 regprop(sv, (regnode*)data.start_class);
2103 PerlIO_printf(Perl_debug_log,
2104 "synthetic stclass `%s'.\n",
2110 if (RExC_seen & REG_SEEN_GPOS)
2111 r->reganch |= ROPT_GPOS_SEEN;
2112 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2113 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2114 if (RExC_seen & REG_SEEN_EVAL)
2115 r->reganch |= ROPT_EVAL_SEEN;
2116 if (RExC_seen & REG_SEEN_CANY)
2117 r->reganch |= ROPT_CANY_SEEN;
2118 Newz(1002, r->startp, RExC_npar, I32);
2119 Newz(1002, r->endp, RExC_npar, I32);
2120 PL_regdata = r->data; /* for regprop() */
2121 DEBUG_r(regdump(r));
2126 - reg - regular expression, i.e. main body or parenthesized thing
2128 * Caller must absorb opening parenthesis.
2130 * Combining parenthesis handling with the base level of regular expression
2131 * is a trifle forced, but the need to tie the tails of the branches to what
2132 * follows makes it hard to avoid.
2135 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2136 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2138 register regnode *ret; /* Will be the head of the group. */
2139 register regnode *br;
2140 register regnode *lastbr;
2141 register regnode *ender = 0;
2142 register I32 parno = 0;
2143 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2145 /* for (?g), (?gc), and (?o) warnings; warning
2146 about (?c) will warn about (?g) -- japhy */
2148 I32 wastedflags = 0x00,
2151 wasted_gc = 0x02 | 0x04,
2154 char * parse_start = RExC_parse; /* MJD */
2155 char *oregcomp_parse = RExC_parse;
2158 *flagp = 0; /* Tentatively. */
2161 /* Make an OPEN node, if parenthesized. */
2163 if (*RExC_parse == '?') { /* (?...) */
2164 U32 posflags = 0, negflags = 0;
2165 U32 *flagsp = &posflags;
2167 char *seqstart = RExC_parse;
2170 paren = *RExC_parse++;
2171 ret = NULL; /* For look-ahead/behind. */
2173 case '<': /* (?<...) */
2174 RExC_seen |= REG_SEEN_LOOKBEHIND;
2175 if (*RExC_parse == '!')
2177 if (*RExC_parse != '=' && *RExC_parse != '!')
2180 case '=': /* (?=...) */
2181 case '!': /* (?!...) */
2182 RExC_seen_zerolen++;
2183 case ':': /* (?:...) */
2184 case '>': /* (?>...) */
2186 case '$': /* (?$...) */
2187 case '@': /* (?@...) */
2188 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2190 case '#': /* (?#...) */
2191 while (*RExC_parse && *RExC_parse != ')')
2193 if (*RExC_parse != ')')
2194 FAIL("Sequence (?#... not terminated");
2195 nextchar(pRExC_state);
2198 case 'p': /* (?p...) */
2199 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2200 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2202 case '?': /* (??...) */
2204 if (*RExC_parse != '{')
2206 paren = *RExC_parse++;
2208 case '{': /* (?{...}) */
2210 I32 count = 1, n = 0;
2212 char *s = RExC_parse;
2214 OP_4tree *sop, *rop;
2216 RExC_seen_zerolen++;
2217 RExC_seen |= REG_SEEN_EVAL;
2218 while (count && (c = *RExC_parse)) {
2219 if (c == '\\' && RExC_parse[1])
2227 if (*RExC_parse != ')')
2230 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2235 if (RExC_parse - 1 - s)
2236 sv = newSVpvn(s, RExC_parse - 1 - s);
2238 sv = newSVpvn("", 0);
2241 Perl_save_re_context(aTHX);
2242 rop = sv_compile_2op(sv, &sop, "re", &pad);
2243 sop->op_private |= OPpREFCOUNTED;
2244 /* re_dup will OpREFCNT_inc */
2245 OpREFCNT_set(sop, 1);
2248 n = add_data(pRExC_state, 3, "nop");
2249 RExC_rx->data->data[n] = (void*)rop;
2250 RExC_rx->data->data[n+1] = (void*)sop;
2251 RExC_rx->data->data[n+2] = (void*)pad;
2254 else { /* First pass */
2255 if (PL_reginterp_cnt < ++RExC_seen_evals
2256 && PL_curcop != &PL_compiling)
2257 /* No compiled RE interpolated, has runtime
2258 components ===> unsafe. */
2259 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2260 if (PL_tainting && PL_tainted)
2261 FAIL("Eval-group in insecure regular expression");
2264 nextchar(pRExC_state);
2266 ret = reg_node(pRExC_state, LOGICAL);
2269 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2270 /* deal with the length of this later - MJD */
2273 ret = reganode(pRExC_state, EVAL, n);
2274 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2275 Set_Node_Offset(ret, parse_start);
2278 case '(': /* (?(?{...})...) and (?(?=...)...) */
2280 if (RExC_parse[0] == '?') { /* (?(?...)) */
2281 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2282 || RExC_parse[1] == '<'
2283 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2286 ret = reg_node(pRExC_state, LOGICAL);
2289 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2293 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2295 parno = atoi(RExC_parse++);
2297 while (isDIGIT(*RExC_parse))
2299 ret = reganode(pRExC_state, GROUPP, parno);
2301 if ((c = *nextchar(pRExC_state)) != ')')
2302 vFAIL("Switch condition not recognized");
2304 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2305 br = regbranch(pRExC_state, &flags, 1);
2307 br = reganode(pRExC_state, LONGJMP, 0);
2309 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2310 c = *nextchar(pRExC_state);
2314 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2315 regbranch(pRExC_state, &flags, 1);
2316 regtail(pRExC_state, ret, lastbr);
2319 c = *nextchar(pRExC_state);
2324 vFAIL("Switch (?(condition)... contains too many branches");
2325 ender = reg_node(pRExC_state, TAIL);
2326 regtail(pRExC_state, br, ender);
2328 regtail(pRExC_state, lastbr, ender);
2329 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2332 regtail(pRExC_state, ret, ender);
2336 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2340 RExC_parse--; /* for vFAIL to print correctly */
2341 vFAIL("Sequence (? incomplete");
2345 parse_flags: /* (?i) */
2346 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2347 /* (?g), (?gc) and (?o) are useless here
2348 and must be globally applied -- japhy */
2350 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2351 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2352 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2353 if (! (wastedflags & wflagbit) ) {
2354 wastedflags |= wflagbit;
2357 "Useless (%s%c) - %suse /%c modifier",
2358 flagsp == &negflags ? "?-" : "?",
2360 flagsp == &negflags ? "don't " : "",
2366 else if (*RExC_parse == 'c') {
2367 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2368 if (! (wastedflags & wasted_c) ) {
2369 wastedflags |= wasted_gc;
2372 "Useless (%sc) - %suse /gc modifier",
2373 flagsp == &negflags ? "?-" : "?",
2374 flagsp == &negflags ? "don't " : ""
2379 else { pmflag(flagsp, *RExC_parse); }
2383 if (*RExC_parse == '-') {
2385 wastedflags = 0; /* reset so (?g-c) warns twice */
2389 RExC_flags |= posflags;
2390 RExC_flags &= ~negflags;
2391 if (*RExC_parse == ':') {
2397 if (*RExC_parse != ')') {
2399 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2401 nextchar(pRExC_state);
2409 ret = reganode(pRExC_state, OPEN, parno);
2410 Set_Node_Length(ret, 1); /* MJD */
2411 Set_Node_Offset(ret, RExC_parse); /* MJD */
2418 /* Pick up the branches, linking them together. */
2419 parse_start = RExC_parse; /* MJD */
2420 br = regbranch(pRExC_state, &flags, 1);
2421 /* branch_len = (paren != 0); */
2425 if (*RExC_parse == '|') {
2426 if (!SIZE_ONLY && RExC_extralen) {
2427 reginsert(pRExC_state, BRANCHJ, br);
2430 reginsert(pRExC_state, BRANCH, br);
2431 Set_Node_Length(br, paren != 0);
2432 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2436 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2438 else if (paren == ':') {
2439 *flagp |= flags&SIMPLE;
2441 if (open) { /* Starts with OPEN. */
2442 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2444 else if (paren != '?') /* Not Conditional */
2446 *flagp |= flags & (SPSTART | HASWIDTH);
2448 while (*RExC_parse == '|') {
2449 if (!SIZE_ONLY && RExC_extralen) {
2450 ender = reganode(pRExC_state, LONGJMP,0);
2451 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2454 RExC_extralen += 2; /* Account for LONGJMP. */
2455 nextchar(pRExC_state);
2456 br = regbranch(pRExC_state, &flags, 0);
2460 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2464 *flagp |= flags&SPSTART;
2467 if (have_branch || paren != ':') {
2468 /* Make a closing node, and hook it on the end. */
2471 ender = reg_node(pRExC_state, TAIL);
2474 ender = reganode(pRExC_state, CLOSE, parno);
2475 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2476 Set_Node_Length(ender,1); /* MJD */
2482 *flagp &= ~HASWIDTH;
2485 ender = reg_node(pRExC_state, SUCCEED);
2488 ender = reg_node(pRExC_state, END);
2491 regtail(pRExC_state, lastbr, ender);
2494 /* Hook the tails of the branches to the closing node. */
2495 for (br = ret; br != NULL; br = regnext(br)) {
2496 regoptail(pRExC_state, br, ender);
2503 static char parens[] = "=!<,>";
2505 if (paren && (p = strchr(parens, paren))) {
2506 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2507 int flag = (p - parens) > 1;
2510 node = SUSPEND, flag = 0;
2511 reginsert(pRExC_state, node,ret);
2512 Set_Node_Offset(ret, oregcomp_parse);
2513 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2515 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2519 /* Check for proper termination. */
2521 RExC_flags = oregflags;
2522 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2523 RExC_parse = oregcomp_parse;
2524 vFAIL("Unmatched (");
2527 else if (!paren && RExC_parse < RExC_end) {
2528 if (*RExC_parse == ')') {
2530 vFAIL("Unmatched )");
2533 FAIL("Junk on end of regexp"); /* "Can't happen". */
2541 - regbranch - one alternative of an | operator
2543 * Implements the concatenation operator.
2546 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2548 register regnode *ret;
2549 register regnode *chain = NULL;
2550 register regnode *latest;
2551 I32 flags = 0, c = 0;
2556 if (!SIZE_ONLY && RExC_extralen)
2557 ret = reganode(pRExC_state, BRANCHJ,0);
2559 ret = reg_node(pRExC_state, BRANCH);
2560 Set_Node_Length(ret, 1);
2564 if (!first && SIZE_ONLY)
2565 RExC_extralen += 1; /* BRANCHJ */
2567 *flagp = WORST; /* Tentatively. */
2570 nextchar(pRExC_state);
2571 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2573 latest = regpiece(pRExC_state, &flags);
2574 if (latest == NULL) {
2575 if (flags & TRYAGAIN)
2579 else if (ret == NULL)
2581 *flagp |= flags&HASWIDTH;
2582 if (chain == NULL) /* First piece. */
2583 *flagp |= flags&SPSTART;
2586 regtail(pRExC_state, chain, latest);
2591 if (chain == NULL) { /* Loop ran zero times. */
2592 chain = reg_node(pRExC_state, NOTHING);
2597 *flagp |= flags&SIMPLE;
2604 - regpiece - something followed by possible [*+?]
2606 * Note that the branching code sequences used for ? and the general cases
2607 * of * and + are somewhat optimized: they use the same NOTHING node as
2608 * both the endmarker for their branch list and the body of the last branch.
2609 * It might seem that this node could be dispensed with entirely, but the
2610 * endmarker role is not redundant.
2613 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2615 register regnode *ret;
2617 register char *next;
2619 char *origparse = RExC_parse;
2622 I32 max = REG_INFTY;
2625 ret = regatom(pRExC_state, &flags);
2627 if (flags & TRYAGAIN)
2634 if (op == '{' && regcurly(RExC_parse)) {
2635 parse_start = RExC_parse; /* MJD */
2636 next = RExC_parse + 1;
2638 while (isDIGIT(*next) || *next == ',') {
2647 if (*next == '}') { /* got one */
2651 min = atoi(RExC_parse);
2655 maxpos = RExC_parse;
2657 if (!max && *maxpos != '0')
2658 max = REG_INFTY; /* meaning "infinity" */
2659 else if (max >= REG_INFTY)
2660 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2662 nextchar(pRExC_state);
2665 if ((flags&SIMPLE)) {
2666 RExC_naughty += 2 + RExC_naughty / 2;
2667 reginsert(pRExC_state, CURLY, ret);
2668 Set_Node_Offset(ret, parse_start+1); /* MJD */
2669 Set_Node_Cur_Length(ret);
2672 regnode *w = reg_node(pRExC_state, WHILEM);
2675 regtail(pRExC_state, ret, w);
2676 if (!SIZE_ONLY && RExC_extralen) {
2677 reginsert(pRExC_state, LONGJMP,ret);
2678 reginsert(pRExC_state, NOTHING,ret);
2679 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2681 reginsert(pRExC_state, CURLYX,ret);
2683 Set_Node_Offset(ret, parse_start+1);
2684 Set_Node_Length(ret,
2685 op == '{' ? (RExC_parse - parse_start) : 1);
2687 if (!SIZE_ONLY && RExC_extralen)
2688 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2689 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2691 RExC_whilem_seen++, RExC_extralen += 3;
2692 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2700 if (max && max < min)
2701 vFAIL("Can't do {n,m} with n > m");
2703 ARG1_SET(ret, (U16)min);
2704 ARG2_SET(ret, (U16)max);
2716 #if 0 /* Now runtime fix should be reliable. */
2718 /* if this is reinstated, don't forget to put this back into perldiag:
2720 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2722 (F) The part of the regexp subject to either the * or + quantifier
2723 could match an empty string. The {#} shows in the regular
2724 expression about where the problem was discovered.
2728 if (!(flags&HASWIDTH) && op != '?')
2729 vFAIL("Regexp *+ operand could be empty");
2732 parse_start = RExC_parse;
2733 nextchar(pRExC_state);
2735 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2737 if (op == '*' && (flags&SIMPLE)) {
2738 reginsert(pRExC_state, STAR, ret);
2742 else if (op == '*') {
2746 else if (op == '+' && (flags&SIMPLE)) {
2747 reginsert(pRExC_state, PLUS, ret);
2751 else if (op == '+') {
2755 else if (op == '?') {
2760 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2762 "%.*s matches null string many times",
2763 RExC_parse - origparse,
2767 if (*RExC_parse == '?') {
2768 nextchar(pRExC_state);
2769 reginsert(pRExC_state, MINMOD, ret);
2770 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2772 if (ISMULT2(RExC_parse)) {
2774 vFAIL("Nested quantifiers");
2781 - regatom - the lowest level
2783 * Optimization: gobbles an entire sequence of ordinary characters so that
2784 * it can turn them into a single node, which is smaller to store and
2785 * faster to run. Backslashed characters are exceptions, each becoming a
2786 * separate node; the code is simpler that way and it's not worth fixing.
2788 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2790 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2792 register regnode *ret = 0;
2794 char *parse_start = 0;
2796 *flagp = WORST; /* Tentatively. */
2799 switch (*RExC_parse) {
2801 RExC_seen_zerolen++;
2802 nextchar(pRExC_state);
2803 if (RExC_flags & PMf_MULTILINE)
2804 ret = reg_node(pRExC_state, MBOL);
2805 else if (RExC_flags & PMf_SINGLELINE)
2806 ret = reg_node(pRExC_state, SBOL);
2808 ret = reg_node(pRExC_state, BOL);
2809 Set_Node_Length(ret, 1); /* MJD */
2812 nextchar(pRExC_state);
2814 RExC_seen_zerolen++;
2815 if (RExC_flags & PMf_MULTILINE)
2816 ret = reg_node(pRExC_state, MEOL);
2817 else if (RExC_flags & PMf_SINGLELINE)
2818 ret = reg_node(pRExC_state, SEOL);
2820 ret = reg_node(pRExC_state, EOL);
2821 Set_Node_Length(ret, 1); /* MJD */
2824 nextchar(pRExC_state);
2825 if (RExC_flags & PMf_SINGLELINE)
2826 ret = reg_node(pRExC_state, SANY);
2828 ret = reg_node(pRExC_state, REG_ANY);
2829 *flagp |= HASWIDTH|SIMPLE;
2831 Set_Node_Length(ret, 1); /* MJD */
2835 char *oregcomp_parse = ++RExC_parse;
2836 ret = regclass(pRExC_state);
2837 if (*RExC_parse != ']') {
2838 RExC_parse = oregcomp_parse;
2839 vFAIL("Unmatched [");
2841 nextchar(pRExC_state);
2842 *flagp |= HASWIDTH|SIMPLE;
2843 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2847 nextchar(pRExC_state);
2848 ret = reg(pRExC_state, 1, &flags);
2850 if (flags & TRYAGAIN) {
2851 if (RExC_parse == RExC_end) {
2852 /* Make parent create an empty node if needed. */
2860 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2864 if (flags & TRYAGAIN) {
2868 vFAIL("Internal urp");
2869 /* Supposed to be caught earlier. */
2872 if (!regcurly(RExC_parse)) {
2881 vFAIL("Quantifier follows nothing");
2884 switch (*++RExC_parse) {
2886 RExC_seen_zerolen++;
2887 ret = reg_node(pRExC_state, SBOL);
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, GPOS);
2894 RExC_seen |= REG_SEEN_GPOS;
2896 nextchar(pRExC_state);
2897 Set_Node_Length(ret, 2); /* MJD */
2900 ret = reg_node(pRExC_state, SEOL);
2902 RExC_seen_zerolen++; /* Do not optimize RE away */
2903 nextchar(pRExC_state);
2906 ret = reg_node(pRExC_state, EOS);
2908 RExC_seen_zerolen++; /* Do not optimize RE away */
2909 nextchar(pRExC_state);
2910 Set_Node_Length(ret, 2); /* MJD */
2913 ret = reg_node(pRExC_state, CANY);
2914 RExC_seen |= REG_SEEN_CANY;
2915 *flagp |= HASWIDTH|SIMPLE;
2916 nextchar(pRExC_state);
2917 Set_Node_Length(ret, 2); /* MJD */
2920 ret = reg_node(pRExC_state, CLUMP);
2922 nextchar(pRExC_state);
2923 Set_Node_Length(ret, 2); /* MJD */
2926 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2927 *flagp |= HASWIDTH|SIMPLE;
2928 nextchar(pRExC_state);
2929 Set_Node_Length(ret, 2); /* MJD */
2932 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2933 *flagp |= HASWIDTH|SIMPLE;
2934 nextchar(pRExC_state);
2935 Set_Node_Length(ret, 2); /* MJD */
2938 RExC_seen_zerolen++;
2939 RExC_seen |= REG_SEEN_LOOKBEHIND;
2940 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2942 nextchar(pRExC_state);
2943 Set_Node_Length(ret, 2); /* MJD */
2946 RExC_seen_zerolen++;
2947 RExC_seen |= REG_SEEN_LOOKBEHIND;
2948 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2950 nextchar(pRExC_state);
2951 Set_Node_Length(ret, 2); /* MJD */
2954 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2955 *flagp |= HASWIDTH|SIMPLE;
2956 nextchar(pRExC_state);
2957 Set_Node_Length(ret, 2); /* MJD */
2960 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2961 *flagp |= HASWIDTH|SIMPLE;
2962 nextchar(pRExC_state);
2963 Set_Node_Length(ret, 2); /* MJD */
2966 ret = reg_node(pRExC_state, DIGIT);
2967 *flagp |= HASWIDTH|SIMPLE;
2968 nextchar(pRExC_state);
2969 Set_Node_Length(ret, 2); /* MJD */
2972 ret = reg_node(pRExC_state, NDIGIT);
2973 *flagp |= HASWIDTH|SIMPLE;
2974 nextchar(pRExC_state);
2975 Set_Node_Length(ret, 2); /* MJD */
2980 char* oldregxend = RExC_end;
2981 char* parse_start = RExC_parse - 2;
2983 if (RExC_parse[1] == '{') {
2984 /* a lovely hack--pretend we saw [\pX] instead */
2985 RExC_end = strchr(RExC_parse, '}');
2987 U8 c = (U8)*RExC_parse;
2989 RExC_end = oldregxend;
2990 vFAIL2("Missing right brace on \\%c{}", c);
2995 RExC_end = RExC_parse + 2;
2996 if (RExC_end > oldregxend)
2997 RExC_end = oldregxend;
3001 ret = regclass(pRExC_state);
3003 RExC_end = oldregxend;
3006 Set_Node_Offset(ret, parse_start + 2);
3007 Set_Node_Cur_Length(ret);
3008 nextchar(pRExC_state);
3009 *flagp |= HASWIDTH|SIMPLE;
3022 case '1': case '2': case '3': case '4':
3023 case '5': case '6': case '7': case '8': case '9':
3025 I32 num = atoi(RExC_parse);
3027 if (num > 9 && num >= RExC_npar)
3030 char * parse_start = RExC_parse - 1; /* MJD */
3031 while (isDIGIT(*RExC_parse))
3034 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3035 vFAIL("Reference to nonexistent group");
3037 ret = reganode(pRExC_state,
3038 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3042 /* override incorrect value set in reganode MJD */
3043 Set_Node_Offset(ret, parse_start+1);
3044 Set_Node_Cur_Length(ret); /* MJD */
3046 nextchar(pRExC_state);
3051 if (RExC_parse >= RExC_end)
3052 FAIL("Trailing \\");
3055 /* Do not generate `unrecognized' warnings here, we fall
3056 back into the quick-grab loop below */
3062 if (RExC_flags & PMf_EXTENDED) {
3063 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3064 if (RExC_parse < RExC_end)
3070 register STRLEN len;
3076 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3078 parse_start = RExC_parse - 1;
3084 ret = reg_node(pRExC_state,
3085 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3087 for (len = 0, p = RExC_parse - 1;
3088 len < 127 && p < RExC_end;
3093 if (RExC_flags & PMf_EXTENDED)
3094 p = regwhite(p, RExC_end);
3141 ender = ASCII_TO_NATIVE('\033');
3145 ender = ASCII_TO_NATIVE('\007');
3150 char* e = strchr(p, '}');
3154 vFAIL("Missing right brace on \\x{}");
3157 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3158 | PERL_SCAN_DISALLOW_PREFIX;
3160 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3163 /* numlen is generous */
3164 if (numlen + len >= 127) {
3172 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3174 ender = grok_hex(p, &numlen, &flags, NULL);
3180 ender = UCHARAT(p++);
3181 ender = toCTRL(ender);
3183 case '0': case '1': case '2': case '3':case '4':
3184 case '5': case '6': case '7': case '8':case '9':
3186 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3189 ender = grok_oct(p, &numlen, &flags, NULL);
3199 FAIL("Trailing \\");
3202 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3203 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3204 goto normal_default;
3209 if (UTF8_IS_START(*p) && UTF) {
3210 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3218 if (RExC_flags & PMf_EXTENDED)
3219 p = regwhite(p, RExC_end);
3221 /* Prime the casefolded buffer. */
3222 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3224 if (ISMULT2(p)) { /* Back off on ?+*. */
3231 /* Emit all the Unicode characters. */
3232 for (foldbuf = tmpbuf;
3234 foldlen -= numlen) {
3235 ender = utf8_to_uvchr(foldbuf, &numlen);
3237 reguni(pRExC_state, ender, s, &unilen);
3240 /* In EBCDIC the numlen
3241 * and unilen can differ. */
3243 if (numlen >= foldlen)
3247 break; /* "Can't happen." */
3251 reguni(pRExC_state, ender, s, &unilen);
3260 REGC((char)ender, s++);
3268 /* Emit all the Unicode characters. */
3269 for (foldbuf = tmpbuf;
3271 foldlen -= numlen) {
3272 ender = utf8_to_uvchr(foldbuf, &numlen);
3274 reguni(pRExC_state, ender, s, &unilen);
3277 /* In EBCDIC the numlen
3278 * and unilen can differ. */
3280 if (numlen >= foldlen)
3288 reguni(pRExC_state, ender, s, &unilen);
3297 REGC((char)ender, s++);
3301 Set_Node_Cur_Length(ret); /* MJD */
3302 nextchar(pRExC_state);
3304 /* len is STRLEN which is unsigned, need to copy to signed */
3307 vFAIL("Internal disaster");
3316 RExC_size += STR_SZ(len);
3318 RExC_emit += STR_SZ(len);
3323 /* If the encoding pragma is in effect recode the text of
3324 * any EXACT-kind nodes. */
3325 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3326 STRLEN oldlen = STR_LEN(ret);
3327 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3331 if (sv_utf8_downgrade(sv, TRUE)) {
3332 char *s = sv_recode_to_utf8(sv, PL_encoding);
3333 STRLEN newlen = SvCUR(sv);
3338 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3339 (int)oldlen, STRING(ret),
3341 Copy(s, STRING(ret), newlen, char);
3342 STR_LEN(ret) += newlen - oldlen;
3343 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3345 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3353 S_regwhite(pTHX_ char *p, char *e)
3358 else if (*p == '#') {
3361 } while (p < e && *p != '\n');
3369 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3370 Character classes ([:foo:]) can also be negated ([:^foo:]).
3371 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3372 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3373 but trigger failures because they are currently unimplemented. */
3375 #define POSIXCC_DONE(c) ((c) == ':')
3376 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3377 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3380 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3383 I32 namedclass = OOB_NAMEDCLASS;
3385 if (value == '[' && RExC_parse + 1 < RExC_end &&
3386 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3387 POSIXCC(UCHARAT(RExC_parse))) {
3388 char c = UCHARAT(RExC_parse);
3389 char* s = RExC_parse++;
3391 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3393 if (RExC_parse == RExC_end)
3394 /* Grandfather lone [:, [=, [. */
3397 char* t = RExC_parse++; /* skip over the c */
3399 if (UCHARAT(RExC_parse) == ']') {
3400 RExC_parse++; /* skip over the ending ] */
3403 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3404 I32 skip = 5; /* the most common skip */
3408 if (strnEQ(posixcc, "alnum", 5))
3410 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3411 else if (strnEQ(posixcc, "alpha", 5))
3413 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3414 else if (strnEQ(posixcc, "ascii", 5))
3416 complement ? ANYOF_NASCII : ANYOF_ASCII;
3419 if (strnEQ(posixcc, "blank", 5))
3421 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3424 if (strnEQ(posixcc, "cntrl", 5))
3426 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3429 if (strnEQ(posixcc, "digit", 5))
3431 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3434 if (strnEQ(posixcc, "graph", 5))
3436 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3439 if (strnEQ(posixcc, "lower", 5))
3441 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3444 if (strnEQ(posixcc, "print", 5))
3446 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3447 else if (strnEQ(posixcc, "punct", 5))
3449 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3452 if (strnEQ(posixcc, "space", 5))
3454 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3457 if (strnEQ(posixcc, "upper", 5))
3459 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3461 case 'w': /* this is not POSIX, this is the Perl \w */
3462 if (strnEQ(posixcc, "word", 4)) {
3464 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3469 if (strnEQ(posixcc, "xdigit", 6)) {
3471 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3476 if (namedclass == OOB_NAMEDCLASS ||
3477 posixcc[skip] != ':' ||
3478 posixcc[skip+1] != ']')
3480 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3483 } else if (!SIZE_ONLY) {
3484 /* [[=foo=]] and [[.foo.]] are still future. */
3486 /* adjust RExC_parse so the warning shows after
3488 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3490 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3493 /* Maternal grandfather:
3494 * "[:" ending in ":" but not in ":]" */
3504 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3506 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3507 char *s = RExC_parse;
3510 while(*s && isALNUM(*s))
3512 if (*s && c == *s && s[1] == ']') {
3513 if (ckWARN(WARN_REGEXP))
3515 "POSIX syntax [%c %c] belongs inside character classes",
3518 /* [[=foo=]] and [[.foo.]] are still future. */
3519 if (POSIXCC_NOTYET(c)) {
3520 /* adjust RExC_parse so the error shows after
3522 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3524 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3531 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3534 register UV nextvalue;
3535 register IV prevvalue = OOB_UNICODE;
3536 register IV range = 0;
3537 register regnode *ret;
3540 char *rangebegin = 0;
3541 bool need_class = 0;
3542 SV *listsv = Nullsv;
3545 bool optimize_invert = TRUE;
3546 AV* unicode_alternate = 0;
3548 UV literal_endpoint = 0;
3551 ret = reganode(pRExC_state, ANYOF, 0);
3554 ANYOF_FLAGS(ret) = 0;
3556 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3560 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3564 RExC_size += ANYOF_SKIP;
3566 RExC_emit += ANYOF_SKIP;
3568 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3570 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3571 ANYOF_BITMAP_ZERO(ret);
3572 listsv = newSVpvn("# comment\n", 10);
3575 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3577 if (!SIZE_ONLY && POSIXCC(nextvalue))
3578 checkposixcc(pRExC_state);
3580 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3581 if (UCHARAT(RExC_parse) == ']')
3584 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3588 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3591 rangebegin = RExC_parse;
3593 value = utf8n_to_uvchr((U8*)RExC_parse,
3594 RExC_end - RExC_parse,
3596 RExC_parse += numlen;
3599 value = UCHARAT(RExC_parse++);
3600 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3601 if (value == '[' && POSIXCC(nextvalue))
3602 namedclass = regpposixcc(pRExC_state, value);
3603 else if (value == '\\') {
3605 value = utf8n_to_uvchr((U8*)RExC_parse,
3606 RExC_end - RExC_parse,
3608 RExC_parse += numlen;
3611 value = UCHARAT(RExC_parse++);
3612 /* Some compilers cannot handle switching on 64-bit integer
3613 * values, therefore value cannot be an UV. Yes, this will
3614 * be a problem later if we want switch on Unicode.
3615 * A similar issue a little bit later when switching on
3616 * namedclass. --jhi */
3617 switch ((I32)value) {
3618 case 'w': namedclass = ANYOF_ALNUM; break;
3619 case 'W': namedclass = ANYOF_NALNUM; break;
3620 case 's': namedclass = ANYOF_SPACE; break;
3621 case 'S': namedclass = ANYOF_NSPACE; break;
3622 case 'd': namedclass = ANYOF_DIGIT; break;
3623 case 'D': namedclass = ANYOF_NDIGIT; break;
3626 if (RExC_parse >= RExC_end)
3627 vFAIL2("Empty \\%c{}", (U8)value);
3628 if (*RExC_parse == '{') {
3630 e = strchr(RExC_parse++, '}');
3632 vFAIL2("Missing right brace on \\%c{}", c);
3633 while (isSPACE(UCHARAT(RExC_parse)))
3635 if (e == RExC_parse)
3636 vFAIL2("Empty \\%c{}", c);
3638 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3646 if (UCHARAT(RExC_parse) == '^') {
3649 value = value == 'p' ? 'P' : 'p'; /* toggle */
3650 while (isSPACE(UCHARAT(RExC_parse))) {
3656 Perl_sv_catpvf(aTHX_ listsv,
3657 "+utf8::%.*s\n", (int)n, RExC_parse);
3659 Perl_sv_catpvf(aTHX_ listsv,
3660 "!utf8::%.*s\n", (int)n, RExC_parse);
3663 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3665 case 'n': value = '\n'; break;
3666 case 'r': value = '\r'; break;
3667 case 't': value = '\t'; break;
3668 case 'f': value = '\f'; break;
3669 case 'b': value = '\b'; break;
3670 case 'e': value = ASCII_TO_NATIVE('\033');break;
3671 case 'a': value = ASCII_TO_NATIVE('\007');break;
3673 if (*RExC_parse == '{') {
3674 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3675 | PERL_SCAN_DISALLOW_PREFIX;
3676 e = strchr(RExC_parse++, '}');
3678 vFAIL("Missing right brace on \\x{}");
3680 numlen = e - RExC_parse;
3681 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3685 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3687 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3688 RExC_parse += numlen;
3692 value = UCHARAT(RExC_parse++);
3693 value = toCTRL(value);
3695 case '0': case '1': case '2': case '3': case '4':
3696 case '5': case '6': case '7': case '8': case '9':
3700 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3701 RExC_parse += numlen;
3705 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3707 "Unrecognized escape \\%c in character class passed through",
3711 } /* end of \blah */
3717 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3719 if (!SIZE_ONLY && !need_class)
3720 ANYOF_CLASS_ZERO(ret);
3724 /* a bad range like a-\d, a-[:digit:] ? */
3727 if (ckWARN(WARN_REGEXP))
3729 "False [] range \"%*.*s\"",
3730 RExC_parse - rangebegin,
3731 RExC_parse - rangebegin,
3733 if (prevvalue < 256) {
3734 ANYOF_BITMAP_SET(ret, prevvalue);
3735 ANYOF_BITMAP_SET(ret, '-');
3738 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3739 Perl_sv_catpvf(aTHX_ listsv,
3740 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3744 range = 0; /* this was not a true range */
3748 if (namedclass > OOB_NAMEDCLASS)
3749 optimize_invert = FALSE;
3750 /* Possible truncation here but in some 64-bit environments
3751 * the compiler gets heartburn about switch on 64-bit values.
3752 * A similar issue a little earlier when switching on value.
3754 switch ((I32)namedclass) {
3757 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3759 for (value = 0; value < 256; value++)
3761 ANYOF_BITMAP_SET(ret, value);
3763 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3767 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3769 for (value = 0; value < 256; value++)
3770 if (!isALNUM(value))
3771 ANYOF_BITMAP_SET(ret, value);
3773 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3777 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3779 for (value = 0; value < 256; value++)
3780 if (isALNUMC(value))
3781 ANYOF_BITMAP_SET(ret, value);
3783 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3787 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3789 for (value = 0; value < 256; value++)
3790 if (!isALNUMC(value))
3791 ANYOF_BITMAP_SET(ret, value);
3793 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3797 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3799 for (value = 0; value < 256; value++)
3801 ANYOF_BITMAP_SET(ret, value);
3803 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3807 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3809 for (value = 0; value < 256; value++)
3810 if (!isALPHA(value))
3811 ANYOF_BITMAP_SET(ret, value);
3813 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3817 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3820 for (value = 0; value < 128; value++)
3821 ANYOF_BITMAP_SET(ret, value);
3823 for (value = 0; value < 256; value++) {
3825 ANYOF_BITMAP_SET(ret, value);
3829 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3833 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3836 for (value = 128; value < 256; value++)
3837 ANYOF_BITMAP_SET(ret, value);
3839 for (value = 0; value < 256; value++) {
3840 if (!isASCII(value))
3841 ANYOF_BITMAP_SET(ret, value);
3845 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3849 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3851 for (value = 0; value < 256; value++)
3853 ANYOF_BITMAP_SET(ret, value);
3855 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3859 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3861 for (value = 0; value < 256; value++)
3862 if (!isBLANK(value))
3863 ANYOF_BITMAP_SET(ret, value);
3865 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3869 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3871 for (value = 0; value < 256; value++)
3873 ANYOF_BITMAP_SET(ret, value);
3875 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3879 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3881 for (value = 0; value < 256; value++)
3882 if (!isCNTRL(value))
3883 ANYOF_BITMAP_SET(ret, value);
3885 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3889 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3891 /* consecutive digits assumed */
3892 for (value = '0'; value <= '9'; value++)
3893 ANYOF_BITMAP_SET(ret, value);
3895 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3899 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3901 /* consecutive digits assumed */
3902 for (value = 0; value < '0'; value++)
3903 ANYOF_BITMAP_SET(ret, value);
3904 for (value = '9' + 1; value < 256; value++)
3905 ANYOF_BITMAP_SET(ret, value);
3907 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3911 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3913 for (value = 0; value < 256; value++)
3915 ANYOF_BITMAP_SET(ret, value);
3917 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3921 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3923 for (value = 0; value < 256; value++)
3924 if (!isGRAPH(value))
3925 ANYOF_BITMAP_SET(ret, value);
3927 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3931 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3933 for (value = 0; value < 256; value++)
3935 ANYOF_BITMAP_SET(ret, value);
3937 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3941 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3943 for (value = 0; value < 256; value++)
3944 if (!isLOWER(value))
3945 ANYOF_BITMAP_SET(ret, value);
3947 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3951 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3953 for (value = 0; value < 256; value++)
3955 ANYOF_BITMAP_SET(ret, value);
3957 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3961 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3963 for (value = 0; value < 256; value++)
3964 if (!isPRINT(value))
3965 ANYOF_BITMAP_SET(ret, value);
3967 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3971 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3973 for (value = 0; value < 256; value++)
3974 if (isPSXSPC(value))
3975 ANYOF_BITMAP_SET(ret, value);
3977 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3981 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3983 for (value = 0; value < 256; value++)
3984 if (!isPSXSPC(value))
3985 ANYOF_BITMAP_SET(ret, value);
3987 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3991 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3993 for (value = 0; value < 256; value++)
3995 ANYOF_BITMAP_SET(ret, value);
3997 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4001 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4003 for (value = 0; value < 256; value++)
4004 if (!isPUNCT(value))
4005 ANYOF_BITMAP_SET(ret, value);
4007 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4011 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4013 for (value = 0; value < 256; value++)
4015 ANYOF_BITMAP_SET(ret, value);
4017 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4021 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4023 for (value = 0; value < 256; value++)
4024 if (!isSPACE(value))
4025 ANYOF_BITMAP_SET(ret, value);
4027 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4031 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4033 for (value = 0; value < 256; value++)
4035 ANYOF_BITMAP_SET(ret, value);
4037 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4041 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4043 for (value = 0; value < 256; value++)
4044 if (!isUPPER(value))
4045 ANYOF_BITMAP_SET(ret, value);
4047 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4051 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4053 for (value = 0; value < 256; value++)
4054 if (isXDIGIT(value))
4055 ANYOF_BITMAP_SET(ret, value);
4057 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4061 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4063 for (value = 0; value < 256; value++)
4064 if (!isXDIGIT(value))
4065 ANYOF_BITMAP_SET(ret, value);
4067 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4070 vFAIL("Invalid [::] class");
4074 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4077 } /* end of namedclass \blah */
4080 if (prevvalue > (IV)value) /* b-a */ {
4081 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4082 RExC_parse - rangebegin,
4083 RExC_parse - rangebegin,
4085 range = 0; /* not a valid range */
4089 prevvalue = value; /* save the beginning of the range */
4090 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4091 RExC_parse[1] != ']') {
4094 /* a bad range like \w-, [:word:]- ? */
4095 if (namedclass > OOB_NAMEDCLASS) {
4096 if (ckWARN(WARN_REGEXP))
4098 "False [] range \"%*.*s\"",
4099 RExC_parse - rangebegin,
4100 RExC_parse - rangebegin,
4103 ANYOF_BITMAP_SET(ret, '-');
4105 range = 1; /* yeah, it's a range! */
4106 continue; /* but do it the next time */
4110 /* now is the next time */
4114 if (prevvalue < 256) {
4115 IV ceilvalue = value < 256 ? value : 255;
4118 /* In EBCDIC [\x89-\x91] should include
4119 * the \x8e but [i-j] should not. */
4120 if (literal_endpoint == 2 &&
4121 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4122 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4124 if (isLOWER(prevvalue)) {
4125 for (i = prevvalue; i <= ceilvalue; i++)
4127 ANYOF_BITMAP_SET(ret, i);
4129 for (i = prevvalue; i <= ceilvalue; i++)
4131 ANYOF_BITMAP_SET(ret, i);
4136 for (i = prevvalue; i <= ceilvalue; i++)
4137 ANYOF_BITMAP_SET(ret, i);
4139 if (value > 255 || UTF) {
4140 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4141 UV natvalue = NATIVE_TO_UNI(value);
4143 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4144 if (prevnatvalue < natvalue) { /* what about > ? */
4145 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4146 prevnatvalue, natvalue);
4148 else if (prevnatvalue == natvalue) {
4149 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4151 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4153 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4155 /* If folding and foldable and a single
4156 * character, insert also the folded version
4157 * to the charclass. */
4159 if (foldlen == (STRLEN)UNISKIP(f))
4160 Perl_sv_catpvf(aTHX_ listsv,
4163 /* Any multicharacter foldings
4164 * require the following transform:
4165 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4166 * where E folds into "pq" and F folds
4167 * into "rst", all other characters
4168 * fold to single characters. We save
4169 * away these multicharacter foldings,
4170 * to be later saved as part of the
4171 * additional "s" data. */
4174 if (!unicode_alternate)
4175 unicode_alternate = newAV();
4176 sv = newSVpvn((char*)foldbuf, foldlen);
4178 av_push(unicode_alternate, sv);
4182 /* If folding and the value is one of the Greek
4183 * sigmas insert a few more sigmas to make the
4184 * folding rules of the sigmas to work right.
4185 * Note that not all the possible combinations
4186 * are handled here: some of them are handled
4187 * by the standard folding rules, and some of
4188 * them (literal or EXACTF cases) are handled
4189 * during runtime in regexec.c:S_find_byclass(). */
4190 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4191 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4192 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4193 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4194 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4196 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4197 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4198 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4203 literal_endpoint = 0;
4207 range = 0; /* this range (if it was one) is done now */
4211 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4213 RExC_size += ANYOF_CLASS_ADD_SKIP;
4215 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4218 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4220 /* If the only flag is folding (plus possibly inversion). */
4221 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4223 for (value = 0; value < 256; ++value) {
4224 if (ANYOF_BITMAP_TEST(ret, value)) {
4225 UV fold = PL_fold[value];
4228 ANYOF_BITMAP_SET(ret, fold);
4231 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4234 /* optimize inverted simple patterns (e.g. [^a-z]) */
4235 if (!SIZE_ONLY && optimize_invert &&
4236 /* If the only flag is inversion. */
4237 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4238 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4239 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4240 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4247 /* The 0th element stores the character class description
4248 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4249 * to initialize the appropriate swash (which gets stored in
4250 * the 1st element), and also useful for dumping the regnode.
4251 * The 2nd element stores the multicharacter foldings,
4252 * used later (regexec.c:S_reginclass()). */
4253 av_store(av, 0, listsv);
4254 av_store(av, 1, NULL);
4255 av_store(av, 2, (SV*)unicode_alternate);
4256 rv = newRV_noinc((SV*)av);
4257 n = add_data(pRExC_state, 1, "s");
4258 RExC_rx->data->data[n] = (void*)rv;
4266 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4268 char* retval = RExC_parse++;
4271 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4272 RExC_parse[2] == '#') {
4273 while (*RExC_parse && *RExC_parse != ')')
4278 if (RExC_flags & PMf_EXTENDED) {
4279 if (isSPACE(*RExC_parse)) {
4283 else if (*RExC_parse == '#') {
4284 while (*RExC_parse && *RExC_parse != '\n')
4295 - reg_node - emit a node
4297 STATIC regnode * /* Location. */
4298 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4300 register regnode *ret;
4301 register regnode *ptr;
4305 SIZE_ALIGN(RExC_size);
4310 NODE_ALIGN_FILL(ret);
4312 FILL_ADVANCE_NODE(ptr, op);
4313 if (RExC_offsets) { /* MJD */
4314 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4315 "reg_node", __LINE__,
4317 RExC_emit - RExC_emit_start > RExC_offsets[0]
4318 ? "Overwriting end of array!\n" : "OK",
4319 RExC_emit - RExC_emit_start,
4320 RExC_parse - RExC_start,
4322 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4331 - reganode - emit a node with an argument
4333 STATIC regnode * /* Location. */
4334 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4336 register regnode *ret;
4337 register regnode *ptr;
4341 SIZE_ALIGN(RExC_size);
4346 NODE_ALIGN_FILL(ret);
4348 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4349 if (RExC_offsets) { /* MJD */
4350 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4354 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4355 "Overwriting end of array!\n" : "OK",
4356 RExC_emit - RExC_emit_start,
4357 RExC_parse - RExC_start,
4359 Set_Cur_Node_Offset;
4368 - reguni - emit (if appropriate) a Unicode character
4371 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4373 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4377 - reginsert - insert an operator in front of already-emitted operand
4379 * Means relocating the operand.
4382 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4384 register regnode *src;
4385 register regnode *dst;
4386 register regnode *place;
4387 register int offset = regarglen[(U8)op];
4389 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4392 RExC_size += NODE_STEP_REGNODE + offset;
4397 RExC_emit += NODE_STEP_REGNODE + offset;
4399 while (src > opnd) {
4400 StructCopy(--src, --dst, regnode);
4401 if (RExC_offsets) { /* MJD 20010112 */
4402 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4406 dst - RExC_emit_start > RExC_offsets[0]
4407 ? "Overwriting end of array!\n" : "OK",
4408 src - RExC_emit_start,
4409 dst - RExC_emit_start,
4411 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4412 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4417 place = opnd; /* Op node, where operand used to be. */
4418 if (RExC_offsets) { /* MJD */
4419 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4423 place - RExC_emit_start > RExC_offsets[0]
4424 ? "Overwriting end of array!\n" : "OK",
4425 place - RExC_emit_start,
4426 RExC_parse - RExC_start,
4428 Set_Node_Offset(place, RExC_parse);
4430 src = NEXTOPER(place);
4431 FILL_ADVANCE_NODE(place, op);
4432 Zero(src, offset, regnode);
4436 - regtail - set the next-pointer at the end of a node chain of p to val.
4439 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4441 register regnode *scan;
4442 register regnode *temp;
4447 /* Find last node. */
4450 temp = regnext(scan);
4456 if (reg_off_by_arg[OP(scan)]) {
4457 ARG_SET(scan, val - scan);
4460 NEXT_OFF(scan) = val - scan;
4465 - regoptail - regtail on operand of first argument; nop if operandless
4468 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4470 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4471 if (p == NULL || SIZE_ONLY)
4473 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4474 regtail(pRExC_state, NEXTOPER(p), val);
4476 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4477 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4484 - regcurly - a little FSA that accepts {\d+,?\d*}
4487 S_regcurly(pTHX_ register char *s)
4508 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4510 register U8 op = EXACT; /* Arbitrary non-END op. */
4511 register regnode *next;
4513 while (op != END && (!last || node < last)) {
4514 /* While that wasn't END last time... */
4520 next = regnext(node);
4522 if (OP(node) == OPTIMIZED)
4525 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4526 (int)(2*l + 1), "", SvPVX(sv));
4527 if (next == NULL) /* Next ptr. */
4528 PerlIO_printf(Perl_debug_log, "(0)");
4530 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4531 (void)PerlIO_putc(Perl_debug_log, '\n');
4533 if (PL_regkind[(U8)op] == BRANCHJ) {
4534 register regnode *nnode = (OP(next) == LONGJMP
4537 if (last && nnode > last)
4539 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4541 else if (PL_regkind[(U8)op] == BRANCH) {
4542 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4544 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4545 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4546 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4548 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4549 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4552 else if ( op == PLUS || op == STAR) {
4553 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4555 else if (op == ANYOF) {
4556 /* arglen 1 + class block */
4557 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4558 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4559 node = NEXTOPER(node);
4561 else if (PL_regkind[(U8)op] == EXACT) {
4562 /* Literal string, where present. */
4563 node += NODE_SZ_STR(node) - 1;
4564 node = NEXTOPER(node);
4567 node = NEXTOPER(node);
4568 node += regarglen[(U8)op];
4570 if (op == CURLYX || op == OPEN)
4572 else if (op == WHILEM)
4578 #endif /* DEBUGGING */
4581 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4584 Perl_regdump(pTHX_ regexp *r)
4587 SV *sv = sv_newmortal();
4589 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4591 /* Header fields of interest. */
4592 if (r->anchored_substr)
4593 PerlIO_printf(Perl_debug_log,
4594 "anchored `%s%.*s%s'%s at %"IVdf" ",
4596 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4597 SvPVX(r->anchored_substr),
4599 SvTAIL(r->anchored_substr) ? "$" : "",
4600 (IV)r->anchored_offset);
4601 else if (r->anchored_utf8)
4602 PerlIO_printf(Perl_debug_log,
4603 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4605 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4606 SvPVX(r->anchored_utf8),
4608 SvTAIL(r->anchored_utf8) ? "$" : "",
4609 (IV)r->anchored_offset);
4610 if (r->float_substr)
4611 PerlIO_printf(Perl_debug_log,
4612 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4614 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4615 SvPVX(r->float_substr),
4617 SvTAIL(r->float_substr) ? "$" : "",
4618 (IV)r->float_min_offset, (UV)r->float_max_offset);
4619 else if (r->float_utf8)
4620 PerlIO_printf(Perl_debug_log,
4621 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4623 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4624 SvPVX(r->float_utf8),
4626 SvTAIL(r->float_utf8) ? "$" : "",
4627 (IV)r->float_min_offset, (UV)r->float_max_offset);
4628 if (r->check_substr || r->check_utf8)
4629 PerlIO_printf(Perl_debug_log,
4630 r->check_substr == r->float_substr
4631 && r->check_utf8 == r->float_utf8
4632 ? "(checking floating" : "(checking anchored");
4633 if (r->reganch & ROPT_NOSCAN)
4634 PerlIO_printf(Perl_debug_log, " noscan");
4635 if (r->reganch & ROPT_CHECK_ALL)
4636 PerlIO_printf(Perl_debug_log, " isall");
4637 if (r->check_substr || r->check_utf8)
4638 PerlIO_printf(Perl_debug_log, ") ");
4640 if (r->regstclass) {
4641 regprop(sv, r->regstclass);
4642 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4644 if (r->reganch & ROPT_ANCH) {
4645 PerlIO_printf(Perl_debug_log, "anchored");
4646 if (r->reganch & ROPT_ANCH_BOL)
4647 PerlIO_printf(Perl_debug_log, "(BOL)");
4648 if (r->reganch & ROPT_ANCH_MBOL)
4649 PerlIO_printf(Perl_debug_log, "(MBOL)");
4650 if (r->reganch & ROPT_ANCH_SBOL)
4651 PerlIO_printf(Perl_debug_log, "(SBOL)");
4652 if (r->reganch & ROPT_ANCH_GPOS)
4653 PerlIO_printf(Perl_debug_log, "(GPOS)");
4654 PerlIO_putc(Perl_debug_log, ' ');
4656 if (r->reganch & ROPT_GPOS_SEEN)
4657 PerlIO_printf(Perl_debug_log, "GPOS ");
4658 if (r->reganch & ROPT_SKIP)
4659 PerlIO_printf(Perl_debug_log, "plus ");
4660 if (r->reganch & ROPT_IMPLICIT)
4661 PerlIO_printf(Perl_debug_log, "implicit ");
4662 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4663 if (r->reganch & ROPT_EVAL_SEEN)
4664 PerlIO_printf(Perl_debug_log, "with eval ");
4665 PerlIO_printf(Perl_debug_log, "\n");
4668 U32 len = r->offsets[0];
4669 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4670 for (i = 1; i <= len; i++)
4671 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4672 (UV)r->offsets[i*2-1],
4673 (UV)r->offsets[i*2]);
4674 PerlIO_printf(Perl_debug_log, "\n");
4676 #endif /* DEBUGGING */
4682 S_put_byte(pTHX_ SV *sv, int c)
4684 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4685 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4686 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4687 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4689 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4692 #endif /* DEBUGGING */
4695 - regprop - printable representation of opcode
4698 Perl_regprop(pTHX_ SV *sv, regnode *o)
4703 sv_setpvn(sv, "", 0);
4704 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4705 /* It would be nice to FAIL() here, but this may be called from
4706 regexec.c, and it would be hard to supply pRExC_state. */
4707 Perl_croak(aTHX_ "Corrupted regexp opcode");
4708 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4710 k = PL_regkind[(U8)OP(o)];
4713 SV *dsv = sv_2mortal(newSVpvn("", 0));
4714 /* Using is_utf8_string() is a crude hack but it may
4715 * be the best for now since we have no flag "this EXACTish
4716 * node was UTF-8" --jhi */
4717 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4719 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4720 UNI_DISPLAY_REGEX) :
4725 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4730 else if (k == CURLY) {
4731 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4732 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4733 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4735 else if (k == WHILEM && o->flags) /* Ordinal/of */
4736 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4737 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4738 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4739 else if (k == LOGICAL)
4740 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4741 else if (k == ANYOF) {
4742 int i, rangestart = -1;
4743 U8 flags = ANYOF_FLAGS(o);
4744 const char * const anyofs[] = { /* Should be synchronized with
4745 * ANYOF_ #xdefines in regcomp.h */
4778 if (flags & ANYOF_LOCALE)
4779 sv_catpv(sv, "{loc}");
4780 if (flags & ANYOF_FOLD)
4781 sv_catpv(sv, "{i}");
4782 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4783 if (flags & ANYOF_INVERT)
4785 for (i = 0; i <= 256; i++) {
4786 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4787 if (rangestart == -1)
4789 } else if (rangestart != -1) {
4790 if (i <= rangestart + 3)
4791 for (; rangestart < i; rangestart++)
4792 put_byte(sv, rangestart);
4794 put_byte(sv, rangestart);
4796 put_byte(sv, i - 1);
4802 if (o->flags & ANYOF_CLASS)
4803 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4804 if (ANYOF_CLASS_TEST(o,i))
4805 sv_catpv(sv, anyofs[i]);
4807 if (flags & ANYOF_UNICODE)
4808 sv_catpv(sv, "{unicode}");
4809 else if (flags & ANYOF_UNICODE_ALL)
4810 sv_catpv(sv, "{unicode_all}");
4814 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4818 U8 s[UTF8_MAXLEN+1];
4820 for (i = 0; i <= 256; i++) { /* just the first 256 */
4821 U8 *e = uvchr_to_utf8(s, i);
4823 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4824 if (rangestart == -1)
4826 } else if (rangestart != -1) {
4829 if (i <= rangestart + 3)
4830 for (; rangestart < i; rangestart++) {
4831 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4835 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4838 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4845 sv_catpv(sv, "..."); /* et cetera */
4849 char *s = savepv(SvPVX(lv));
4852 while(*s && *s != '\n') s++;
4873 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4875 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4876 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4877 #endif /* DEBUGGING */
4881 Perl_re_intuit_string(pTHX_ regexp *prog)
4882 { /* Assume that RE_INTUIT is set */
4885 char *s = SvPV(prog->check_substr
4886 ? prog->check_substr : prog->check_utf8, n_a);
4888 if (!PL_colorset) reginitcolors();
4889 PerlIO_printf(Perl_debug_log,
4890 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4892 prog->check_substr ? "" : "utf8 ",
4893 PL_colors[5],PL_colors[0],
4896 (strlen(s) > 60 ? "..." : ""));
4899 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4903 Perl_pregfree(pTHX_ struct regexp *r)
4906 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4909 if (!r || (--r->refcnt > 0))
4915 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4916 r->prelen, 60, UNI_DISPLAY_REGEX)
4917 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4921 PerlIO_printf(Perl_debug_log,
4922 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4923 PL_colors[4],PL_colors[5],PL_colors[0],
4926 len > 60 ? "..." : "");
4930 Safefree(r->precomp);
4931 if (r->offsets) /* 20010421 MJD */
4932 Safefree(r->offsets);
4933 RX_MATCH_COPY_FREE(r);
4934 #ifdef PERL_COPY_ON_WRITE
4936 SvREFCNT_dec(r->saved_copy);
4939 if (r->anchored_substr)
4940 SvREFCNT_dec(r->anchored_substr);
4941 if (r->anchored_utf8)
4942 SvREFCNT_dec(r->anchored_utf8);
4943 if (r->float_substr)
4944 SvREFCNT_dec(r->float_substr);
4946 SvREFCNT_dec(r->float_utf8);
4947 Safefree(r->substrs);
4950 int n = r->data->count;
4951 PAD* new_comppad = NULL;
4955 /* If you add a ->what type here, update the comment in regcomp.h */
4956 switch (r->data->what[n]) {
4958 SvREFCNT_dec((SV*)r->data->data[n]);
4961 Safefree(r->data->data[n]);
4964 new_comppad = (AV*)r->data->data[n];
4967 if (new_comppad == NULL)
4968 Perl_croak(aTHX_ "panic: pregfree comppad");
4969 PAD_SAVE_LOCAL(old_comppad,
4970 /* Watch out for global destruction's random ordering. */
4971 (SvTYPE(new_comppad) == SVt_PVAV) ?
4972 new_comppad : Null(PAD *)
4974 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4975 op_free((OP_4tree*)r->data->data[n]);
4978 PAD_RESTORE_LOCAL(old_comppad);
4979 SvREFCNT_dec((SV*)new_comppad);
4985 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4988 Safefree(r->data->what);
4991 Safefree(r->startp);
4997 - regnext - dig the "next" pointer out of a node
4999 * [Note, when REGALIGN is defined there are two places in regmatch()
5000 * that bypass this code for speed.]
5003 Perl_regnext(pTHX_ register regnode *p)
5005 register I32 offset;
5007 if (p == &PL_regdummy)
5010 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5018 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5021 STRLEN l1 = strlen(pat1);
5022 STRLEN l2 = strlen(pat2);
5031 Copy(pat1, buf, l1 , char);
5032 Copy(pat2, buf + l1, l2 , char);
5033 buf[l1 + l2] = '\n';
5034 buf[l1 + l2 + 1] = '\0';
5036 /* ANSI variant takes additional second argument */
5037 va_start(args, pat2);
5041 msv = vmess(buf, &args);
5043 message = SvPV(msv,l1);
5046 Copy(message, buf, l1 , char);
5047 buf[l1] = '\0'; /* Overwrite \n */
5048 Perl_croak(aTHX_ "%s", buf);
5051 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5054 Perl_save_re_context(pTHX)
5056 SAVEI32(PL_reg_flags); /* from regexec.c */
5058 SAVEPPTR(PL_reginput); /* String-input pointer. */
5059 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5060 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5061 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5062 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5063 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5064 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5065 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5066 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5067 PL_reg_start_tmp = 0;
5068 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5069 PL_reg_start_tmpl = 0;
5070 SAVEVPTR(PL_regdata);
5071 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5072 SAVEI32(PL_regnarrate); /* from regexec.c */
5073 SAVEVPTR(PL_regprogram); /* from regexec.c */
5074 SAVEINT(PL_regindent); /* from regexec.c */
5075 SAVEVPTR(PL_regcc); /* from regexec.c */
5076 SAVEVPTR(PL_curcop);
5077 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5078 SAVEVPTR(PL_reg_re); /* from regexec.c */
5079 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5080 SAVESPTR(PL_reg_sv); /* from regexec.c */
5081 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5082 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5083 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5084 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5085 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5086 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5087 PL_reg_oldsaved = Nullch;
5088 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5089 PL_reg_oldsavedlen = 0;
5090 #ifdef PERL_COPY_ON_WRITE
5094 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5096 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5097 PL_reg_leftiter = 0;
5098 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5099 PL_reg_poscache = Nullch;
5100 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5101 PL_reg_poscache_size = 0;
5102 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5103 SAVEI32(PL_regnpar); /* () count. */
5104 SAVEI32(PL_regsize); /* from regexec.c */
5107 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5113 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5114 for (i = 1; i <= rx->nparens; i++) {
5115 sprintf(digits, "%lu", (long)i);
5116 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5123 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5128 clear_re(pTHX_ void *r)
5130 ReREFCNT_dec((regexp *)r);