5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
837 /* first pass, loop through and scan words */
840 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
845 /* we just use folder as a flag in utf8 */
846 const U8 *folder=( flags == EXACTF
854 U32 data_slot = add_data( pRExC_state, 1, "t" );
857 GET_RE_DEBUG_FLAGS_DECL;
859 Newz( 848200, trie, 1, reg_trie_data );
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
864 trie->words = newAV();
865 trie->revcharmap = newAV();
869 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
870 if (!SvIOK(re_trie_maxbuff)) {
871 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
874 /* -- First loop and Setup --
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 U8 *uc = (U8*)STRING( noper );
901 U8 *e = uc + STR_LEN( noper );
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
906 for ( ; uc < e ; uc += len ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
944 representation using a flat table will take. If its over a reasonable
945 limit (as specified by $^RE_TRIE_MAXBUFF) we use a more memory
946 conservative but potentially much slower representation using an array
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 Second Pass -- Array Of Lists Representation
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
979 STRLEN transcount = 1;
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
997 for ( ; uc < e ; uc += len ) {
1002 charid = trie->charmap[ uvc ];
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1009 charid=(U16)SvIV( *svpp );
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1052 /* Its a dupe. So ignore it. */
1055 } /* end second pass */
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1060 DEBUG_TRIE_COMPILE_MORE_r({
1065 print out the table precompression.
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1123 if ( transcount < tp + maxid - minid + 1) {
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1152 tp += ( maxid - minid + 1 );
1154 Safefree(trie->states[ state ].trans.list);
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1161 trie->states[ state ].trans.base=base;
1163 Renew( trie->trans, tp + 1, reg_trie_trans );
1168 Second Pass -- Flat Table Representation.
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1175 We then construct the trie using only the .next slots of the entry
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1214 U32 state = 1; /* required init */
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1224 for ( ; uc < e ; uc += len ) {
1229 charid = trie->charmap[ uvc ];
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1236 charid=(U16)SvIV( *svpp );
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1246 state = trie->trans[ state + charid ].next;
1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1266 /* Its a dupe. So ignore it. */
1269 } /* end second pass */
1271 DEBUG_TRIE_COMPILE_MORE_r({
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1293 PerlIO_printf( Perl_debug_log, "\n" );
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1297 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1300 PerlIO_printf( Perl_debug_log, "%04X ",
1301 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1304 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1306 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1314 * Inplace compress the table.*
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1326 - If .base is 0 there are no valid transitions from that node.
1328 - If .base is nonzero then charid is added to it to find an entry in
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1345 - We set 'pos' to indicate the first entry of the second node.
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1365 - Once compressed we can Renew/realloc the structures to release the
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1373 U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1376 trie->laststate = laststate;
1378 for ( state = 1 ; state < laststate ; state++ ) {
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1412 Renew( trie->trans, pos + 1, reg_trie_trans);
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1416 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1417 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1420 } /* end table compress */
1423 DEBUG_TRIE_COMPILE_r({
1426 Now we print it out again, in a slightly different form as there is additional
1427 info we want to be able to see when its compressed. They are close enough for
1428 visual comparison though.
1430 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1432 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1433 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1435 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1438 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1439 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1440 PerlIO_printf( Perl_debug_log, "-----");
1441 PerlIO_printf( Perl_debug_log, "\n");
1442 for( state = 1 ; state < trie->laststate ; state++ ) {
1443 U32 base = trie->states[ state ].trans.base;
1445 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1447 if ( trie->states[ state ].wordnum ) {
1448 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1450 PerlIO_printf( Perl_debug_log, "%6s", "" );
1453 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1458 while( ( base + ofs - trie->uniquecharcount ) >=0 &&
1459 trie->trans[ base + ofs - trie->uniquecharcount ].check != state )
1462 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1464 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1465 if ( ( base + ofs - trie->uniquecharcount>=0) &&
1466 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1468 PerlIO_printf( Perl_debug_log, "%04X ",
1469 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1471 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1475 PerlIO_printf( Perl_debug_log, "]", ofs);
1478 PerlIO_printf( Perl_debug_log, "\n" );
1483 /* now finally we "stitch in" the new TRIE node
1484 This means we convert either the first branch or the first Exact,
1485 depending on whether the thing following (in 'last') is a branch
1486 or not and whther first is the startbranch (ie is it a sub part of
1487 the alternation or is it the whole thing.)
1488 Assuming its a sub part we conver the EXACT otherwise we convert
1489 the whole branch sequence, including the first.
1496 if ( first == startbranch && OP( last ) != BRANCH ) {
1499 convert = NEXTOPER( first );
1500 NEXT_OFF( first ) = (U16)(last - first);
1503 OP( convert ) = TRIE + (U8)( flags - EXACT );
1504 NEXT_OFF( convert ) = (U16)(tail - convert);
1505 ARG_SET( convert, data_slot );
1507 /* tells us if we need to handle accept buffers specially */
1508 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1511 /* needed for dumping*/
1513 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1514 /* We now need to mark all of the space originally used by the
1515 branches as optimized away. This keeps the dumpuntil from
1516 throwing a wobbly as it doesnt use regnext() to traverse the
1519 while( optimize < last ) {
1520 OP( optimize ) = OPTIMIZED;
1524 } /* end node insert */
1531 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1532 * These need to be revisited when a newer toolchain becomes available.
1534 #if defined(__sparc64__) && defined(__GNUC__)
1535 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1536 # undef SPARC64_GCC_WORKAROUND
1537 # define SPARC64_GCC_WORKAROUND 1
1541 /* REx optimizer. Converts nodes into quickier variants "in place".
1542 Finds fixed substrings. */
1544 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1545 to the position after last scanned or to NULL. */
1549 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1550 /* scanp: Start here (read-write). */
1551 /* deltap: Write maxlen-minlen here. */
1552 /* last: Stop before this one. */
1554 I32 min = 0, pars = 0, code;
1555 regnode *scan = *scanp, *next;
1557 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1558 int is_inf_internal = 0; /* The studied chunk is infinite */
1559 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1560 scan_data_t data_fake;
1561 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1562 SV *re_trie_maxbuff = NULL;
1564 GET_RE_DEBUG_FLAGS_DECL;
1566 while (scan && OP(scan) != END && scan < last) {
1567 /* Peephole optimizer: */
1569 SV *mysv=sv_newmortal();
1570 regprop( mysv, scan);
1571 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1574 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1575 /* Merge several consecutive EXACTish nodes into one. */
1576 regnode *n = regnext(scan);
1579 regnode *stop = scan;
1582 next = scan + NODE_SZ_STR(scan);
1583 /* Skip NOTHING, merge EXACT*. */
1585 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1586 (stringok && (OP(n) == OP(scan))))
1588 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1589 if (OP(n) == TAIL || n > next)
1591 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1592 NEXT_OFF(scan) += NEXT_OFF(n);
1593 next = n + NODE_STEP_REGNODE;
1600 else if (stringok) {
1601 int oldl = STR_LEN(scan);
1602 regnode *nnext = regnext(n);
1604 if (oldl + STR_LEN(n) > U8_MAX)
1606 NEXT_OFF(scan) += NEXT_OFF(n);
1607 STR_LEN(scan) += STR_LEN(n);
1608 next = n + NODE_SZ_STR(n);
1609 /* Now we can overwrite *n : */
1610 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1618 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1620 Two problematic code points in Unicode casefolding of EXACT nodes:
1622 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1623 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1629 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1630 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1632 This means that in case-insensitive matching (or "loose matching",
1633 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1634 length of the above casefolded versions) can match a target string
1635 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1636 This would rather mess up the minimum length computation.
1638 What we'll do is to look for the tail four bytes, and then peek
1639 at the preceding two bytes to see whether we need to decrease
1640 the minimum length by four (six minus two).
1642 Thanks to the design of UTF-8, there cannot be false matches:
1643 A sequence of valid UTF-8 bytes cannot be a subsequence of
1644 another valid sequence of UTF-8 bytes.
1647 char *s0 = STRING(scan), *s, *t;
1648 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1649 char *t0 = "\xcc\x88\xcc\x81";
1653 s < s2 && (t = ninstr(s, s1, t0, t1));
1655 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1656 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1663 n = scan + NODE_SZ_STR(scan);
1665 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1676 /* Follow the next-chain of the current node and optimize
1677 away all the NOTHINGs from it. */
1678 if (OP(scan) != CURLYX) {
1679 int max = (reg_off_by_arg[OP(scan)]
1681 /* I32 may be smaller than U16 on CRAYs! */
1682 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1683 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1687 /* Skip NOTHING and LONGJMP. */
1688 while ((n = regnext(n))
1689 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1690 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1691 && off + noff < max)
1693 if (reg_off_by_arg[OP(scan)])
1696 NEXT_OFF(scan) = off;
1699 /* The principal pseudo-switch. Cannot be a switch, since we
1700 look into several different things. */
1701 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1702 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1703 next = regnext(scan);
1705 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1707 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1708 I32 max1 = 0, min1 = I32_MAX, num = 0;
1709 struct regnode_charclass_class accum;
1710 regnode *startbranch=scan;
1712 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1713 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1714 if (flags & SCF_DO_STCLASS)
1715 cl_init_zero(pRExC_state, &accum);
1717 while (OP(scan) == code) {
1718 I32 deltanext, minnext, f = 0, fake;
1719 struct regnode_charclass_class this_class;
1722 data_fake.flags = 0;
1724 data_fake.whilem_c = data->whilem_c;
1725 data_fake.last_closep = data->last_closep;
1728 data_fake.last_closep = &fake;
1729 next = regnext(scan);
1730 scan = NEXTOPER(scan);
1732 scan = NEXTOPER(scan);
1733 if (flags & SCF_DO_STCLASS) {
1734 cl_init(pRExC_state, &this_class);
1735 data_fake.start_class = &this_class;
1736 f = SCF_DO_STCLASS_AND;
1738 if (flags & SCF_WHILEM_VISITED_POS)
1739 f |= SCF_WHILEM_VISITED_POS;
1741 /* we suppose the run is continuous, last=next...*/
1742 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1743 next, &data_fake, f,depth+1);
1746 if (max1 < minnext + deltanext)
1747 max1 = minnext + deltanext;
1748 if (deltanext == I32_MAX)
1749 is_inf = is_inf_internal = 1;
1751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1753 if (data && (data_fake.flags & SF_HAS_EVAL))
1754 data->flags |= SF_HAS_EVAL;
1756 data->whilem_c = data_fake.whilem_c;
1757 if (flags & SCF_DO_STCLASS)
1758 cl_or(pRExC_state, &accum, &this_class);
1759 if (code == SUSPEND)
1762 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1764 if (flags & SCF_DO_SUBSTR) {
1765 data->pos_min += min1;
1766 data->pos_delta += max1 - min1;
1767 if (max1 != min1 || is_inf)
1768 data->longest = &(data->longest_float);
1771 delta += max1 - min1;
1772 if (flags & SCF_DO_STCLASS_OR) {
1773 cl_or(pRExC_state, data->start_class, &accum);
1775 cl_and(data->start_class, &and_with);
1776 flags &= ~SCF_DO_STCLASS;
1779 else if (flags & SCF_DO_STCLASS_AND) {
1781 cl_and(data->start_class, &accum);
1782 flags &= ~SCF_DO_STCLASS;
1785 /* Switch to OR mode: cache the old value of
1786 * data->start_class */
1787 StructCopy(data->start_class, &and_with,
1788 struct regnode_charclass_class);
1789 flags &= ~SCF_DO_STCLASS_AND;
1790 StructCopy(&accum, data->start_class,
1791 struct regnode_charclass_class);
1792 flags |= SCF_DO_STCLASS_OR;
1793 data->start_class->flags |= ANYOF_EOS;
1799 Assuming this was/is a branch we are dealing with: 'scan' now
1800 points at the item that follows the branch sequence, whatever
1801 it is. We now start at the beginning of the sequence and look
1807 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1809 If we can find such a subseqence we need to turn the first
1810 element into a trie and then add the subsequent branch exact
1811 strings to the trie.
1815 1. patterns where the whole set of branch can be converted to a trie,
1817 2. patterns where only a subset of the alternations can be
1818 converted to a trie.
1820 In case 1 we can replace the whole set with a single regop
1821 for the trie. In case 2 we need to keep the start and end
1824 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1825 becomes BRANCH TRIE; BRANCH X;
1827 Hypthetically when we know the regex isnt anchored we can
1828 turn a case 1 into a DFA and let it rip... Every time it finds a match
1829 it would just call its tail, no WHILEM/CURLY needed.
1833 if (!re_trie_maxbuff) {
1834 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
1835 if (!SvIOK(re_trie_maxbuff))
1836 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
1839 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1841 regnode *first = (regnode *)NULL;
1842 regnode *last = (regnode *)NULL;
1843 regnode *tail = scan;
1848 SV *mysv = sv_newmortal(); /* for dumping */
1850 /* var tail is used because there may be a TAIL
1851 regop in the way. Ie, the exacts will point to the
1852 thing following the TAIL, but the last branch will
1853 point at the TAIL. So we advance tail. If we
1854 have nested (?:) we may have to move through several
1858 while ( OP( tail ) == TAIL ) {
1859 /* this is the TAIL generated by (?:) */
1860 tail = regnext( tail );
1864 regprop( mysv, tail );
1865 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1866 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1867 (RExC_seen_evals) ? "[EVAL]" : ""
1872 step through the branches, cur represents each
1873 branch, noper is the first thing to be matched
1874 as part of that branch and noper_next is the
1875 regnext() of that node. if noper is an EXACT
1876 and noper_next is the same as scan (our current
1877 position in the regex) then the EXACT branch is
1878 a possible optimization target. Once we have
1879 two or more consequetive such branches we can
1880 create a trie of the EXACT's contents and stich
1881 it in place. If the sequence represents all of
1882 the branches we eliminate the whole thing and
1883 replace it with a single TRIE. If it is a
1884 subsequence then we need to stitch it in. This
1885 means the first branch has to remain, and needs
1886 to be repointed at the item on the branch chain
1887 following the last branch optimized. This could
1888 be either a BRANCH, in which case the
1889 subsequence is internal, or it could be the
1890 item following the branch sequence in which
1891 case the subsequence is at the end.
1895 /* dont use tail as the end marker for this traverse */
1896 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1897 regnode *noper = NEXTOPER( cur );
1898 regnode *noper_next = regnext( noper );
1902 regprop( mysv, cur);
1903 PerlIO_printf( Perl_debug_log, "%*s%s",
1904 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1906 regprop( mysv, noper);
1907 PerlIO_printf( Perl_debug_log, " -> %s",
1911 regprop( mysv, noper_next );
1912 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1915 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1918 if ( ( first ? OP( noper ) == optype
1919 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1920 && noper_next == tail && count<U16_MAX)
1925 optype = OP( noper );
1929 regprop( mysv, first);
1930 PerlIO_printf( Perl_debug_log, "%*s%s",
1931 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1932 regprop( mysv, NEXTOPER(first) );
1933 PerlIO_printf( Perl_debug_log, " -> %s\n",
1934 SvPV_nolen( mysv ) );
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log, "%*s%s",
1941 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1942 regprop( mysv, noper );
1943 PerlIO_printf( Perl_debug_log, " -> %s\n",
1944 SvPV_nolen( mysv ) );
1950 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1951 depth * 2 + 2, "E:", "**END**" );
1953 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1955 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1956 && noper_next == tail )
1960 optype = OP( noper );
1970 regprop( mysv, cur);
1971 PerlIO_printf( Perl_debug_log,
1972 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1973 " ", SvPV_nolen( mysv ), first, last, cur);
1978 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1979 depth * 2 + 2, "E:", "==END==" );
1981 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1986 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1987 scan = NEXTOPER(NEXTOPER(scan));
1988 } else /* single branch is optimized. */
1989 scan = NEXTOPER(scan);
1992 else if (OP(scan) == EXACT) {
1993 I32 l = STR_LEN(scan);
1994 UV uc = *((U8*)STRING(scan));
1996 U8 *s = (U8*)STRING(scan);
1997 l = utf8_length(s, s + l);
1998 uc = utf8_to_uvchr(s, NULL);
2001 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2002 /* The code below prefers earlier match for fixed
2003 offset, later match for variable offset. */
2004 if (data->last_end == -1) { /* Update the start info. */
2005 data->last_start_min = data->pos_min;
2006 data->last_start_max = is_inf
2007 ? I32_MAX : data->pos_min + data->pos_delta;
2009 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2011 SV * sv = data->last_found;
2012 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2013 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2014 if (mg && mg->mg_len >= 0)
2015 mg->mg_len += utf8_length((U8*)STRING(scan),
2016 (U8*)STRING(scan)+STR_LEN(scan));
2019 SvUTF8_on(data->last_found);
2020 data->last_end = data->pos_min + l;
2021 data->pos_min += l; /* As in the first entry. */
2022 data->flags &= ~SF_BEFORE_EOL;
2024 if (flags & SCF_DO_STCLASS_AND) {
2025 /* Check whether it is compatible with what we know already! */
2029 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2030 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2031 && (!(data->start_class->flags & ANYOF_FOLD)
2032 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2035 ANYOF_CLASS_ZERO(data->start_class);
2036 ANYOF_BITMAP_ZERO(data->start_class);
2038 ANYOF_BITMAP_SET(data->start_class, uc);
2039 data->start_class->flags &= ~ANYOF_EOS;
2041 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2043 else if (flags & SCF_DO_STCLASS_OR) {
2044 /* false positive possible if the class is case-folded */
2046 ANYOF_BITMAP_SET(data->start_class, uc);
2048 data->start_class->flags |= ANYOF_UNICODE_ALL;
2049 data->start_class->flags &= ~ANYOF_EOS;
2050 cl_and(data->start_class, &and_with);
2052 flags &= ~SCF_DO_STCLASS;
2054 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2055 I32 l = STR_LEN(scan);
2056 UV uc = *((U8*)STRING(scan));
2058 /* Search for fixed substrings supports EXACT only. */
2059 if (flags & SCF_DO_SUBSTR)
2060 scan_commit(pRExC_state, data);
2062 U8 *s = (U8 *)STRING(scan);
2063 l = utf8_length(s, s + l);
2064 uc = utf8_to_uvchr(s, NULL);
2067 if (data && (flags & SCF_DO_SUBSTR))
2069 if (flags & SCF_DO_STCLASS_AND) {
2070 /* Check whether it is compatible with what we know already! */
2074 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2075 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2076 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2078 ANYOF_CLASS_ZERO(data->start_class);
2079 ANYOF_BITMAP_ZERO(data->start_class);
2081 ANYOF_BITMAP_SET(data->start_class, uc);
2082 data->start_class->flags &= ~ANYOF_EOS;
2083 data->start_class->flags |= ANYOF_FOLD;
2084 if (OP(scan) == EXACTFL)
2085 data->start_class->flags |= ANYOF_LOCALE;
2088 else if (flags & SCF_DO_STCLASS_OR) {
2089 if (data->start_class->flags & ANYOF_FOLD) {
2090 /* false positive possible if the class is case-folded.
2091 Assume that the locale settings are the same... */
2093 ANYOF_BITMAP_SET(data->start_class, uc);
2094 data->start_class->flags &= ~ANYOF_EOS;
2096 cl_and(data->start_class, &and_with);
2098 flags &= ~SCF_DO_STCLASS;
2100 else if (strchr((char*)PL_varies,OP(scan))) {
2101 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2102 I32 f = flags, pos_before = 0;
2103 regnode *oscan = scan;
2104 struct regnode_charclass_class this_class;
2105 struct regnode_charclass_class *oclass = NULL;
2106 I32 next_is_eval = 0;
2108 switch (PL_regkind[(U8)OP(scan)]) {
2109 case WHILEM: /* End of (?:...)* . */
2110 scan = NEXTOPER(scan);
2113 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2114 next = NEXTOPER(scan);
2115 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2117 maxcount = REG_INFTY;
2118 next = regnext(scan);
2119 scan = NEXTOPER(scan);
2123 if (flags & SCF_DO_SUBSTR)
2128 if (flags & SCF_DO_STCLASS) {
2130 maxcount = REG_INFTY;
2131 next = regnext(scan);
2132 scan = NEXTOPER(scan);
2135 is_inf = is_inf_internal = 1;
2136 scan = regnext(scan);
2137 if (flags & SCF_DO_SUBSTR) {
2138 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2139 data->longest = &(data->longest_float);
2141 goto optimize_curly_tail;
2143 mincount = ARG1(scan);
2144 maxcount = ARG2(scan);
2145 next = regnext(scan);
2146 if (OP(scan) == CURLYX) {
2147 I32 lp = (data ? *(data->last_closep) : 0);
2148 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2150 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2151 next_is_eval = (OP(scan) == EVAL);
2153 if (flags & SCF_DO_SUBSTR) {
2154 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2155 pos_before = data->pos_min;
2159 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2161 data->flags |= SF_IS_INF;
2163 if (flags & SCF_DO_STCLASS) {
2164 cl_init(pRExC_state, &this_class);
2165 oclass = data->start_class;
2166 data->start_class = &this_class;
2167 f |= SCF_DO_STCLASS_AND;
2168 f &= ~SCF_DO_STCLASS_OR;
2170 /* These are the cases when once a subexpression
2171 fails at a particular position, it cannot succeed
2172 even after backtracking at the enclosing scope.
2174 XXXX what if minimal match and we are at the
2175 initial run of {n,m}? */
2176 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2177 f &= ~SCF_WHILEM_VISITED_POS;
2179 /* This will finish on WHILEM, setting scan, or on NULL: */
2180 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2182 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2184 if (flags & SCF_DO_STCLASS)
2185 data->start_class = oclass;
2186 if (mincount == 0 || minnext == 0) {
2187 if (flags & SCF_DO_STCLASS_OR) {
2188 cl_or(pRExC_state, data->start_class, &this_class);
2190 else if (flags & SCF_DO_STCLASS_AND) {
2191 /* Switch to OR mode: cache the old value of
2192 * data->start_class */
2193 StructCopy(data->start_class, &and_with,
2194 struct regnode_charclass_class);
2195 flags &= ~SCF_DO_STCLASS_AND;
2196 StructCopy(&this_class, data->start_class,
2197 struct regnode_charclass_class);
2198 flags |= SCF_DO_STCLASS_OR;
2199 data->start_class->flags |= ANYOF_EOS;
2201 } else { /* Non-zero len */
2202 if (flags & SCF_DO_STCLASS_OR) {
2203 cl_or(pRExC_state, data->start_class, &this_class);
2204 cl_and(data->start_class, &and_with);
2206 else if (flags & SCF_DO_STCLASS_AND)
2207 cl_and(data->start_class, &this_class);
2208 flags &= ~SCF_DO_STCLASS;
2210 if (!scan) /* It was not CURLYX, but CURLY. */
2212 if (ckWARN(WARN_REGEXP)
2213 /* ? quantifier ok, except for (?{ ... }) */
2214 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2215 && (minnext == 0) && (deltanext == 0)
2216 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2217 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2220 "Quantifier unexpected on zero-length expression");
2223 min += minnext * mincount;
2224 is_inf_internal |= ((maxcount == REG_INFTY
2225 && (minnext + deltanext) > 0)
2226 || deltanext == I32_MAX);
2227 is_inf |= is_inf_internal;
2228 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2230 /* Try powerful optimization CURLYX => CURLYN. */
2231 if ( OP(oscan) == CURLYX && data
2232 && data->flags & SF_IN_PAR
2233 && !(data->flags & SF_HAS_EVAL)
2234 && !deltanext && minnext == 1 ) {
2235 /* Try to optimize to CURLYN. */
2236 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2237 regnode *nxt1 = nxt;
2244 if (!strchr((char*)PL_simple,OP(nxt))
2245 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2246 && STR_LEN(nxt) == 1))
2252 if (OP(nxt) != CLOSE)
2254 /* Now we know that nxt2 is the only contents: */
2255 oscan->flags = (U8)ARG(nxt);
2257 OP(nxt1) = NOTHING; /* was OPEN. */
2259 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2260 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2261 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2262 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2263 OP(nxt + 1) = OPTIMIZED; /* was count. */
2264 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2269 /* Try optimization CURLYX => CURLYM. */
2270 if ( OP(oscan) == CURLYX && data
2271 && !(data->flags & SF_HAS_PAR)
2272 && !(data->flags & SF_HAS_EVAL)
2273 && !deltanext /* atom is fixed width */
2274 && minnext != 0 /* CURLYM can't handle zero width */
2276 /* XXXX How to optimize if data == 0? */
2277 /* Optimize to a simpler form. */
2278 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2282 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2283 && (OP(nxt2) != WHILEM))
2285 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2286 /* Need to optimize away parenths. */
2287 if (data->flags & SF_IN_PAR) {
2288 /* Set the parenth number. */
2289 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2291 if (OP(nxt) != CLOSE)
2292 FAIL("Panic opt close");
2293 oscan->flags = (U8)ARG(nxt);
2294 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2295 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2297 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2298 OP(nxt + 1) = OPTIMIZED; /* was count. */
2299 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2300 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2303 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2304 regnode *nnxt = regnext(nxt1);
2307 if (reg_off_by_arg[OP(nxt1)])
2308 ARG_SET(nxt1, nxt2 - nxt1);
2309 else if (nxt2 - nxt1 < U16_MAX)
2310 NEXT_OFF(nxt1) = nxt2 - nxt1;
2312 OP(nxt) = NOTHING; /* Cannot beautify */
2317 /* Optimize again: */
2318 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2324 else if ((OP(oscan) == CURLYX)
2325 && (flags & SCF_WHILEM_VISITED_POS)
2326 /* See the comment on a similar expression above.
2327 However, this time it not a subexpression
2328 we care about, but the expression itself. */
2329 && (maxcount == REG_INFTY)
2330 && data && ++data->whilem_c < 16) {
2331 /* This stays as CURLYX, we can put the count/of pair. */
2332 /* Find WHILEM (as in regexec.c) */
2333 regnode *nxt = oscan + NEXT_OFF(oscan);
2335 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2337 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2338 | (RExC_whilem_seen << 4)); /* On WHILEM */
2340 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2342 if (flags & SCF_DO_SUBSTR) {
2343 SV *last_str = Nullsv;
2344 int counted = mincount != 0;
2346 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2347 #if defined(SPARC64_GCC_WORKAROUND)
2353 if (pos_before >= data->last_start_min)
2356 b = data->last_start_min;
2359 s = SvPV(data->last_found, l);
2360 old = b - data->last_start_min;
2363 I32 b = pos_before >= data->last_start_min
2364 ? pos_before : data->last_start_min;
2366 char *s = SvPV(data->last_found, l);
2367 I32 old = b - data->last_start_min;
2371 old = utf8_hop((U8*)s, old) - (U8*)s;
2374 /* Get the added string: */
2375 last_str = newSVpvn(s + old, l);
2377 SvUTF8_on(last_str);
2378 if (deltanext == 0 && pos_before == b) {
2379 /* What was added is a constant string */
2381 SvGROW(last_str, (mincount * l) + 1);
2382 repeatcpy(SvPVX(last_str) + l,
2383 SvPVX(last_str), l, mincount - 1);
2384 SvCUR(last_str) *= mincount;
2385 /* Add additional parts. */
2386 SvCUR_set(data->last_found,
2387 SvCUR(data->last_found) - l);
2388 sv_catsv(data->last_found, last_str);
2390 SV * sv = data->last_found;
2392 SvUTF8(sv) && SvMAGICAL(sv) ?
2393 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2394 if (mg && mg->mg_len >= 0)
2395 mg->mg_len += CHR_SVLEN(last_str);
2397 data->last_end += l * (mincount - 1);
2400 /* start offset must point into the last copy */
2401 data->last_start_min += minnext * (mincount - 1);
2402 data->last_start_max += is_inf ? I32_MAX
2403 : (maxcount - 1) * (minnext + data->pos_delta);
2406 /* It is counted once already... */
2407 data->pos_min += minnext * (mincount - counted);
2408 data->pos_delta += - counted * deltanext +
2409 (minnext + deltanext) * maxcount - minnext * mincount;
2410 if (mincount != maxcount) {
2411 /* Cannot extend fixed substrings found inside
2413 scan_commit(pRExC_state,data);
2414 if (mincount && last_str) {
2415 sv_setsv(data->last_found, last_str);
2416 data->last_end = data->pos_min;
2417 data->last_start_min =
2418 data->pos_min - CHR_SVLEN(last_str);
2419 data->last_start_max = is_inf
2421 : data->pos_min + data->pos_delta
2422 - CHR_SVLEN(last_str);
2424 data->longest = &(data->longest_float);
2426 SvREFCNT_dec(last_str);
2428 if (data && (fl & SF_HAS_EVAL))
2429 data->flags |= SF_HAS_EVAL;
2430 optimize_curly_tail:
2431 if (OP(oscan) != CURLYX) {
2432 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2434 NEXT_OFF(oscan) += NEXT_OFF(next);
2437 default: /* REF and CLUMP only? */
2438 if (flags & SCF_DO_SUBSTR) {
2439 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2440 data->longest = &(data->longest_float);
2442 is_inf = is_inf_internal = 1;
2443 if (flags & SCF_DO_STCLASS_OR)
2444 cl_anything(pRExC_state, data->start_class);
2445 flags &= ~SCF_DO_STCLASS;
2449 else if (strchr((char*)PL_simple,OP(scan))) {
2452 if (flags & SCF_DO_SUBSTR) {
2453 scan_commit(pRExC_state,data);
2457 if (flags & SCF_DO_STCLASS) {
2458 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2460 /* Some of the logic below assumes that switching
2461 locale on will only add false positives. */
2462 switch (PL_regkind[(U8)OP(scan)]) {
2466 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2467 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2468 cl_anything(pRExC_state, data->start_class);
2471 if (OP(scan) == SANY)
2473 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2474 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2475 || (data->start_class->flags & ANYOF_CLASS));
2476 cl_anything(pRExC_state, data->start_class);
2478 if (flags & SCF_DO_STCLASS_AND || !value)
2479 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2482 if (flags & SCF_DO_STCLASS_AND)
2483 cl_and(data->start_class,
2484 (struct regnode_charclass_class*)scan);
2486 cl_or(pRExC_state, data->start_class,
2487 (struct regnode_charclass_class*)scan);
2490 if (flags & SCF_DO_STCLASS_AND) {
2491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2493 for (value = 0; value < 256; value++)
2494 if (!isALNUM(value))
2495 ANYOF_BITMAP_CLEAR(data->start_class, value);
2499 if (data->start_class->flags & ANYOF_LOCALE)
2500 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2502 for (value = 0; value < 256; value++)
2504 ANYOF_BITMAP_SET(data->start_class, value);
2509 if (flags & SCF_DO_STCLASS_AND) {
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2514 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2515 data->start_class->flags |= ANYOF_LOCALE;
2519 if (flags & SCF_DO_STCLASS_AND) {
2520 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2522 for (value = 0; value < 256; value++)
2524 ANYOF_BITMAP_CLEAR(data->start_class, value);
2528 if (data->start_class->flags & ANYOF_LOCALE)
2529 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2531 for (value = 0; value < 256; value++)
2532 if (!isALNUM(value))
2533 ANYOF_BITMAP_SET(data->start_class, value);
2538 if (flags & SCF_DO_STCLASS_AND) {
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2543 data->start_class->flags |= ANYOF_LOCALE;
2544 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2548 if (flags & SCF_DO_STCLASS_AND) {
2549 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2550 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2551 for (value = 0; value < 256; value++)
2552 if (!isSPACE(value))
2553 ANYOF_BITMAP_CLEAR(data->start_class, value);
2557 if (data->start_class->flags & ANYOF_LOCALE)
2558 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2560 for (value = 0; value < 256; value++)
2562 ANYOF_BITMAP_SET(data->start_class, value);
2567 if (flags & SCF_DO_STCLASS_AND) {
2568 if (data->start_class->flags & ANYOF_LOCALE)
2569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2572 data->start_class->flags |= ANYOF_LOCALE;
2573 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2577 if (flags & SCF_DO_STCLASS_AND) {
2578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2580 for (value = 0; value < 256; value++)
2582 ANYOF_BITMAP_CLEAR(data->start_class, value);
2586 if (data->start_class->flags & ANYOF_LOCALE)
2587 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2589 for (value = 0; value < 256; value++)
2590 if (!isSPACE(value))
2591 ANYOF_BITMAP_SET(data->start_class, value);
2596 if (flags & SCF_DO_STCLASS_AND) {
2597 if (data->start_class->flags & ANYOF_LOCALE) {
2598 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2599 for (value = 0; value < 256; value++)
2600 if (!isSPACE(value))
2601 ANYOF_BITMAP_CLEAR(data->start_class, value);
2605 data->start_class->flags |= ANYOF_LOCALE;
2606 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2610 if (flags & SCF_DO_STCLASS_AND) {
2611 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2612 for (value = 0; value < 256; value++)
2613 if (!isDIGIT(value))
2614 ANYOF_BITMAP_CLEAR(data->start_class, value);
2617 if (data->start_class->flags & ANYOF_LOCALE)
2618 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2620 for (value = 0; value < 256; value++)
2622 ANYOF_BITMAP_SET(data->start_class, value);
2627 if (flags & SCF_DO_STCLASS_AND) {
2628 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2629 for (value = 0; value < 256; value++)
2631 ANYOF_BITMAP_CLEAR(data->start_class, value);
2634 if (data->start_class->flags & ANYOF_LOCALE)
2635 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2637 for (value = 0; value < 256; value++)
2638 if (!isDIGIT(value))
2639 ANYOF_BITMAP_SET(data->start_class, value);
2644 if (flags & SCF_DO_STCLASS_OR)
2645 cl_and(data->start_class, &and_with);
2646 flags &= ~SCF_DO_STCLASS;
2649 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2650 data->flags |= (OP(scan) == MEOL
2654 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2655 /* Lookbehind, or need to calculate parens/evals/stclass: */
2656 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2657 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2658 /* Lookahead/lookbehind */
2659 I32 deltanext, minnext, fake = 0;
2661 struct regnode_charclass_class intrnl;
2664 data_fake.flags = 0;
2666 data_fake.whilem_c = data->whilem_c;
2667 data_fake.last_closep = data->last_closep;
2670 data_fake.last_closep = &fake;
2671 if ( flags & SCF_DO_STCLASS && !scan->flags
2672 && OP(scan) == IFMATCH ) { /* Lookahead */
2673 cl_init(pRExC_state, &intrnl);
2674 data_fake.start_class = &intrnl;
2675 f |= SCF_DO_STCLASS_AND;
2677 if (flags & SCF_WHILEM_VISITED_POS)
2678 f |= SCF_WHILEM_VISITED_POS;
2679 next = regnext(scan);
2680 nscan = NEXTOPER(NEXTOPER(scan));
2681 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2684 vFAIL("Variable length lookbehind not implemented");
2686 else if (minnext > U8_MAX) {
2687 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2689 scan->flags = (U8)minnext;
2691 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2693 if (data && (data_fake.flags & SF_HAS_EVAL))
2694 data->flags |= SF_HAS_EVAL;
2696 data->whilem_c = data_fake.whilem_c;
2697 if (f & SCF_DO_STCLASS_AND) {
2698 int was = (data->start_class->flags & ANYOF_EOS);
2700 cl_and(data->start_class, &intrnl);
2702 data->start_class->flags |= ANYOF_EOS;
2705 else if (OP(scan) == OPEN) {
2708 else if (OP(scan) == CLOSE) {
2709 if ((I32)ARG(scan) == is_par) {
2710 next = regnext(scan);
2712 if ( next && (OP(next) != WHILEM) && next < last)
2713 is_par = 0; /* Disable optimization */
2716 *(data->last_closep) = ARG(scan);
2718 else if (OP(scan) == EVAL) {
2720 data->flags |= SF_HAS_EVAL;
2722 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2723 if (flags & SCF_DO_SUBSTR) {
2724 scan_commit(pRExC_state,data);
2725 data->longest = &(data->longest_float);
2727 is_inf = is_inf_internal = 1;
2728 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2729 cl_anything(pRExC_state, data->start_class);
2730 flags &= ~SCF_DO_STCLASS;
2732 /* Else: zero-length, ignore. */
2733 scan = regnext(scan);
2738 *deltap = is_inf_internal ? I32_MAX : delta;
2739 if (flags & SCF_DO_SUBSTR && is_inf)
2740 data->pos_delta = I32_MAX - data->pos_min;
2741 if (is_par > U8_MAX)
2743 if (is_par && pars==1 && data) {
2744 data->flags |= SF_IN_PAR;
2745 data->flags &= ~SF_HAS_PAR;
2747 else if (pars && data) {
2748 data->flags |= SF_HAS_PAR;
2749 data->flags &= ~SF_IN_PAR;
2751 if (flags & SCF_DO_STCLASS_OR)
2752 cl_and(data->start_class, &and_with);
2757 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
2759 if (RExC_rx->data) {
2760 Renewc(RExC_rx->data,
2761 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2762 char, struct reg_data);
2763 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2764 RExC_rx->data->count += n;
2767 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2768 char, struct reg_data);
2769 New(1208, RExC_rx->data->what, n, U8);
2770 RExC_rx->data->count = n;
2772 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2773 return RExC_rx->data->count - n;
2777 Perl_reginitcolors(pTHX)
2780 char *s = PerlEnv_getenv("PERL_RE_COLORS");
2783 PL_colors[0] = s = savepv(s);
2785 s = strchr(s, '\t');
2791 PL_colors[i] = s = "";
2795 PL_colors[i++] = "";
2802 - pregcomp - compile a regular expression into internal code
2804 * We can't allocate space until we know how big the compiled form will be,
2805 * but we can't compile it (and thus know how big it is) until we've got a
2806 * place to put the code. So we cheat: we compile it twice, once with code
2807 * generation turned off and size counting turned on, and once "for real".
2808 * This also means that we don't allocate space until we are sure that the
2809 * thing really will compile successfully, and we never have to move the
2810 * code and thus invalidate pointers into it. (Note that it has to be in
2811 * one piece because free() must be able to free it all.) [NB: not true in perl]
2813 * Beware that the optimization-preparation code in here knows about some
2814 * of the structure of the compiled regexp. [I'll say.]
2817 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2827 RExC_state_t RExC_state;
2828 RExC_state_t *pRExC_state = &RExC_state;
2830 GET_RE_DEBUG_FLAGS_DECL;
2833 FAIL("NULL regexp argument");
2835 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2838 DEBUG_r(if (!PL_colorset) reginitcolors());
2840 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2841 PL_colors[4],PL_colors[5],PL_colors[0],
2842 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2844 RExC_flags = pm->op_pmflags;
2848 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2849 RExC_seen_evals = 0;
2852 /* First pass: determine size, legality. */
2859 RExC_emit = &PL_regdummy;
2860 RExC_whilem_seen = 0;
2861 #if 0 /* REGC() is (currently) a NOP at the first pass.
2862 * Clever compilers notice this and complain. --jhi */
2863 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2865 if (reg(pRExC_state, 0, &flags) == NULL) {
2866 RExC_precomp = Nullch;
2869 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2871 /* Small enough for pointer-storage convention?
2872 If extralen==0, this means that we will not need long jumps. */
2873 if (RExC_size >= 0x10000L && RExC_extralen)
2874 RExC_size += RExC_extralen;
2877 if (RExC_whilem_seen > 15)
2878 RExC_whilem_seen = 15;
2880 /* Allocate space and initialize. */
2881 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2884 FAIL("Regexp out of space");
2887 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2888 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2891 r->prelen = xend - exp;
2892 r->precomp = savepvn(RExC_precomp, r->prelen);
2894 #ifdef PERL_COPY_ON_WRITE
2895 r->saved_copy = Nullsv;
2897 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2898 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2900 r->substrs = 0; /* Useful during FAIL. */
2901 r->startp = 0; /* Useful during FAIL. */
2902 r->endp = 0; /* Useful during FAIL. */
2904 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2906 r->offsets[0] = RExC_size;
2908 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2909 "%s %"UVuf" bytes for offset annotations.\n",
2910 r->offsets ? "Got" : "Couldn't get",
2911 (UV)((2*RExC_size+1) * sizeof(U32))));
2915 /* Second pass: emit code. */
2916 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2921 RExC_emit_start = r->program;
2922 RExC_emit = r->program;
2923 /* Store the count of eval-groups for security checks: */
2924 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2925 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2927 if (reg(pRExC_state, 0, &flags) == NULL)
2931 /* Dig out information for optimizations. */
2932 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2933 pm->op_pmflags = RExC_flags;
2935 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2936 r->regstclass = NULL;
2937 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2938 r->reganch |= ROPT_NAUGHTY;
2939 scan = r->program + 1; /* First BRANCH. */
2941 /* XXXX To minimize changes to RE engine we always allocate
2942 3-units-long substrs field. */
2943 Newz(1004, r->substrs, 1, struct reg_substr_data);
2945 StructCopy(&zero_scan_data, &data, scan_data_t);
2946 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2947 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2949 STRLEN longest_float_length, longest_fixed_length;
2950 struct regnode_charclass_class ch_class;
2955 /* Skip introductions and multiplicators >= 1. */
2956 while ((OP(first) == OPEN && (sawopen = 1)) ||
2957 /* An OR of *one* alternative - should not happen now. */
2958 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2959 (OP(first) == PLUS) ||
2960 (OP(first) == MINMOD) ||
2961 /* An {n,m} with n>0 */
2962 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2963 if (OP(first) == PLUS)
2966 first += regarglen[(U8)OP(first)];
2967 first = NEXTOPER(first);
2970 /* Starting-point info. */
2972 if (PL_regkind[(U8)OP(first)] == EXACT) {
2973 if (OP(first) == EXACT)
2974 ; /* Empty, get anchored substr later. */
2975 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2976 r->regstclass = first;
2978 else if (strchr((char*)PL_simple,OP(first)))
2979 r->regstclass = first;
2980 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2981 PL_regkind[(U8)OP(first)] == NBOUND)
2982 r->regstclass = first;
2983 else if (PL_regkind[(U8)OP(first)] == BOL) {
2984 r->reganch |= (OP(first) == MBOL
2986 : (OP(first) == SBOL
2989 first = NEXTOPER(first);
2992 else if (OP(first) == GPOS) {
2993 r->reganch |= ROPT_ANCH_GPOS;
2994 first = NEXTOPER(first);
2997 else if (!sawopen && (OP(first) == STAR &&
2998 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2999 !(r->reganch & ROPT_ANCH) )
3001 /* turn .* into ^.* with an implied $*=1 */
3002 int type = OP(NEXTOPER(first));
3004 if (type == REG_ANY)
3005 type = ROPT_ANCH_MBOL;
3007 type = ROPT_ANCH_SBOL;
3009 r->reganch |= type | ROPT_IMPLICIT;
3010 first = NEXTOPER(first);
3013 if (sawplus && (!sawopen || !RExC_sawback)
3014 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3015 /* x+ must match at the 1st pos of run of x's */
3016 r->reganch |= ROPT_SKIP;
3018 /* Scan is after the zeroth branch, first is atomic matcher. */
3019 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3020 (IV)(first - scan + 1)));
3022 * If there's something expensive in the r.e., find the
3023 * longest literal string that must appear and make it the
3024 * regmust. Resolve ties in favor of later strings, since
3025 * the regstart check works with the beginning of the r.e.
3026 * and avoiding duplication strengthens checking. Not a
3027 * strong reason, but sufficient in the absence of others.
3028 * [Now we resolve ties in favor of the earlier string if
3029 * it happens that c_offset_min has been invalidated, since the
3030 * earlier string may buy us something the later one won't.]
3034 data.longest_fixed = newSVpvn("",0);
3035 data.longest_float = newSVpvn("",0);
3036 data.last_found = newSVpvn("",0);
3037 data.longest = &(data.longest_fixed);
3039 if (!r->regstclass) {
3040 cl_init(pRExC_state, &ch_class);
3041 data.start_class = &ch_class;
3042 stclass_flag = SCF_DO_STCLASS_AND;
3043 } else /* XXXX Check for BOUND? */
3045 data.last_closep = &last_close;
3047 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3048 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3049 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3050 && data.last_start_min == 0 && data.last_end > 0
3051 && !RExC_seen_zerolen
3052 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3053 r->reganch |= ROPT_CHECK_ALL;
3054 scan_commit(pRExC_state, &data);
3055 SvREFCNT_dec(data.last_found);
3057 longest_float_length = CHR_SVLEN(data.longest_float);
3058 if (longest_float_length
3059 || (data.flags & SF_FL_BEFORE_EOL
3060 && (!(data.flags & SF_FL_BEFORE_MEOL)
3061 || (RExC_flags & PMf_MULTILINE)))) {
3064 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3065 && data.offset_fixed == data.offset_float_min
3066 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3067 goto remove_float; /* As in (a)+. */
3069 if (SvUTF8(data.longest_float)) {
3070 r->float_utf8 = data.longest_float;
3071 r->float_substr = Nullsv;
3073 r->float_substr = data.longest_float;
3074 r->float_utf8 = Nullsv;
3076 r->float_min_offset = data.offset_float_min;
3077 r->float_max_offset = data.offset_float_max;
3078 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3079 && (!(data.flags & SF_FL_BEFORE_MEOL)
3080 || (RExC_flags & PMf_MULTILINE)));
3081 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3085 r->float_substr = r->float_utf8 = Nullsv;
3086 SvREFCNT_dec(data.longest_float);
3087 longest_float_length = 0;
3090 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3091 if (longest_fixed_length
3092 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3093 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3094 || (RExC_flags & PMf_MULTILINE)))) {
3097 if (SvUTF8(data.longest_fixed)) {
3098 r->anchored_utf8 = data.longest_fixed;
3099 r->anchored_substr = Nullsv;
3101 r->anchored_substr = data.longest_fixed;
3102 r->anchored_utf8 = Nullsv;
3104 r->anchored_offset = data.offset_fixed;
3105 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3106 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3107 || (RExC_flags & PMf_MULTILINE)));
3108 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3111 r->anchored_substr = r->anchored_utf8 = Nullsv;
3112 SvREFCNT_dec(data.longest_fixed);
3113 longest_fixed_length = 0;
3116 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3117 r->regstclass = NULL;
3118 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3120 && !(data.start_class->flags & ANYOF_EOS)
3121 && !cl_is_anything(data.start_class))
3123 I32 n = add_data(pRExC_state, 1, "f");
3125 New(1006, RExC_rx->data->data[n], 1,
3126 struct regnode_charclass_class);
3127 StructCopy(data.start_class,
3128 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3129 struct regnode_charclass_class);
3130 r->regstclass = (regnode*)RExC_rx->data->data[n];
3131 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3132 PL_regdata = r->data; /* for regprop() */
3133 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3134 regprop(sv, (regnode*)data.start_class);
3135 PerlIO_printf(Perl_debug_log,
3136 "synthetic stclass `%s'.\n",
3140 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3141 if (longest_fixed_length > longest_float_length) {
3142 r->check_substr = r->anchored_substr;
3143 r->check_utf8 = r->anchored_utf8;
3144 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3145 if (r->reganch & ROPT_ANCH_SINGLE)
3146 r->reganch |= ROPT_NOSCAN;
3149 r->check_substr = r->float_substr;
3150 r->check_utf8 = r->float_utf8;
3151 r->check_offset_min = data.offset_float_min;
3152 r->check_offset_max = data.offset_float_max;
3154 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3155 This should be changed ASAP! */
3156 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3157 r->reganch |= RE_USE_INTUIT;
3158 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3159 r->reganch |= RE_INTUIT_TAIL;
3163 /* Several toplevels. Best we can is to set minlen. */
3165 struct regnode_charclass_class ch_class;
3168 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3169 scan = r->program + 1;
3170 cl_init(pRExC_state, &ch_class);
3171 data.start_class = &ch_class;
3172 data.last_closep = &last_close;
3173 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3174 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3175 = r->float_substr = r->float_utf8 = Nullsv;
3176 if (!(data.start_class->flags & ANYOF_EOS)
3177 && !cl_is_anything(data.start_class))
3179 I32 n = add_data(pRExC_state, 1, "f");
3181 New(1006, RExC_rx->data->data[n], 1,
3182 struct regnode_charclass_class);
3183 StructCopy(data.start_class,
3184 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3185 struct regnode_charclass_class);
3186 r->regstclass = (regnode*)RExC_rx->data->data[n];
3187 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3188 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3189 regprop(sv, (regnode*)data.start_class);
3190 PerlIO_printf(Perl_debug_log,
3191 "synthetic stclass `%s'.\n",
3197 if (RExC_seen & REG_SEEN_GPOS)
3198 r->reganch |= ROPT_GPOS_SEEN;
3199 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3200 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3201 if (RExC_seen & REG_SEEN_EVAL)
3202 r->reganch |= ROPT_EVAL_SEEN;
3203 if (RExC_seen & REG_SEEN_CANY)
3204 r->reganch |= ROPT_CANY_SEEN;
3205 Newz(1002, r->startp, RExC_npar, I32);
3206 Newz(1002, r->endp, RExC_npar, I32);
3207 PL_regdata = r->data; /* for regprop() */
3208 DEBUG_COMPILE_r(regdump(r));
3213 - reg - regular expression, i.e. main body or parenthesized thing
3215 * Caller must absorb opening parenthesis.
3217 * Combining parenthesis handling with the base level of regular expression
3218 * is a trifle forced, but the need to tie the tails of the branches to what
3219 * follows makes it hard to avoid.
3222 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3223 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3225 register regnode *ret; /* Will be the head of the group. */
3226 register regnode *br;
3227 register regnode *lastbr;
3228 register regnode *ender = 0;
3229 register I32 parno = 0;
3230 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3232 /* for (?g), (?gc), and (?o) warnings; warning
3233 about (?c) will warn about (?g) -- japhy */
3235 I32 wastedflags = 0x00,
3238 wasted_gc = 0x02 | 0x04,
3241 char * parse_start = RExC_parse; /* MJD */
3242 char *oregcomp_parse = RExC_parse;
3245 *flagp = 0; /* Tentatively. */
3248 /* Make an OPEN node, if parenthesized. */
3250 if (*RExC_parse == '?') { /* (?...) */
3251 U32 posflags = 0, negflags = 0;
3252 U32 *flagsp = &posflags;
3254 char *seqstart = RExC_parse;
3257 paren = *RExC_parse++;
3258 ret = NULL; /* For look-ahead/behind. */
3260 case '<': /* (?<...) */
3261 RExC_seen |= REG_SEEN_LOOKBEHIND;
3262 if (*RExC_parse == '!')
3264 if (*RExC_parse != '=' && *RExC_parse != '!')
3267 case '=': /* (?=...) */
3268 case '!': /* (?!...) */
3269 RExC_seen_zerolen++;
3270 case ':': /* (?:...) */
3271 case '>': /* (?>...) */
3273 case '$': /* (?$...) */
3274 case '@': /* (?@...) */
3275 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3277 case '#': /* (?#...) */
3278 while (*RExC_parse && *RExC_parse != ')')
3280 if (*RExC_parse != ')')
3281 FAIL("Sequence (?#... not terminated");
3282 nextchar(pRExC_state);
3285 case 'p': /* (?p...) */
3286 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3287 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3289 case '?': /* (??...) */
3291 if (*RExC_parse != '{')
3293 paren = *RExC_parse++;
3295 case '{': /* (?{...}) */
3297 I32 count = 1, n = 0;
3299 char *s = RExC_parse;
3301 OP_4tree *sop, *rop;
3303 RExC_seen_zerolen++;
3304 RExC_seen |= REG_SEEN_EVAL;
3305 while (count && (c = *RExC_parse)) {
3306 if (c == '\\' && RExC_parse[1])
3314 if (*RExC_parse != ')')
3317 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3322 if (RExC_parse - 1 - s)
3323 sv = newSVpvn(s, RExC_parse - 1 - s);
3325 sv = newSVpvn("", 0);
3328 Perl_save_re_context(aTHX);
3329 rop = sv_compile_2op(sv, &sop, "re", &pad);
3330 sop->op_private |= OPpREFCOUNTED;
3331 /* re_dup will OpREFCNT_inc */
3332 OpREFCNT_set(sop, 1);
3335 n = add_data(pRExC_state, 3, "nop");
3336 RExC_rx->data->data[n] = (void*)rop;
3337 RExC_rx->data->data[n+1] = (void*)sop;
3338 RExC_rx->data->data[n+2] = (void*)pad;
3341 else { /* First pass */
3342 if (PL_reginterp_cnt < ++RExC_seen_evals
3344 /* No compiled RE interpolated, has runtime
3345 components ===> unsafe. */
3346 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3347 if (PL_tainting && PL_tainted)
3348 FAIL("Eval-group in insecure regular expression");
3349 if (IN_PERL_COMPILETIME)
3353 nextchar(pRExC_state);
3355 ret = reg_node(pRExC_state, LOGICAL);
3358 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3359 /* deal with the length of this later - MJD */
3362 ret = reganode(pRExC_state, EVAL, n);
3363 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3364 Set_Node_Offset(ret, parse_start);
3367 case '(': /* (?(?{...})...) and (?(?=...)...) */
3369 if (RExC_parse[0] == '?') { /* (?(?...)) */
3370 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3371 || RExC_parse[1] == '<'
3372 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3375 ret = reg_node(pRExC_state, LOGICAL);
3378 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3382 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3384 parno = atoi(RExC_parse++);
3386 while (isDIGIT(*RExC_parse))
3388 ret = reganode(pRExC_state, GROUPP, parno);
3390 if ((c = *nextchar(pRExC_state)) != ')')
3391 vFAIL("Switch condition not recognized");
3393 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3394 br = regbranch(pRExC_state, &flags, 1);
3396 br = reganode(pRExC_state, LONGJMP, 0);
3398 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3399 c = *nextchar(pRExC_state);
3403 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3404 regbranch(pRExC_state, &flags, 1);
3405 regtail(pRExC_state, ret, lastbr);
3408 c = *nextchar(pRExC_state);
3413 vFAIL("Switch (?(condition)... contains too many branches");
3414 ender = reg_node(pRExC_state, TAIL);
3415 regtail(pRExC_state, br, ender);
3417 regtail(pRExC_state, lastbr, ender);
3418 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3421 regtail(pRExC_state, ret, ender);
3425 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3429 RExC_parse--; /* for vFAIL to print correctly */
3430 vFAIL("Sequence (? incomplete");
3434 parse_flags: /* (?i) */
3435 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3436 /* (?g), (?gc) and (?o) are useless here
3437 and must be globally applied -- japhy */
3439 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3440 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3441 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3442 if (! (wastedflags & wflagbit) ) {
3443 wastedflags |= wflagbit;
3446 "Useless (%s%c) - %suse /%c modifier",
3447 flagsp == &negflags ? "?-" : "?",
3449 flagsp == &negflags ? "don't " : "",
3455 else if (*RExC_parse == 'c') {
3456 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3457 if (! (wastedflags & wasted_c) ) {
3458 wastedflags |= wasted_gc;
3461 "Useless (%sc) - %suse /gc modifier",
3462 flagsp == &negflags ? "?-" : "?",
3463 flagsp == &negflags ? "don't " : ""
3468 else { pmflag(flagsp, *RExC_parse); }
3472 if (*RExC_parse == '-') {
3474 wastedflags = 0; /* reset so (?g-c) warns twice */
3478 RExC_flags |= posflags;
3479 RExC_flags &= ~negflags;
3480 if (*RExC_parse == ':') {
3486 if (*RExC_parse != ')') {
3488 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3490 nextchar(pRExC_state);
3498 ret = reganode(pRExC_state, OPEN, parno);
3499 Set_Node_Length(ret, 1); /* MJD */
3500 Set_Node_Offset(ret, RExC_parse); /* MJD */
3507 /* Pick up the branches, linking them together. */
3508 parse_start = RExC_parse; /* MJD */
3509 br = regbranch(pRExC_state, &flags, 1);
3510 /* branch_len = (paren != 0); */
3514 if (*RExC_parse == '|') {
3515 if (!SIZE_ONLY && RExC_extralen) {
3516 reginsert(pRExC_state, BRANCHJ, br);
3519 reginsert(pRExC_state, BRANCH, br);
3520 Set_Node_Length(br, paren != 0);
3521 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3525 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3527 else if (paren == ':') {
3528 *flagp |= flags&SIMPLE;
3530 if (open) { /* Starts with OPEN. */
3531 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3533 else if (paren != '?') /* Not Conditional */
3535 *flagp |= flags & (SPSTART | HASWIDTH);
3537 while (*RExC_parse == '|') {
3538 if (!SIZE_ONLY && RExC_extralen) {
3539 ender = reganode(pRExC_state, LONGJMP,0);
3540 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3543 RExC_extralen += 2; /* Account for LONGJMP. */
3544 nextchar(pRExC_state);
3545 br = regbranch(pRExC_state, &flags, 0);
3549 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3553 *flagp |= flags&SPSTART;
3556 if (have_branch || paren != ':') {
3557 /* Make a closing node, and hook it on the end. */
3560 ender = reg_node(pRExC_state, TAIL);
3563 ender = reganode(pRExC_state, CLOSE, parno);
3564 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3565 Set_Node_Length(ender,1); /* MJD */
3571 *flagp &= ~HASWIDTH;
3574 ender = reg_node(pRExC_state, SUCCEED);
3577 ender = reg_node(pRExC_state, END);
3580 regtail(pRExC_state, lastbr, ender);
3583 /* Hook the tails of the branches to the closing node. */
3584 for (br = ret; br != NULL; br = regnext(br)) {
3585 regoptail(pRExC_state, br, ender);
3592 static char parens[] = "=!<,>";
3594 if (paren && (p = strchr(parens, paren))) {
3595 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3596 int flag = (p - parens) > 1;
3599 node = SUSPEND, flag = 0;
3600 reginsert(pRExC_state, node,ret);
3601 Set_Node_Cur_Length(ret);
3602 Set_Node_Offset(ret, parse_start + 1);
3604 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3608 /* Check for proper termination. */
3610 RExC_flags = oregflags;
3611 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3612 RExC_parse = oregcomp_parse;
3613 vFAIL("Unmatched (");
3616 else if (!paren && RExC_parse < RExC_end) {
3617 if (*RExC_parse == ')') {
3619 vFAIL("Unmatched )");
3622 FAIL("Junk on end of regexp"); /* "Can't happen". */
3630 - regbranch - one alternative of an | operator
3632 * Implements the concatenation operator.
3635 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3637 register regnode *ret;
3638 register regnode *chain = NULL;
3639 register regnode *latest;
3640 I32 flags = 0, c = 0;
3645 if (!SIZE_ONLY && RExC_extralen)
3646 ret = reganode(pRExC_state, BRANCHJ,0);
3648 ret = reg_node(pRExC_state, BRANCH);
3649 Set_Node_Length(ret, 1);
3653 if (!first && SIZE_ONLY)
3654 RExC_extralen += 1; /* BRANCHJ */
3656 *flagp = WORST; /* Tentatively. */
3659 nextchar(pRExC_state);
3660 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3662 latest = regpiece(pRExC_state, &flags);
3663 if (latest == NULL) {
3664 if (flags & TRYAGAIN)
3668 else if (ret == NULL)
3670 *flagp |= flags&HASWIDTH;
3671 if (chain == NULL) /* First piece. */
3672 *flagp |= flags&SPSTART;
3675 regtail(pRExC_state, chain, latest);
3680 if (chain == NULL) { /* Loop ran zero times. */
3681 chain = reg_node(pRExC_state, NOTHING);
3686 *flagp |= flags&SIMPLE;
3693 - regpiece - something followed by possible [*+?]
3695 * Note that the branching code sequences used for ? and the general cases
3696 * of * and + are somewhat optimized: they use the same NOTHING node as
3697 * both the endmarker for their branch list and the body of the last branch.
3698 * It might seem that this node could be dispensed with entirely, but the
3699 * endmarker role is not redundant.
3702 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3704 register regnode *ret;
3706 register char *next;
3708 char *origparse = RExC_parse;
3711 I32 max = REG_INFTY;
3714 ret = regatom(pRExC_state, &flags);
3716 if (flags & TRYAGAIN)
3723 if (op == '{' && regcurly(RExC_parse)) {
3724 parse_start = RExC_parse; /* MJD */
3725 next = RExC_parse + 1;
3727 while (isDIGIT(*next) || *next == ',') {
3736 if (*next == '}') { /* got one */
3740 min = atoi(RExC_parse);
3744 maxpos = RExC_parse;
3746 if (!max && *maxpos != '0')
3747 max = REG_INFTY; /* meaning "infinity" */
3748 else if (max >= REG_INFTY)
3749 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3751 nextchar(pRExC_state);
3754 if ((flags&SIMPLE)) {
3755 RExC_naughty += 2 + RExC_naughty / 2;
3756 reginsert(pRExC_state, CURLY, ret);
3757 Set_Node_Offset(ret, parse_start+1); /* MJD */
3758 Set_Node_Cur_Length(ret);
3761 regnode *w = reg_node(pRExC_state, WHILEM);
3764 regtail(pRExC_state, ret, w);
3765 if (!SIZE_ONLY && RExC_extralen) {
3766 reginsert(pRExC_state, LONGJMP,ret);
3767 reginsert(pRExC_state, NOTHING,ret);
3768 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3770 reginsert(pRExC_state, CURLYX,ret);
3772 Set_Node_Offset(ret, parse_start+1);
3773 Set_Node_Length(ret,
3774 op == '{' ? (RExC_parse - parse_start) : 1);
3776 if (!SIZE_ONLY && RExC_extralen)
3777 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3778 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3780 RExC_whilem_seen++, RExC_extralen += 3;
3781 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3789 if (max && max < min)
3790 vFAIL("Can't do {n,m} with n > m");
3792 ARG1_SET(ret, (U16)min);
3793 ARG2_SET(ret, (U16)max);
3805 #if 0 /* Now runtime fix should be reliable. */
3807 /* if this is reinstated, don't forget to put this back into perldiag:
3809 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3811 (F) The part of the regexp subject to either the * or + quantifier
3812 could match an empty string. The {#} shows in the regular
3813 expression about where the problem was discovered.
3817 if (!(flags&HASWIDTH) && op != '?')
3818 vFAIL("Regexp *+ operand could be empty");
3821 parse_start = RExC_parse;
3822 nextchar(pRExC_state);
3824 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3826 if (op == '*' && (flags&SIMPLE)) {
3827 reginsert(pRExC_state, STAR, ret);
3831 else if (op == '*') {
3835 else if (op == '+' && (flags&SIMPLE)) {
3836 reginsert(pRExC_state, PLUS, ret);
3840 else if (op == '+') {
3844 else if (op == '?') {
3849 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3851 "%.*s matches null string many times",
3852 RExC_parse - origparse,
3856 if (*RExC_parse == '?') {
3857 nextchar(pRExC_state);
3858 reginsert(pRExC_state, MINMOD, ret);
3859 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3861 if (ISMULT2(RExC_parse)) {
3863 vFAIL("Nested quantifiers");
3870 - regatom - the lowest level
3872 * Optimization: gobbles an entire sequence of ordinary characters so that
3873 * it can turn them into a single node, which is smaller to store and
3874 * faster to run. Backslashed characters are exceptions, each becoming a
3875 * separate node; the code is simpler that way and it's not worth fixing.
3877 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3879 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3881 register regnode *ret = 0;
3883 char *parse_start = RExC_parse;
3885 *flagp = WORST; /* Tentatively. */
3888 switch (*RExC_parse) {
3890 RExC_seen_zerolen++;
3891 nextchar(pRExC_state);
3892 if (RExC_flags & PMf_MULTILINE)
3893 ret = reg_node(pRExC_state, MBOL);
3894 else if (RExC_flags & PMf_SINGLELINE)
3895 ret = reg_node(pRExC_state, SBOL);
3897 ret = reg_node(pRExC_state, BOL);
3898 Set_Node_Length(ret, 1); /* MJD */
3901 nextchar(pRExC_state);
3903 RExC_seen_zerolen++;
3904 if (RExC_flags & PMf_MULTILINE)
3905 ret = reg_node(pRExC_state, MEOL);
3906 else if (RExC_flags & PMf_SINGLELINE)
3907 ret = reg_node(pRExC_state, SEOL);
3909 ret = reg_node(pRExC_state, EOL);
3910 Set_Node_Length(ret, 1); /* MJD */
3913 nextchar(pRExC_state);
3914 if (RExC_flags & PMf_SINGLELINE)
3915 ret = reg_node(pRExC_state, SANY);
3917 ret = reg_node(pRExC_state, REG_ANY);
3918 *flagp |= HASWIDTH|SIMPLE;
3920 Set_Node_Length(ret, 1); /* MJD */
3924 char *oregcomp_parse = ++RExC_parse;
3925 ret = regclass(pRExC_state);
3926 if (*RExC_parse != ']') {
3927 RExC_parse = oregcomp_parse;
3928 vFAIL("Unmatched [");
3930 nextchar(pRExC_state);
3931 *flagp |= HASWIDTH|SIMPLE;
3932 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3936 nextchar(pRExC_state);
3937 ret = reg(pRExC_state, 1, &flags);
3939 if (flags & TRYAGAIN) {
3940 if (RExC_parse == RExC_end) {
3941 /* Make parent create an empty node if needed. */
3949 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3953 if (flags & TRYAGAIN) {
3957 vFAIL("Internal urp");
3958 /* Supposed to be caught earlier. */
3961 if (!regcurly(RExC_parse)) {
3970 vFAIL("Quantifier follows nothing");
3973 switch (*++RExC_parse) {
3975 RExC_seen_zerolen++;
3976 ret = reg_node(pRExC_state, SBOL);
3978 nextchar(pRExC_state);
3979 Set_Node_Length(ret, 2); /* MJD */
3982 ret = reg_node(pRExC_state, GPOS);
3983 RExC_seen |= REG_SEEN_GPOS;
3985 nextchar(pRExC_state);
3986 Set_Node_Length(ret, 2); /* MJD */
3989 ret = reg_node(pRExC_state, SEOL);
3991 RExC_seen_zerolen++; /* Do not optimize RE away */
3992 nextchar(pRExC_state);
3995 ret = reg_node(pRExC_state, EOS);
3997 RExC_seen_zerolen++; /* Do not optimize RE away */
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4002 ret = reg_node(pRExC_state, CANY);
4003 RExC_seen |= REG_SEEN_CANY;
4004 *flagp |= HASWIDTH|SIMPLE;
4005 nextchar(pRExC_state);
4006 Set_Node_Length(ret, 2); /* MJD */
4009 ret = reg_node(pRExC_state, CLUMP);
4011 nextchar(pRExC_state);
4012 Set_Node_Length(ret, 2); /* MJD */
4015 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4016 *flagp |= HASWIDTH|SIMPLE;
4017 nextchar(pRExC_state);
4018 Set_Node_Length(ret, 2); /* MJD */
4021 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4022 *flagp |= HASWIDTH|SIMPLE;
4023 nextchar(pRExC_state);
4024 Set_Node_Length(ret, 2); /* MJD */
4027 RExC_seen_zerolen++;
4028 RExC_seen |= REG_SEEN_LOOKBEHIND;
4029 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4031 nextchar(pRExC_state);
4032 Set_Node_Length(ret, 2); /* MJD */
4035 RExC_seen_zerolen++;
4036 RExC_seen |= REG_SEEN_LOOKBEHIND;
4037 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4039 nextchar(pRExC_state);
4040 Set_Node_Length(ret, 2); /* MJD */
4043 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4044 *flagp |= HASWIDTH|SIMPLE;
4045 nextchar(pRExC_state);
4046 Set_Node_Length(ret, 2); /* MJD */
4049 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4050 *flagp |= HASWIDTH|SIMPLE;
4051 nextchar(pRExC_state);
4052 Set_Node_Length(ret, 2); /* MJD */
4055 ret = reg_node(pRExC_state, DIGIT);
4056 *flagp |= HASWIDTH|SIMPLE;
4057 nextchar(pRExC_state);
4058 Set_Node_Length(ret, 2); /* MJD */
4061 ret = reg_node(pRExC_state, NDIGIT);
4062 *flagp |= HASWIDTH|SIMPLE;
4063 nextchar(pRExC_state);
4064 Set_Node_Length(ret, 2); /* MJD */
4069 char* oldregxend = RExC_end;
4070 char* parse_start = RExC_parse - 2;
4072 if (RExC_parse[1] == '{') {
4073 /* a lovely hack--pretend we saw [\pX] instead */
4074 RExC_end = strchr(RExC_parse, '}');
4076 U8 c = (U8)*RExC_parse;
4078 RExC_end = oldregxend;
4079 vFAIL2("Missing right brace on \\%c{}", c);
4084 RExC_end = RExC_parse + 2;
4085 if (RExC_end > oldregxend)
4086 RExC_end = oldregxend;
4090 ret = regclass(pRExC_state);
4092 RExC_end = oldregxend;
4095 Set_Node_Offset(ret, parse_start + 2);
4096 Set_Node_Cur_Length(ret);
4097 nextchar(pRExC_state);
4098 *flagp |= HASWIDTH|SIMPLE;
4111 case '1': case '2': case '3': case '4':
4112 case '5': case '6': case '7': case '8': case '9':
4114 I32 num = atoi(RExC_parse);
4116 if (num > 9 && num >= RExC_npar)
4119 char * parse_start = RExC_parse - 1; /* MJD */
4120 while (isDIGIT(*RExC_parse))
4123 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4124 vFAIL("Reference to nonexistent group");
4126 ret = reganode(pRExC_state,
4127 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4131 /* override incorrect value set in reganode MJD */
4132 Set_Node_Offset(ret, parse_start+1);
4133 Set_Node_Cur_Length(ret); /* MJD */
4135 nextchar(pRExC_state);
4140 if (RExC_parse >= RExC_end)
4141 FAIL("Trailing \\");
4144 /* Do not generate `unrecognized' warnings here, we fall
4145 back into the quick-grab loop below */
4152 if (RExC_flags & PMf_EXTENDED) {
4153 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4154 if (RExC_parse < RExC_end)
4160 register STRLEN len;
4166 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4168 parse_start = RExC_parse - 1;
4174 ret = reg_node(pRExC_state,
4175 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4177 for (len = 0, p = RExC_parse - 1;
4178 len < 127 && p < RExC_end;
4183 if (RExC_flags & PMf_EXTENDED)
4184 p = regwhite(p, RExC_end);
4231 ender = ASCII_TO_NATIVE('\033');
4235 ender = ASCII_TO_NATIVE('\007');
4240 char* e = strchr(p, '}');
4244 vFAIL("Missing right brace on \\x{}");
4247 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4248 | PERL_SCAN_DISALLOW_PREFIX;
4250 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4257 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4259 ender = grok_hex(p, &numlen, &flags, NULL);
4265 ender = UCHARAT(p++);
4266 ender = toCTRL(ender);
4268 case '0': case '1': case '2': case '3':case '4':
4269 case '5': case '6': case '7': case '8':case '9':
4271 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4274 ender = grok_oct(p, &numlen, &flags, NULL);
4284 FAIL("Trailing \\");
4287 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4288 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4289 goto normal_default;
4294 if (UTF8_IS_START(*p) && UTF) {
4295 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4303 if (RExC_flags & PMf_EXTENDED)
4304 p = regwhite(p, RExC_end);
4306 /* Prime the casefolded buffer. */
4307 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4309 if (ISMULT2(p)) { /* Back off on ?+*. */
4316 /* Emit all the Unicode characters. */
4317 for (foldbuf = tmpbuf;
4319 foldlen -= numlen) {
4320 ender = utf8_to_uvchr(foldbuf, &numlen);
4322 reguni(pRExC_state, ender, s, &unilen);
4325 /* In EBCDIC the numlen
4326 * and unilen can differ. */
4328 if (numlen >= foldlen)
4332 break; /* "Can't happen." */
4336 reguni(pRExC_state, ender, s, &unilen);
4345 REGC((char)ender, s++);
4353 /* Emit all the Unicode characters. */
4354 for (foldbuf = tmpbuf;
4356 foldlen -= numlen) {
4357 ender = utf8_to_uvchr(foldbuf, &numlen);
4359 reguni(pRExC_state, ender, s, &unilen);
4362 /* In EBCDIC the numlen
4363 * and unilen can differ. */
4365 if (numlen >= foldlen)
4373 reguni(pRExC_state, ender, s, &unilen);
4382 REGC((char)ender, s++);
4386 Set_Node_Cur_Length(ret); /* MJD */
4387 nextchar(pRExC_state);
4389 /* len is STRLEN which is unsigned, need to copy to signed */
4392 vFAIL("Internal disaster");
4396 if (len == 1 && UNI_IS_INVARIANT(ender))
4401 RExC_size += STR_SZ(len);
4403 RExC_emit += STR_SZ(len);
4408 /* If the encoding pragma is in effect recode the text of
4409 * any EXACT-kind nodes. */
4410 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4411 STRLEN oldlen = STR_LEN(ret);
4412 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4416 if (sv_utf8_downgrade(sv, TRUE)) {
4417 char *s = sv_recode_to_utf8(sv, PL_encoding);
4418 STRLEN newlen = SvCUR(sv);
4423 GET_RE_DEBUG_FLAGS_DECL;
4424 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4425 (int)oldlen, STRING(ret),
4427 Copy(s, STRING(ret), newlen, char);
4428 STR_LEN(ret) += newlen - oldlen;
4429 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4431 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4439 S_regwhite(pTHX_ char *p, char *e)
4444 else if (*p == '#') {
4447 } while (p < e && *p != '\n');
4455 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4456 Character classes ([:foo:]) can also be negated ([:^foo:]).
4457 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4458 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4459 but trigger failures because they are currently unimplemented. */
4461 #define POSIXCC_DONE(c) ((c) == ':')
4462 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4463 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4466 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4469 I32 namedclass = OOB_NAMEDCLASS;
4471 if (value == '[' && RExC_parse + 1 < RExC_end &&
4472 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4473 POSIXCC(UCHARAT(RExC_parse))) {
4474 char c = UCHARAT(RExC_parse);
4475 char* s = RExC_parse++;
4477 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4479 if (RExC_parse == RExC_end)
4480 /* Grandfather lone [:, [=, [. */
4483 char* t = RExC_parse++; /* skip over the c */
4487 if (UCHARAT(RExC_parse) == ']') {
4488 RExC_parse++; /* skip over the ending ] */
4491 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4492 I32 skip = t - posixcc;
4494 /* Initially switch on the length of the name. */
4497 if (memEQ(posixcc, "word", 4)) {
4498 /* this is not POSIX, this is the Perl \w */;
4500 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4504 /* Names all of length 5. */
4505 /* alnum alpha ascii blank cntrl digit graph lower
4506 print punct space upper */
4507 /* Offset 4 gives the best switch position. */
4508 switch (posixcc[4]) {
4510 if (memEQ(posixcc, "alph", 4)) {
4513 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4517 if (memEQ(posixcc, "spac", 4)) {
4520 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4524 if (memEQ(posixcc, "grap", 4)) {
4527 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4531 if (memEQ(posixcc, "asci", 4)) {
4534 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4538 if (memEQ(posixcc, "blan", 4)) {
4541 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4545 if (memEQ(posixcc, "cntr", 4)) {
4548 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4552 if (memEQ(posixcc, "alnu", 4)) {
4555 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4559 if (memEQ(posixcc, "lowe", 4)) {
4562 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4564 if (memEQ(posixcc, "uppe", 4)) {
4567 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4571 if (memEQ(posixcc, "digi", 4)) {
4574 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4576 if (memEQ(posixcc, "prin", 4)) {
4579 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4581 if (memEQ(posixcc, "punc", 4)) {
4584 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4590 if (memEQ(posixcc, "xdigit", 6)) {
4592 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4597 if (namedclass == OOB_NAMEDCLASS)
4599 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4602 assert (posixcc[skip] == ':');
4603 assert (posixcc[skip+1] == ']');
4604 } else if (!SIZE_ONLY) {
4605 /* [[=foo=]] and [[.foo.]] are still future. */
4607 /* adjust RExC_parse so the warning shows after
4609 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4611 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4614 /* Maternal grandfather:
4615 * "[:" ending in ":" but not in ":]" */
4625 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4627 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4628 char *s = RExC_parse;
4631 while(*s && isALNUM(*s))
4633 if (*s && c == *s && s[1] == ']') {
4634 if (ckWARN(WARN_REGEXP))
4636 "POSIX syntax [%c %c] belongs inside character classes",
4639 /* [[=foo=]] and [[.foo.]] are still future. */
4640 if (POSIXCC_NOTYET(c)) {
4641 /* adjust RExC_parse so the error shows after
4643 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4645 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4652 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4655 register UV nextvalue;
4656 register IV prevvalue = OOB_UNICODE;
4657 register IV range = 0;
4658 register regnode *ret;
4661 char *rangebegin = 0;
4662 bool need_class = 0;
4663 SV *listsv = Nullsv;
4666 bool optimize_invert = TRUE;
4667 AV* unicode_alternate = 0;
4669 UV literal_endpoint = 0;
4672 ret = reganode(pRExC_state, ANYOF, 0);
4675 ANYOF_FLAGS(ret) = 0;
4677 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4681 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4685 RExC_size += ANYOF_SKIP;
4687 RExC_emit += ANYOF_SKIP;
4689 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4691 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4692 ANYOF_BITMAP_ZERO(ret);
4693 listsv = newSVpvn("# comment\n", 10);
4696 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4698 if (!SIZE_ONLY && POSIXCC(nextvalue))
4699 checkposixcc(pRExC_state);
4701 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4702 if (UCHARAT(RExC_parse) == ']')
4705 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4709 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4712 rangebegin = RExC_parse;
4714 value = utf8n_to_uvchr((U8*)RExC_parse,
4715 RExC_end - RExC_parse,
4717 RExC_parse += numlen;
4720 value = UCHARAT(RExC_parse++);
4721 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4722 if (value == '[' && POSIXCC(nextvalue))
4723 namedclass = regpposixcc(pRExC_state, value);
4724 else if (value == '\\') {
4726 value = utf8n_to_uvchr((U8*)RExC_parse,
4727 RExC_end - RExC_parse,
4729 RExC_parse += numlen;
4732 value = UCHARAT(RExC_parse++);
4733 /* Some compilers cannot handle switching on 64-bit integer
4734 * values, therefore value cannot be an UV. Yes, this will
4735 * be a problem later if we want switch on Unicode.
4736 * A similar issue a little bit later when switching on
4737 * namedclass. --jhi */
4738 switch ((I32)value) {
4739 case 'w': namedclass = ANYOF_ALNUM; break;
4740 case 'W': namedclass = ANYOF_NALNUM; break;
4741 case 's': namedclass = ANYOF_SPACE; break;
4742 case 'S': namedclass = ANYOF_NSPACE; break;
4743 case 'd': namedclass = ANYOF_DIGIT; break;
4744 case 'D': namedclass = ANYOF_NDIGIT; break;
4747 if (RExC_parse >= RExC_end)
4748 vFAIL2("Empty \\%c{}", (U8)value);
4749 if (*RExC_parse == '{') {
4751 e = strchr(RExC_parse++, '}');
4753 vFAIL2("Missing right brace on \\%c{}", c);
4754 while (isSPACE(UCHARAT(RExC_parse)))
4756 if (e == RExC_parse)
4757 vFAIL2("Empty \\%c{}", c);
4759 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4767 if (UCHARAT(RExC_parse) == '^') {
4770 value = value == 'p' ? 'P' : 'p'; /* toggle */
4771 while (isSPACE(UCHARAT(RExC_parse))) {
4777 Perl_sv_catpvf(aTHX_ listsv,
4778 "+utf8::%.*s\n", (int)n, RExC_parse);
4780 Perl_sv_catpvf(aTHX_ listsv,
4781 "!utf8::%.*s\n", (int)n, RExC_parse);
4784 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4785 namedclass = ANYOF_MAX; /* no official name, but it's named */
4787 case 'n': value = '\n'; break;
4788 case 'r': value = '\r'; break;
4789 case 't': value = '\t'; break;
4790 case 'f': value = '\f'; break;
4791 case 'b': value = '\b'; break;
4792 case 'e': value = ASCII_TO_NATIVE('\033');break;
4793 case 'a': value = ASCII_TO_NATIVE('\007');break;
4795 if (*RExC_parse == '{') {
4796 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4797 | PERL_SCAN_DISALLOW_PREFIX;
4798 e = strchr(RExC_parse++, '}');
4800 vFAIL("Missing right brace on \\x{}");
4802 numlen = e - RExC_parse;
4803 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4807 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4809 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4810 RExC_parse += numlen;
4814 value = UCHARAT(RExC_parse++);
4815 value = toCTRL(value);
4817 case '0': case '1': case '2': case '3': case '4':
4818 case '5': case '6': case '7': case '8': case '9':
4822 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4823 RExC_parse += numlen;
4827 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4829 "Unrecognized escape \\%c in character class passed through",
4833 } /* end of \blah */
4839 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4841 if (!SIZE_ONLY && !need_class)
4842 ANYOF_CLASS_ZERO(ret);
4846 /* a bad range like a-\d, a-[:digit:] ? */
4849 if (ckWARN(WARN_REGEXP))
4851 "False [] range \"%*.*s\"",
4852 RExC_parse - rangebegin,
4853 RExC_parse - rangebegin,
4855 if (prevvalue < 256) {
4856 ANYOF_BITMAP_SET(ret, prevvalue);
4857 ANYOF_BITMAP_SET(ret, '-');
4860 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4861 Perl_sv_catpvf(aTHX_ listsv,
4862 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4866 range = 0; /* this was not a true range */
4870 const char *what = NULL;
4873 if (namedclass > OOB_NAMEDCLASS)
4874 optimize_invert = FALSE;
4875 /* Possible truncation here but in some 64-bit environments
4876 * the compiler gets heartburn about switch on 64-bit values.
4877 * A similar issue a little earlier when switching on value.
4879 switch ((I32)namedclass) {
4882 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4884 for (value = 0; value < 256; value++)
4886 ANYOF_BITMAP_SET(ret, value);
4893 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4895 for (value = 0; value < 256; value++)
4896 if (!isALNUM(value))
4897 ANYOF_BITMAP_SET(ret, value);
4904 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4906 for (value = 0; value < 256; value++)
4907 if (isALNUMC(value))
4908 ANYOF_BITMAP_SET(ret, value);
4915 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4917 for (value = 0; value < 256; value++)
4918 if (!isALNUMC(value))
4919 ANYOF_BITMAP_SET(ret, value);
4926 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4928 for (value = 0; value < 256; value++)
4930 ANYOF_BITMAP_SET(ret, value);
4937 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4939 for (value = 0; value < 256; value++)
4940 if (!isALPHA(value))
4941 ANYOF_BITMAP_SET(ret, value);
4948 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4951 for (value = 0; value < 128; value++)
4952 ANYOF_BITMAP_SET(ret, value);
4954 for (value = 0; value < 256; value++) {
4956 ANYOF_BITMAP_SET(ret, value);
4965 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4968 for (value = 128; value < 256; value++)
4969 ANYOF_BITMAP_SET(ret, value);
4971 for (value = 0; value < 256; value++) {
4972 if (!isASCII(value))
4973 ANYOF_BITMAP_SET(ret, value);
4982 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4984 for (value = 0; value < 256; value++)
4986 ANYOF_BITMAP_SET(ret, value);
4993 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4995 for (value = 0; value < 256; value++)
4996 if (!isBLANK(value))
4997 ANYOF_BITMAP_SET(ret, value);
5004 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5006 for (value = 0; value < 256; value++)
5008 ANYOF_BITMAP_SET(ret, value);
5015 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5017 for (value = 0; value < 256; value++)
5018 if (!isCNTRL(value))
5019 ANYOF_BITMAP_SET(ret, value);
5026 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5028 /* consecutive digits assumed */
5029 for (value = '0'; value <= '9'; value++)
5030 ANYOF_BITMAP_SET(ret, value);
5037 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5039 /* consecutive digits assumed */
5040 for (value = 0; value < '0'; value++)
5041 ANYOF_BITMAP_SET(ret, value);
5042 for (value = '9' + 1; value < 256; value++)
5043 ANYOF_BITMAP_SET(ret, value);
5050 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5052 for (value = 0; value < 256; value++)
5054 ANYOF_BITMAP_SET(ret, value);
5061 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5063 for (value = 0; value < 256; value++)
5064 if (!isGRAPH(value))
5065 ANYOF_BITMAP_SET(ret, value);
5072 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5074 for (value = 0; value < 256; value++)
5076 ANYOF_BITMAP_SET(ret, value);
5083 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5085 for (value = 0; value < 256; value++)
5086 if (!isLOWER(value))
5087 ANYOF_BITMAP_SET(ret, value);
5094 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5096 for (value = 0; value < 256; value++)
5098 ANYOF_BITMAP_SET(ret, value);
5105 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5107 for (value = 0; value < 256; value++)
5108 if (!isPRINT(value))
5109 ANYOF_BITMAP_SET(ret, value);
5116 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5118 for (value = 0; value < 256; value++)
5119 if (isPSXSPC(value))
5120 ANYOF_BITMAP_SET(ret, value);
5127 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5129 for (value = 0; value < 256; value++)
5130 if (!isPSXSPC(value))
5131 ANYOF_BITMAP_SET(ret, value);
5138 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5140 for (value = 0; value < 256; value++)
5142 ANYOF_BITMAP_SET(ret, value);
5149 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5151 for (value = 0; value < 256; value++)
5152 if (!isPUNCT(value))
5153 ANYOF_BITMAP_SET(ret, value);
5160 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5162 for (value = 0; value < 256; value++)
5164 ANYOF_BITMAP_SET(ret, value);
5171 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5173 for (value = 0; value < 256; value++)
5174 if (!isSPACE(value))
5175 ANYOF_BITMAP_SET(ret, value);
5182 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5184 for (value = 0; value < 256; value++)
5186 ANYOF_BITMAP_SET(ret, value);
5193 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5195 for (value = 0; value < 256; value++)
5196 if (!isUPPER(value))
5197 ANYOF_BITMAP_SET(ret, value);
5204 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5206 for (value = 0; value < 256; value++)
5207 if (isXDIGIT(value))
5208 ANYOF_BITMAP_SET(ret, value);
5215 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5217 for (value = 0; value < 256; value++)
5218 if (!isXDIGIT(value))
5219 ANYOF_BITMAP_SET(ret, value);
5225 /* this is to handle \p and \P */
5228 vFAIL("Invalid [::] class");
5232 /* Strings such as "+utf8::isWord\n" */
5233 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5236 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5239 } /* end of namedclass \blah */
5242 if (prevvalue > (IV)value) /* b-a */ {
5243 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5244 RExC_parse - rangebegin,
5245 RExC_parse - rangebegin,
5247 range = 0; /* not a valid range */
5251 prevvalue = value; /* save the beginning of the range */
5252 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5253 RExC_parse[1] != ']') {
5256 /* a bad range like \w-, [:word:]- ? */
5257 if (namedclass > OOB_NAMEDCLASS) {
5258 if (ckWARN(WARN_REGEXP))
5260 "False [] range \"%*.*s\"",
5261 RExC_parse - rangebegin,
5262 RExC_parse - rangebegin,
5265 ANYOF_BITMAP_SET(ret, '-');
5267 range = 1; /* yeah, it's a range! */
5268 continue; /* but do it the next time */
5272 /* now is the next time */
5276 if (prevvalue < 256) {
5277 IV ceilvalue = value < 256 ? value : 255;
5280 /* In EBCDIC [\x89-\x91] should include
5281 * the \x8e but [i-j] should not. */
5282 if (literal_endpoint == 2 &&
5283 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5284 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5286 if (isLOWER(prevvalue)) {
5287 for (i = prevvalue; i <= ceilvalue; i++)
5289 ANYOF_BITMAP_SET(ret, i);
5291 for (i = prevvalue; i <= ceilvalue; i++)
5293 ANYOF_BITMAP_SET(ret, i);
5298 for (i = prevvalue; i <= ceilvalue; i++)
5299 ANYOF_BITMAP_SET(ret, i);
5301 if (value > 255 || UTF) {
5302 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5303 UV natvalue = NATIVE_TO_UNI(value);
5305 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5306 if (prevnatvalue < natvalue) { /* what about > ? */
5307 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5308 prevnatvalue, natvalue);
5310 else if (prevnatvalue == natvalue) {
5311 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5313 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5315 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5317 /* If folding and foldable and a single
5318 * character, insert also the folded version
5319 * to the charclass. */
5321 if (foldlen == (STRLEN)UNISKIP(f))
5322 Perl_sv_catpvf(aTHX_ listsv,
5325 /* Any multicharacter foldings
5326 * require the following transform:
5327 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5328 * where E folds into "pq" and F folds
5329 * into "rst", all other characters
5330 * fold to single characters. We save
5331 * away these multicharacter foldings,
5332 * to be later saved as part of the
5333 * additional "s" data. */
5336 if (!unicode_alternate)
5337 unicode_alternate = newAV();
5338 sv = newSVpvn((char*)foldbuf, foldlen);
5340 av_push(unicode_alternate, sv);
5344 /* If folding and the value is one of the Greek
5345 * sigmas insert a few more sigmas to make the
5346 * folding rules of the sigmas to work right.
5347 * Note that not all the possible combinations
5348 * are handled here: some of them are handled
5349 * by the standard folding rules, and some of
5350 * them (literal or EXACTF cases) are handled
5351 * during runtime in regexec.c:S_find_byclass(). */
5352 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5353 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5354 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5355 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5356 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5358 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5359 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5360 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5365 literal_endpoint = 0;
5369 range = 0; /* this range (if it was one) is done now */
5373 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5375 RExC_size += ANYOF_CLASS_ADD_SKIP;
5377 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5380 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5382 /* If the only flag is folding (plus possibly inversion). */
5383 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5385 for (value = 0; value < 256; ++value) {
5386 if (ANYOF_BITMAP_TEST(ret, value)) {
5387 UV fold = PL_fold[value];
5390 ANYOF_BITMAP_SET(ret, fold);
5393 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5396 /* optimize inverted simple patterns (e.g. [^a-z]) */
5397 if (!SIZE_ONLY && optimize_invert &&
5398 /* If the only flag is inversion. */
5399 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5400 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5401 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5402 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5409 /* The 0th element stores the character class description
5410 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5411 * to initialize the appropriate swash (which gets stored in
5412 * the 1st element), and also useful for dumping the regnode.
5413 * The 2nd element stores the multicharacter foldings,
5414 * used later (regexec.c:S_reginclass()). */
5415 av_store(av, 0, listsv);
5416 av_store(av, 1, NULL);
5417 av_store(av, 2, (SV*)unicode_alternate);
5418 rv = newRV_noinc((SV*)av);
5419 n = add_data(pRExC_state, 1, "s");
5420 RExC_rx->data->data[n] = (void*)rv;
5428 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5430 char* retval = RExC_parse++;
5433 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5434 RExC_parse[2] == '#') {
5435 while (*RExC_parse != ')') {
5436 if (RExC_parse == RExC_end)
5437 FAIL("Sequence (?#... not terminated");
5443 if (RExC_flags & PMf_EXTENDED) {
5444 if (isSPACE(*RExC_parse)) {
5448 else if (*RExC_parse == '#') {
5449 while (RExC_parse < RExC_end)
5450 if (*RExC_parse++ == '\n') break;
5459 - reg_node - emit a node
5461 STATIC regnode * /* Location. */
5462 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5464 register regnode *ret;
5465 register regnode *ptr;
5469 SIZE_ALIGN(RExC_size);
5474 NODE_ALIGN_FILL(ret);
5476 FILL_ADVANCE_NODE(ptr, op);
5477 if (RExC_offsets) { /* MJD */
5478 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5479 "reg_node", __LINE__,
5481 RExC_emit - RExC_emit_start > RExC_offsets[0]
5482 ? "Overwriting end of array!\n" : "OK",
5483 RExC_emit - RExC_emit_start,
5484 RExC_parse - RExC_start,
5486 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5495 - reganode - emit a node with an argument
5497 STATIC regnode * /* Location. */
5498 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5500 register regnode *ret;
5501 register regnode *ptr;
5505 SIZE_ALIGN(RExC_size);
5510 NODE_ALIGN_FILL(ret);
5512 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5513 if (RExC_offsets) { /* MJD */
5514 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5518 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5519 "Overwriting end of array!\n" : "OK",
5520 RExC_emit - RExC_emit_start,
5521 RExC_parse - RExC_start,
5523 Set_Cur_Node_Offset;
5532 - reguni - emit (if appropriate) a Unicode character
5535 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5537 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5541 - reginsert - insert an operator in front of already-emitted operand
5543 * Means relocating the operand.
5546 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5548 register regnode *src;
5549 register regnode *dst;
5550 register regnode *place;
5551 register int offset = regarglen[(U8)op];
5553 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5556 RExC_size += NODE_STEP_REGNODE + offset;
5561 RExC_emit += NODE_STEP_REGNODE + offset;
5563 while (src > opnd) {
5564 StructCopy(--src, --dst, regnode);
5565 if (RExC_offsets) { /* MJD 20010112 */
5566 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5570 dst - RExC_emit_start > RExC_offsets[0]
5571 ? "Overwriting end of array!\n" : "OK",
5572 src - RExC_emit_start,
5573 dst - RExC_emit_start,
5575 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5576 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5581 place = opnd; /* Op node, where operand used to be. */
5582 if (RExC_offsets) { /* MJD */
5583 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5587 place - RExC_emit_start > RExC_offsets[0]
5588 ? "Overwriting end of array!\n" : "OK",
5589 place - RExC_emit_start,
5590 RExC_parse - RExC_start,
5592 Set_Node_Offset(place, RExC_parse);
5593 Set_Node_Length(place, 1);
5595 src = NEXTOPER(place);
5596 FILL_ADVANCE_NODE(place, op);
5597 Zero(src, offset, regnode);
5601 - regtail - set the next-pointer at the end of a node chain of p to val.
5604 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5606 register regnode *scan;
5607 register regnode *temp;
5612 /* Find last node. */
5615 temp = regnext(scan);
5621 if (reg_off_by_arg[OP(scan)]) {
5622 ARG_SET(scan, val - scan);
5625 NEXT_OFF(scan) = val - scan;
5630 - regoptail - regtail on operand of first argument; nop if operandless
5633 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5635 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5636 if (p == NULL || SIZE_ONLY)
5638 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5639 regtail(pRExC_state, NEXTOPER(p), val);
5641 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5642 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5649 - regcurly - a little FSA that accepts {\d+,?\d*}
5652 S_regcurly(pTHX_ register char *s)
5673 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5675 register U8 op = EXACT; /* Arbitrary non-END op. */
5676 register regnode *next;
5678 while (op != END && (!last || node < last)) {
5679 /* While that wasn't END last time... */
5685 next = regnext(node);
5687 if (OP(node) == OPTIMIZED)
5690 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5691 (int)(2*l + 1), "", SvPVX(sv));
5692 if (next == NULL) /* Next ptr. */
5693 PerlIO_printf(Perl_debug_log, "(0)");
5695 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5696 (void)PerlIO_putc(Perl_debug_log, '\n');
5698 if (PL_regkind[(U8)op] == BRANCHJ) {
5699 register regnode *nnode = (OP(next) == LONGJMP
5702 if (last && nnode > last)
5704 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5706 else if (PL_regkind[(U8)op] == BRANCH) {
5707 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5709 else if ( PL_regkind[(U8)op] == TRIE ) {
5711 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5713 I32 arry_len=av_len(trie->words)+1;
5714 PerlIO_printf(Perl_debug_log,
5715 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%d%s]\n",
5719 trie->uniquecharcount,
5721 node->flags ? " EVAL mode" : "");
5723 for (word_idx=0; word_idx < arry_len; word_idx++) {
5724 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5726 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5729 SvPV_nolen(*elem_ptr),
5734 PerlIO_printf(Perl_debug_log, "(0)\n");
5736 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5742 node = NEXTOPER(node);
5743 node += regarglen[(U8)op];
5746 else if ( op == CURLY) { /* `next' might be very big: optimizer */
5747 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5748 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5750 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5751 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5754 else if ( op == PLUS || op == STAR) {
5755 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5757 else if (op == ANYOF) {
5758 /* arglen 1 + class block */
5759 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5760 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5761 node = NEXTOPER(node);
5763 else if (PL_regkind[(U8)op] == EXACT) {
5764 /* Literal string, where present. */
5765 node += NODE_SZ_STR(node) - 1;
5766 node = NEXTOPER(node);
5769 node = NEXTOPER(node);
5770 node += regarglen[(U8)op];
5772 if (op == CURLYX || op == OPEN)
5774 else if (op == WHILEM)
5780 #endif /* DEBUGGING */
5783 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5786 Perl_regdump(pTHX_ regexp *r)
5789 SV *sv = sv_newmortal();
5791 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5793 /* Header fields of interest. */
5794 if (r->anchored_substr)
5795 PerlIO_printf(Perl_debug_log,
5796 "anchored `%s%.*s%s'%s at %"IVdf" ",
5798 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5799 SvPVX(r->anchored_substr),
5801 SvTAIL(r->anchored_substr) ? "$" : "",
5802 (IV)r->anchored_offset);
5803 else if (r->anchored_utf8)
5804 PerlIO_printf(Perl_debug_log,
5805 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5807 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5808 SvPVX(r->anchored_utf8),
5810 SvTAIL(r->anchored_utf8) ? "$" : "",
5811 (IV)r->anchored_offset);
5812 if (r->float_substr)
5813 PerlIO_printf(Perl_debug_log,
5814 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5816 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5817 SvPVX(r->float_substr),
5819 SvTAIL(r->float_substr) ? "$" : "",
5820 (IV)r->float_min_offset, (UV)r->float_max_offset);
5821 else if (r->float_utf8)
5822 PerlIO_printf(Perl_debug_log,
5823 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5825 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5826 SvPVX(r->float_utf8),
5828 SvTAIL(r->float_utf8) ? "$" : "",
5829 (IV)r->float_min_offset, (UV)r->float_max_offset);
5830 if (r->check_substr || r->check_utf8)
5831 PerlIO_printf(Perl_debug_log,
5832 r->check_substr == r->float_substr
5833 && r->check_utf8 == r->float_utf8
5834 ? "(checking floating" : "(checking anchored");
5835 if (r->reganch & ROPT_NOSCAN)
5836 PerlIO_printf(Perl_debug_log, " noscan");
5837 if (r->reganch & ROPT_CHECK_ALL)
5838 PerlIO_printf(Perl_debug_log, " isall");
5839 if (r->check_substr || r->check_utf8)
5840 PerlIO_printf(Perl_debug_log, ") ");
5842 if (r->regstclass) {
5843 regprop(sv, r->regstclass);
5844 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5846 if (r->reganch & ROPT_ANCH) {
5847 PerlIO_printf(Perl_debug_log, "anchored");
5848 if (r->reganch & ROPT_ANCH_BOL)
5849 PerlIO_printf(Perl_debug_log, "(BOL)");
5850 if (r->reganch & ROPT_ANCH_MBOL)
5851 PerlIO_printf(Perl_debug_log, "(MBOL)");
5852 if (r->reganch & ROPT_ANCH_SBOL)
5853 PerlIO_printf(Perl_debug_log, "(SBOL)");
5854 if (r->reganch & ROPT_ANCH_GPOS)
5855 PerlIO_printf(Perl_debug_log, "(GPOS)");
5856 PerlIO_putc(Perl_debug_log, ' ');
5858 if (r->reganch & ROPT_GPOS_SEEN)
5859 PerlIO_printf(Perl_debug_log, "GPOS ");
5860 if (r->reganch & ROPT_SKIP)
5861 PerlIO_printf(Perl_debug_log, "plus ");
5862 if (r->reganch & ROPT_IMPLICIT)
5863 PerlIO_printf(Perl_debug_log, "implicit ");
5864 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5865 if (r->reganch & ROPT_EVAL_SEEN)
5866 PerlIO_printf(Perl_debug_log, "with eval ");
5867 PerlIO_printf(Perl_debug_log, "\n");
5870 U32 len = r->offsets[0];
5871 GET_RE_DEBUG_FLAGS_DECL;
5873 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5874 for (i = 1; i <= len; i++)
5875 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5876 (UV)r->offsets[i*2-1],
5877 (UV)r->offsets[i*2]);
5878 PerlIO_printf(Perl_debug_log, "\n");
5881 #endif /* DEBUGGING */
5887 S_put_byte(pTHX_ SV *sv, int c)
5889 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5890 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5891 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5892 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5894 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5897 #endif /* DEBUGGING */
5901 - regprop - printable representation of opcode
5904 Perl_regprop(pTHX_ SV *sv, regnode *o)
5909 sv_setpvn(sv, "", 0);
5910 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5911 /* It would be nice to FAIL() here, but this may be called from
5912 regexec.c, and it would be hard to supply pRExC_state. */
5913 Perl_croak(aTHX_ "Corrupted regexp opcode");
5914 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
5916 k = PL_regkind[(U8)OP(o)];
5919 SV *dsv = sv_2mortal(newSVpvn("", 0));
5920 /* Using is_utf8_string() is a crude hack but it may
5921 * be the best for now since we have no flag "this EXACTish
5922 * node was UTF-8" --jhi */
5923 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5925 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5926 UNI_DISPLAY_REGEX) :
5931 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5935 } else if (k == TRIE) {/*
5936 this isn't always safe, as Pl_regdata may not be for this regex yet
5937 (depending on where its called from) so its being moved to dumpuntil
5939 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5940 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5943 trie->uniquecharcount,
5946 } else if (k == CURLY) {
5947 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5948 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5949 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5951 else if (k == WHILEM && o->flags) /* Ordinal/of */
5952 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5953 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5954 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5955 else if (k == LOGICAL)
5956 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5957 else if (k == ANYOF) {
5958 int i, rangestart = -1;
5959 U8 flags = ANYOF_FLAGS(o);
5960 const char * const anyofs[] = { /* Should be synchronized with
5961 * ANYOF_ #xdefines in regcomp.h */
5994 if (flags & ANYOF_LOCALE)
5995 sv_catpv(sv, "{loc}");
5996 if (flags & ANYOF_FOLD)
5997 sv_catpv(sv, "{i}");
5998 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5999 if (flags & ANYOF_INVERT)
6001 for (i = 0; i <= 256; i++) {
6002 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6003 if (rangestart == -1)
6005 } else if (rangestart != -1) {
6006 if (i <= rangestart + 3)
6007 for (; rangestart < i; rangestart++)
6008 put_byte(sv, rangestart);
6010 put_byte(sv, rangestart);
6012 put_byte(sv, i - 1);
6018 if (o->flags & ANYOF_CLASS)
6019 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6020 if (ANYOF_CLASS_TEST(o,i))
6021 sv_catpv(sv, anyofs[i]);
6023 if (flags & ANYOF_UNICODE)
6024 sv_catpv(sv, "{unicode}");
6025 else if (flags & ANYOF_UNICODE_ALL)
6026 sv_catpv(sv, "{unicode_all}");
6030 SV *sw = regclass_swash(o, FALSE, &lv, 0);
6034 U8 s[UTF8_MAXBYTES_CASE+1];
6036 for (i = 0; i <= 256; i++) { /* just the first 256 */
6037 U8 *e = uvchr_to_utf8(s, i);
6039 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6040 if (rangestart == -1)
6042 } else if (rangestart != -1) {
6045 if (i <= rangestart + 3)
6046 for (; rangestart < i; rangestart++) {
6047 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6051 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6054 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6061 sv_catpv(sv, "..."); /* et cetera */
6065 char *s = savesvpv(lv);
6068 while(*s && *s != '\n') s++;
6089 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6091 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6092 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6093 #endif /* DEBUGGING */
6097 Perl_re_intuit_string(pTHX_ regexp *prog)
6098 { /* Assume that RE_INTUIT is set */
6099 GET_RE_DEBUG_FLAGS_DECL;
6102 char *s = SvPV(prog->check_substr
6103 ? prog->check_substr : prog->check_utf8, n_a);
6105 if (!PL_colorset) reginitcolors();
6106 PerlIO_printf(Perl_debug_log,
6107 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6109 prog->check_substr ? "" : "utf8 ",
6110 PL_colors[5],PL_colors[0],
6113 (strlen(s) > 60 ? "..." : ""));
6116 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6120 Perl_pregfree(pTHX_ struct regexp *r)
6123 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
6124 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6128 if (!r || (--r->refcnt > 0))
6130 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6134 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
6135 r->prelen, 60, UNI_DISPLAY_REGEX)
6136 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6140 PerlIO_printf(Perl_debug_log,
6141 "%sFreeing REx:%s %s%*.*s%s%s\n",
6142 PL_colors[4],PL_colors[5],PL_colors[0],
6145 len > 60 ? "..." : "");
6149 Safefree(r->precomp);
6150 if (r->offsets) /* 20010421 MJD */
6151 Safefree(r->offsets);
6152 RX_MATCH_COPY_FREE(r);
6153 #ifdef PERL_COPY_ON_WRITE
6155 SvREFCNT_dec(r->saved_copy);
6158 if (r->anchored_substr)
6159 SvREFCNT_dec(r->anchored_substr);
6160 if (r->anchored_utf8)
6161 SvREFCNT_dec(r->anchored_utf8);
6162 if (r->float_substr)
6163 SvREFCNT_dec(r->float_substr);
6165 SvREFCNT_dec(r->float_utf8);
6166 Safefree(r->substrs);
6169 int n = r->data->count;
6170 PAD* new_comppad = NULL;
6175 /* If you add a ->what type here, update the comment in regcomp.h */
6176 switch (r->data->what[n]) {
6178 SvREFCNT_dec((SV*)r->data->data[n]);
6181 Safefree(r->data->data[n]);
6184 new_comppad = (AV*)r->data->data[n];
6187 if (new_comppad == NULL)
6188 Perl_croak(aTHX_ "panic: pregfree comppad");
6189 PAD_SAVE_LOCAL(old_comppad,
6190 /* Watch out for global destruction's random ordering. */
6191 (SvTYPE(new_comppad) == SVt_PVAV) ?
6192 new_comppad : Null(PAD *)
6195 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6198 op_free((OP_4tree*)r->data->data[n]);
6200 PAD_RESTORE_LOCAL(old_comppad);
6201 SvREFCNT_dec((SV*)new_comppad);
6208 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6211 refcount = trie->refcount--;
6215 Safefree(trie->charmap);
6216 if (trie->widecharmap)
6217 SvREFCNT_dec((SV*)trie->widecharmap);
6219 Safefree(trie->states);
6221 Safefree(trie->trans);
6224 SvREFCNT_dec((SV*)trie->words);
6225 if (trie->revcharmap)
6226 SvREFCNT_dec((SV*)trie->revcharmap);
6228 Safefree(r->data->data[n]); /* do this last!!!! */
6233 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6236 Safefree(r->data->what);
6239 Safefree(r->startp);
6245 - regnext - dig the "next" pointer out of a node
6248 Perl_regnext(pTHX_ register regnode *p)
6250 register I32 offset;
6252 if (p == &PL_regdummy)
6255 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6263 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6266 STRLEN l1 = strlen(pat1);
6267 STRLEN l2 = strlen(pat2);
6276 Copy(pat1, buf, l1 , char);
6277 Copy(pat2, buf + l1, l2 , char);
6278 buf[l1 + l2] = '\n';
6279 buf[l1 + l2 + 1] = '\0';
6281 /* ANSI variant takes additional second argument */
6282 va_start(args, pat2);
6286 msv = vmess(buf, &args);
6288 message = SvPV(msv,l1);
6291 Copy(message, buf, l1 , char);
6292 buf[l1-1] = '\0'; /* Overwrite \n */
6293 Perl_croak(aTHX_ "%s", buf);
6296 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6299 Perl_save_re_context(pTHX)
6301 SAVEI32(PL_reg_flags); /* from regexec.c */
6303 SAVEPPTR(PL_reginput); /* String-input pointer. */
6304 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6305 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6306 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6307 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6308 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6309 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6310 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6311 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6312 PL_reg_start_tmp = 0;
6313 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6314 PL_reg_start_tmpl = 0;
6315 SAVEVPTR(PL_regdata);
6316 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6317 SAVEI32(PL_regnarrate); /* from regexec.c */
6318 SAVEVPTR(PL_regprogram); /* from regexec.c */
6319 SAVEINT(PL_regindent); /* from regexec.c */
6320 SAVEVPTR(PL_regcc); /* from regexec.c */
6321 SAVEVPTR(PL_curcop);
6322 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6323 SAVEVPTR(PL_reg_re); /* from regexec.c */
6324 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6325 SAVESPTR(PL_reg_sv); /* from regexec.c */
6326 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6327 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6328 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6329 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6330 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6331 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6332 PL_reg_oldsaved = Nullch;
6333 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6334 PL_reg_oldsavedlen = 0;
6335 #ifdef PERL_COPY_ON_WRITE
6339 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6341 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6342 PL_reg_leftiter = 0;
6343 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6344 PL_reg_poscache = Nullch;
6345 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6346 PL_reg_poscache_size = 0;
6347 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6348 SAVEI32(PL_regnpar); /* () count. */
6349 SAVEI32(PL_regsize); /* from regexec.c */
6352 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6356 char digits[TYPE_CHARS(long)];
6358 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6359 for (i = 1; i <= rx->nparens; i++) {
6360 sprintf(digits, "%lu", (long)i);
6361 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6368 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6373 clear_re(pTHX_ void *r)
6375 ReREFCNT_dec((regexp *)r);
6380 * c-indentation-style: bsd
6382 * indent-tabs-mode: t
6385 * vim: shiftwidth=4: