5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
674 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
676 A trie is an N'ary tree where the branches are determined by digital
677 decomposition of the key. IE, at the root node you look up the 1st character and
678 follow that branch repeat until you find the end of the branches. Nodes can be
679 marked as "accepting" meaning they represent a complete word. Eg:
683 would convert into the following structure. Numbers represent states, letters
684 following numbers represent valid transitions on the letter from that state, if
685 the number is in square brackets it represents an accepting state, otherwise it
686 will be in parenthesis.
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
692 (1) +-i->(6)-+-s->[7]
694 +-s->(3)-+-h->(4)-+-e->[5]
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
698 This shows that when matching against the string 'hers' we will begin at state 1
699 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702 single traverse. We store a mapping from accepting to state to which word was
703 matched, and then when we have multiple possibilities we try to complete the
704 rest of the regex in the order in which they occured in the alternation.
706 The only prior NFA like behaviour that would be changed by the TRIE support is
707 the silent ignoring of duplicate alternations which are of the form:
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
711 Thus EVAL blocks follwing a trie may be called a different number of times with
712 and without the optimisation. With the optimisations dupes will be silently
713 ignored. This inconsistant behaviour of EVAL type nodes is well established as
714 the following demonstrates:
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
718 which prints out 'word' three times, but
720 'words'=~/(word|word|word)(?{ print $1 })S/
722 which doesnt print it out at all. This is due to other optimisations kicking in.
724 Example of what happens on a structural level:
726 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
728 1: CURLYM[1] {1,32767}(18)
739 This would be optimizable with startbranch=5, first=5, last=16, tail=16
740 and should turn into:
742 1: CURLYM[1] {1,32767}(18)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
752 Cases where tail != last would be like /(?foo|bar)baz/:
762 which would be optimizable with startbranch=1, first=1, last=7, tail=8
763 and would end up looking like:
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
775 #define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
784 av_push( trie->revcharmap, tmp ); \
787 #define TRIE_READ_CHAR STMT_START { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
811 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
816 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
827 #define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
835 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
838 /* first pass, loop through and scan words */
841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
846 /* we just use folder as a flag in utf8 */
847 const U8 * const folder = ( flags == EXACTF
855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
858 GET_RE_DEBUG_FLAGS_DECL;
860 Newz( 848200, trie, 1, reg_trie_data );
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
865 trie->words = newAV();
866 trie->revcharmap = newAV();
870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
871 if (!SvIOK(re_trie_maxbuff)) {
872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
875 /* -- First loop and Setup --
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
905 const U8 *scan = (U8*)NULL;
907 for ( ; uc < e ; uc += len ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
938 (int)trie->charcount, trie->uniquecharcount )
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
947 conservative but potentially much slower representation using an array
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
966 Second Pass -- Array Of Lists Representation
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
980 STRLEN transcount = 1;
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
998 for ( ; uc < e ; uc += len ) {
1003 charid = trie->charmap[ uvc ];
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1010 charid=(U16)SvIV( *svpp );
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1053 /* Its a dupe. So ignore it. */
1056 } /* end second pass */
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1061 DEBUG_TRIE_COMPILE_MORE_r({
1066 print out the table precompression.
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
1079 trie->states[ state ].wordnum
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1086 TRIE_LIST_ITEM(state,charid).forid,
1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1124 if ( transcount < tp + maxid - minid + 1) {
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1153 tp += ( maxid - minid + 1 );
1155 Safefree(trie->states[ state ].trans.list);
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1162 trie->states[ state ].trans.base=base;
1164 trie->lasttrans = tp + 1;
1168 Second Pass -- Flat Table Representation.
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1175 We then construct the trie using only the .next slots of the entry
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1214 U32 state = 1; /* required init */
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1224 for ( ; uc < e ; uc += len ) {
1229 charid = trie->charmap[ uvc ];
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1236 charid=(U16)SvIV( *svpp );
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1246 state = trie->trans[ state + charid ].next;
1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1266 /* Its a dupe. So ignore it. */
1269 } /* end second pass */
1271 DEBUG_TRIE_COMPILE_MORE_r({
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1293 PerlIO_printf( Perl_debug_log, "\n" );
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1314 * Inplace compress the table.*
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1326 - If .base is 0 there are no valid transitions from that node.
1328 - If .base is nonzero then charid is added to it to find an entry in
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1345 - We set 'pos' to indicate the first entry of the second node.
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1365 - Once compressed we can Renew/realloc the structures to release the
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1373 const U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1376 trie->laststate = laststate;
1378 for ( state = 1 ; state < laststate ; state++ ) {
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1412 trie->lasttrans = pos + 1;
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1417 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1420 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1423 } /* end table compress */
1425 /* resize the trans array to remove unused space */
1426 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1428 DEBUG_TRIE_COMPILE_r({
1431 Now we print it out again, in a slightly different form as there is additional
1432 info we want to be able to see when its compressed. They are close enough for
1433 visual comparison though.
1435 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1437 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1438 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1440 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1443 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1445 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1446 PerlIO_printf( Perl_debug_log, "-----");
1447 PerlIO_printf( Perl_debug_log, "\n");
1449 for( state = 1 ; state < trie->laststate ; state++ ) {
1450 U32 base = trie->states[ state ].trans.base;
1452 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1454 if ( trie->states[ state ].wordnum ) {
1455 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1457 PerlIO_printf( Perl_debug_log, "%6s", "" );
1460 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1465 while( ( base + ofs < trie->uniquecharcount ) ||
1466 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1467 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1470 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1472 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1473 if ( ( base + ofs >= trie->uniquecharcount ) &&
1474 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1475 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1477 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1478 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1480 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1484 PerlIO_printf( Perl_debug_log, "]");
1487 PerlIO_printf( Perl_debug_log, "\n" );
1492 /* now finally we "stitch in" the new TRIE node
1493 This means we convert either the first branch or the first Exact,
1494 depending on whether the thing following (in 'last') is a branch
1495 or not and whther first is the startbranch (ie is it a sub part of
1496 the alternation or is it the whole thing.)
1497 Assuming its a sub part we conver the EXACT otherwise we convert
1498 the whole branch sequence, including the first.
1505 if ( first == startbranch && OP( last ) != BRANCH ) {
1508 convert = NEXTOPER( first );
1509 NEXT_OFF( first ) = (U16)(last - first);
1512 OP( convert ) = TRIE + (U8)( flags - EXACT );
1513 NEXT_OFF( convert ) = (U16)(tail - convert);
1514 ARG_SET( convert, data_slot );
1516 /* tells us if we need to handle accept buffers specially */
1517 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1520 /* needed for dumping*/
1522 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1523 /* We now need to mark all of the space originally used by the
1524 branches as optimized away. This keeps the dumpuntil from
1525 throwing a wobbly as it doesnt use regnext() to traverse the
1528 while( optimize < last ) {
1529 OP( optimize ) = OPTIMIZED;
1533 } /* end node insert */
1540 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1541 * These need to be revisited when a newer toolchain becomes available.
1543 #if defined(__sparc64__) && defined(__GNUC__)
1544 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1545 # undef SPARC64_GCC_WORKAROUND
1546 # define SPARC64_GCC_WORKAROUND 1
1550 /* REx optimizer. Converts nodes into quickier variants "in place".
1551 Finds fixed substrings. */
1553 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1554 to the position after last scanned or to NULL. */
1558 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1559 /* scanp: Start here (read-write). */
1560 /* deltap: Write maxlen-minlen here. */
1561 /* last: Stop before this one. */
1563 I32 min = 0, pars = 0, code;
1564 regnode *scan = *scanp, *next;
1566 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1567 int is_inf_internal = 0; /* The studied chunk is infinite */
1568 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1569 scan_data_t data_fake;
1570 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1571 SV *re_trie_maxbuff = NULL;
1573 GET_RE_DEBUG_FLAGS_DECL;
1575 while (scan && OP(scan) != END && scan < last) {
1576 /* Peephole optimizer: */
1578 SV *mysv=sv_newmortal();
1579 regprop( mysv, scan);
1580 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1581 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
1584 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1585 /* Merge several consecutive EXACTish nodes into one. */
1586 regnode *n = regnext(scan);
1589 regnode *stop = scan;
1592 next = scan + NODE_SZ_STR(scan);
1593 /* Skip NOTHING, merge EXACT*. */
1595 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1596 (stringok && (OP(n) == OP(scan))))
1598 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1599 if (OP(n) == TAIL || n > next)
1601 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 next = n + NODE_STEP_REGNODE;
1610 else if (stringok) {
1611 const int oldl = STR_LEN(scan);
1612 regnode *nnext = regnext(n);
1614 if (oldl + STR_LEN(n) > U8_MAX)
1616 NEXT_OFF(scan) += NEXT_OFF(n);
1617 STR_LEN(scan) += STR_LEN(n);
1618 next = n + NODE_SZ_STR(n);
1619 /* Now we can overwrite *n : */
1620 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1628 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1630 Two problematic code points in Unicode casefolding of EXACT nodes:
1632 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1633 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1639 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1640 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1642 This means that in case-insensitive matching (or "loose matching",
1643 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1644 length of the above casefolded versions) can match a target string
1645 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1646 This would rather mess up the minimum length computation.
1648 What we'll do is to look for the tail four bytes, and then peek
1649 at the preceding two bytes to see whether we need to decrease
1650 the minimum length by four (six minus two).
1652 Thanks to the design of UTF-8, there cannot be false matches:
1653 A sequence of valid UTF-8 bytes cannot be a subsequence of
1654 another valid sequence of UTF-8 bytes.
1657 char *s0 = STRING(scan), *s, *t;
1658 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1659 const char *t0 = "\xcc\x88\xcc\x81";
1660 const char *t1 = t0 + 3;
1663 s < s2 && (t = ninstr(s, s1, t0, t1));
1665 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1666 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1673 n = scan + NODE_SZ_STR(scan);
1675 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1686 /* Follow the next-chain of the current node and optimize
1687 away all the NOTHINGs from it. */
1688 if (OP(scan) != CURLYX) {
1689 const int max = (reg_off_by_arg[OP(scan)]
1691 /* I32 may be smaller than U16 on CRAYs! */
1692 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1693 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1697 /* Skip NOTHING and LONGJMP. */
1698 while ((n = regnext(n))
1699 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1700 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1701 && off + noff < max)
1703 if (reg_off_by_arg[OP(scan)])
1706 NEXT_OFF(scan) = off;
1709 /* The principal pseudo-switch. Cannot be a switch, since we
1710 look into several different things. */
1711 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1712 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1713 next = regnext(scan);
1715 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1717 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1718 I32 max1 = 0, min1 = I32_MAX, num = 0;
1719 struct regnode_charclass_class accum;
1720 regnode *startbranch=scan;
1722 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1723 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1724 if (flags & SCF_DO_STCLASS)
1725 cl_init_zero(pRExC_state, &accum);
1727 while (OP(scan) == code) {
1728 I32 deltanext, minnext, f = 0, fake;
1729 struct regnode_charclass_class this_class;
1732 data_fake.flags = 0;
1734 data_fake.whilem_c = data->whilem_c;
1735 data_fake.last_closep = data->last_closep;
1738 data_fake.last_closep = &fake;
1739 next = regnext(scan);
1740 scan = NEXTOPER(scan);
1742 scan = NEXTOPER(scan);
1743 if (flags & SCF_DO_STCLASS) {
1744 cl_init(pRExC_state, &this_class);
1745 data_fake.start_class = &this_class;
1746 f = SCF_DO_STCLASS_AND;
1748 if (flags & SCF_WHILEM_VISITED_POS)
1749 f |= SCF_WHILEM_VISITED_POS;
1751 /* we suppose the run is continuous, last=next...*/
1752 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1753 next, &data_fake, f,depth+1);
1756 if (max1 < minnext + deltanext)
1757 max1 = minnext + deltanext;
1758 if (deltanext == I32_MAX)
1759 is_inf = is_inf_internal = 1;
1761 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1763 if (data && (data_fake.flags & SF_HAS_EVAL))
1764 data->flags |= SF_HAS_EVAL;
1766 data->whilem_c = data_fake.whilem_c;
1767 if (flags & SCF_DO_STCLASS)
1768 cl_or(pRExC_state, &accum, &this_class);
1769 if (code == SUSPEND)
1772 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1774 if (flags & SCF_DO_SUBSTR) {
1775 data->pos_min += min1;
1776 data->pos_delta += max1 - min1;
1777 if (max1 != min1 || is_inf)
1778 data->longest = &(data->longest_float);
1781 delta += max1 - min1;
1782 if (flags & SCF_DO_STCLASS_OR) {
1783 cl_or(pRExC_state, data->start_class, &accum);
1785 cl_and(data->start_class, &and_with);
1786 flags &= ~SCF_DO_STCLASS;
1789 else if (flags & SCF_DO_STCLASS_AND) {
1791 cl_and(data->start_class, &accum);
1792 flags &= ~SCF_DO_STCLASS;
1795 /* Switch to OR mode: cache the old value of
1796 * data->start_class */
1797 StructCopy(data->start_class, &and_with,
1798 struct regnode_charclass_class);
1799 flags &= ~SCF_DO_STCLASS_AND;
1800 StructCopy(&accum, data->start_class,
1801 struct regnode_charclass_class);
1802 flags |= SCF_DO_STCLASS_OR;
1803 data->start_class->flags |= ANYOF_EOS;
1809 Assuming this was/is a branch we are dealing with: 'scan' now
1810 points at the item that follows the branch sequence, whatever
1811 it is. We now start at the beginning of the sequence and look
1817 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1819 If we can find such a subseqence we need to turn the first
1820 element into a trie and then add the subsequent branch exact
1821 strings to the trie.
1825 1. patterns where the whole set of branch can be converted to a trie,
1827 2. patterns where only a subset of the alternations can be
1828 converted to a trie.
1830 In case 1 we can replace the whole set with a single regop
1831 for the trie. In case 2 we need to keep the start and end
1834 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1835 becomes BRANCH TRIE; BRANCH X;
1837 Hypthetically when we know the regex isnt anchored we can
1838 turn a case 1 into a DFA and let it rip... Every time it finds a match
1839 it would just call its tail, no WHILEM/CURLY needed.
1843 if (!re_trie_maxbuff) {
1844 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1845 if (!SvIOK(re_trie_maxbuff))
1846 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1848 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1850 regnode *first = (regnode *)NULL;
1851 regnode *last = (regnode *)NULL;
1852 regnode *tail = scan;
1857 SV *mysv = sv_newmortal(); /* for dumping */
1859 /* var tail is used because there may be a TAIL
1860 regop in the way. Ie, the exacts will point to the
1861 thing following the TAIL, but the last branch will
1862 point at the TAIL. So we advance tail. If we
1863 have nested (?:) we may have to move through several
1867 while ( OP( tail ) == TAIL ) {
1868 /* this is the TAIL generated by (?:) */
1869 tail = regnext( tail );
1873 regprop( mysv, tail );
1874 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1875 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1876 (RExC_seen_evals) ? "[EVAL]" : ""
1881 step through the branches, cur represents each
1882 branch, noper is the first thing to be matched
1883 as part of that branch and noper_next is the
1884 regnext() of that node. if noper is an EXACT
1885 and noper_next is the same as scan (our current
1886 position in the regex) then the EXACT branch is
1887 a possible optimization target. Once we have
1888 two or more consequetive such branches we can
1889 create a trie of the EXACT's contents and stich
1890 it in place. If the sequence represents all of
1891 the branches we eliminate the whole thing and
1892 replace it with a single TRIE. If it is a
1893 subsequence then we need to stitch it in. This
1894 means the first branch has to remain, and needs
1895 to be repointed at the item on the branch chain
1896 following the last branch optimized. This could
1897 be either a BRANCH, in which case the
1898 subsequence is internal, or it could be the
1899 item following the branch sequence in which
1900 case the subsequence is at the end.
1904 /* dont use tail as the end marker for this traverse */
1905 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1906 regnode *noper = NEXTOPER( cur );
1907 regnode *noper_next = regnext( noper );
1910 regprop( mysv, cur);
1911 PerlIO_printf( Perl_debug_log, "%*s%s",
1912 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
1914 regprop( mysv, noper);
1915 PerlIO_printf( Perl_debug_log, " -> %s",
1919 regprop( mysv, noper_next );
1920 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1923 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1926 if ( ( first ? OP( noper ) == optype
1927 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1928 && noper_next == tail && count<U16_MAX)
1933 optype = OP( noper );
1937 regprop( mysv, first);
1938 PerlIO_printf( Perl_debug_log, "%*s%s",
1939 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1940 regprop( mysv, NEXTOPER(first) );
1941 PerlIO_printf( Perl_debug_log, " -> %s\n",
1942 SvPV_nolen( mysv ) );
1947 regprop( mysv, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s",
1949 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1950 regprop( mysv, noper );
1951 PerlIO_printf( Perl_debug_log, " -> %s\n",
1952 SvPV_nolen( mysv ) );
1958 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1959 (int)depth * 2 + 2, "E:", "**END**" );
1961 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1963 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1964 && noper_next == tail )
1968 optype = OP( noper );
1978 regprop( mysv, cur);
1979 PerlIO_printf( Perl_debug_log,
1980 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1981 " ", SvPV_nolen( mysv ), first, last, cur);
1986 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1987 (int)depth * 2 + 2, "E:", "==END==" );
1989 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1995 scan = NEXTOPER(NEXTOPER(scan));
1996 } else /* single branch is optimized. */
1997 scan = NEXTOPER(scan);
2000 else if (OP(scan) == EXACT) {
2001 I32 l = STR_LEN(scan);
2002 UV uc = *((U8*)STRING(scan));
2004 const U8 * const s = (U8*)STRING(scan);
2005 l = utf8_length(s, s + l);
2006 uc = utf8_to_uvchr(s, NULL);
2009 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2010 /* The code below prefers earlier match for fixed
2011 offset, later match for variable offset. */
2012 if (data->last_end == -1) { /* Update the start info. */
2013 data->last_start_min = data->pos_min;
2014 data->last_start_max = is_inf
2015 ? I32_MAX : data->pos_min + data->pos_delta;
2017 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2019 SV * sv = data->last_found;
2020 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2021 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2022 if (mg && mg->mg_len >= 0)
2023 mg->mg_len += utf8_length((U8*)STRING(scan),
2024 (U8*)STRING(scan)+STR_LEN(scan));
2027 SvUTF8_on(data->last_found);
2028 data->last_end = data->pos_min + l;
2029 data->pos_min += l; /* As in the first entry. */
2030 data->flags &= ~SF_BEFORE_EOL;
2032 if (flags & SCF_DO_STCLASS_AND) {
2033 /* Check whether it is compatible with what we know already! */
2037 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2038 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2039 && (!(data->start_class->flags & ANYOF_FOLD)
2040 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2043 ANYOF_CLASS_ZERO(data->start_class);
2044 ANYOF_BITMAP_ZERO(data->start_class);
2046 ANYOF_BITMAP_SET(data->start_class, uc);
2047 data->start_class->flags &= ~ANYOF_EOS;
2049 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2051 else if (flags & SCF_DO_STCLASS_OR) {
2052 /* false positive possible if the class is case-folded */
2054 ANYOF_BITMAP_SET(data->start_class, uc);
2056 data->start_class->flags |= ANYOF_UNICODE_ALL;
2057 data->start_class->flags &= ~ANYOF_EOS;
2058 cl_and(data->start_class, &and_with);
2060 flags &= ~SCF_DO_STCLASS;
2062 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2063 I32 l = STR_LEN(scan);
2064 UV uc = *((U8*)STRING(scan));
2066 /* Search for fixed substrings supports EXACT only. */
2067 if (flags & SCF_DO_SUBSTR)
2068 scan_commit(pRExC_state, data);
2070 U8 *s = (U8 *)STRING(scan);
2071 l = utf8_length(s, s + l);
2072 uc = utf8_to_uvchr(s, NULL);
2075 if (data && (flags & SCF_DO_SUBSTR))
2077 if (flags & SCF_DO_STCLASS_AND) {
2078 /* Check whether it is compatible with what we know already! */
2082 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2083 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2084 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2086 ANYOF_CLASS_ZERO(data->start_class);
2087 ANYOF_BITMAP_ZERO(data->start_class);
2089 ANYOF_BITMAP_SET(data->start_class, uc);
2090 data->start_class->flags &= ~ANYOF_EOS;
2091 data->start_class->flags |= ANYOF_FOLD;
2092 if (OP(scan) == EXACTFL)
2093 data->start_class->flags |= ANYOF_LOCALE;
2096 else if (flags & SCF_DO_STCLASS_OR) {
2097 if (data->start_class->flags & ANYOF_FOLD) {
2098 /* false positive possible if the class is case-folded.
2099 Assume that the locale settings are the same... */
2101 ANYOF_BITMAP_SET(data->start_class, uc);
2102 data->start_class->flags &= ~ANYOF_EOS;
2104 cl_and(data->start_class, &and_with);
2106 flags &= ~SCF_DO_STCLASS;
2108 else if (strchr((const char*)PL_varies,OP(scan))) {
2109 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2110 I32 f = flags, pos_before = 0;
2111 regnode *oscan = scan;
2112 struct regnode_charclass_class this_class;
2113 struct regnode_charclass_class *oclass = NULL;
2114 I32 next_is_eval = 0;
2116 switch (PL_regkind[(U8)OP(scan)]) {
2117 case WHILEM: /* End of (?:...)* . */
2118 scan = NEXTOPER(scan);
2121 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2122 next = NEXTOPER(scan);
2123 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2125 maxcount = REG_INFTY;
2126 next = regnext(scan);
2127 scan = NEXTOPER(scan);
2131 if (flags & SCF_DO_SUBSTR)
2136 if (flags & SCF_DO_STCLASS) {
2138 maxcount = REG_INFTY;
2139 next = regnext(scan);
2140 scan = NEXTOPER(scan);
2143 is_inf = is_inf_internal = 1;
2144 scan = regnext(scan);
2145 if (flags & SCF_DO_SUBSTR) {
2146 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2147 data->longest = &(data->longest_float);
2149 goto optimize_curly_tail;
2151 mincount = ARG1(scan);
2152 maxcount = ARG2(scan);
2153 next = regnext(scan);
2154 if (OP(scan) == CURLYX) {
2155 I32 lp = (data ? *(data->last_closep) : 0);
2156 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2158 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2159 next_is_eval = (OP(scan) == EVAL);
2161 if (flags & SCF_DO_SUBSTR) {
2162 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2163 pos_before = data->pos_min;
2167 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2169 data->flags |= SF_IS_INF;
2171 if (flags & SCF_DO_STCLASS) {
2172 cl_init(pRExC_state, &this_class);
2173 oclass = data->start_class;
2174 data->start_class = &this_class;
2175 f |= SCF_DO_STCLASS_AND;
2176 f &= ~SCF_DO_STCLASS_OR;
2178 /* These are the cases when once a subexpression
2179 fails at a particular position, it cannot succeed
2180 even after backtracking at the enclosing scope.
2182 XXXX what if minimal match and we are at the
2183 initial run of {n,m}? */
2184 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2185 f &= ~SCF_WHILEM_VISITED_POS;
2187 /* This will finish on WHILEM, setting scan, or on NULL: */
2188 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2190 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2192 if (flags & SCF_DO_STCLASS)
2193 data->start_class = oclass;
2194 if (mincount == 0 || minnext == 0) {
2195 if (flags & SCF_DO_STCLASS_OR) {
2196 cl_or(pRExC_state, data->start_class, &this_class);
2198 else if (flags & SCF_DO_STCLASS_AND) {
2199 /* Switch to OR mode: cache the old value of
2200 * data->start_class */
2201 StructCopy(data->start_class, &and_with,
2202 struct regnode_charclass_class);
2203 flags &= ~SCF_DO_STCLASS_AND;
2204 StructCopy(&this_class, data->start_class,
2205 struct regnode_charclass_class);
2206 flags |= SCF_DO_STCLASS_OR;
2207 data->start_class->flags |= ANYOF_EOS;
2209 } else { /* Non-zero len */
2210 if (flags & SCF_DO_STCLASS_OR) {
2211 cl_or(pRExC_state, data->start_class, &this_class);
2212 cl_and(data->start_class, &and_with);
2214 else if (flags & SCF_DO_STCLASS_AND)
2215 cl_and(data->start_class, &this_class);
2216 flags &= ~SCF_DO_STCLASS;
2218 if (!scan) /* It was not CURLYX, but CURLY. */
2220 if (ckWARN(WARN_REGEXP)
2221 /* ? quantifier ok, except for (?{ ... }) */
2222 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2223 && (minnext == 0) && (deltanext == 0)
2224 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2225 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2228 "Quantifier unexpected on zero-length expression");
2231 min += minnext * mincount;
2232 is_inf_internal |= ((maxcount == REG_INFTY
2233 && (minnext + deltanext) > 0)
2234 || deltanext == I32_MAX);
2235 is_inf |= is_inf_internal;
2236 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2238 /* Try powerful optimization CURLYX => CURLYN. */
2239 if ( OP(oscan) == CURLYX && data
2240 && data->flags & SF_IN_PAR
2241 && !(data->flags & SF_HAS_EVAL)
2242 && !deltanext && minnext == 1 ) {
2243 /* Try to optimize to CURLYN. */
2244 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2245 regnode *nxt1 = nxt;
2252 if (!strchr((const char*)PL_simple,OP(nxt))
2253 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2254 && STR_LEN(nxt) == 1))
2260 if (OP(nxt) != CLOSE)
2262 /* Now we know that nxt2 is the only contents: */
2263 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = NOTHING; /* was OPEN. */
2267 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2268 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2269 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2270 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2271 OP(nxt + 1) = OPTIMIZED; /* was count. */
2272 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2277 /* Try optimization CURLYX => CURLYM. */
2278 if ( OP(oscan) == CURLYX && data
2279 && !(data->flags & SF_HAS_PAR)
2280 && !(data->flags & SF_HAS_EVAL)
2281 && !deltanext /* atom is fixed width */
2282 && minnext != 0 /* CURLYM can't handle zero width */
2284 /* XXXX How to optimize if data == 0? */
2285 /* Optimize to a simpler form. */
2286 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2290 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2291 && (OP(nxt2) != WHILEM))
2293 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2294 /* Need to optimize away parenths. */
2295 if (data->flags & SF_IN_PAR) {
2296 /* Set the parenth number. */
2297 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2299 if (OP(nxt) != CLOSE)
2300 FAIL("Panic opt close");
2301 oscan->flags = (U8)ARG(nxt);
2302 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2303 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2305 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2306 OP(nxt + 1) = OPTIMIZED; /* was count. */
2307 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2308 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2311 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2312 regnode *nnxt = regnext(nxt1);
2315 if (reg_off_by_arg[OP(nxt1)])
2316 ARG_SET(nxt1, nxt2 - nxt1);
2317 else if (nxt2 - nxt1 < U16_MAX)
2318 NEXT_OFF(nxt1) = nxt2 - nxt1;
2320 OP(nxt) = NOTHING; /* Cannot beautify */
2325 /* Optimize again: */
2326 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2332 else if ((OP(oscan) == CURLYX)
2333 && (flags & SCF_WHILEM_VISITED_POS)
2334 /* See the comment on a similar expression above.
2335 However, this time it not a subexpression
2336 we care about, but the expression itself. */
2337 && (maxcount == REG_INFTY)
2338 && data && ++data->whilem_c < 16) {
2339 /* This stays as CURLYX, we can put the count/of pair. */
2340 /* Find WHILEM (as in regexec.c) */
2341 regnode *nxt = oscan + NEXT_OFF(oscan);
2343 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2345 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2346 | (RExC_whilem_seen << 4)); /* On WHILEM */
2348 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2350 if (flags & SCF_DO_SUBSTR) {
2351 SV *last_str = Nullsv;
2352 int counted = mincount != 0;
2354 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2355 #if defined(SPARC64_GCC_WORKAROUND)
2361 if (pos_before >= data->last_start_min)
2364 b = data->last_start_min;
2367 s = SvPV(data->last_found, l);
2368 old = b - data->last_start_min;
2371 I32 b = pos_before >= data->last_start_min
2372 ? pos_before : data->last_start_min;
2374 char *s = SvPV(data->last_found, l);
2375 I32 old = b - data->last_start_min;
2379 old = utf8_hop((U8*)s, old) - (U8*)s;
2382 /* Get the added string: */
2383 last_str = newSVpvn(s + old, l);
2385 SvUTF8_on(last_str);
2386 if (deltanext == 0 && pos_before == b) {
2387 /* What was added is a constant string */
2389 SvGROW(last_str, (mincount * l) + 1);
2390 repeatcpy(SvPVX(last_str) + l,
2391 SvPVX(last_str), l, mincount - 1);
2392 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2393 /* Add additional parts. */
2394 SvCUR_set(data->last_found,
2395 SvCUR(data->last_found) - l);
2396 sv_catsv(data->last_found, last_str);
2398 SV * sv = data->last_found;
2400 SvUTF8(sv) && SvMAGICAL(sv) ?
2401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2402 if (mg && mg->mg_len >= 0)
2403 mg->mg_len += CHR_SVLEN(last_str);
2405 data->last_end += l * (mincount - 1);
2408 /* start offset must point into the last copy */
2409 data->last_start_min += minnext * (mincount - 1);
2410 data->last_start_max += is_inf ? I32_MAX
2411 : (maxcount - 1) * (minnext + data->pos_delta);
2414 /* It is counted once already... */
2415 data->pos_min += minnext * (mincount - counted);
2416 data->pos_delta += - counted * deltanext +
2417 (minnext + deltanext) * maxcount - minnext * mincount;
2418 if (mincount != maxcount) {
2419 /* Cannot extend fixed substrings found inside
2421 scan_commit(pRExC_state,data);
2422 if (mincount && last_str) {
2423 sv_setsv(data->last_found, last_str);
2424 data->last_end = data->pos_min;
2425 data->last_start_min =
2426 data->pos_min - CHR_SVLEN(last_str);
2427 data->last_start_max = is_inf
2429 : data->pos_min + data->pos_delta
2430 - CHR_SVLEN(last_str);
2432 data->longest = &(data->longest_float);
2434 SvREFCNT_dec(last_str);
2436 if (data && (fl & SF_HAS_EVAL))
2437 data->flags |= SF_HAS_EVAL;
2438 optimize_curly_tail:
2439 if (OP(oscan) != CURLYX) {
2440 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2442 NEXT_OFF(oscan) += NEXT_OFF(next);
2445 default: /* REF and CLUMP only? */
2446 if (flags & SCF_DO_SUBSTR) {
2447 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2448 data->longest = &(data->longest_float);
2450 is_inf = is_inf_internal = 1;
2451 if (flags & SCF_DO_STCLASS_OR)
2452 cl_anything(pRExC_state, data->start_class);
2453 flags &= ~SCF_DO_STCLASS;
2457 else if (strchr((const char*)PL_simple,OP(scan))) {
2460 if (flags & SCF_DO_SUBSTR) {
2461 scan_commit(pRExC_state,data);
2465 if (flags & SCF_DO_STCLASS) {
2466 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2468 /* Some of the logic below assumes that switching
2469 locale on will only add false positives. */
2470 switch (PL_regkind[(U8)OP(scan)]) {
2474 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2475 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2476 cl_anything(pRExC_state, data->start_class);
2479 if (OP(scan) == SANY)
2481 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2482 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2483 || (data->start_class->flags & ANYOF_CLASS));
2484 cl_anything(pRExC_state, data->start_class);
2486 if (flags & SCF_DO_STCLASS_AND || !value)
2487 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2490 if (flags & SCF_DO_STCLASS_AND)
2491 cl_and(data->start_class,
2492 (struct regnode_charclass_class*)scan);
2494 cl_or(pRExC_state, data->start_class,
2495 (struct regnode_charclass_class*)scan);
2498 if (flags & SCF_DO_STCLASS_AND) {
2499 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2501 for (value = 0; value < 256; value++)
2502 if (!isALNUM(value))
2503 ANYOF_BITMAP_CLEAR(data->start_class, value);
2507 if (data->start_class->flags & ANYOF_LOCALE)
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2510 for (value = 0; value < 256; value++)
2512 ANYOF_BITMAP_SET(data->start_class, value);
2517 if (flags & SCF_DO_STCLASS_AND) {
2518 if (data->start_class->flags & ANYOF_LOCALE)
2519 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2523 data->start_class->flags |= ANYOF_LOCALE;
2527 if (flags & SCF_DO_STCLASS_AND) {
2528 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2530 for (value = 0; value < 256; value++)
2532 ANYOF_BITMAP_CLEAR(data->start_class, value);
2536 if (data->start_class->flags & ANYOF_LOCALE)
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 for (value = 0; value < 256; value++)
2540 if (!isALNUM(value))
2541 ANYOF_BITMAP_SET(data->start_class, value);
2546 if (flags & SCF_DO_STCLASS_AND) {
2547 if (data->start_class->flags & ANYOF_LOCALE)
2548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2551 data->start_class->flags |= ANYOF_LOCALE;
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2556 if (flags & SCF_DO_STCLASS_AND) {
2557 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2559 for (value = 0; value < 256; value++)
2560 if (!isSPACE(value))
2561 ANYOF_BITMAP_CLEAR(data->start_class, value);
2565 if (data->start_class->flags & ANYOF_LOCALE)
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 for (value = 0; value < 256; value++)
2570 ANYOF_BITMAP_SET(data->start_class, value);
2575 if (flags & SCF_DO_STCLASS_AND) {
2576 if (data->start_class->flags & ANYOF_LOCALE)
2577 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2580 data->start_class->flags |= ANYOF_LOCALE;
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2585 if (flags & SCF_DO_STCLASS_AND) {
2586 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2588 for (value = 0; value < 256; value++)
2590 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2597 for (value = 0; value < 256; value++)
2598 if (!isSPACE(value))
2599 ANYOF_BITMAP_SET(data->start_class, value);
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 if (data->start_class->flags & ANYOF_LOCALE) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2607 for (value = 0; value < 256; value++)
2608 if (!isSPACE(value))
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2613 data->start_class->flags |= ANYOF_LOCALE;
2614 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2620 for (value = 0; value < 256; value++)
2621 if (!isDIGIT(value))
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2628 for (value = 0; value < 256; value++)
2630 ANYOF_BITMAP_SET(data->start_class, value);
2635 if (flags & SCF_DO_STCLASS_AND) {
2636 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2637 for (value = 0; value < 256; value++)
2639 ANYOF_BITMAP_CLEAR(data->start_class, value);
2642 if (data->start_class->flags & ANYOF_LOCALE)
2643 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2645 for (value = 0; value < 256; value++)
2646 if (!isDIGIT(value))
2647 ANYOF_BITMAP_SET(data->start_class, value);
2652 if (flags & SCF_DO_STCLASS_OR)
2653 cl_and(data->start_class, &and_with);
2654 flags &= ~SCF_DO_STCLASS;
2657 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2658 data->flags |= (OP(scan) == MEOL
2662 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2663 /* Lookbehind, or need to calculate parens/evals/stclass: */
2664 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2665 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2666 /* Lookahead/lookbehind */
2667 I32 deltanext, minnext, fake = 0;
2669 struct regnode_charclass_class intrnl;
2672 data_fake.flags = 0;
2674 data_fake.whilem_c = data->whilem_c;
2675 data_fake.last_closep = data->last_closep;
2678 data_fake.last_closep = &fake;
2679 if ( flags & SCF_DO_STCLASS && !scan->flags
2680 && OP(scan) == IFMATCH ) { /* Lookahead */
2681 cl_init(pRExC_state, &intrnl);
2682 data_fake.start_class = &intrnl;
2683 f |= SCF_DO_STCLASS_AND;
2685 if (flags & SCF_WHILEM_VISITED_POS)
2686 f |= SCF_WHILEM_VISITED_POS;
2687 next = regnext(scan);
2688 nscan = NEXTOPER(NEXTOPER(scan));
2689 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2692 vFAIL("Variable length lookbehind not implemented");
2694 else if (minnext > U8_MAX) {
2695 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2697 scan->flags = (U8)minnext;
2699 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2701 if (data && (data_fake.flags & SF_HAS_EVAL))
2702 data->flags |= SF_HAS_EVAL;
2704 data->whilem_c = data_fake.whilem_c;
2705 if (f & SCF_DO_STCLASS_AND) {
2706 int was = (data->start_class->flags & ANYOF_EOS);
2708 cl_and(data->start_class, &intrnl);
2710 data->start_class->flags |= ANYOF_EOS;
2713 else if (OP(scan) == OPEN) {
2716 else if (OP(scan) == CLOSE) {
2717 if ((I32)ARG(scan) == is_par) {
2718 next = regnext(scan);
2720 if ( next && (OP(next) != WHILEM) && next < last)
2721 is_par = 0; /* Disable optimization */
2724 *(data->last_closep) = ARG(scan);
2726 else if (OP(scan) == EVAL) {
2728 data->flags |= SF_HAS_EVAL;
2730 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2731 if (flags & SCF_DO_SUBSTR) {
2732 scan_commit(pRExC_state,data);
2733 data->longest = &(data->longest_float);
2735 is_inf = is_inf_internal = 1;
2736 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2737 cl_anything(pRExC_state, data->start_class);
2738 flags &= ~SCF_DO_STCLASS;
2740 /* Else: zero-length, ignore. */
2741 scan = regnext(scan);
2746 *deltap = is_inf_internal ? I32_MAX : delta;
2747 if (flags & SCF_DO_SUBSTR && is_inf)
2748 data->pos_delta = I32_MAX - data->pos_min;
2749 if (is_par > U8_MAX)
2751 if (is_par && pars==1 && data) {
2752 data->flags |= SF_IN_PAR;
2753 data->flags &= ~SF_HAS_PAR;
2755 else if (pars && data) {
2756 data->flags |= SF_HAS_PAR;
2757 data->flags &= ~SF_IN_PAR;
2759 if (flags & SCF_DO_STCLASS_OR)
2760 cl_and(data->start_class, &and_with);
2765 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2767 if (RExC_rx->data) {
2768 Renewc(RExC_rx->data,
2769 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2770 char, struct reg_data);
2771 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2772 RExC_rx->data->count += n;
2775 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2776 char, struct reg_data);
2777 New(1208, RExC_rx->data->what, n, U8);
2778 RExC_rx->data->count = n;
2780 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2781 return RExC_rx->data->count - n;
2785 Perl_reginitcolors(pTHX)
2787 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2789 char *t = savepv(s);
2793 t = strchr(t, '\t');
2799 PL_colors[i] = t = (char *)"";
2804 PL_colors[i++] = (char *)"";
2811 - pregcomp - compile a regular expression into internal code
2813 * We can't allocate space until we know how big the compiled form will be,
2814 * but we can't compile it (and thus know how big it is) until we've got a
2815 * place to put the code. So we cheat: we compile it twice, once with code
2816 * generation turned off and size counting turned on, and once "for real".
2817 * This also means that we don't allocate space until we are sure that the
2818 * thing really will compile successfully, and we never have to move the
2819 * code and thus invalidate pointers into it. (Note that it has to be in
2820 * one piece because free() must be able to free it all.) [NB: not true in perl]
2822 * Beware that the optimization-preparation code in here knows about some
2823 * of the structure of the compiled regexp. [I'll say.]
2826 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2836 RExC_state_t RExC_state;
2837 RExC_state_t *pRExC_state = &RExC_state;
2839 GET_RE_DEBUG_FLAGS_DECL;
2842 FAIL("NULL regexp argument");
2844 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2847 DEBUG_r(if (!PL_colorset) reginitcolors());
2849 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2850 PL_colors[4],PL_colors[5],PL_colors[0],
2851 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2853 RExC_flags = pm->op_pmflags;
2857 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2858 RExC_seen_evals = 0;
2861 /* First pass: determine size, legality. */
2868 RExC_emit = &PL_regdummy;
2869 RExC_whilem_seen = 0;
2870 #if 0 /* REGC() is (currently) a NOP at the first pass.
2871 * Clever compilers notice this and complain. --jhi */
2872 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2874 if (reg(pRExC_state, 0, &flags) == NULL) {
2875 RExC_precomp = Nullch;
2878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2880 /* Small enough for pointer-storage convention?
2881 If extralen==0, this means that we will not need long jumps. */
2882 if (RExC_size >= 0x10000L && RExC_extralen)
2883 RExC_size += RExC_extralen;
2886 if (RExC_whilem_seen > 15)
2887 RExC_whilem_seen = 15;
2889 /* Allocate space and initialize. */
2890 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2893 FAIL("Regexp out of space");
2896 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2897 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2900 r->prelen = xend - exp;
2901 r->precomp = savepvn(RExC_precomp, r->prelen);
2903 #ifdef PERL_COPY_ON_WRITE
2904 r->saved_copy = Nullsv;
2906 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2907 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2909 r->substrs = 0; /* Useful during FAIL. */
2910 r->startp = 0; /* Useful during FAIL. */
2911 r->endp = 0; /* Useful during FAIL. */
2913 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2915 r->offsets[0] = RExC_size;
2917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2918 "%s %"UVuf" bytes for offset annotations.\n",
2919 r->offsets ? "Got" : "Couldn't get",
2920 (UV)((2*RExC_size+1) * sizeof(U32))));
2924 /* Second pass: emit code. */
2925 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2930 RExC_emit_start = r->program;
2931 RExC_emit = r->program;
2932 /* Store the count of eval-groups for security checks: */
2933 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2934 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2936 if (reg(pRExC_state, 0, &flags) == NULL)
2940 /* Dig out information for optimizations. */
2941 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2942 pm->op_pmflags = RExC_flags;
2944 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2945 r->regstclass = NULL;
2946 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2947 r->reganch |= ROPT_NAUGHTY;
2948 scan = r->program + 1; /* First BRANCH. */
2950 /* XXXX To minimize changes to RE engine we always allocate
2951 3-units-long substrs field. */
2952 Newz(1004, r->substrs, 1, struct reg_substr_data);
2954 StructCopy(&zero_scan_data, &data, scan_data_t);
2955 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2956 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2958 STRLEN longest_float_length, longest_fixed_length;
2959 struct regnode_charclass_class ch_class;
2964 /* Skip introductions and multiplicators >= 1. */
2965 while ((OP(first) == OPEN && (sawopen = 1)) ||
2966 /* An OR of *one* alternative - should not happen now. */
2967 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2968 (OP(first) == PLUS) ||
2969 (OP(first) == MINMOD) ||
2970 /* An {n,m} with n>0 */
2971 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2972 if (OP(first) == PLUS)
2975 first += regarglen[(U8)OP(first)];
2976 first = NEXTOPER(first);
2979 /* Starting-point info. */
2981 if (PL_regkind[(U8)OP(first)] == EXACT) {
2982 if (OP(first) == EXACT)
2983 ; /* Empty, get anchored substr later. */
2984 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2985 r->regstclass = first;
2987 else if (strchr((const char*)PL_simple,OP(first)))
2988 r->regstclass = first;
2989 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2990 PL_regkind[(U8)OP(first)] == NBOUND)
2991 r->regstclass = first;
2992 else if (PL_regkind[(U8)OP(first)] == BOL) {
2993 r->reganch |= (OP(first) == MBOL
2995 : (OP(first) == SBOL
2998 first = NEXTOPER(first);
3001 else if (OP(first) == GPOS) {
3002 r->reganch |= ROPT_ANCH_GPOS;
3003 first = NEXTOPER(first);
3006 else if (!sawopen && (OP(first) == STAR &&
3007 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
3008 !(r->reganch & ROPT_ANCH) )
3010 /* turn .* into ^.* with an implied $*=1 */
3012 (OP(NEXTOPER(first)) == REG_ANY)
3015 r->reganch |= type | ROPT_IMPLICIT;
3016 first = NEXTOPER(first);
3019 if (sawplus && (!sawopen || !RExC_sawback)
3020 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3021 /* x+ must match at the 1st pos of run of x's */
3022 r->reganch |= ROPT_SKIP;
3024 /* Scan is after the zeroth branch, first is atomic matcher. */
3025 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3026 (IV)(first - scan + 1)));
3028 * If there's something expensive in the r.e., find the
3029 * longest literal string that must appear and make it the
3030 * regmust. Resolve ties in favor of later strings, since
3031 * the regstart check works with the beginning of the r.e.
3032 * and avoiding duplication strengthens checking. Not a
3033 * strong reason, but sufficient in the absence of others.
3034 * [Now we resolve ties in favor of the earlier string if
3035 * it happens that c_offset_min has been invalidated, since the
3036 * earlier string may buy us something the later one won't.]
3040 data.longest_fixed = newSVpvn("",0);
3041 data.longest_float = newSVpvn("",0);
3042 data.last_found = newSVpvn("",0);
3043 data.longest = &(data.longest_fixed);
3045 if (!r->regstclass) {
3046 cl_init(pRExC_state, &ch_class);
3047 data.start_class = &ch_class;
3048 stclass_flag = SCF_DO_STCLASS_AND;
3049 } else /* XXXX Check for BOUND? */
3051 data.last_closep = &last_close;
3053 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3054 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3055 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3056 && data.last_start_min == 0 && data.last_end > 0
3057 && !RExC_seen_zerolen
3058 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3059 r->reganch |= ROPT_CHECK_ALL;
3060 scan_commit(pRExC_state, &data);
3061 SvREFCNT_dec(data.last_found);
3063 longest_float_length = CHR_SVLEN(data.longest_float);
3064 if (longest_float_length
3065 || (data.flags & SF_FL_BEFORE_EOL
3066 && (!(data.flags & SF_FL_BEFORE_MEOL)
3067 || (RExC_flags & PMf_MULTILINE)))) {
3070 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3071 && data.offset_fixed == data.offset_float_min
3072 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3073 goto remove_float; /* As in (a)+. */
3075 if (SvUTF8(data.longest_float)) {
3076 r->float_utf8 = data.longest_float;
3077 r->float_substr = Nullsv;
3079 r->float_substr = data.longest_float;
3080 r->float_utf8 = Nullsv;
3082 r->float_min_offset = data.offset_float_min;
3083 r->float_max_offset = data.offset_float_max;
3084 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FL_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)));
3087 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3091 r->float_substr = r->float_utf8 = Nullsv;
3092 SvREFCNT_dec(data.longest_float);
3093 longest_float_length = 0;
3096 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3097 if (longest_fixed_length
3098 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3099 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3100 || (RExC_flags & PMf_MULTILINE)))) {
3103 if (SvUTF8(data.longest_fixed)) {
3104 r->anchored_utf8 = data.longest_fixed;
3105 r->anchored_substr = Nullsv;
3107 r->anchored_substr = data.longest_fixed;
3108 r->anchored_utf8 = Nullsv;
3110 r->anchored_offset = data.offset_fixed;
3111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3113 || (RExC_flags & PMf_MULTILINE)));
3114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3117 r->anchored_substr = r->anchored_utf8 = Nullsv;
3118 SvREFCNT_dec(data.longest_fixed);
3119 longest_fixed_length = 0;
3122 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3123 r->regstclass = NULL;
3124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3126 && !(data.start_class->flags & ANYOF_EOS)
3127 && !cl_is_anything(data.start_class))
3129 const I32 n = add_data(pRExC_state, 1, "f");
3131 New(1006, RExC_rx->data->data[n], 1,
3132 struct regnode_charclass_class);
3133 StructCopy(data.start_class,
3134 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3135 struct regnode_charclass_class);
3136 r->regstclass = (regnode*)RExC_rx->data->data[n];
3137 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3138 PL_regdata = r->data; /* for regprop() */
3139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3140 regprop(sv, (regnode*)data.start_class);
3141 PerlIO_printf(Perl_debug_log,
3142 "synthetic stclass \"%s\".\n",
3146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3147 if (longest_fixed_length > longest_float_length) {
3148 r->check_substr = r->anchored_substr;
3149 r->check_utf8 = r->anchored_utf8;
3150 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3151 if (r->reganch & ROPT_ANCH_SINGLE)
3152 r->reganch |= ROPT_NOSCAN;
3155 r->check_substr = r->float_substr;
3156 r->check_utf8 = r->float_utf8;
3157 r->check_offset_min = data.offset_float_min;
3158 r->check_offset_max = data.offset_float_max;
3160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3161 This should be changed ASAP! */
3162 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3163 r->reganch |= RE_USE_INTUIT;
3164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3165 r->reganch |= RE_INTUIT_TAIL;
3169 /* Several toplevels. Best we can is to set minlen. */
3171 struct regnode_charclass_class ch_class;
3174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3175 scan = r->program + 1;
3176 cl_init(pRExC_state, &ch_class);
3177 data.start_class = &ch_class;
3178 data.last_closep = &last_close;
3179 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3180 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3181 = r->float_substr = r->float_utf8 = Nullsv;
3182 if (!(data.start_class->flags & ANYOF_EOS)
3183 && !cl_is_anything(data.start_class))
3185 const I32 n = add_data(pRExC_state, 1, "f");
3187 New(1006, RExC_rx->data->data[n], 1,
3188 struct regnode_charclass_class);
3189 StructCopy(data.start_class,
3190 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3191 struct regnode_charclass_class);
3192 r->regstclass = (regnode*)RExC_rx->data->data[n];
3193 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3194 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3195 regprop(sv, (regnode*)data.start_class);
3196 PerlIO_printf(Perl_debug_log,
3197 "synthetic stclass \"%s\".\n",
3203 if (RExC_seen & REG_SEEN_GPOS)
3204 r->reganch |= ROPT_GPOS_SEEN;
3205 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3206 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3207 if (RExC_seen & REG_SEEN_EVAL)
3208 r->reganch |= ROPT_EVAL_SEEN;
3209 if (RExC_seen & REG_SEEN_CANY)
3210 r->reganch |= ROPT_CANY_SEEN;
3211 Newz(1002, r->startp, RExC_npar, I32);
3212 Newz(1002, r->endp, RExC_npar, I32);
3213 PL_regdata = r->data; /* for regprop() */
3214 DEBUG_COMPILE_r(regdump(r));
3219 - reg - regular expression, i.e. main body or parenthesized thing
3221 * Caller must absorb opening parenthesis.
3223 * Combining parenthesis handling with the base level of regular expression
3224 * is a trifle forced, but the need to tie the tails of the branches to what
3225 * follows makes it hard to avoid.
3228 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3229 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3232 register regnode *ret; /* Will be the head of the group. */
3233 register regnode *br;
3234 register regnode *lastbr;
3235 register regnode *ender = 0;
3236 register I32 parno = 0;
3237 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3239 /* for (?g), (?gc), and (?o) warnings; warning
3240 about (?c) will warn about (?g) -- japhy */
3242 I32 wastedflags = 0x00,
3245 wasted_gc = 0x02 | 0x04,
3248 char * parse_start = RExC_parse; /* MJD */
3249 char *oregcomp_parse = RExC_parse;
3252 *flagp = 0; /* Tentatively. */
3255 /* Make an OPEN node, if parenthesized. */
3257 if (*RExC_parse == '?') { /* (?...) */
3258 U32 posflags = 0, negflags = 0;
3259 U32 *flagsp = &posflags;
3261 char *seqstart = RExC_parse;
3264 paren = *RExC_parse++;
3265 ret = NULL; /* For look-ahead/behind. */
3267 case '<': /* (?<...) */
3268 RExC_seen |= REG_SEEN_LOOKBEHIND;
3269 if (*RExC_parse == '!')
3271 if (*RExC_parse != '=' && *RExC_parse != '!')
3274 case '=': /* (?=...) */
3275 case '!': /* (?!...) */
3276 RExC_seen_zerolen++;
3277 case ':': /* (?:...) */
3278 case '>': /* (?>...) */
3280 case '$': /* (?$...) */
3281 case '@': /* (?@...) */
3282 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3284 case '#': /* (?#...) */
3285 while (*RExC_parse && *RExC_parse != ')')
3287 if (*RExC_parse != ')')
3288 FAIL("Sequence (?#... not terminated");
3289 nextchar(pRExC_state);
3292 case 'p': /* (?p...) */
3293 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3294 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3296 case '?': /* (??...) */
3298 if (*RExC_parse != '{')
3300 paren = *RExC_parse++;
3302 case '{': /* (?{...}) */
3304 I32 count = 1, n = 0;
3306 char *s = RExC_parse;
3308 OP_4tree *sop, *rop;
3310 RExC_seen_zerolen++;
3311 RExC_seen |= REG_SEEN_EVAL;
3312 while (count && (c = *RExC_parse)) {
3313 if (c == '\\' && RExC_parse[1])
3321 if (*RExC_parse != ')')
3324 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3329 if (RExC_parse - 1 - s)
3330 sv = newSVpvn(s, RExC_parse - 1 - s);
3332 sv = newSVpvn("", 0);
3335 Perl_save_re_context(aTHX);
3336 rop = sv_compile_2op(sv, &sop, "re", &pad);
3337 sop->op_private |= OPpREFCOUNTED;
3338 /* re_dup will OpREFCNT_inc */
3339 OpREFCNT_set(sop, 1);
3342 n = add_data(pRExC_state, 3, "nop");
3343 RExC_rx->data->data[n] = (void*)rop;
3344 RExC_rx->data->data[n+1] = (void*)sop;
3345 RExC_rx->data->data[n+2] = (void*)pad;
3348 else { /* First pass */
3349 if (PL_reginterp_cnt < ++RExC_seen_evals
3351 /* No compiled RE interpolated, has runtime
3352 components ===> unsafe. */
3353 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3354 if (PL_tainting && PL_tainted)
3355 FAIL("Eval-group in insecure regular expression");
3356 if (IN_PERL_COMPILETIME)
3360 nextchar(pRExC_state);
3362 ret = reg_node(pRExC_state, LOGICAL);
3365 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3366 /* deal with the length of this later - MJD */
3369 ret = reganode(pRExC_state, EVAL, n);
3370 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3371 Set_Node_Offset(ret, parse_start);
3374 case '(': /* (?(?{...})...) and (?(?=...)...) */
3376 if (RExC_parse[0] == '?') { /* (?(?...)) */
3377 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3378 || RExC_parse[1] == '<'
3379 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3382 ret = reg_node(pRExC_state, LOGICAL);
3385 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3389 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3391 parno = atoi(RExC_parse++);
3393 while (isDIGIT(*RExC_parse))
3395 ret = reganode(pRExC_state, GROUPP, parno);
3397 if ((c = *nextchar(pRExC_state)) != ')')
3398 vFAIL("Switch condition not recognized");
3400 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3401 br = regbranch(pRExC_state, &flags, 1);
3403 br = reganode(pRExC_state, LONGJMP, 0);
3405 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3406 c = *nextchar(pRExC_state);
3410 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3411 regbranch(pRExC_state, &flags, 1);
3412 regtail(pRExC_state, ret, lastbr);
3415 c = *nextchar(pRExC_state);
3420 vFAIL("Switch (?(condition)... contains too many branches");
3421 ender = reg_node(pRExC_state, TAIL);
3422 regtail(pRExC_state, br, ender);
3424 regtail(pRExC_state, lastbr, ender);
3425 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3428 regtail(pRExC_state, ret, ender);
3432 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3436 RExC_parse--; /* for vFAIL to print correctly */
3437 vFAIL("Sequence (? incomplete");
3441 parse_flags: /* (?i) */
3442 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3443 /* (?g), (?gc) and (?o) are useless here
3444 and must be globally applied -- japhy */
3446 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3447 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3448 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3449 if (! (wastedflags & wflagbit) ) {
3450 wastedflags |= wflagbit;
3453 "Useless (%s%c) - %suse /%c modifier",
3454 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : "",
3462 else if (*RExC_parse == 'c') {
3463 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3464 if (! (wastedflags & wasted_c) ) {
3465 wastedflags |= wasted_gc;
3468 "Useless (%sc) - %suse /gc modifier",
3469 flagsp == &negflags ? "?-" : "?",
3470 flagsp == &negflags ? "don't " : ""
3475 else { pmflag(flagsp, *RExC_parse); }
3479 if (*RExC_parse == '-') {
3481 wastedflags = 0; /* reset so (?g-c) warns twice */
3485 RExC_flags |= posflags;
3486 RExC_flags &= ~negflags;
3487 if (*RExC_parse == ':') {
3493 if (*RExC_parse != ')') {
3495 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3497 nextchar(pRExC_state);
3505 ret = reganode(pRExC_state, OPEN, parno);
3506 Set_Node_Length(ret, 1); /* MJD */
3507 Set_Node_Offset(ret, RExC_parse); /* MJD */
3514 /* Pick up the branches, linking them together. */
3515 parse_start = RExC_parse; /* MJD */
3516 br = regbranch(pRExC_state, &flags, 1);
3517 /* branch_len = (paren != 0); */
3521 if (*RExC_parse == '|') {
3522 if (!SIZE_ONLY && RExC_extralen) {
3523 reginsert(pRExC_state, BRANCHJ, br);
3526 reginsert(pRExC_state, BRANCH, br);
3527 Set_Node_Length(br, paren != 0);
3528 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3532 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3534 else if (paren == ':') {
3535 *flagp |= flags&SIMPLE;
3537 if (open) { /* Starts with OPEN. */
3538 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3540 else if (paren != '?') /* Not Conditional */
3542 *flagp |= flags & (SPSTART | HASWIDTH);
3544 while (*RExC_parse == '|') {
3545 if (!SIZE_ONLY && RExC_extralen) {
3546 ender = reganode(pRExC_state, LONGJMP,0);
3547 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3550 RExC_extralen += 2; /* Account for LONGJMP. */
3551 nextchar(pRExC_state);
3552 br = regbranch(pRExC_state, &flags, 0);
3556 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3560 *flagp |= flags&SPSTART;
3563 if (have_branch || paren != ':') {
3564 /* Make a closing node, and hook it on the end. */
3567 ender = reg_node(pRExC_state, TAIL);
3570 ender = reganode(pRExC_state, CLOSE, parno);
3571 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3572 Set_Node_Length(ender,1); /* MJD */
3578 *flagp &= ~HASWIDTH;
3581 ender = reg_node(pRExC_state, SUCCEED);
3584 ender = reg_node(pRExC_state, END);
3587 regtail(pRExC_state, lastbr, ender);
3590 /* Hook the tails of the branches to the closing node. */
3591 for (br = ret; br != NULL; br = regnext(br)) {
3592 regoptail(pRExC_state, br, ender);
3599 static const char parens[] = "=!<,>";
3601 if (paren && (p = strchr(parens, paren))) {
3602 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3603 int flag = (p - parens) > 1;
3606 node = SUSPEND, flag = 0;
3607 reginsert(pRExC_state, node,ret);
3608 Set_Node_Cur_Length(ret);
3609 Set_Node_Offset(ret, parse_start + 1);
3611 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3615 /* Check for proper termination. */
3617 RExC_flags = oregflags;
3618 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3619 RExC_parse = oregcomp_parse;
3620 vFAIL("Unmatched (");
3623 else if (!paren && RExC_parse < RExC_end) {
3624 if (*RExC_parse == ')') {
3626 vFAIL("Unmatched )");
3629 FAIL("Junk on end of regexp"); /* "Can't happen". */
3637 - regbranch - one alternative of an | operator
3639 * Implements the concatenation operator.
3642 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3644 register regnode *ret;
3645 register regnode *chain = NULL;
3646 register regnode *latest;
3647 I32 flags = 0, c = 0;
3652 if (!SIZE_ONLY && RExC_extralen)
3653 ret = reganode(pRExC_state, BRANCHJ,0);
3655 ret = reg_node(pRExC_state, BRANCH);
3656 Set_Node_Length(ret, 1);
3660 if (!first && SIZE_ONLY)
3661 RExC_extralen += 1; /* BRANCHJ */
3663 *flagp = WORST; /* Tentatively. */
3666 nextchar(pRExC_state);
3667 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3669 latest = regpiece(pRExC_state, &flags);
3670 if (latest == NULL) {
3671 if (flags & TRYAGAIN)
3675 else if (ret == NULL)
3677 *flagp |= flags&HASWIDTH;
3678 if (chain == NULL) /* First piece. */
3679 *flagp |= flags&SPSTART;
3682 regtail(pRExC_state, chain, latest);
3687 if (chain == NULL) { /* Loop ran zero times. */
3688 chain = reg_node(pRExC_state, NOTHING);
3693 *flagp |= flags&SIMPLE;
3700 - regpiece - something followed by possible [*+?]
3702 * Note that the branching code sequences used for ? and the general cases
3703 * of * and + are somewhat optimized: they use the same NOTHING node as
3704 * both the endmarker for their branch list and the body of the last branch.
3705 * It might seem that this node could be dispensed with entirely, but the
3706 * endmarker role is not redundant.
3709 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3711 register regnode *ret;
3713 register char *next;
3715 const char * const origparse = RExC_parse;
3718 I32 max = REG_INFTY;
3721 ret = regatom(pRExC_state, &flags);
3723 if (flags & TRYAGAIN)
3730 if (op == '{' && regcurly(RExC_parse)) {
3731 parse_start = RExC_parse; /* MJD */
3732 next = RExC_parse + 1;
3734 while (isDIGIT(*next) || *next == ',') {
3743 if (*next == '}') { /* got one */
3747 min = atoi(RExC_parse);
3751 maxpos = RExC_parse;
3753 if (!max && *maxpos != '0')
3754 max = REG_INFTY; /* meaning "infinity" */
3755 else if (max >= REG_INFTY)
3756 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3758 nextchar(pRExC_state);
3761 if ((flags&SIMPLE)) {
3762 RExC_naughty += 2 + RExC_naughty / 2;
3763 reginsert(pRExC_state, CURLY, ret);
3764 Set_Node_Offset(ret, parse_start+1); /* MJD */
3765 Set_Node_Cur_Length(ret);
3768 regnode *w = reg_node(pRExC_state, WHILEM);
3771 regtail(pRExC_state, ret, w);
3772 if (!SIZE_ONLY && RExC_extralen) {
3773 reginsert(pRExC_state, LONGJMP,ret);
3774 reginsert(pRExC_state, NOTHING,ret);
3775 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3777 reginsert(pRExC_state, CURLYX,ret);
3779 Set_Node_Offset(ret, parse_start+1);
3780 Set_Node_Length(ret,
3781 op == '{' ? (RExC_parse - parse_start) : 1);
3783 if (!SIZE_ONLY && RExC_extralen)
3784 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3785 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3787 RExC_whilem_seen++, RExC_extralen += 3;
3788 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3796 if (max && max < min)
3797 vFAIL("Can't do {n,m} with n > m");
3799 ARG1_SET(ret, (U16)min);
3800 ARG2_SET(ret, (U16)max);
3812 #if 0 /* Now runtime fix should be reliable. */
3814 /* if this is reinstated, don't forget to put this back into perldiag:
3816 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3818 (F) The part of the regexp subject to either the * or + quantifier
3819 could match an empty string. The {#} shows in the regular
3820 expression about where the problem was discovered.
3824 if (!(flags&HASWIDTH) && op != '?')
3825 vFAIL("Regexp *+ operand could be empty");
3828 parse_start = RExC_parse;
3829 nextchar(pRExC_state);
3831 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3833 if (op == '*' && (flags&SIMPLE)) {
3834 reginsert(pRExC_state, STAR, ret);
3838 else if (op == '*') {
3842 else if (op == '+' && (flags&SIMPLE)) {
3843 reginsert(pRExC_state, PLUS, ret);
3847 else if (op == '+') {
3851 else if (op == '?') {
3856 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3858 "%.*s matches null string many times",
3859 RExC_parse - origparse,
3863 if (*RExC_parse == '?') {
3864 nextchar(pRExC_state);
3865 reginsert(pRExC_state, MINMOD, ret);
3866 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3868 if (ISMULT2(RExC_parse)) {
3870 vFAIL("Nested quantifiers");
3877 - regatom - the lowest level
3879 * Optimization: gobbles an entire sequence of ordinary characters so that
3880 * it can turn them into a single node, which is smaller to store and
3881 * faster to run. Backslashed characters are exceptions, each becoming a
3882 * separate node; the code is simpler that way and it's not worth fixing.
3884 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3886 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3888 register regnode *ret = 0;
3890 char *parse_start = RExC_parse;
3892 *flagp = WORST; /* Tentatively. */
3895 switch (*RExC_parse) {
3897 RExC_seen_zerolen++;
3898 nextchar(pRExC_state);
3899 if (RExC_flags & PMf_MULTILINE)
3900 ret = reg_node(pRExC_state, MBOL);
3901 else if (RExC_flags & PMf_SINGLELINE)
3902 ret = reg_node(pRExC_state, SBOL);
3904 ret = reg_node(pRExC_state, BOL);
3905 Set_Node_Length(ret, 1); /* MJD */
3908 nextchar(pRExC_state);
3910 RExC_seen_zerolen++;
3911 if (RExC_flags & PMf_MULTILINE)
3912 ret = reg_node(pRExC_state, MEOL);
3913 else if (RExC_flags & PMf_SINGLELINE)
3914 ret = reg_node(pRExC_state, SEOL);
3916 ret = reg_node(pRExC_state, EOL);
3917 Set_Node_Length(ret, 1); /* MJD */
3920 nextchar(pRExC_state);
3921 if (RExC_flags & PMf_SINGLELINE)
3922 ret = reg_node(pRExC_state, SANY);
3924 ret = reg_node(pRExC_state, REG_ANY);
3925 *flagp |= HASWIDTH|SIMPLE;
3927 Set_Node_Length(ret, 1); /* MJD */
3931 char *oregcomp_parse = ++RExC_parse;
3932 ret = regclass(pRExC_state);
3933 if (*RExC_parse != ']') {
3934 RExC_parse = oregcomp_parse;
3935 vFAIL("Unmatched [");
3937 nextchar(pRExC_state);
3938 *flagp |= HASWIDTH|SIMPLE;
3939 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3943 nextchar(pRExC_state);
3944 ret = reg(pRExC_state, 1, &flags);
3946 if (flags & TRYAGAIN) {
3947 if (RExC_parse == RExC_end) {
3948 /* Make parent create an empty node if needed. */
3956 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3960 if (flags & TRYAGAIN) {
3964 vFAIL("Internal urp");
3965 /* Supposed to be caught earlier. */
3968 if (!regcurly(RExC_parse)) {
3977 vFAIL("Quantifier follows nothing");
3980 switch (*++RExC_parse) {
3982 RExC_seen_zerolen++;
3983 ret = reg_node(pRExC_state, SBOL);
3985 nextchar(pRExC_state);
3986 Set_Node_Length(ret, 2); /* MJD */
3989 ret = reg_node(pRExC_state, GPOS);
3990 RExC_seen |= REG_SEEN_GPOS;
3992 nextchar(pRExC_state);
3993 Set_Node_Length(ret, 2); /* MJD */
3996 ret = reg_node(pRExC_state, SEOL);
3998 RExC_seen_zerolen++; /* Do not optimize RE away */
3999 nextchar(pRExC_state);
4002 ret = reg_node(pRExC_state, EOS);
4004 RExC_seen_zerolen++; /* Do not optimize RE away */
4005 nextchar(pRExC_state);
4006 Set_Node_Length(ret, 2); /* MJD */
4009 ret = reg_node(pRExC_state, CANY);
4010 RExC_seen |= REG_SEEN_CANY;
4011 *flagp |= HASWIDTH|SIMPLE;
4012 nextchar(pRExC_state);
4013 Set_Node_Length(ret, 2); /* MJD */
4016 ret = reg_node(pRExC_state, CLUMP);
4018 nextchar(pRExC_state);
4019 Set_Node_Length(ret, 2); /* MJD */
4022 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4023 *flagp |= HASWIDTH|SIMPLE;
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4028 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4029 *flagp |= HASWIDTH|SIMPLE;
4030 nextchar(pRExC_state);
4031 Set_Node_Length(ret, 2); /* MJD */
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
4036 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 RExC_seen_zerolen++;
4043 RExC_seen |= REG_SEEN_LOOKBEHIND;
4044 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4046 nextchar(pRExC_state);
4047 Set_Node_Length(ret, 2); /* MJD */
4050 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4051 *flagp |= HASWIDTH|SIMPLE;
4052 nextchar(pRExC_state);
4053 Set_Node_Length(ret, 2); /* MJD */
4056 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4057 *flagp |= HASWIDTH|SIMPLE;
4058 nextchar(pRExC_state);
4059 Set_Node_Length(ret, 2); /* MJD */
4062 ret = reg_node(pRExC_state, DIGIT);
4063 *flagp |= HASWIDTH|SIMPLE;
4064 nextchar(pRExC_state);
4065 Set_Node_Length(ret, 2); /* MJD */
4068 ret = reg_node(pRExC_state, NDIGIT);
4069 *flagp |= HASWIDTH|SIMPLE;
4070 nextchar(pRExC_state);
4071 Set_Node_Length(ret, 2); /* MJD */
4076 char* oldregxend = RExC_end;
4077 char* parse_start = RExC_parse - 2;
4079 if (RExC_parse[1] == '{') {
4080 /* a lovely hack--pretend we saw [\pX] instead */
4081 RExC_end = strchr(RExC_parse, '}');
4083 U8 c = (U8)*RExC_parse;
4085 RExC_end = oldregxend;
4086 vFAIL2("Missing right brace on \\%c{}", c);
4091 RExC_end = RExC_parse + 2;
4092 if (RExC_end > oldregxend)
4093 RExC_end = oldregxend;
4097 ret = regclass(pRExC_state);
4099 RExC_end = oldregxend;
4102 Set_Node_Offset(ret, parse_start + 2);
4103 Set_Node_Cur_Length(ret);
4104 nextchar(pRExC_state);
4105 *flagp |= HASWIDTH|SIMPLE;
4118 case '1': case '2': case '3': case '4':
4119 case '5': case '6': case '7': case '8': case '9':
4121 const I32 num = atoi(RExC_parse);
4123 if (num > 9 && num >= RExC_npar)
4126 char * parse_start = RExC_parse - 1; /* MJD */
4127 while (isDIGIT(*RExC_parse))
4130 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4131 vFAIL("Reference to nonexistent group");
4133 ret = reganode(pRExC_state,
4134 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4138 /* override incorrect value set in reganode MJD */
4139 Set_Node_Offset(ret, parse_start+1);
4140 Set_Node_Cur_Length(ret); /* MJD */
4142 nextchar(pRExC_state);
4147 if (RExC_parse >= RExC_end)
4148 FAIL("Trailing \\");
4151 /* Do not generate "unrecognized" warnings here, we fall
4152 back into the quick-grab loop below */
4159 if (RExC_flags & PMf_EXTENDED) {
4160 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4161 if (RExC_parse < RExC_end)
4167 register STRLEN len;
4172 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4174 parse_start = RExC_parse - 1;
4180 ret = reg_node(pRExC_state,
4181 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4183 for (len = 0, p = RExC_parse - 1;
4184 len < 127 && p < RExC_end;
4189 if (RExC_flags & PMf_EXTENDED)
4190 p = regwhite(p, RExC_end);
4237 ender = ASCII_TO_NATIVE('\033');
4241 ender = ASCII_TO_NATIVE('\007');
4246 char* const e = strchr(p, '}');
4250 vFAIL("Missing right brace on \\x{}");
4253 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4254 | PERL_SCAN_DISALLOW_PREFIX;
4255 STRLEN numlen = e - p - 1;
4256 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4263 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4265 ender = grok_hex(p, &numlen, &flags, NULL);
4271 ender = UCHARAT(p++);
4272 ender = toCTRL(ender);
4274 case '0': case '1': case '2': case '3':case '4':
4275 case '5': case '6': case '7': case '8':case '9':
4277 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4280 ender = grok_oct(p, &numlen, &flags, NULL);
4290 FAIL("Trailing \\");
4293 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4294 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4295 goto normal_default;
4300 if (UTF8_IS_START(*p) && UTF) {
4302 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4310 if (RExC_flags & PMf_EXTENDED)
4311 p = regwhite(p, RExC_end);
4313 /* Prime the casefolded buffer. */
4314 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4316 if (ISMULT2(p)) { /* Back off on ?+*. */
4323 /* Emit all the Unicode characters. */
4325 for (foldbuf = tmpbuf;
4327 foldlen -= numlen) {
4328 ender = utf8_to_uvchr(foldbuf, &numlen);
4330 reguni(pRExC_state, ender, s, &unilen);
4333 /* In EBCDIC the numlen
4334 * and unilen can differ. */
4336 if (numlen >= foldlen)
4340 break; /* "Can't happen." */
4344 reguni(pRExC_state, ender, s, &unilen);
4353 REGC((char)ender, s++);
4361 /* Emit all the Unicode characters. */
4363 for (foldbuf = tmpbuf;
4365 foldlen -= numlen) {
4366 ender = utf8_to_uvchr(foldbuf, &numlen);
4368 reguni(pRExC_state, ender, s, &unilen);
4371 /* In EBCDIC the numlen
4372 * and unilen can differ. */
4374 if (numlen >= foldlen)
4382 reguni(pRExC_state, ender, s, &unilen);
4391 REGC((char)ender, s++);
4395 Set_Node_Cur_Length(ret); /* MJD */
4396 nextchar(pRExC_state);
4398 /* len is STRLEN which is unsigned, need to copy to signed */
4401 vFAIL("Internal disaster");
4405 if (len == 1 && UNI_IS_INVARIANT(ender))
4410 RExC_size += STR_SZ(len);
4412 RExC_emit += STR_SZ(len);
4417 /* If the encoding pragma is in effect recode the text of
4418 * any EXACT-kind nodes. */
4419 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4420 STRLEN oldlen = STR_LEN(ret);
4421 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4425 if (sv_utf8_downgrade(sv, TRUE)) {
4426 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4427 const STRLEN newlen = SvCUR(sv);
4432 GET_RE_DEBUG_FLAGS_DECL;
4433 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4434 (int)oldlen, STRING(ret),
4436 Copy(s, STRING(ret), newlen, char);
4437 STR_LEN(ret) += newlen - oldlen;
4438 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4440 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4448 S_regwhite(pTHX_ char *p, const char *e)
4453 else if (*p == '#') {
4456 } while (p < e && *p != '\n');
4464 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4465 Character classes ([:foo:]) can also be negated ([:^foo:]).
4466 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4467 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4468 but trigger failures because they are currently unimplemented. */
4470 #define POSIXCC_DONE(c) ((c) == ':')
4471 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4472 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4475 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4478 I32 namedclass = OOB_NAMEDCLASS;
4480 if (value == '[' && RExC_parse + 1 < RExC_end &&
4481 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4482 POSIXCC(UCHARAT(RExC_parse))) {
4483 const char c = UCHARAT(RExC_parse);
4484 char* s = RExC_parse++;
4486 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4488 if (RExC_parse == RExC_end)
4489 /* Grandfather lone [:, [=, [. */
4492 const char* t = RExC_parse++; /* skip over the c */
4496 if (UCHARAT(RExC_parse) == ']') {
4497 RExC_parse++; /* skip over the ending ] */
4500 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4501 const I32 skip = t - posixcc;
4503 /* Initially switch on the length of the name. */
4506 if (memEQ(posixcc, "word", 4)) {
4507 /* this is not POSIX, this is the Perl \w */;
4509 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4513 /* Names all of length 5. */
4514 /* alnum alpha ascii blank cntrl digit graph lower
4515 print punct space upper */
4516 /* Offset 4 gives the best switch position. */
4517 switch (posixcc[4]) {
4519 if (memEQ(posixcc, "alph", 4)) {
4522 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4526 if (memEQ(posixcc, "spac", 4)) {
4529 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4533 if (memEQ(posixcc, "grap", 4)) {
4536 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4540 if (memEQ(posixcc, "asci", 4)) {
4543 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4547 if (memEQ(posixcc, "blan", 4)) {
4550 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4554 if (memEQ(posixcc, "cntr", 4)) {
4557 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4561 if (memEQ(posixcc, "alnu", 4)) {
4564 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4568 if (memEQ(posixcc, "lowe", 4)) {
4571 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4573 if (memEQ(posixcc, "uppe", 4)) {
4576 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4580 if (memEQ(posixcc, "digi", 4)) {
4583 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4585 if (memEQ(posixcc, "prin", 4)) {
4588 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4590 if (memEQ(posixcc, "punc", 4)) {
4593 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4599 if (memEQ(posixcc, "xdigit", 6)) {
4601 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4606 if (namedclass == OOB_NAMEDCLASS)
4608 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4611 assert (posixcc[skip] == ':');
4612 assert (posixcc[skip+1] == ']');
4613 } else if (!SIZE_ONLY) {
4614 /* [[=foo=]] and [[.foo.]] are still future. */
4616 /* adjust RExC_parse so the warning shows after
4618 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4620 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4623 /* Maternal grandfather:
4624 * "[:" ending in ":" but not in ":]" */
4634 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4636 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4637 const char *s = RExC_parse;
4638 const char c = *s++;
4640 while(*s && isALNUM(*s))
4642 if (*s && c == *s && s[1] == ']') {
4643 if (ckWARN(WARN_REGEXP))
4645 "POSIX syntax [%c %c] belongs inside character classes",
4648 /* [[=foo=]] and [[.foo.]] are still future. */
4649 if (POSIXCC_NOTYET(c)) {
4650 /* adjust RExC_parse so the error shows after
4652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4661 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4664 register UV nextvalue;
4665 register IV prevvalue = OOB_UNICODE;
4666 register IV range = 0;
4667 register regnode *ret;
4670 char *rangebegin = 0;
4671 bool need_class = 0;
4672 SV *listsv = Nullsv;
4675 bool optimize_invert = TRUE;
4676 AV* unicode_alternate = 0;
4678 UV literal_endpoint = 0;
4681 ret = reganode(pRExC_state, ANYOF, 0);
4684 ANYOF_FLAGS(ret) = 0;
4686 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4690 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4694 RExC_size += ANYOF_SKIP;
4696 RExC_emit += ANYOF_SKIP;
4698 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4700 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4701 ANYOF_BITMAP_ZERO(ret);
4702 listsv = newSVpvn("# comment\n", 10);
4705 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4707 if (!SIZE_ONLY && POSIXCC(nextvalue))
4708 checkposixcc(pRExC_state);
4710 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4711 if (UCHARAT(RExC_parse) == ']')
4714 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4718 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4721 rangebegin = RExC_parse;
4723 value = utf8n_to_uvchr((U8*)RExC_parse,
4724 RExC_end - RExC_parse,
4726 RExC_parse += numlen;
4729 value = UCHARAT(RExC_parse++);
4730 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4731 if (value == '[' && POSIXCC(nextvalue))
4732 namedclass = regpposixcc(pRExC_state, value);
4733 else if (value == '\\') {
4735 value = utf8n_to_uvchr((U8*)RExC_parse,
4736 RExC_end - RExC_parse,
4738 RExC_parse += numlen;
4741 value = UCHARAT(RExC_parse++);
4742 /* Some compilers cannot handle switching on 64-bit integer
4743 * values, therefore value cannot be an UV. Yes, this will
4744 * be a problem later if we want switch on Unicode.
4745 * A similar issue a little bit later when switching on
4746 * namedclass. --jhi */
4747 switch ((I32)value) {
4748 case 'w': namedclass = ANYOF_ALNUM; break;
4749 case 'W': namedclass = ANYOF_NALNUM; break;
4750 case 's': namedclass = ANYOF_SPACE; break;
4751 case 'S': namedclass = ANYOF_NSPACE; break;
4752 case 'd': namedclass = ANYOF_DIGIT; break;
4753 case 'D': namedclass = ANYOF_NDIGIT; break;
4756 if (RExC_parse >= RExC_end)
4757 vFAIL2("Empty \\%c{}", (U8)value);
4758 if (*RExC_parse == '{') {
4759 const U8 c = (U8)value;
4760 e = strchr(RExC_parse++, '}');
4762 vFAIL2("Missing right brace on \\%c{}", c);
4763 while (isSPACE(UCHARAT(RExC_parse)))
4765 if (e == RExC_parse)
4766 vFAIL2("Empty \\%c{}", c);
4768 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4776 if (UCHARAT(RExC_parse) == '^') {
4779 value = value == 'p' ? 'P' : 'p'; /* toggle */
4780 while (isSPACE(UCHARAT(RExC_parse))) {
4786 Perl_sv_catpvf(aTHX_ listsv,
4787 "+utf8::%.*s\n", (int)n, RExC_parse);
4789 Perl_sv_catpvf(aTHX_ listsv,
4790 "!utf8::%.*s\n", (int)n, RExC_parse);
4793 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4794 namedclass = ANYOF_MAX; /* no official name, but it's named */
4796 case 'n': value = '\n'; break;
4797 case 'r': value = '\r'; break;
4798 case 't': value = '\t'; break;
4799 case 'f': value = '\f'; break;
4800 case 'b': value = '\b'; break;
4801 case 'e': value = ASCII_TO_NATIVE('\033');break;
4802 case 'a': value = ASCII_TO_NATIVE('\007');break;
4804 if (*RExC_parse == '{') {
4805 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4806 | PERL_SCAN_DISALLOW_PREFIX;
4807 e = strchr(RExC_parse++, '}');
4809 vFAIL("Missing right brace on \\x{}");
4811 numlen = e - RExC_parse;
4812 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4816 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4818 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4819 RExC_parse += numlen;
4823 value = UCHARAT(RExC_parse++);
4824 value = toCTRL(value);
4826 case '0': case '1': case '2': case '3': case '4':
4827 case '5': case '6': case '7': case '8': case '9':
4831 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4832 RExC_parse += numlen;
4836 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4838 "Unrecognized escape \\%c in character class passed through",
4842 } /* end of \blah */
4848 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4850 if (!SIZE_ONLY && !need_class)
4851 ANYOF_CLASS_ZERO(ret);
4855 /* a bad range like a-\d, a-[:digit:] ? */
4858 if (ckWARN(WARN_REGEXP))
4860 "False [] range \"%*.*s\"",
4861 RExC_parse - rangebegin,
4862 RExC_parse - rangebegin,
4864 if (prevvalue < 256) {
4865 ANYOF_BITMAP_SET(ret, prevvalue);
4866 ANYOF_BITMAP_SET(ret, '-');
4869 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4870 Perl_sv_catpvf(aTHX_ listsv,
4871 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4875 range = 0; /* this was not a true range */
4879 const char *what = NULL;
4882 if (namedclass > OOB_NAMEDCLASS)
4883 optimize_invert = FALSE;
4884 /* Possible truncation here but in some 64-bit environments
4885 * the compiler gets heartburn about switch on 64-bit values.
4886 * A similar issue a little earlier when switching on value.
4888 switch ((I32)namedclass) {
4891 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4893 for (value = 0; value < 256; value++)
4895 ANYOF_BITMAP_SET(ret, value);
4902 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4904 for (value = 0; value < 256; value++)
4905 if (!isALNUM(value))
4906 ANYOF_BITMAP_SET(ret, value);
4913 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4915 for (value = 0; value < 256; value++)
4916 if (isALNUMC(value))
4917 ANYOF_BITMAP_SET(ret, value);
4924 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4926 for (value = 0; value < 256; value++)
4927 if (!isALNUMC(value))
4928 ANYOF_BITMAP_SET(ret, value);
4935 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4937 for (value = 0; value < 256; value++)
4939 ANYOF_BITMAP_SET(ret, value);
4946 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4948 for (value = 0; value < 256; value++)
4949 if (!isALPHA(value))
4950 ANYOF_BITMAP_SET(ret, value);
4957 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4960 for (value = 0; value < 128; value++)
4961 ANYOF_BITMAP_SET(ret, value);
4963 for (value = 0; value < 256; value++) {
4965 ANYOF_BITMAP_SET(ret, value);
4974 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4977 for (value = 128; value < 256; value++)
4978 ANYOF_BITMAP_SET(ret, value);
4980 for (value = 0; value < 256; value++) {
4981 if (!isASCII(value))
4982 ANYOF_BITMAP_SET(ret, value);
4991 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4993 for (value = 0; value < 256; value++)
4995 ANYOF_BITMAP_SET(ret, value);
5002 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5004 for (value = 0; value < 256; value++)
5005 if (!isBLANK(value))
5006 ANYOF_BITMAP_SET(ret, value);
5013 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5015 for (value = 0; value < 256; value++)
5017 ANYOF_BITMAP_SET(ret, value);
5024 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5026 for (value = 0; value < 256; value++)
5027 if (!isCNTRL(value))
5028 ANYOF_BITMAP_SET(ret, value);
5035 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5037 /* consecutive digits assumed */
5038 for (value = '0'; value <= '9'; value++)
5039 ANYOF_BITMAP_SET(ret, value);
5046 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5048 /* consecutive digits assumed */
5049 for (value = 0; value < '0'; value++)
5050 ANYOF_BITMAP_SET(ret, value);
5051 for (value = '9' + 1; value < 256; value++)
5052 ANYOF_BITMAP_SET(ret, value);
5059 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5061 for (value = 0; value < 256; value++)
5063 ANYOF_BITMAP_SET(ret, value);
5070 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5072 for (value = 0; value < 256; value++)
5073 if (!isGRAPH(value))
5074 ANYOF_BITMAP_SET(ret, value);
5081 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5083 for (value = 0; value < 256; value++)
5085 ANYOF_BITMAP_SET(ret, value);
5092 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5094 for (value = 0; value < 256; value++)
5095 if (!isLOWER(value))
5096 ANYOF_BITMAP_SET(ret, value);
5103 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5105 for (value = 0; value < 256; value++)
5107 ANYOF_BITMAP_SET(ret, value);
5114 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5116 for (value = 0; value < 256; value++)
5117 if (!isPRINT(value))
5118 ANYOF_BITMAP_SET(ret, value);
5125 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5127 for (value = 0; value < 256; value++)
5128 if (isPSXSPC(value))
5129 ANYOF_BITMAP_SET(ret, value);
5136 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5138 for (value = 0; value < 256; value++)
5139 if (!isPSXSPC(value))
5140 ANYOF_BITMAP_SET(ret, value);
5147 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5149 for (value = 0; value < 256; value++)
5151 ANYOF_BITMAP_SET(ret, value);
5158 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5160 for (value = 0; value < 256; value++)
5161 if (!isPUNCT(value))
5162 ANYOF_BITMAP_SET(ret, value);
5169 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5171 for (value = 0; value < 256; value++)
5173 ANYOF_BITMAP_SET(ret, value);
5180 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5182 for (value = 0; value < 256; value++)
5183 if (!isSPACE(value))
5184 ANYOF_BITMAP_SET(ret, value);
5191 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5193 for (value = 0; value < 256; value++)
5195 ANYOF_BITMAP_SET(ret, value);
5202 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5204 for (value = 0; value < 256; value++)
5205 if (!isUPPER(value))
5206 ANYOF_BITMAP_SET(ret, value);
5213 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5215 for (value = 0; value < 256; value++)
5216 if (isXDIGIT(value))
5217 ANYOF_BITMAP_SET(ret, value);
5224 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5226 for (value = 0; value < 256; value++)
5227 if (!isXDIGIT(value))
5228 ANYOF_BITMAP_SET(ret, value);
5234 /* this is to handle \p and \P */
5237 vFAIL("Invalid [::] class");
5241 /* Strings such as "+utf8::isWord\n" */
5242 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5245 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5248 } /* end of namedclass \blah */
5251 if (prevvalue > (IV)value) /* b-a */ {
5252 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5253 RExC_parse - rangebegin,
5254 RExC_parse - rangebegin,
5256 range = 0; /* not a valid range */
5260 prevvalue = value; /* save the beginning of the range */
5261 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5262 RExC_parse[1] != ']') {
5265 /* a bad range like \w-, [:word:]- ? */
5266 if (namedclass > OOB_NAMEDCLASS) {
5267 if (ckWARN(WARN_REGEXP))
5269 "False [] range \"%*.*s\"",
5270 RExC_parse - rangebegin,
5271 RExC_parse - rangebegin,
5274 ANYOF_BITMAP_SET(ret, '-');
5276 range = 1; /* yeah, it's a range! */
5277 continue; /* but do it the next time */
5281 /* now is the next time */
5285 if (prevvalue < 256) {
5286 const IV ceilvalue = value < 256 ? value : 255;
5289 /* In EBCDIC [\x89-\x91] should include
5290 * the \x8e but [i-j] should not. */
5291 if (literal_endpoint == 2 &&
5292 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5293 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5295 if (isLOWER(prevvalue)) {
5296 for (i = prevvalue; i <= ceilvalue; i++)
5298 ANYOF_BITMAP_SET(ret, i);
5300 for (i = prevvalue; i <= ceilvalue; i++)
5302 ANYOF_BITMAP_SET(ret, i);
5307 for (i = prevvalue; i <= ceilvalue; i++)
5308 ANYOF_BITMAP_SET(ret, i);
5310 if (value > 255 || UTF) {
5311 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5312 const UV natvalue = NATIVE_TO_UNI(value);
5314 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5315 if (prevnatvalue < natvalue) { /* what about > ? */
5316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5317 prevnatvalue, natvalue);
5319 else if (prevnatvalue == natvalue) {
5320 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5322 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5324 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5326 /* If folding and foldable and a single
5327 * character, insert also the folded version
5328 * to the charclass. */
5330 if (foldlen == (STRLEN)UNISKIP(f))
5331 Perl_sv_catpvf(aTHX_ listsv,
5334 /* Any multicharacter foldings
5335 * require the following transform:
5336 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5337 * where E folds into "pq" and F folds
5338 * into "rst", all other characters
5339 * fold to single characters. We save
5340 * away these multicharacter foldings,
5341 * to be later saved as part of the
5342 * additional "s" data. */
5345 if (!unicode_alternate)
5346 unicode_alternate = newAV();
5347 sv = newSVpvn((char*)foldbuf, foldlen);
5349 av_push(unicode_alternate, sv);
5353 /* If folding and the value is one of the Greek
5354 * sigmas insert a few more sigmas to make the
5355 * folding rules of the sigmas to work right.
5356 * Note that not all the possible combinations
5357 * are handled here: some of them are handled
5358 * by the standard folding rules, and some of
5359 * them (literal or EXACTF cases) are handled
5360 * during runtime in regexec.c:S_find_byclass(). */
5361 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5363 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5364 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5365 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5367 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5368 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5369 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5374 literal_endpoint = 0;
5378 range = 0; /* this range (if it was one) is done now */
5382 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5384 RExC_size += ANYOF_CLASS_ADD_SKIP;
5386 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5389 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5391 /* If the only flag is folding (plus possibly inversion). */
5392 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5394 for (value = 0; value < 256; ++value) {
5395 if (ANYOF_BITMAP_TEST(ret, value)) {
5396 UV fold = PL_fold[value];
5399 ANYOF_BITMAP_SET(ret, fold);
5402 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5405 /* optimize inverted simple patterns (e.g. [^a-z]) */
5406 if (!SIZE_ONLY && optimize_invert &&
5407 /* If the only flag is inversion. */
5408 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5409 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5410 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5411 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5418 /* The 0th element stores the character class description
5419 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5420 * to initialize the appropriate swash (which gets stored in
5421 * the 1st element), and also useful for dumping the regnode.
5422 * The 2nd element stores the multicharacter foldings,
5423 * used later (regexec.c:S_reginclass()). */
5424 av_store(av, 0, listsv);
5425 av_store(av, 1, NULL);
5426 av_store(av, 2, (SV*)unicode_alternate);
5427 rv = newRV_noinc((SV*)av);
5428 n = add_data(pRExC_state, 1, "s");
5429 RExC_rx->data->data[n] = (void*)rv;
5437 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5439 char* retval = RExC_parse++;
5442 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5443 RExC_parse[2] == '#') {
5444 while (*RExC_parse != ')') {
5445 if (RExC_parse == RExC_end)
5446 FAIL("Sequence (?#... not terminated");
5452 if (RExC_flags & PMf_EXTENDED) {
5453 if (isSPACE(*RExC_parse)) {
5457 else if (*RExC_parse == '#') {
5458 while (RExC_parse < RExC_end)
5459 if (*RExC_parse++ == '\n') break;
5468 - reg_node - emit a node
5470 STATIC regnode * /* Location. */
5471 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5473 register regnode *ptr;
5474 regnode * const ret = RExC_emit;
5477 SIZE_ALIGN(RExC_size);
5482 NODE_ALIGN_FILL(ret);
5484 FILL_ADVANCE_NODE(ptr, op);
5485 if (RExC_offsets) { /* MJD */
5486 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5487 "reg_node", __LINE__,
5489 RExC_emit - RExC_emit_start > RExC_offsets[0]
5490 ? "Overwriting end of array!\n" : "OK",
5491 RExC_emit - RExC_emit_start,
5492 RExC_parse - RExC_start,
5494 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5503 - reganode - emit a node with an argument
5505 STATIC regnode * /* Location. */
5506 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5508 register regnode *ptr;
5509 regnode * const ret = RExC_emit;
5512 SIZE_ALIGN(RExC_size);
5517 NODE_ALIGN_FILL(ret);
5519 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5520 if (RExC_offsets) { /* MJD */
5521 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5525 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5526 "Overwriting end of array!\n" : "OK",
5527 RExC_emit - RExC_emit_start,
5528 RExC_parse - RExC_start,
5530 Set_Cur_Node_Offset;
5539 - reguni - emit (if appropriate) a Unicode character
5542 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5544 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5548 - reginsert - insert an operator in front of already-emitted operand
5550 * Means relocating the operand.
5553 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
5558 const int offset = regarglen[(U8)op];
5560 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5563 RExC_size += NODE_STEP_REGNODE + offset;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5570 while (src > opnd) {
5571 StructCopy(--src, --dst, regnode);
5572 if (RExC_offsets) { /* MJD 20010112 */
5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5588 place = opnd; /* Op node, where operand used to be. */
5589 if (RExC_offsets) { /* MJD */
5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5599 Set_Node_Offset(place, RExC_parse);
5600 Set_Node_Length(place, 1);
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
5608 - regtail - set the next-pointer at the end of a node chain of p to val.
5611 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5613 register regnode *scan;
5618 /* Find last node. */
5621 regnode * const temp = regnext(scan);
5627 if (reg_off_by_arg[OP(scan)]) {
5628 ARG_SET(scan, val - scan);
5631 NEXT_OFF(scan) = val - scan;
5636 - regoptail - regtail on operand of first argument; nop if operandless
5639 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5641 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5642 if (p == NULL || SIZE_ONLY)
5644 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5645 regtail(pRExC_state, NEXTOPER(p), val);
5647 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5648 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5655 - regcurly - a little FSA that accepts {\d+,?\d*}
5658 S_regcurly(pTHX_ register const char *s)
5679 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5681 register U8 op = EXACT; /* Arbitrary non-END op. */
5682 register regnode *next;
5684 while (op != END && (!last || node < last)) {
5685 /* While that wasn't END last time... */
5691 next = regnext(node);
5693 if (OP(node) == OPTIMIZED)
5696 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5697 (int)(2*l + 1), "", SvPVX(sv));
5698 if (next == NULL) /* Next ptr. */
5699 PerlIO_printf(Perl_debug_log, "(0)");
5701 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5702 (void)PerlIO_putc(Perl_debug_log, '\n');
5704 if (PL_regkind[(U8)op] == BRANCHJ) {
5705 register regnode *nnode = (OP(next) == LONGJMP
5708 if (last && nnode > last)
5710 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5712 else if (PL_regkind[(U8)op] == BRANCH) {
5713 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5715 else if ( PL_regkind[(U8)op] == TRIE ) {
5716 const I32 n = ARG(node);
5717 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5718 const I32 arry_len = av_len(trie->words)+1;
5720 PerlIO_printf(Perl_debug_log,
5721 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
5725 (int)trie->charcount,
5726 trie->uniquecharcount,
5727 (IV)trie->laststate-1,
5728 node->flags ? " EVAL mode" : "");
5730 for (word_idx=0; word_idx < arry_len; word_idx++) {
5731 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5733 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5736 SvPV_nolen(*elem_ptr),
5741 PerlIO_printf(Perl_debug_log, "(0)\n");
5743 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5749 node = NEXTOPER(node);
5750 node += regarglen[(U8)op];
5753 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5757 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5758 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5761 else if ( op == PLUS || op == STAR) {
5762 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5764 else if (op == ANYOF) {
5765 /* arglen 1 + class block */
5766 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5767 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5768 node = NEXTOPER(node);
5770 else if (PL_regkind[(U8)op] == EXACT) {
5771 /* Literal string, where present. */
5772 node += NODE_SZ_STR(node) - 1;
5773 node = NEXTOPER(node);
5776 node = NEXTOPER(node);
5777 node += regarglen[(U8)op];
5779 if (op == CURLYX || op == OPEN)
5781 else if (op == WHILEM)
5787 #endif /* DEBUGGING */
5790 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5793 Perl_regdump(pTHX_ regexp *r)
5796 SV *sv = sv_newmortal();
5798 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5800 /* Header fields of interest. */
5801 if (r->anchored_substr)
5802 PerlIO_printf(Perl_debug_log,
5803 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5805 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5806 SvPVX(r->anchored_substr),
5808 SvTAIL(r->anchored_substr) ? "$" : "",
5809 (IV)r->anchored_offset);
5810 else if (r->anchored_utf8)
5811 PerlIO_printf(Perl_debug_log,
5812 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5814 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5815 SvPVX(r->anchored_utf8),
5817 SvTAIL(r->anchored_utf8) ? "$" : "",
5818 (IV)r->anchored_offset);
5819 if (r->float_substr)
5820 PerlIO_printf(Perl_debug_log,
5821 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5823 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5824 SvPVX(r->float_substr),
5826 SvTAIL(r->float_substr) ? "$" : "",
5827 (IV)r->float_min_offset, (UV)r->float_max_offset);
5828 else if (r->float_utf8)
5829 PerlIO_printf(Perl_debug_log,
5830 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5832 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5833 SvPVX(r->float_utf8),
5835 SvTAIL(r->float_utf8) ? "$" : "",
5836 (IV)r->float_min_offset, (UV)r->float_max_offset);
5837 if (r->check_substr || r->check_utf8)
5838 PerlIO_printf(Perl_debug_log,
5839 r->check_substr == r->float_substr
5840 && r->check_utf8 == r->float_utf8
5841 ? "(checking floating" : "(checking anchored");
5842 if (r->reganch & ROPT_NOSCAN)
5843 PerlIO_printf(Perl_debug_log, " noscan");
5844 if (r->reganch & ROPT_CHECK_ALL)
5845 PerlIO_printf(Perl_debug_log, " isall");
5846 if (r->check_substr || r->check_utf8)
5847 PerlIO_printf(Perl_debug_log, ") ");
5849 if (r->regstclass) {
5850 regprop(sv, r->regstclass);
5851 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX(sv));
5853 if (r->reganch & ROPT_ANCH) {
5854 PerlIO_printf(Perl_debug_log, "anchored");
5855 if (r->reganch & ROPT_ANCH_BOL)
5856 PerlIO_printf(Perl_debug_log, "(BOL)");
5857 if (r->reganch & ROPT_ANCH_MBOL)
5858 PerlIO_printf(Perl_debug_log, "(MBOL)");
5859 if (r->reganch & ROPT_ANCH_SBOL)
5860 PerlIO_printf(Perl_debug_log, "(SBOL)");
5861 if (r->reganch & ROPT_ANCH_GPOS)
5862 PerlIO_printf(Perl_debug_log, "(GPOS)");
5863 PerlIO_putc(Perl_debug_log, ' ');
5865 if (r->reganch & ROPT_GPOS_SEEN)
5866 PerlIO_printf(Perl_debug_log, "GPOS ");
5867 if (r->reganch & ROPT_SKIP)
5868 PerlIO_printf(Perl_debug_log, "plus ");
5869 if (r->reganch & ROPT_IMPLICIT)
5870 PerlIO_printf(Perl_debug_log, "implicit ");
5871 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5872 if (r->reganch & ROPT_EVAL_SEEN)
5873 PerlIO_printf(Perl_debug_log, "with eval ");
5874 PerlIO_printf(Perl_debug_log, "\n");
5876 const U32 len = r->offsets[0];
5877 GET_RE_DEBUG_FLAGS_DECL;
5880 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5881 for (i = 1; i <= len; i++)
5882 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5883 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5884 PerlIO_printf(Perl_debug_log, "\n");
5887 #endif /* DEBUGGING */
5893 S_put_byte(pTHX_ SV *sv, int c)
5895 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5896 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5897 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5898 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5900 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5903 #endif /* DEBUGGING */
5907 - regprop - printable representation of opcode
5910 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5915 sv_setpvn(sv, "", 0);
5916 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5917 /* It would be nice to FAIL() here, but this may be called from
5918 regexec.c, and it would be hard to supply pRExC_state. */
5919 Perl_croak(aTHX_ "Corrupted regexp opcode");
5920 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5922 k = PL_regkind[(U8)OP(o)];
5925 SV *dsv = sv_2mortal(newSVpvn("", 0));
5926 /* Using is_utf8_string() is a crude hack but it may
5927 * be the best for now since we have no flag "this EXACTish
5928 * node was UTF-8" --jhi */
5929 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5930 const char *s = do_utf8 ?
5931 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5932 UNI_DISPLAY_REGEX) :
5934 const int len = do_utf8 ?
5937 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5941 } else if (k == TRIE) {/*
5942 this isn't always safe, as Pl_regdata may not be for this regex yet
5943 (depending on where its called from) so its being moved to dumpuntil
5945 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5946 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5949 trie->uniquecharcount,
5952 } else if (k == CURLY) {
5953 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5954 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5955 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5957 else if (k == WHILEM && o->flags) /* Ordinal/of */
5958 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5959 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5960 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5961 else if (k == LOGICAL)
5962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5963 else if (k == ANYOF) {
5964 int i, rangestart = -1;
5965 U8 flags = ANYOF_FLAGS(o);
5966 const char * const anyofs[] = { /* Should be synchronized with
5967 * ANYOF_ #xdefines in regcomp.h */
6000 if (flags & ANYOF_LOCALE)
6001 sv_catpv(sv, "{loc}");
6002 if (flags & ANYOF_FOLD)
6003 sv_catpv(sv, "{i}");
6004 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6005 if (flags & ANYOF_INVERT)
6007 for (i = 0; i <= 256; i++) {
6008 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6009 if (rangestart == -1)
6011 } else if (rangestart != -1) {
6012 if (i <= rangestart + 3)
6013 for (; rangestart < i; rangestart++)
6014 put_byte(sv, rangestart);
6016 put_byte(sv, rangestart);
6018 put_byte(sv, i - 1);
6024 if (o->flags & ANYOF_CLASS)
6025 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6026 if (ANYOF_CLASS_TEST(o,i))
6027 sv_catpv(sv, anyofs[i]);
6029 if (flags & ANYOF_UNICODE)
6030 sv_catpv(sv, "{unicode}");
6031 else if (flags & ANYOF_UNICODE_ALL)
6032 sv_catpv(sv, "{unicode_all}");
6036 SV *sw = regclass_swash(o, FALSE, &lv, 0);
6040 U8 s[UTF8_MAXBYTES_CASE+1];
6042 for (i = 0; i <= 256; i++) { /* just the first 256 */
6043 uvchr_to_utf8(s, i);
6045 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6046 if (rangestart == -1)
6048 } else if (rangestart != -1) {
6051 if (i <= rangestart + 3)
6052 for (; rangestart < i; rangestart++) {
6054 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6059 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
6062 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6069 sv_catpv(sv, "..."); /* et cetera */
6073 char *s = savesvpv(lv);
6076 while(*s && *s != '\n') s++;
6079 const char *t = ++s;
6097 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6099 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6100 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6101 #endif /* DEBUGGING */
6105 Perl_re_intuit_string(pTHX_ regexp *prog)
6106 { /* Assume that RE_INTUIT is set */
6107 GET_RE_DEBUG_FLAGS_DECL;
6110 const char *s = SvPV(prog->check_substr
6111 ? prog->check_substr : prog->check_utf8, n_a);
6113 if (!PL_colorset) reginitcolors();
6114 PerlIO_printf(Perl_debug_log,
6115 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6117 prog->check_substr ? "" : "utf8 ",
6118 PL_colors[5],PL_colors[0],
6121 (strlen(s) > 60 ? "..." : ""));
6124 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6128 Perl_pregfree(pTHX_ struct regexp *r)
6132 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
6133 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6137 if (!r || (--r->refcnt > 0))
6139 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6140 const char *s = (r->reganch & ROPT_UTF8)
6141 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6142 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6143 const int len = SvCUR(dsv);
6146 PerlIO_printf(Perl_debug_log,
6147 "%sFreeing REx:%s %s%*.*s%s%s\n",
6148 PL_colors[4],PL_colors[5],PL_colors[0],
6151 len > 60 ? "..." : "");
6155 Safefree(r->precomp);
6156 if (r->offsets) /* 20010421 MJD */
6157 Safefree(r->offsets);
6158 RX_MATCH_COPY_FREE(r);
6159 #ifdef PERL_COPY_ON_WRITE
6161 SvREFCNT_dec(r->saved_copy);
6164 if (r->anchored_substr)
6165 SvREFCNT_dec(r->anchored_substr);
6166 if (r->anchored_utf8)
6167 SvREFCNT_dec(r->anchored_utf8);
6168 if (r->float_substr)
6169 SvREFCNT_dec(r->float_substr);
6171 SvREFCNT_dec(r->float_utf8);
6172 Safefree(r->substrs);
6175 int n = r->data->count;
6176 PAD* new_comppad = NULL;
6181 /* If you add a ->what type here, update the comment in regcomp.h */
6182 switch (r->data->what[n]) {
6184 SvREFCNT_dec((SV*)r->data->data[n]);
6187 Safefree(r->data->data[n]);
6190 new_comppad = (AV*)r->data->data[n];
6193 if (new_comppad == NULL)
6194 Perl_croak(aTHX_ "panic: pregfree comppad");
6195 PAD_SAVE_LOCAL(old_comppad,
6196 /* Watch out for global destruction's random ordering. */
6197 (SvTYPE(new_comppad) == SVt_PVAV) ?
6198 new_comppad : Null(PAD *)
6201 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6204 op_free((OP_4tree*)r->data->data[n]);
6206 PAD_RESTORE_LOCAL(old_comppad);
6207 SvREFCNT_dec((SV*)new_comppad);
6214 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6217 refcount = trie->refcount--;
6221 Safefree(trie->charmap);
6222 if (trie->widecharmap)
6223 SvREFCNT_dec((SV*)trie->widecharmap);
6225 Safefree(trie->states);
6227 Safefree(trie->trans);
6230 SvREFCNT_dec((SV*)trie->words);
6231 if (trie->revcharmap)
6232 SvREFCNT_dec((SV*)trie->revcharmap);
6234 Safefree(r->data->data[n]); /* do this last!!!! */
6239 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6242 Safefree(r->data->what);
6245 Safefree(r->startp);
6251 - regnext - dig the "next" pointer out of a node
6254 Perl_regnext(pTHX_ register regnode *p)
6256 register I32 offset;
6258 if (p == &PL_regdummy)
6261 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6269 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6272 STRLEN l1 = strlen(pat1);
6273 STRLEN l2 = strlen(pat2);
6276 const char *message;
6282 Copy(pat1, buf, l1 , char);
6283 Copy(pat2, buf + l1, l2 , char);
6284 buf[l1 + l2] = '\n';
6285 buf[l1 + l2 + 1] = '\0';
6287 /* ANSI variant takes additional second argument */
6288 va_start(args, pat2);
6292 msv = vmess(buf, &args);
6294 message = SvPV(msv,l1);
6297 Copy(message, buf, l1 , char);
6298 buf[l1-1] = '\0'; /* Overwrite \n */
6299 Perl_croak(aTHX_ "%s", buf);
6302 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6305 Perl_save_re_context(pTHX)
6307 SAVEI32(PL_reg_flags); /* from regexec.c */
6309 SAVEPPTR(PL_reginput); /* String-input pointer. */
6310 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6311 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6312 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6313 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6314 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6315 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6316 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6317 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6318 PL_reg_start_tmp = 0;
6319 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6320 PL_reg_start_tmpl = 0;
6321 SAVEVPTR(PL_regdata);
6322 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6323 SAVEI32(PL_regnarrate); /* from regexec.c */
6324 SAVEVPTR(PL_regprogram); /* from regexec.c */
6325 SAVEINT(PL_regindent); /* from regexec.c */
6326 SAVEVPTR(PL_regcc); /* from regexec.c */
6327 SAVEVPTR(PL_curcop);
6328 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6329 SAVEVPTR(PL_reg_re); /* from regexec.c */
6330 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6331 SAVESPTR(PL_reg_sv); /* from regexec.c */
6332 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6333 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6334 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6335 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6336 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6337 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6338 PL_reg_oldsaved = Nullch;
6339 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6340 PL_reg_oldsavedlen = 0;
6341 #ifdef PERL_COPY_ON_WRITE
6345 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6347 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6348 PL_reg_leftiter = 0;
6349 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6350 PL_reg_poscache = Nullch;
6351 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6352 PL_reg_poscache_size = 0;
6353 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6354 SAVEI32(PL_regnpar); /* () count. */
6355 SAVEI32(PL_regsize); /* from regexec.c */
6358 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6361 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6363 for (i = 1; i <= rx->nparens; i++) {
6365 char digits[TYPE_CHARS(long)];
6366 sprintf(digits, "%lu", (long)i);
6367 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6374 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6379 clear_re(pTHX_ void *r)
6381 ReREFCNT_dec((regexp *)r);
6386 * c-indentation-style: bsd
6388 * indent-tabs-mode: t
6391 * ex: set ts=8 sts=4 sw=4 noet: