typedef struct regexp {
/* what engine created this regexp? */
- const struct regexp_engine* engine;
+ const struct regexp_engine* engine;
/* Information about the match that the perl core uses to manage things */
U32 extflags; /* Flags used both externally and internally */
is the compiling routine's responsibility to populate this field before
returning the regexp object.
-=item C<precomp> C<prelen>
+=item C<precomp> C<prelen>
Used for debugging purposes. C<precomp> holds a copy of the pattern
-that was compiled.
+that was compiled.
=item C<extflags>
=item C<minlen> C<minlenret>
-C<minlen> is the minimum string length required for the pattern to match.
-This is used to prune the search space by not bothering to match any
-closer to the end of a string than would allow a match. For instance
-there is no point in even starting the regex engine if the minlen is
-10 but the string is only 5 characters long. There is no way that the
+C<minlen> is the minimum string length required for the pattern to match.
+This is used to prune the search space by not bothering to match any
+closer to the end of a string than would allow a match. For instance
+there is no point in even starting the regex engine if the minlen is
+10 but the string is only 5 characters long. There is no way that the
pattern can match.
C<minlenret> is the minimum length of the string that would be found
-in $& after a match.
+in $& after a match.
The difference between C<minlen> and C<minlenret> can be seen in the
following pattern:
/ns(?=\d)/
-where the C<minlen> would be 3 but the minlen ret would only be 2 as
+where the C<minlen> would be 3 but the minlen ret would only be 2 as
the \d is required to match but is not actually included in the matched
content. This distinction is particularly important as the substitution
logic uses the C<minlenret> to tell whether it can do in-place substition
Fast-Boyer-Moore searches on the string to find out if its worth using
the regex engine at all, and if so where in the string to search.
-=item C<startp>, C<endp>,
+=item C<startp>, C<endp>
These fields store arrays that are used to hold the offsets of the begining
and end of each capture group that has matched. -1 is used to indicate no match.
=item C<seen_evals>
-This stores the number of eval groups in the pattern. This is used
-for security purposes when embedding compiled regexes into larger
+This stores the number of eval groups in the pattern. This is used
+for security purposes when embedding compiled regexes into larger
patterns.
=back
pm and return a pointer to a prepared regexp structure that can perform
the match.
+The utf8'ness of the string can be found by testing
+
+ pm->op_pmdynflags & PMdf_CMP_UTF8
+
+Additional various flags reflecting the modifiers used are contained in
+
+ pm->op_pmflags
+
+some of these have exact equivelents in re->extflags. See regcomp.h and op.h
+for details of these values.
+
=item exec
I32 exec(regexp* prog,
Find the start position where a regex match should be attempted,
or possibly whether the regex engine should not be run because the
pattern can't match. This is called as appropriate by the core
-depending on the values of the extflags member of the regexp
+depending on the values of the extflags member of the regexp
structure.
=item checkstr
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
regnode *opend; /* END node in program */
- I32 utf8;
+ I32 utf8; /* whether the pattern is utf8 or not */
+ I32 orig_utf8; /* whether the pattern was originally in utf8 */
+ /* XXX use this for future optimisation of case
+ * where pattern must be upgraded to utf8. */
HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars=0;
+ STRLEN chars = 0;
+ bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
trie->minlen= 0;
continue;
}
- if (trie->bitmap) {
- TRIE_BITMAP_SET(trie,*uc);
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
- }
+ if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
+ regardless of encoding */
+
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
TRIE_STORE_REVCHAR;
}
+ if ( set_bit ) {
+ /* store the codepoint in the bitmap, and if its ascii
+ also store its folded equivelent. */
+ TRIE_BITMAP_SET(trie,uvc);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ set_bit = 0; /* We've done our bit :-) */
+ }
} else {
SV** svpp;
if ( !widecharmap )
if (exp == NULL)
FAIL("NULL regexp argument");
- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- RExC_precomp = exp;
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, RExC_precomp, (xend - exp), 60);
+ dsv, exp, (xend - exp), 60);
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
+
+redo_first_pass:
+ RExC_precomp = exp;
RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
RExC_precomp = NULL;
return(NULL);
}
+ if (RExC_utf8 && !RExC_orig_utf8) {
+ /* It's possible to write a regexp in ascii that represents unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ XXX: somehow figure out how to make this less expensive...
+ -- dmq */
+ STRLEN len = xend-exp;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8;
+ SAVEFREEPV(exp);
+ goto redo_first_pass;
+ }
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("reg ");
-
*flagp = 0; /* Tentatively. */
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("brnc");
+
if (first)
ret = NULL;
else {
SV * const prop = sv_newmortal();
regprop(prog, prop, c);
{
- RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
+ RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
s,strend-s,60);
PerlIO_printf(Perl_debug_log,
"Matching stclass %.*s against %s (%d chars)\n",
"Check that (?&..) to a buffer inside a (?|...) goes to the leftmost");
}
+{
+ use warnings;
+ local $Message = "ASCII pattern that really is utf8";
+ my @w;
+ local $SIG{__WARN__}=sub{push @w,"@_"};
+ my $c=qq(\x{DF});
+ ok($c=~/${c}|\x{100}/);
+ ok(@w==0);
+}
+
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1650;
+ $::TestCount = 1652;
print "1..$::TestCount\n";
}