From: Yves Orton Date: Tue, 20 Mar 2007 01:40:34 +0000 (+0100) Subject: feel the the baß (encoding problems in the regex engine) X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=02daf0ab3f11bf85e3739683ba94241d1c4cf8b2;p=p5sagit%2Fp5-mst-13.2.git feel the the baß (encoding problems in the regex engine) Message-ID: <9b18b3110703191740m6bf21942p6521f3016ed8092f@mail.gmail.com> p4raw-id: //depot/perl@30647 --- diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index 3ba0da0..d119dfe 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -775,7 +775,7 @@ must be able to correctly build a regexp structure. typedef struct regexp { /* what engine created this regexp? */ - const struct regexp_engine* engine; + const struct regexp_engine* engine; /* Information about the match that the perl core uses to manage things */ U32 extflags; /* Flags used both externally and internally */ @@ -829,10 +829,10 @@ to the subroutines that are to be used for performing a match. It is the compiling routine's responsibility to populate this field before returning the regexp object. -=item C C +=item C C Used for debugging purposes. C holds a copy of the pattern -that was compiled. +that was compiled. =item C @@ -841,22 +841,22 @@ contains a \G or a ^ or $ symbol. =item C C -C is the minimum string length required for the pattern to match. -This is used to prune the search space by not bothering to match any -closer to the end of a string than would allow a match. For instance -there is no point in even starting the regex engine if the minlen is -10 but the string is only 5 characters long. There is no way that the +C is the minimum string length required for the pattern to match. +This is used to prune the search space by not bothering to match any +closer to the end of a string than would allow a match. For instance +there is no point in even starting the regex engine if the minlen is +10 but the string is only 5 characters long. There is no way that the pattern can match. C is the minimum length of the string that would be found -in $& after a match. +in $& after a match. The difference between C and C can be seen in the following pattern: /ns(?=\d)/ -where the C would be 3 but the minlen ret would only be 2 as +where the C would be 3 but the minlen ret would only be 2 as the \d is required to match but is not actually included in the matched content. This distinction is particularly important as the substitution logic uses the C to tell whether it can do in-place substition @@ -889,7 +889,7 @@ occur at a floating offset from the start of the pattern. Used to do Fast-Boyer-Moore searches on the string to find out if its worth using the regex engine at all, and if so where in the string to search. -=item C, C, +=item C, C These fields store arrays that are used to hold the offsets of the begining and end of each capture group that has matched. -1 is used to indicate no match. @@ -903,8 +903,8 @@ patterns. =item C -This stores the number of eval groups in the pattern. This is used -for security purposes when embedding compiled regexes into larger +This stores the number of eval groups in the pattern. This is used +for security purposes when embedding compiled regexes into larger patterns. =back @@ -1028,6 +1028,17 @@ Compile the pattern between exp and xend using the flags contained in pm and return a pointer to a prepared regexp structure that can perform the match. +The utf8'ness of the string can be found by testing + + pm->op_pmdynflags & PMdf_CMP_UTF8 + +Additional various flags reflecting the modifiers used are contained in + + pm->op_pmflags + +some of these have exact equivelents in re->extflags. See regcomp.h and op.h +for details of these values. + =item exec I32 exec(regexp* prog, @@ -1046,7 +1057,7 @@ Execute a regexp. Find the start position where a regex match should be attempted, or possibly whether the regex engine should not be run because the pattern can't match. This is called as appropriate by the core -depending on the values of the extflags member of the regexp +depending on the values of the extflags member of the regexp structure. =item checkstr diff --git a/regcomp.c b/regcomp.c index 429b493..7c08840 100644 --- a/regcomp.c +++ b/regcomp.c @@ -124,7 +124,10 @@ typedef struct RExC_state_t { regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ - I32 utf8; + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ HV *charnames; /* cache of named sequences */ HV *paren_names; /* Paren names */ @@ -168,6 +171,7 @@ typedef struct RExC_state_t { #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) #define RExC_charnames (pRExC_state->charnames) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) @@ -1375,16 +1379,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ - STRLEN chars=0; + STRLEN chars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ if (OP(noper) == NOTHING) { trie->minlen= 0; continue; } - if (trie->bitmap) { - TRIE_BITMAP_SET(trie,*uc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]); - } + if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; @@ -1396,6 +1401,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; TRIE_STORE_REVCHAR; } + if ( set_bit ) { + /* store the codepoint in the bitmap, and if its ascii + also store its folded equivelent. */ + TRIE_BITMAP_SET(trie,uvc); + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + set_bit = 0; /* We've done our bit :-) */ + } } else { SV** svpp; if ( !widecharmap ) @@ -4052,16 +4064,18 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; - RExC_precomp = exp; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, RExC_precomp, (xend - exp), 60); + dsv, exp, (xend - exp), 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); + +redo_first_pass: + RExC_precomp = exp; RExC_flags = pm->op_pmflags; RExC_sawback = 0; @@ -4100,6 +4114,25 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) RExC_precomp = NULL; return(NULL); } + if (RExC_utf8 && !RExC_orig_utf8) { + /* It's possible to write a regexp in ascii that represents unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + XXX: somehow figure out how to make this less expensive... + -- dmq */ + STRLEN len = xend-exp; + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8; + SAVEFREEPV(exp); + goto redo_first_pass; + } DEBUG_PARSE_r({ PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" @@ -4956,7 +4989,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) GET_RE_DEBUG_FLAGS_DECL; DEBUG_PARSE("reg "); - *flagp = 0; /* Tentatively. */ @@ -5796,6 +5828,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; DEBUG_PARSE("brnc"); + if (first) ret = NULL; else { diff --git a/regexec.c b/regexec.c index dc0cd9b..c9efaae 100644 --- a/regexec.c +++ b/regexec.c @@ -1997,7 +1997,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * SV * const prop = sv_newmortal(); regprop(prog, prop, c); { - RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1), + RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); PerlIO_printf(Perl_debug_log, "Matching stclass %.*s against %s (%d chars)\n", diff --git a/t/op/pat.t b/t/op/pat.t index 5bc68d7..423822a 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4316,6 +4316,16 @@ sub kt "Check that (?&..) to a buffer inside a (?|...) goes to the leftmost"); } +{ + use warnings; + local $Message = "ASCII pattern that really is utf8"; + my @w; + local $SIG{__WARN__}=sub{push @w,"@_"}; + my $c=qq(\x{DF}); + ok($c=~/${c}|\x{100}/); + ok(@w==0); +} + # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4385,7 +4395,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/); iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1650; + $::TestCount = 1652; print "1..$::TestCount\n"; }