From: Yves Orton Date: Tue, 24 Apr 2007 16:46:05 +0000 (+0200) Subject: Re: Analysis of problems with mixed encoding case insensitive matches in regex engine. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=32e6a07c84b153f78f946de50870bc0ee030624f;p=p5sagit%2Fp5-mst-13.2.git Re: Analysis of problems with mixed encoding case insensitive matches in regex engine. Message-ID: <9b18b3110704240746u461e4bdcl208ef7d7f9c5ef64@mail.gmail.com> p4raw-id: //depot/perl@31081 --- diff --git a/Porting/regcharclass.pl b/Porting/regcharclass.pl index c895440..8f5b3f1 100644 --- a/Porting/regcharclass.pl +++ b/Porting/regcharclass.pl @@ -247,7 +247,8 @@ sub combine { ? sprintf("$alu == $hex_fmt",$_->[0]) : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_); return $txt unless @_; - return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; + return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )", + $txt,$alu,$_->[1],combine($alu,@_); } # recursively convert a trie to an optree represented by @@ -302,11 +303,15 @@ sub make_optree { $size=1 if $type eq 'c'; if ( !$type ) { my ( $u, $l ); - for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { - $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); + if ($self->{trie}{u}) { + for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { + $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); + } } - for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { - $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); + if ($self->{trie}{l}) { + for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { + $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); + } } if ( $u ) { $else= [ '(is_utf8)', $u, $l || 0 ]; @@ -314,9 +319,13 @@ sub make_optree { $else= [ '(!is_utf8)', $l, 0 ]; } $type= 'n'; - $size-- while !$self->{trie}{n}{$size}; } - return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); + if (!$self->{trie}{$type}) { + return $else; + } else { + $size-- while $size>0 && !$self->{trie}{$type}{$size}; + return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); + } } # construct the optree for a type with length checks to prevent buffer @@ -427,18 +436,23 @@ sub ternary { return "/*** GENERATED CODE ***/\n" . _macro "#define is_$self->{op}$ext($args)\n$code"; } - +$|++; my $path=shift @ARGV; + if (!$path) { $path= "regcharclass.h"; if (!-e $path) { $path="../$path" } if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; } - -rename $path,"$path.bak"; -open my $out_fh,">",$path - or die "Can't write to '$path':$!"; -binmode $out_fh; # want unix line endings even when run on win32. +my $out_fh; +if ($path eq '-') { + $out_fh= \*STDOUT; +} else { + rename $path,"$path.bak"; + open $out_fh,">",$path + or die "Can't write to '$path':$!"; + binmode $out_fh; # want unix line endings even when run on win32. +} my ($zero) = $0=~/([^\\\/]+)$/; print $out_fh <<"HEADER"; /* -*- buffer-read-only: t -*- @@ -458,17 +472,22 @@ print $out_fh <<"HEADER"; HEADER -my ($op,$title,@strs,@txt); +my ($op,$title,@strs,@txt,$type); my $doit= sub { return unless $op; my $o= __PACKAGE__->new($title,$op,@strs); print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; print $out_fh join "\n",@txt,"*/",""; - for ('', 'U', 'L') { - print $out_fh $o->ternary( $_ ); - print $out_fh $o->ternary( $_,'_safe' ); + $type||="U L c _safe"; + my @ext=(""); + my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } } + split /\s+/,$type); + for my $type (@types) { + for my $ext (@ext) { + next if $type eq 'c' and $ext eq '_safe'; + print $out_fh $o->ternary( $type,$ext ); + } } - print $out_fh $o->ternary( 'c' ); }; while () { next unless /\S/; @@ -477,6 +496,9 @@ while () { $doit->(); ($op,$title)=split /\s*:\s*/,$_,2; @txt=@strs=(); + $type=""; + } elsif (/^=(.*)/) { + $type.=$1; } else { push @txt, "\t$_"; s/#.*$//; @@ -489,7 +511,6 @@ while () { } $doit->(); print $out_fh "/* ex: set ro: */\n"; -print "$path has been updated\n"; __DATA__ LNBREAK: Line Break: \R @@ -532,3 +553,7 @@ VERTWS: Vertical Whitespace: \v \V 0x2028 # LINE SEPARATOR 0x2029 # PARAGRAPH SEPARATOR +TRICKYFOLD: Problematic fold case letters. +0x00DF # LATIN SMALL LETTER SHARP S +0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS diff --git a/regcharclass.h b/regcharclass.h index 40d21bf..8425693 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -9,7 +9,7 @@ * * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! * This file is built by Porting/regcharclass.pl. - * (Generated at: Mon Apr 23 15:30:51 2007 GMT) + * (Generated at: Tue Apr 24 12:19:13 2007 GMT) * Any changes made here will be lost! */ @@ -105,9 +105,9 @@ /*** GENERATED CODE ***/ #define is_LNBREAK_cp(cp) \ -( (0x0A <= cp && cp <= 0x0D) || ( cp > 13 && \ -( cp == 0x85 || ( cp > 133 && \ -( cp == 0x2028 || ( cp > 8232 && \ +( (0x0A <= cp && cp <= 0x0D) ||( cp > 0x0D && \ +( cp == 0x85 ||( cp > 0x85 && \ +( cp == 0x2028 ||( cp > 0x2028 && \ cp == 0x2029 ) ) ) ) ) ) /* @@ -227,14 +227,14 @@ cp == 0x2029 ) ) ) ) ) ) /*** GENERATED CODE ***/ #define is_HORIZWS_cp(cp) \ -( cp == 0x09 || ( cp > 9 && \ -( cp == 0x20 || ( cp > 32 && \ -( cp == 0xA0 || ( cp > 160 && \ -( cp == 0x1680 || ( cp > 5760 && \ -( cp == 0x180E || ( cp > 6158 && \ -( (0x2000 <= cp && cp <= 0x200A) || ( cp > 8202 && \ -( cp == 0x202F || ( cp > 8239 && \ -( cp == 0x205F || ( cp > 8287 && \ +( cp == 0x09 ||( cp > 0x09 && \ +( cp == 0x20 ||( cp > 0x20 && \ +( cp == 0xA0 ||( cp > 0xA0 && \ +( cp == 0x1680 ||( cp > 0x1680 && \ +( cp == 0x180E ||( cp > 0x180E && \ +( (0x2000 <= cp && cp <= 0x200A) ||( cp > 0x200A && \ +( cp == 0x202F ||( cp > 0x202F && \ +( cp == 0x205F ||( cp > 0x205F && \ cp == 0x3000 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) /* @@ -310,9 +310,62 @@ cp == 0x3000 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) /*** GENERATED CODE ***/ #define is_VERTWS_cp(cp) \ -( (0x0A <= cp && cp <= 0x0D) || ( cp > 13 && \ -( cp == 0x85 || ( cp > 133 && \ -( cp == 0x2028 || ( cp > 8232 && \ +( (0x0A <= cp && cp <= 0x0D) ||( cp > 0x0D && \ +( cp == 0x85 ||( cp > 0x85 && \ +( cp == 0x2028 ||( cp > 0x2028 && \ cp == 0x2029 ) ) ) ) ) ) +/* + TRICKYFOLD: Problematic fold case letters. + + 0x00DF # LATIN SMALL LETTER SHARP S + 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +*/ +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD(s,is_utf8) \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0] == 0xC3 ) ? \ + ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \ + ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) :\ + ( ((U8*)s)[0] == 0xDF ) ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_safe(s,e,is_utf8) \ +( ( (e) - (s) > 1 ) ? \ +( (is_utf8) ? \ + ( ( ((U8*)s)[0] == 0xC3 ) ? \ + ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \ + ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) :\ + ( ((U8*)s)[0] == 0xDF ) ) : \ +((( (e) - (s) > 0 ) && (!is_utf8)) ? ( ((U8*)s)[0] == 0xDF ) : 0) ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_utf8(s) \ +( ( ((U8*)s)[0] == 0xC3 ) ? \ + ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \ + ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_utf8_safe(s,e) \ +( ( (e) - (s) > 1 ) ? \ + ( ( ((U8*)s)[0] == 0xC3 ) ? \ + ( ( ((U8*)s)[1] == 0x9F ) ? 2 : 0 ) : \ + ((( ((U8*)s)[0] == 0xCE ) && ( ((U8*)s)[1] == 0x90 || ((U8*)s)[1] == 0xB0 )) ? 2 : 0) ) : 0 ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_latin1(s) \ +( ((U8*)s)[0] == 0xDF ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_latin1_safe(s,e) \ +( ( (e) - (s) > 0 ) ? \ + ( ((U8*)s)[0] == 0xDF ) : 0 ) + +/*** GENERATED CODE ***/ +#define is_TRICKYFOLD_cp(cp) \ +( cp == 0xDF ||( cp > 0xDF && \ +( cp == 0x390 ||( cp > 0x390 && \ +cp == 0x3B0 ) ) ) ) + /* ex: set ro: */ diff --git a/regcomp.sym b/regcomp.sym index 070fe98..b0b9faf 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -192,6 +192,8 @@ NVERTWS NVERTWS, none not vertical whitespace (Perl 6) HORIZWS HORIZWS, none horizontal whitespace (Perl 6) NHORIZWS NHORIZWS, none not horizontal whitespace (Perl 6) +FOLDCHAR FOLDCHAR, codepoint 1 codepoint with tricky case folding properties. + # NEW STUFF ABOVE THIS LINE ################################################################################ diff --git a/regexec.c b/regexec.c index d84190b..374d480 100644 --- a/regexec.c +++ b/regexec.c @@ -5004,7 +5004,34 @@ NULL sayNO; /* NOTREACHED */ #undef ST + case FOLDCHAR: + n = ARG(scan); + if (nextchr==n) { + locinput += UTF8SKIP(locinput); + } else { + /* This malarky is to handle LATIN SMALL LETTER SHARP S + properly. Sigh */ + if (0xDF==n && (UTF||do_utf8) && + toLOWER(locinput[0])=='s' && toLOWER(locinput[1])=='s') + { + locinput += 2; + } else if (do_utf8) { + U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; + STRLEN tmplen1; + U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; + STRLEN tmplen2; + to_uni_fold(n, tmpbuf1, &tmplen1); + to_utf8_fold(locinput, tmpbuf2, &tmplen2); + if (tmplen1!=tmplen2 || !strnEQ(tmpbuf1,tmpbuf2,tmplen1)) + sayNO; + else + locinput += UTF8SKIP(locinput); + } else + sayNO; + } + nextchr = UCHARAT(locinput); + break; case LNBREAK: if ((n=is_LNBREAK(locinput,do_utf8))) { locinput += n; diff --git a/regnodes.h b/regnodes.h index 3c3a5d6..4e0f44d 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 89 -#define REGMATCH_STATE_MAX 129 +#define REGNODE_MAX 90 +#define REGMATCH_STATE_MAX 130 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -97,8 +97,9 @@ #define NVERTWS 85 /* 0x55 not vertical whitespace (Perl 6) */ #define HORIZWS 86 /* 0x56 horizontal whitespace (Perl 6) */ #define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */ -#define OPTIMIZED 88 /* 0x58 Placeholder for dump. */ -#define PSEUDO 89 /* 0x59 Pseudo opcode for internal use. */ +#define FOLDCHAR 88 /* 0x58 codepoint with tricky case folding properties. */ +#define OPTIMIZED 89 /* 0x59 Placeholder for dump. */ +#define PSEUDO 90 /* 0x5a Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -235,6 +236,7 @@ EXTCONST U8 PL_regkind[] = { NVERTWS, /* NVERTWS */ HORIZWS, /* HORIZWS */ NHORIZWS, /* NHORIZWS */ + FOLDCHAR, /* FOLDCHAR */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -373,6 +375,7 @@ static const U8 regarglen[] = { 0, /* NVERTWS */ 0, /* HORIZWS */ 0, /* NHORIZWS */ + EXTRA_SIZE(struct regnode_1), /* FOLDCHAR */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -468,6 +471,7 @@ static const char reg_off_by_arg[] = { 0, /* NVERTWS */ 0, /* HORIZWS */ 0, /* NHORIZWS */ + 0, /* FOLDCHAR */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -568,8 +572,9 @@ EXTCONST char * const PL_reg_name[] = { "NVERTWS", /* 0x55 */ "HORIZWS", /* 0x56 */ "NHORIZWS", /* 0x57 */ - "OPTIMIZED", /* 0x58 */ - "PSEUDO", /* 0x59 */ + "FOLDCHAR", /* 0x58 */ + "OPTIMIZED", /* 0x59 */ + "PSEUDO", /* 0x5a */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ diff --git a/t/op/pat.t b/t/op/pat.t index a5b98f6..056e26a 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4346,7 +4346,38 @@ sub kt } } } - +{ + # test that \xDF matches properly. this is pretty hacky stuff, + # but its actually needed. the malarky with '-' is to prevent + # compilation caching from playing any role in the test. + my @df= (chr(0xDF),'-',chr(0xDF)); + utf8::upgrade($df[2]); + my @strs= ('ss','sS','Ss','SS',chr(0xDF)); + my @ss= map { ("$_", "$_") } @strs; + utf8::upgrade($ss[$_*2+1]) for 0..$#strs; + + for my $ssi (0..$#ss) { + for my $dfi (0..$#df) { + my $pat= $df[$dfi]; + my $str= $ss[$ssi]; + my $utf_df= ($dfi > 1) ? 'utf8' : ''; + my $utf_ss= ($ssi % 2) ? 'utf8' : ''; + (my $sstr=$str)=~s/\xDF/\\xDF/; + + if ($utf_df || $utf_ss || length($ss[$ssi])==1) { + my $ret= $str=~/$pat/i; + next if $pat eq '-'; + ok($ret, + "\"$sstr\"=~/\\xDF/i (str is @{[$utf_ss||'latin']}, pat is @{[$utf_df||'latin']})"); + } else { + my $ret= $str !~ /$pat/i; + next if $pat eq '-'; + ok($ret, + "\"$sstr\"!~/\\xDF/i (str is @{[$utf_ss||'latin']}, pat is @{[$utf_df||'latin']})"); + } + } + } +} # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4428,7 +4459,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/); iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1928; + $::TestCount = 1948; print "1..$::TestCount\n"; }