Re: Analysis of problems with mixed encoding case insensitive matches in regex engine.
[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl
CommitLineData
12b72891 1package UTF8::Matcher;
2use strict;
3use warnings;
4use Text::Wrap qw(wrap);
5use Encode;
6use Data::Dumper;
7
0ccab2bc 8our $hex_fmt="0x%02X";
9
12b72891 10# Author: Yves Orton (demerphq) 2007.
11
12=pod
13
14Dynamically generates macros for detecting special charclasses
15in both latin-1, utf8, and codepoint forms.
16
17To regenerate regcharclass.h, run this script from perl-root. No arguments
18are necessary.
19
20Each charclass handler is constructed as follows:
21Each string the charclass must match is rendered as unicode (codepoints>255),
22and if possible as latin1 (codepoints>127), and if possible as "neutral"
23(all codepoints<128).
24
25The rendered strings are then inserted into digit-tries by type and length.
26With shorter strings being added to tries that are allowed to contain longer
27strings, but not vice versa. Thus the "longest" trie contains all strings
28for that charclass.
29
30The following types of trie are generated:
31
32 n - Neutral only. All strings in this type have codepoints<128
33 l - Latin1 only. All strings in this type have a codepoint>127 in them
34 u - UTF8 only. All strings in this type have a codepoint>255 in them
35 L - Latin1. All strings in 'n' and 'l'
36 U - UTF8. All string in 'n' and 'u'
37 c - Codepoint. All strings in U but in codepoint and not utf8 form.
38
39The ternary() routine is responsible for converting the trie data into a
40ternary conditional that matches the required set of strings. The generated
41macro normally takes at least the argument 's' which is expected to be a
42pointer of type C<char *> or C<U8 *>. The condition generated will be
43optimised to match the string as efficiently as possible, with range lookups
44being used where possible, and in some situations relying on "true" to be 1.
45
46ternary() takes two optional arguments, $type which is one of the above
47characters and $ext which is used to add an extra extension to the macro name.
48
49If $type is omitted or false then the generated macro will take an additional
50argument, 'is_utf8'.
51
52If $ext has the string 'safe' in it then the generated macro will take an extra
53argument 'e' for the end of the string, and all lookups will be length checked
54to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
55be guaranteed safe, and no 'e' argument is provided and no length checks are
56made during execution.
57
58The 'c' type is different as compared to the rest. Instead of producing
59a condition that does octet comparisons of a string array, the 'c' type
60produces a macro that takes a single codepoint as an argument (instead of a
61char* or U8*) and does the lookup based on only that char, thus it cannot be
62used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
63This is primarily used for populating charclass bitmaps for codepoints 0..255
64but will also match codepoints in the unicode range if necessary.
65
66Using LNBREAK as an example the following macros will be produced:
67
68=over 4
69
70=item is_LNBREAK(s,is_utf8)
71
72=item is_LNBREAK_safe(s,e,is_utf8)
73
74Do a lookup as apporpriate based on the is_utf8 flag. When possible
75comparisons involving octect<128 are done before checking the is_utf8
76flag, hopefully saving time.
77
78=item is_LNBREAK_utf8(s)
79
80=item is_LNBREAK_utf8_safe(s,e)
81
82Do a lookup assuming the string is encoded in (normalized) UTF8.
83
84=item is_LNBREAK_latin1(s)
85
86=item is_LNBREAK_latin1_safe(s,e)
87
88Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
89
90=item is_LNBREAK_cp(cp)
91
92Check to see if the string matches a given codepoint (hypotethically a
93U32). The condition is constructed as as to "break out" as early as
94possible if the codepoint is out of range of the condition.
95
96IOW:
97
98 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
99
100Thus if the character is X+1 only two comparisons will be done. Making
101matching lookups slower, but non-matching faster.
102
103=back
104
105=cut
106
107# store a list of numbers into a hash based trie.
108sub _trie_store {
109 my $root= shift;
110 foreach my $b ( @_ ) {
111 $root->{$b} ||= {};
112 $root= $root->{$b};
113 }
114 $root->{''}++;
115}
116
117# Convert a string into its neutral, latin1, utf8 forms, where
118# the form is undefined unless the string can be completely represented
119# in that form. The string is then decomposed into the octects representing
120# it. A list is returned for each. Additional a list of codepoints making
121# up the string.
122# returns (\@n,\@u,\@l,\@cp)
123#
124sub _uni_latin1 {
125 my $str= shift;
126 my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
127 my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
128 my $n= $l;
129 undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
130 return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
131 [map { ord $_ } split //,$str]);
132}
133
134# store an array ref of char data into the appropriate
135# type bins, tracking sizes as we go.
136sub _store {
137 my ( $self, $r, @k )= @_;
138 for my $z ( @k ) {
139 $self->{size}{$z}{ 0 + @$r }++;
140 push @{ $self->{data}{$z} }, $r;
141 }
142}
143
144# construct a new charclass constructor object.
145# $title ends up in the code a as a comment.
146# $opcode is the name of the operation the charclass implements.
147# the rest of the arguments are strings that the charclass
148# can match.
149sub new {
150 my $class= shift;
151 my $title= shift;
152 my $opcode= shift;
153 my $self= bless { op => $opcode, title => $title }, $class;
154 my %seen;
155 # convert the strings to the numeric equivelents and store
156 # them for later insertion while tracking their sizes.
157 foreach my $seq ( @_ ) {
158 next if $seen{$seq}++;
159 push @{$self->{seq}},$seq;
160 my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
161 if ( $n ) {
162 _store( $self, $n, qw(n U L) );
163 } else {
164 if ( $l ) {
165 _store( $self, $l, qw(l L) );
166 }
167 _store( $self, $u, qw(u U) );
168 }
169 _store($self,$cp,'c');
170 }
171 #
172 # now construct the tries. For each type of data we insert
173 # the data into all the tries of length $size and smaller.
174 #
175
176 my %allsize;
177 foreach my $k ( keys %{ $self->{data} } ) {
178 my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
179 $self->{size}{$k}=\@size;
180 undef @allsize{@size};
181 foreach my $d ( @{ $self->{data}{$k} } ) {
182 foreach my $sz ( @size ) {
183 last if $sz < @$d;
184 $self->{trie}{$k}{$sz} ||= {};
185 _trie_store( $self->{trie}{$k}{$sz}, @$d );
186 }
187 }
188 #delete $self->{data}{$k};
189 }
190 my @size= sort { $b <=> $a } keys %allsize;
191 $self->{size}{''}= \@size;
192 return $self;
193}
194
195#
196# _cond([$v1,$v2,$v2...],$ofs)
197#
198# converts an array of codepoints into a conditional expression
199# consequtive codepoints are merged into a range test
200# returns a string containing the conditional expression in the form
201# '( li[x]==v || li[x]==y )' When possible we also use range lookups.
202
203sub _cond {
204 my ( $c, $ofs,$fmt )= @_;
205 $fmt||='((U8*)s)[%d]';
206 # cheapo rangification routine.
207 # Convert the first element into a singleton represented
208 # as [$x,$x] and then merge the rest in as we go.
209 my @v= sort { $a <=> $b } @$c;
210 my @r= ( [ ( shift @v ) x 2 ] );
211 for my $n ( @v ) {
212 if ( $n == $r[-1][1] + 1 ) {
213 $r[-1][1]++;
214 } else {
215 push @r, [ $n, $n ];
216 }
217 }
218 @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
219 # sort the ranges by size and order.
220 @r= sort { $a->[0] <=> $b->[0] } @r;
221 my $alu= sprintf $fmt,$ofs; # C array look up
222
223 if ($fmt=~/%d/) {
224 # map the ranges into conditions
225 @r= map {
226 # singleton
0ccab2bc 227 $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
12b72891 228 # range
0ccab2bc 229 sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
12b72891 230 } @r;
231 # return the joined results.
232 return '( ' . join( " || ", @r ) . ' )';
233 } else {
234 return combine($alu,@r);
235 }
236}
237
238#
239# Do the condition in such a way that we break out early if the value
240# we are looking at is in between two elements in the list.
241# Currently used only for codepoint macros (depth 1)
242#
243sub combine {
244 my $alu=shift;
245 local $_ = shift;
246 my $txt= $_->[0] == $_->[1]
0ccab2bc 247 ? sprintf("$alu == $hex_fmt",$_->[0])
248 : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
12b72891 249 return $txt unless @_;
32e6a07c 250 return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )",
251 $txt,$alu,$_->[1],combine($alu,@_);
12b72891 252}
253
254# recursively convert a trie to an optree represented by
255# [condition,yes,no] where yes and no can be a ref to another optree
256# or a scalar representing code.
257# called by make_optree
258
259sub _trie_to_optree {
260 my ( $node, $ofs, $else, $fmt )= @_;
261 return $else unless $node;
262 $ofs ||= 0;
263 if ( $node->{''} ) {
264 $else= $ofs;
265 } else {
266 $else ||= 0;
267 }
268 my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
269 map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
270 grep length, keys %$node;
271
272 return $ofs if !@k;
273
274 my ( $root, $expr );
275 while ( @k ) {
276 my @cond= ( $k[0][0] );
277 my $d= $k[0][1];
278 my $r= $k[0][2];
279 shift @k;
280 while ( @k && $k[0][1] eq $d ) {
281 push @cond, $k[0][0];
282 shift @k;
283 }
284 my $op=
285 [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
286 if ( !$root ) {
287 $root= $expr= $op;
288 } else {
289 push @$expr, $op;
290 $expr= $op;
291 }
292 }
293 push @$expr, $else;
294 return $root;
295}
296
297# construct the optree for a type.
298# handles the special logic of type ''.
299sub make_optree {
300 my ( $self, $type, $size, $fmt )= @_;
301 my $else= 0;
302 $size||=$self->{size}{$type}[0];
303 $size=1 if $type eq 'c';
304 if ( !$type ) {
305 my ( $u, $l );
32e6a07c 306 if ($self->{trie}{u}) {
307 for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
308 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
309 }
12b72891 310 }
32e6a07c 311 if ($self->{trie}{l}) {
312 for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
313 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
314 }
12b72891 315 }
316 if ( $u ) {
317 $else= [ '(is_utf8)', $u, $l || 0 ];
318 } elsif ( $l ) {
319 $else= [ '(!is_utf8)', $l, 0 ];
320 }
321 $type= 'n';
12b72891 322 }
32e6a07c 323 if (!$self->{trie}{$type}) {
324 return $else;
325 } else {
326 $size-- while $size>0 && !$self->{trie}{$type}{$size};
327 return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
328 }
12b72891 329}
330
331# construct the optree for a type with length checks to prevent buffer
332# overruns. Only one length check is performed per lookup trading code
333# size for speed.
334sub length_optree {
335 my ( $self, $type,$fmt )= @_;
336 $type ||= '';
337 return $self->{len_op}{$type} if $self->{len_op}{$type};
338 my @size = @{$self->{size}{$type}};
339
340 my ( $root, $expr );
341 foreach my $size ( @size ) {
342 my $op= [
343 "( (e) - (s) > " . ( $size - 1 ) . " )",
344 $self->make_optree( $type, $size ),
345 ];
346 if ( !$root ) {
347 $root= $expr= $op;
348 } else {
349 push @$expr, $op;
350 $expr= $op;
351 }
352 }
353 push @$expr, 0;
354 return $self->{len_op}{$type}= $root ? $root : $expr->[0];
355}
356
357#
358# recursively walk an optree and covert it to a huge nested ternary expression.
359#
360sub _optree_to_ternary {
361 my ( $node )= @_;
362 return $node
363 if !ref $node;
364 my $depth = 0;
365 if ( $node->[0] =~ /\[(\d+)\]/ ) {
366 $depth= $1 + 1;
367 }
368 return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
369 _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
370}
371
372# add \\ to the end of strings in a reasonable neat way.
373sub _macro($) {
374 my $str= shift;
375 my @lines= split /[^\S\n]*\n/, $str;
0ccab2bc 376 my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
377 $macro =~ s/ *$//;
378 return $macro . "\n\n";
12b72891 379}
380
381# default type extensions. 'uln' dont have one because normally
382# they are used only as part of type '' which doesnt get an extension
383my %ext= (
384 U => '_utf8',
385 L => '_latin1',
386 c => '_cp',
387
388);
389
390# produce the ternary, handling arguments and putting on the macro headers
391# and boiler plate
392sub ternary {
393 my ( $self, $type, $ext )= @_;
394 $type ||= '';
395 $ext = ($ext{$type} || '') . ($ext||"");
396 my ($root,$fmt,$arg);
397 if ($type eq 'c') {
398 $arg= $fmt= 'cp';
399 } else {
400 $arg= 's';
401 }
402 if ( $type eq 'c' || $ext !~ /safe/) {
403 $root= $self->make_optree( $type, 0, $fmt );
404 } else {
405 $root= $self->length_optree( $type, $fmt );
406 }
407
408 our $parens;
409 $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
410 my $expr= qr/
411 \( \s*
412 ($parens)
413 \s* \? \s*
414 \( \s*
415 ($parens)
416 \s* \? \s*
417 (\d+|$parens)
418 \s* : \s*
419 (\d+|$parens)
420 \s* \)
421 \s* : \s*
422 \4
423 \s* \)
424 /x;
425 my $code= _optree_to_ternary( $root );
426 for ( $code ) {
427 s/^\s*//;
428 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
429 || s<$expr><(($1 && $2) ? $3 : $4)>g
430 || s<\(\s*($parens)\s*\)><$1>g;
431 }
432 my @args=($arg);
433 push @args,'e' if $ext=~/safe/;
434 push @args,'is_utf8' if !$type;
435 my $args=join ",",@args;
436 return "/*** GENERATED CODE ***/\n"
437 . _macro "#define is_$self->{op}$ext($args)\n$code";
438}
32e6a07c 439$|++;
12b72891 440my $path=shift @ARGV;
32e6a07c 441
12b72891 442if (!$path) {
443 $path= "regcharclass.h";
444 if (!-e $path) { $path="../$path" }
445 if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
446}
32e6a07c 447my $out_fh;
448if ($path eq '-') {
449 $out_fh= \*STDOUT;
450} else {
451 rename $path,"$path.bak";
452 open $out_fh,">",$path
453 or die "Can't write to '$path':$!";
454 binmode $out_fh; # want unix line endings even when run on win32.
455}
58fbde93 456my ($zero) = $0=~/([^\\\/]+)$/;
12b72891 457print $out_fh <<"HEADER";
58fbde93 458/* -*- buffer-read-only: t -*-
459 *
460 * regcharclass.h
461 *
462 * Copyright (C) 2007, by Larry Wall and others
463 *
464 * You may distribute under the terms of either the GNU General Public
465 * License or the Artistic License, as specified in the README file.
466 *
467 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
468 * This file is built by Porting/$zero.
469 * (Generated at: @{[ scalar gmtime ]} GMT)
470 * Any changes made here will be lost!
471 */
12b72891 472
473HEADER
474
32e6a07c 475my ($op,$title,@strs,@txt,$type);
12b72891 476my $doit= sub {
477 return unless $op;
478 my $o= __PACKAGE__->new($title,$op,@strs);
479 print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
480 print $out_fh join "\n",@txt,"*/","";
32e6a07c 481 $type||="U L c _safe";
482 my @ext=("");
483 my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
484 split /\s+/,$type);
485 for my $type (@types) {
486 for my $ext (@ext) {
487 next if $type eq 'c' and $ext eq '_safe';
488 print $out_fh $o->ternary( $type,$ext );
489 }
12b72891 490 }
12b72891 491};
492while (<DATA>) {
493 next unless /\S/;
494 chomp;
495 if (/^([A-Z]+)/) {
496 $doit->();
497 ($op,$title)=split /\s*:\s*/,$_,2;
498 @txt=@strs=();
32e6a07c 499 $type="";
500 } elsif (/^=(.*)/) {
501 $type.=$1;
12b72891 502 } else {
503 push @txt, "\t$_";
504 s/#.*$//;
505 if (/^0x/) {
506 push @strs,map { chr $_ } eval $_;
507 } elsif (/^[""'']/) {
508 push @strs,eval $_;
509 }
510 }
511}
512$doit->();
58fbde93 513print $out_fh "/* ex: set ro: */\n";
12b72891 514
515__DATA__
516LNBREAK: Line Break: \R
517"\x0D\x0A" # CRLF - Network (Windows) line ending
5180x0A # LF | LINE FEED
5190x0B # VT | VERTICAL TAB
5200x0C # FF | FORM FEED
5210x0D # CR | CARRIAGE RETURN
5220x85 # NEL | NEXT LINE
5230x2028 # LINE SEPARATOR
5240x2029 # PARAGRAPH SEPARATOR
525
526HORIZWS: Horizontal Whitespace: \h \H
5270x09 # HT
5280x20 # SPACE
5290xa0 # NBSP
5300x1680 # OGHAM SPACE MARK
5310x180e # MONGOLIAN VOWEL SEPARATOR
5320x2000 # EN QUAD
5330x2001 # EM QUAD
5340x2002 # EN SPACE
5350x2003 # EM SPACE
5360x2004 # THREE-PER-EM SPACE
5370x2005 # FOUR-PER-EM SPACE
5380x2006 # SIX-PER-EM SPACE
5390x2007 # FIGURE SPACE
5400x2008 # PUNCTUATION SPACE
5410x2009 # THIN SPACE
5420x200A # HAIR SPACE
5430x202f # NARROW NO-BREAK SPACE
5440x205f # MEDIUM MATHEMATICAL SPACE
5450x3000 # IDEOGRAPHIC SPACE
546
547VERTWS: Vertical Whitespace: \v \V
5480x0A # LF
5490x0B # VT
5500x0C # FF
5510x0D # CR
5520x85 # NEL
5530x2028 # LINE SEPARATOR
5540x2029 # PARAGRAPH SEPARATOR
555
32e6a07c 556TRICKYFOLD: Problematic fold case letters.
5570x00DF # LATIN SMALL LETTER SHARP S
5580x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
5590x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS