Commit | Line | Data |
12b72891 |
1 | package UTF8::Matcher; |
2 | use strict; |
3 | use warnings; |
4 | use Text::Wrap qw(wrap); |
5 | use Encode; |
6 | use Data::Dumper; |
7 | |
0ccab2bc |
8 | our $hex_fmt="0x%02X"; |
9 | |
12b72891 |
10 | # Author: Yves Orton (demerphq) 2007. |
11 | |
12 | =pod |
13 | |
14 | Dynamically generates macros for detecting special charclasses |
15 | in both latin-1, utf8, and codepoint forms. |
16 | |
17 | To regenerate regcharclass.h, run this script from perl-root. No arguments |
18 | are necessary. |
19 | |
20 | Each charclass handler is constructed as follows: |
21 | Each string the charclass must match is rendered as unicode (codepoints>255), |
22 | and if possible as latin1 (codepoints>127), and if possible as "neutral" |
23 | (all codepoints<128). |
24 | |
25 | The rendered strings are then inserted into digit-tries by type and length. |
26 | With shorter strings being added to tries that are allowed to contain longer |
27 | strings, but not vice versa. Thus the "longest" trie contains all strings |
28 | for that charclass. |
29 | |
30 | The following types of trie are generated: |
31 | |
32 | n - Neutral only. All strings in this type have codepoints<128 |
33 | l - Latin1 only. All strings in this type have a codepoint>127 in them |
34 | u - UTF8 only. All strings in this type have a codepoint>255 in them |
35 | L - Latin1. All strings in 'n' and 'l' |
36 | U - UTF8. All string in 'n' and 'u' |
37 | c - Codepoint. All strings in U but in codepoint and not utf8 form. |
38 | |
39 | The ternary() routine is responsible for converting the trie data into a |
40 | ternary conditional that matches the required set of strings. The generated |
41 | macro normally takes at least the argument 's' which is expected to be a |
42 | pointer of type C<char *> or C<U8 *>. The condition generated will be |
43 | optimised to match the string as efficiently as possible, with range lookups |
44 | being used where possible, and in some situations relying on "true" to be 1. |
45 | |
46 | ternary() takes two optional arguments, $type which is one of the above |
47 | characters and $ext which is used to add an extra extension to the macro name. |
48 | |
49 | If $type is omitted or false then the generated macro will take an additional |
50 | argument, 'is_utf8'. |
51 | |
52 | If $ext has the string 'safe' in it then the generated macro will take an extra |
53 | argument 'e' for the end of the string, and all lookups will be length checked |
54 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to |
55 | be guaranteed safe, and no 'e' argument is provided and no length checks are |
56 | made during execution. |
57 | |
58 | The 'c' type is different as compared to the rest. Instead of producing |
59 | a condition that does octet comparisons of a string array, the 'c' type |
60 | produces a macro that takes a single codepoint as an argument (instead of a |
61 | char* or U8*) and does the lookup based on only that char, thus it cannot be |
62 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. |
63 | This is primarily used for populating charclass bitmaps for codepoints 0..255 |
64 | but will also match codepoints in the unicode range if necessary. |
65 | |
66 | Using LNBREAK as an example the following macros will be produced: |
67 | |
68 | =over 4 |
69 | |
70 | =item is_LNBREAK(s,is_utf8) |
71 | |
72 | =item is_LNBREAK_safe(s,e,is_utf8) |
73 | |
74 | Do a lookup as apporpriate based on the is_utf8 flag. When possible |
75 | comparisons involving octect<128 are done before checking the is_utf8 |
76 | flag, hopefully saving time. |
77 | |
78 | =item is_LNBREAK_utf8(s) |
79 | |
80 | =item is_LNBREAK_utf8_safe(s,e) |
81 | |
82 | Do a lookup assuming the string is encoded in (normalized) UTF8. |
83 | |
84 | =item is_LNBREAK_latin1(s) |
85 | |
86 | =item is_LNBREAK_latin1_safe(s,e) |
87 | |
88 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). |
89 | |
90 | =item is_LNBREAK_cp(cp) |
91 | |
92 | Check to see if the string matches a given codepoint (hypotethically a |
93 | U32). The condition is constructed as as to "break out" as early as |
94 | possible if the codepoint is out of range of the condition. |
95 | |
96 | IOW: |
97 | |
98 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) |
99 | |
100 | Thus if the character is X+1 only two comparisons will be done. Making |
101 | matching lookups slower, but non-matching faster. |
102 | |
103 | =back |
104 | |
105 | =cut |
106 | |
107 | # store a list of numbers into a hash based trie. |
108 | sub _trie_store { |
109 | my $root= shift; |
110 | foreach my $b ( @_ ) { |
111 | $root->{$b} ||= {}; |
112 | $root= $root->{$b}; |
113 | } |
114 | $root->{''}++; |
115 | } |
116 | |
117 | # Convert a string into its neutral, latin1, utf8 forms, where |
118 | # the form is undefined unless the string can be completely represented |
119 | # in that form. The string is then decomposed into the octects representing |
120 | # it. A list is returned for each. Additional a list of codepoints making |
121 | # up the string. |
122 | # returns (\@n,\@u,\@l,\@cp) |
123 | # |
124 | sub _uni_latin1 { |
125 | my $str= shift; |
126 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; |
127 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; |
128 | my $n= $l; |
129 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; |
130 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), |
131 | [map { ord $_ } split //,$str]); |
132 | } |
133 | |
134 | # store an array ref of char data into the appropriate |
135 | # type bins, tracking sizes as we go. |
136 | sub _store { |
137 | my ( $self, $r, @k )= @_; |
138 | for my $z ( @k ) { |
139 | $self->{size}{$z}{ 0 + @$r }++; |
140 | push @{ $self->{data}{$z} }, $r; |
141 | } |
142 | } |
143 | |
144 | # construct a new charclass constructor object. |
145 | # $title ends up in the code a as a comment. |
146 | # $opcode is the name of the operation the charclass implements. |
147 | # the rest of the arguments are strings that the charclass |
148 | # can match. |
149 | sub new { |
150 | my $class= shift; |
151 | my $title= shift; |
152 | my $opcode= shift; |
153 | my $self= bless { op => $opcode, title => $title }, $class; |
154 | my %seen; |
155 | # convert the strings to the numeric equivelents and store |
156 | # them for later insertion while tracking their sizes. |
157 | foreach my $seq ( @_ ) { |
158 | next if $seen{$seq}++; |
159 | push @{$self->{seq}},$seq; |
160 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); |
161 | if ( $n ) { |
162 | _store( $self, $n, qw(n U L) ); |
163 | } else { |
164 | if ( $l ) { |
165 | _store( $self, $l, qw(l L) ); |
166 | } |
167 | _store( $self, $u, qw(u U) ); |
168 | } |
169 | _store($self,$cp,'c'); |
170 | } |
171 | # |
172 | # now construct the tries. For each type of data we insert |
173 | # the data into all the tries of length $size and smaller. |
174 | # |
175 | |
176 | my %allsize; |
177 | foreach my $k ( keys %{ $self->{data} } ) { |
178 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; |
179 | $self->{size}{$k}=\@size; |
180 | undef @allsize{@size}; |
181 | foreach my $d ( @{ $self->{data}{$k} } ) { |
182 | foreach my $sz ( @size ) { |
183 | last if $sz < @$d; |
184 | $self->{trie}{$k}{$sz} ||= {}; |
185 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); |
186 | } |
187 | } |
188 | #delete $self->{data}{$k}; |
189 | } |
190 | my @size= sort { $b <=> $a } keys %allsize; |
191 | $self->{size}{''}= \@size; |
192 | return $self; |
193 | } |
194 | |
195 | # |
196 | # _cond([$v1,$v2,$v2...],$ofs) |
197 | # |
198 | # converts an array of codepoints into a conditional expression |
199 | # consequtive codepoints are merged into a range test |
200 | # returns a string containing the conditional expression in the form |
201 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. |
202 | |
203 | sub _cond { |
204 | my ( $c, $ofs,$fmt )= @_; |
205 | $fmt||='((U8*)s)[%d]'; |
206 | # cheapo rangification routine. |
207 | # Convert the first element into a singleton represented |
208 | # as [$x,$x] and then merge the rest in as we go. |
209 | my @v= sort { $a <=> $b } @$c; |
210 | my @r= ( [ ( shift @v ) x 2 ] ); |
211 | for my $n ( @v ) { |
212 | if ( $n == $r[-1][1] + 1 ) { |
213 | $r[-1][1]++; |
214 | } else { |
215 | push @r, [ $n, $n ]; |
216 | } |
217 | } |
218 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; |
219 | # sort the ranges by size and order. |
220 | @r= sort { $a->[0] <=> $b->[0] } @r; |
221 | my $alu= sprintf $fmt,$ofs; # C array look up |
222 | |
223 | if ($fmt=~/%d/) { |
224 | # map the ranges into conditions |
225 | @r= map { |
226 | # singleton |
0ccab2bc |
227 | $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) : |
12b72891 |
228 | # range |
0ccab2bc |
229 | sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_) |
12b72891 |
230 | } @r; |
231 | # return the joined results. |
232 | return '( ' . join( " || ", @r ) . ' )'; |
233 | } else { |
234 | return combine($alu,@r); |
235 | } |
236 | } |
237 | |
238 | # |
239 | # Do the condition in such a way that we break out early if the value |
240 | # we are looking at is in between two elements in the list. |
241 | # Currently used only for codepoint macros (depth 1) |
242 | # |
243 | sub combine { |
244 | my $alu=shift; |
245 | local $_ = shift; |
246 | my $txt= $_->[0] == $_->[1] |
0ccab2bc |
247 | ? sprintf("$alu == $hex_fmt",$_->[0]) |
248 | : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_); |
12b72891 |
249 | return $txt unless @_; |
32e6a07c |
250 | return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )", |
251 | $txt,$alu,$_->[1],combine($alu,@_); |
12b72891 |
252 | } |
253 | |
254 | # recursively convert a trie to an optree represented by |
255 | # [condition,yes,no] where yes and no can be a ref to another optree |
256 | # or a scalar representing code. |
257 | # called by make_optree |
258 | |
259 | sub _trie_to_optree { |
260 | my ( $node, $ofs, $else, $fmt )= @_; |
261 | return $else unless $node; |
262 | $ofs ||= 0; |
263 | if ( $node->{''} ) { |
264 | $else= $ofs; |
265 | } else { |
266 | $else ||= 0; |
267 | } |
268 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } |
269 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } |
270 | grep length, keys %$node; |
271 | |
272 | return $ofs if !@k; |
273 | |
274 | my ( $root, $expr ); |
275 | while ( @k ) { |
276 | my @cond= ( $k[0][0] ); |
277 | my $d= $k[0][1]; |
278 | my $r= $k[0][2]; |
279 | shift @k; |
280 | while ( @k && $k[0][1] eq $d ) { |
281 | push @cond, $k[0][0]; |
282 | shift @k; |
283 | } |
284 | my $op= |
285 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; |
286 | if ( !$root ) { |
287 | $root= $expr= $op; |
288 | } else { |
289 | push @$expr, $op; |
290 | $expr= $op; |
291 | } |
292 | } |
293 | push @$expr, $else; |
294 | return $root; |
295 | } |
296 | |
297 | # construct the optree for a type. |
298 | # handles the special logic of type ''. |
299 | sub make_optree { |
300 | my ( $self, $type, $size, $fmt )= @_; |
301 | my $else= 0; |
302 | $size||=$self->{size}{$type}[0]; |
303 | $size=1 if $type eq 'c'; |
304 | if ( !$type ) { |
305 | my ( $u, $l ); |
32e6a07c |
306 | if ($self->{trie}{u}) { |
307 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { |
308 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); |
309 | } |
12b72891 |
310 | } |
32e6a07c |
311 | if ($self->{trie}{l}) { |
312 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { |
313 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); |
314 | } |
12b72891 |
315 | } |
316 | if ( $u ) { |
317 | $else= [ '(is_utf8)', $u, $l || 0 ]; |
318 | } elsif ( $l ) { |
319 | $else= [ '(!is_utf8)', $l, 0 ]; |
320 | } |
321 | $type= 'n'; |
12b72891 |
322 | } |
32e6a07c |
323 | if (!$self->{trie}{$type}) { |
324 | return $else; |
325 | } else { |
326 | $size-- while $size>0 && !$self->{trie}{$type}{$size}; |
327 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); |
328 | } |
12b72891 |
329 | } |
330 | |
331 | # construct the optree for a type with length checks to prevent buffer |
332 | # overruns. Only one length check is performed per lookup trading code |
333 | # size for speed. |
334 | sub length_optree { |
335 | my ( $self, $type,$fmt )= @_; |
336 | $type ||= ''; |
337 | return $self->{len_op}{$type} if $self->{len_op}{$type}; |
338 | my @size = @{$self->{size}{$type}}; |
339 | |
340 | my ( $root, $expr ); |
341 | foreach my $size ( @size ) { |
342 | my $op= [ |
343 | "( (e) - (s) > " . ( $size - 1 ) . " )", |
344 | $self->make_optree( $type, $size ), |
345 | ]; |
346 | if ( !$root ) { |
347 | $root= $expr= $op; |
348 | } else { |
349 | push @$expr, $op; |
350 | $expr= $op; |
351 | } |
352 | } |
353 | push @$expr, 0; |
354 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; |
355 | } |
356 | |
357 | # |
358 | # recursively walk an optree and covert it to a huge nested ternary expression. |
359 | # |
360 | sub _optree_to_ternary { |
361 | my ( $node )= @_; |
362 | return $node |
363 | if !ref $node; |
364 | my $depth = 0; |
365 | if ( $node->[0] =~ /\[(\d+)\]/ ) { |
366 | $depth= $1 + 1; |
367 | } |
368 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], |
369 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); |
370 | } |
371 | |
372 | # add \\ to the end of strings in a reasonable neat way. |
373 | sub _macro($) { |
374 | my $str= shift; |
375 | my @lines= split /[^\S\n]*\n/, $str; |
0ccab2bc |
376 | my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines ); |
377 | $macro =~ s/ *$//; |
378 | return $macro . "\n\n"; |
12b72891 |
379 | } |
380 | |
381 | # default type extensions. 'uln' dont have one because normally |
382 | # they are used only as part of type '' which doesnt get an extension |
383 | my %ext= ( |
384 | U => '_utf8', |
385 | L => '_latin1', |
386 | c => '_cp', |
387 | |
388 | ); |
389 | |
390 | # produce the ternary, handling arguments and putting on the macro headers |
391 | # and boiler plate |
392 | sub ternary { |
393 | my ( $self, $type, $ext )= @_; |
394 | $type ||= ''; |
395 | $ext = ($ext{$type} || '') . ($ext||""); |
396 | my ($root,$fmt,$arg); |
397 | if ($type eq 'c') { |
398 | $arg= $fmt= 'cp'; |
399 | } else { |
400 | $arg= 's'; |
401 | } |
402 | if ( $type eq 'c' || $ext !~ /safe/) { |
403 | $root= $self->make_optree( $type, 0, $fmt ); |
404 | } else { |
405 | $root= $self->length_optree( $type, $fmt ); |
406 | } |
407 | |
408 | our $parens; |
409 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; |
410 | my $expr= qr/ |
411 | \( \s* |
412 | ($parens) |
413 | \s* \? \s* |
414 | \( \s* |
415 | ($parens) |
416 | \s* \? \s* |
417 | (\d+|$parens) |
418 | \s* : \s* |
419 | (\d+|$parens) |
420 | \s* \) |
421 | \s* : \s* |
422 | \4 |
423 | \s* \) |
424 | /x; |
425 | my $code= _optree_to_ternary( $root ); |
426 | for ( $code ) { |
427 | s/^\s*//; |
428 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g |
429 | || s<$expr><(($1 && $2) ? $3 : $4)>g |
430 | || s<\(\s*($parens)\s*\)><$1>g; |
431 | } |
432 | my @args=($arg); |
433 | push @args,'e' if $ext=~/safe/; |
434 | push @args,'is_utf8' if !$type; |
435 | my $args=join ",",@args; |
436 | return "/*** GENERATED CODE ***/\n" |
437 | . _macro "#define is_$self->{op}$ext($args)\n$code"; |
438 | } |
32e6a07c |
439 | $|++; |
12b72891 |
440 | my $path=shift @ARGV; |
32e6a07c |
441 | |
12b72891 |
442 | if (!$path) { |
443 | $path= "regcharclass.h"; |
444 | if (!-e $path) { $path="../$path" } |
445 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; |
446 | } |
32e6a07c |
447 | my $out_fh; |
448 | if ($path eq '-') { |
449 | $out_fh= \*STDOUT; |
450 | } else { |
451 | rename $path,"$path.bak"; |
452 | open $out_fh,">",$path |
453 | or die "Can't write to '$path':$!"; |
454 | binmode $out_fh; # want unix line endings even when run on win32. |
455 | } |
58fbde93 |
456 | my ($zero) = $0=~/([^\\\/]+)$/; |
12b72891 |
457 | print $out_fh <<"HEADER"; |
58fbde93 |
458 | /* -*- buffer-read-only: t -*- |
459 | * |
460 | * regcharclass.h |
461 | * |
462 | * Copyright (C) 2007, by Larry Wall and others |
463 | * |
464 | * You may distribute under the terms of either the GNU General Public |
465 | * License or the Artistic License, as specified in the README file. |
466 | * |
467 | * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
468 | * This file is built by Porting/$zero. |
469 | * (Generated at: @{[ scalar gmtime ]} GMT) |
470 | * Any changes made here will be lost! |
471 | */ |
12b72891 |
472 | |
473 | HEADER |
474 | |
32e6a07c |
475 | my ($op,$title,@strs,@txt,$type); |
12b72891 |
476 | my $doit= sub { |
477 | return unless $op; |
478 | my $o= __PACKAGE__->new($title,$op,@strs); |
479 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; |
480 | print $out_fh join "\n",@txt,"*/",""; |
32e6a07c |
481 | $type||="U L c _safe"; |
482 | my @ext=(""); |
483 | my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } } |
484 | split /\s+/,$type); |
485 | for my $type (@types) { |
486 | for my $ext (@ext) { |
487 | next if $type eq 'c' and $ext eq '_safe'; |
488 | print $out_fh $o->ternary( $type,$ext ); |
489 | } |
12b72891 |
490 | } |
12b72891 |
491 | }; |
492 | while (<DATA>) { |
493 | next unless /\S/; |
494 | chomp; |
495 | if (/^([A-Z]+)/) { |
496 | $doit->(); |
497 | ($op,$title)=split /\s*:\s*/,$_,2; |
498 | @txt=@strs=(); |
32e6a07c |
499 | $type=""; |
500 | } elsif (/^=(.*)/) { |
501 | $type.=$1; |
12b72891 |
502 | } else { |
503 | push @txt, "\t$_"; |
504 | s/#.*$//; |
505 | if (/^0x/) { |
506 | push @strs,map { chr $_ } eval $_; |
507 | } elsif (/^[""'']/) { |
508 | push @strs,eval $_; |
509 | } |
510 | } |
511 | } |
512 | $doit->(); |
58fbde93 |
513 | print $out_fh "/* ex: set ro: */\n"; |
12b72891 |
514 | |
515 | __DATA__ |
516 | LNBREAK: Line Break: \R |
517 | "\x0D\x0A" # CRLF - Network (Windows) line ending |
518 | 0x0A # LF | LINE FEED |
519 | 0x0B # VT | VERTICAL TAB |
520 | 0x0C # FF | FORM FEED |
521 | 0x0D # CR | CARRIAGE RETURN |
522 | 0x85 # NEL | NEXT LINE |
523 | 0x2028 # LINE SEPARATOR |
524 | 0x2029 # PARAGRAPH SEPARATOR |
525 | |
526 | HORIZWS: Horizontal Whitespace: \h \H |
527 | 0x09 # HT |
528 | 0x20 # SPACE |
529 | 0xa0 # NBSP |
530 | 0x1680 # OGHAM SPACE MARK |
531 | 0x180e # MONGOLIAN VOWEL SEPARATOR |
532 | 0x2000 # EN QUAD |
533 | 0x2001 # EM QUAD |
534 | 0x2002 # EN SPACE |
535 | 0x2003 # EM SPACE |
536 | 0x2004 # THREE-PER-EM SPACE |
537 | 0x2005 # FOUR-PER-EM SPACE |
538 | 0x2006 # SIX-PER-EM SPACE |
539 | 0x2007 # FIGURE SPACE |
540 | 0x2008 # PUNCTUATION SPACE |
541 | 0x2009 # THIN SPACE |
542 | 0x200A # HAIR SPACE |
543 | 0x202f # NARROW NO-BREAK SPACE |
544 | 0x205f # MEDIUM MATHEMATICAL SPACE |
545 | 0x3000 # IDEOGRAPHIC SPACE |
546 | |
547 | VERTWS: Vertical Whitespace: \v \V |
548 | 0x0A # LF |
549 | 0x0B # VT |
550 | 0x0C # FF |
551 | 0x0D # CR |
552 | 0x85 # NEL |
553 | 0x2028 # LINE SEPARATOR |
554 | 0x2029 # PARAGRAPH SEPARATOR |
555 | |
32e6a07c |
556 | TRICKYFOLD: Problematic fold case letters. |
557 | 0x00DF # LATIN SMALL LETTER SHARP S |
558 | 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS |
559 | 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS |