Commit | Line | Data |
12b72891 |
1 | package UTF8::Matcher; |
2 | use strict; |
3 | use warnings; |
4 | use Text::Wrap qw(wrap); |
5 | use Encode; |
6 | use Data::Dumper; |
7 | |
0ccab2bc |
8 | our $hex_fmt="0x%02X"; |
9 | |
12b72891 |
10 | # Author: Yves Orton (demerphq) 2007. |
11 | |
12 | =pod |
13 | |
14 | Dynamically generates macros for detecting special charclasses |
15 | in both latin-1, utf8, and codepoint forms. |
16 | |
17 | To regenerate regcharclass.h, run this script from perl-root. No arguments |
18 | are necessary. |
19 | |
20 | Each charclass handler is constructed as follows: |
21 | Each string the charclass must match is rendered as unicode (codepoints>255), |
22 | and if possible as latin1 (codepoints>127), and if possible as "neutral" |
23 | (all codepoints<128). |
24 | |
25 | The rendered strings are then inserted into digit-tries by type and length. |
26 | With shorter strings being added to tries that are allowed to contain longer |
27 | strings, but not vice versa. Thus the "longest" trie contains all strings |
28 | for that charclass. |
29 | |
30 | The following types of trie are generated: |
31 | |
32 | n - Neutral only. All strings in this type have codepoints<128 |
33 | l - Latin1 only. All strings in this type have a codepoint>127 in them |
34 | u - UTF8 only. All strings in this type have a codepoint>255 in them |
35 | L - Latin1. All strings in 'n' and 'l' |
36 | U - UTF8. All string in 'n' and 'u' |
37 | c - Codepoint. All strings in U but in codepoint and not utf8 form. |
38 | |
39 | The ternary() routine is responsible for converting the trie data into a |
40 | ternary conditional that matches the required set of strings. The generated |
41 | macro normally takes at least the argument 's' which is expected to be a |
42 | pointer of type C<char *> or C<U8 *>. The condition generated will be |
43 | optimised to match the string as efficiently as possible, with range lookups |
44 | being used where possible, and in some situations relying on "true" to be 1. |
45 | |
46 | ternary() takes two optional arguments, $type which is one of the above |
47 | characters and $ext which is used to add an extra extension to the macro name. |
48 | |
49 | If $type is omitted or false then the generated macro will take an additional |
50 | argument, 'is_utf8'. |
51 | |
52 | If $ext has the string 'safe' in it then the generated macro will take an extra |
53 | argument 'e' for the end of the string, and all lookups will be length checked |
54 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to |
55 | be guaranteed safe, and no 'e' argument is provided and no length checks are |
56 | made during execution. |
57 | |
58 | The 'c' type is different as compared to the rest. Instead of producing |
59 | a condition that does octet comparisons of a string array, the 'c' type |
60 | produces a macro that takes a single codepoint as an argument (instead of a |
61 | char* or U8*) and does the lookup based on only that char, thus it cannot be |
62 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. |
63 | This is primarily used for populating charclass bitmaps for codepoints 0..255 |
64 | but will also match codepoints in the unicode range if necessary. |
65 | |
66 | Using LNBREAK as an example the following macros will be produced: |
67 | |
68 | =over 4 |
69 | |
70 | =item is_LNBREAK(s,is_utf8) |
71 | |
72 | =item is_LNBREAK_safe(s,e,is_utf8) |
73 | |
74 | Do a lookup as apporpriate based on the is_utf8 flag. When possible |
75 | comparisons involving octect<128 are done before checking the is_utf8 |
76 | flag, hopefully saving time. |
77 | |
78 | =item is_LNBREAK_utf8(s) |
79 | |
80 | =item is_LNBREAK_utf8_safe(s,e) |
81 | |
82 | Do a lookup assuming the string is encoded in (normalized) UTF8. |
83 | |
84 | =item is_LNBREAK_latin1(s) |
85 | |
86 | =item is_LNBREAK_latin1_safe(s,e) |
87 | |
88 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). |
89 | |
90 | =item is_LNBREAK_cp(cp) |
91 | |
92 | Check to see if the string matches a given codepoint (hypotethically a |
93 | U32). The condition is constructed as as to "break out" as early as |
94 | possible if the codepoint is out of range of the condition. |
95 | |
96 | IOW: |
97 | |
98 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) |
99 | |
100 | Thus if the character is X+1 only two comparisons will be done. Making |
101 | matching lookups slower, but non-matching faster. |
102 | |
103 | =back |
104 | |
105 | =cut |
106 | |
107 | # store a list of numbers into a hash based trie. |
108 | sub _trie_store { |
109 | my $root= shift; |
110 | foreach my $b ( @_ ) { |
111 | $root->{$b} ||= {}; |
112 | $root= $root->{$b}; |
113 | } |
114 | $root->{''}++; |
115 | } |
116 | |
117 | # Convert a string into its neutral, latin1, utf8 forms, where |
118 | # the form is undefined unless the string can be completely represented |
119 | # in that form. The string is then decomposed into the octects representing |
120 | # it. A list is returned for each. Additional a list of codepoints making |
121 | # up the string. |
122 | # returns (\@n,\@u,\@l,\@cp) |
123 | # |
124 | sub _uni_latin1 { |
125 | my $str= shift; |
126 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; |
127 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; |
128 | my $n= $l; |
129 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; |
130 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), |
131 | [map { ord $_ } split //,$str]); |
132 | } |
133 | |
134 | # store an array ref of char data into the appropriate |
135 | # type bins, tracking sizes as we go. |
136 | sub _store { |
137 | my ( $self, $r, @k )= @_; |
138 | for my $z ( @k ) { |
139 | $self->{size}{$z}{ 0 + @$r }++; |
140 | push @{ $self->{data}{$z} }, $r; |
141 | } |
142 | } |
143 | |
144 | # construct a new charclass constructor object. |
145 | # $title ends up in the code a as a comment. |
146 | # $opcode is the name of the operation the charclass implements. |
147 | # the rest of the arguments are strings that the charclass |
148 | # can match. |
149 | sub new { |
150 | my $class= shift; |
151 | my $title= shift; |
152 | my $opcode= shift; |
153 | my $self= bless { op => $opcode, title => $title }, $class; |
154 | my %seen; |
155 | # convert the strings to the numeric equivelents and store |
156 | # them for later insertion while tracking their sizes. |
157 | foreach my $seq ( @_ ) { |
158 | next if $seen{$seq}++; |
159 | push @{$self->{seq}},$seq; |
160 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); |
161 | if ( $n ) { |
162 | _store( $self, $n, qw(n U L) ); |
163 | } else { |
164 | if ( $l ) { |
165 | _store( $self, $l, qw(l L) ); |
166 | } |
167 | _store( $self, $u, qw(u U) ); |
168 | } |
169 | _store($self,$cp,'c'); |
170 | } |
171 | # |
172 | # now construct the tries. For each type of data we insert |
173 | # the data into all the tries of length $size and smaller. |
174 | # |
175 | |
176 | my %allsize; |
177 | foreach my $k ( keys %{ $self->{data} } ) { |
178 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; |
179 | $self->{size}{$k}=\@size; |
180 | undef @allsize{@size}; |
181 | foreach my $d ( @{ $self->{data}{$k} } ) { |
182 | foreach my $sz ( @size ) { |
183 | last if $sz < @$d; |
184 | $self->{trie}{$k}{$sz} ||= {}; |
185 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); |
186 | } |
187 | } |
188 | #delete $self->{data}{$k}; |
189 | } |
190 | my @size= sort { $b <=> $a } keys %allsize; |
191 | $self->{size}{''}= \@size; |
192 | return $self; |
193 | } |
194 | |
195 | # |
196 | # _cond([$v1,$v2,$v2...],$ofs) |
197 | # |
198 | # converts an array of codepoints into a conditional expression |
199 | # consequtive codepoints are merged into a range test |
200 | # returns a string containing the conditional expression in the form |
201 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. |
202 | |
203 | sub _cond { |
204 | my ( $c, $ofs,$fmt )= @_; |
205 | $fmt||='((U8*)s)[%d]'; |
206 | # cheapo rangification routine. |
207 | # Convert the first element into a singleton represented |
208 | # as [$x,$x] and then merge the rest in as we go. |
209 | my @v= sort { $a <=> $b } @$c; |
210 | my @r= ( [ ( shift @v ) x 2 ] ); |
211 | for my $n ( @v ) { |
212 | if ( $n == $r[-1][1] + 1 ) { |
213 | $r[-1][1]++; |
214 | } else { |
215 | push @r, [ $n, $n ]; |
216 | } |
217 | } |
218 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; |
219 | # sort the ranges by size and order. |
220 | @r= sort { $a->[0] <=> $b->[0] } @r; |
221 | my $alu= sprintf $fmt,$ofs; # C array look up |
222 | |
223 | if ($fmt=~/%d/) { |
224 | # map the ranges into conditions |
225 | @r= map { |
226 | # singleton |
0ccab2bc |
227 | $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) : |
12b72891 |
228 | # range |
0ccab2bc |
229 | sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_) |
12b72891 |
230 | } @r; |
231 | # return the joined results. |
232 | return '( ' . join( " || ", @r ) . ' )'; |
233 | } else { |
234 | return combine($alu,@r); |
235 | } |
236 | } |
237 | |
238 | # |
239 | # Do the condition in such a way that we break out early if the value |
240 | # we are looking at is in between two elements in the list. |
241 | # Currently used only for codepoint macros (depth 1) |
242 | # |
243 | sub combine { |
244 | my $alu=shift; |
245 | local $_ = shift; |
246 | my $txt= $_->[0] == $_->[1] |
0ccab2bc |
247 | ? sprintf("$alu == $hex_fmt",$_->[0]) |
248 | : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_); |
12b72891 |
249 | return $txt unless @_; |
250 | return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; |
251 | } |
252 | |
253 | # recursively convert a trie to an optree represented by |
254 | # [condition,yes,no] where yes and no can be a ref to another optree |
255 | # or a scalar representing code. |
256 | # called by make_optree |
257 | |
258 | sub _trie_to_optree { |
259 | my ( $node, $ofs, $else, $fmt )= @_; |
260 | return $else unless $node; |
261 | $ofs ||= 0; |
262 | if ( $node->{''} ) { |
263 | $else= $ofs; |
264 | } else { |
265 | $else ||= 0; |
266 | } |
267 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } |
268 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } |
269 | grep length, keys %$node; |
270 | |
271 | return $ofs if !@k; |
272 | |
273 | my ( $root, $expr ); |
274 | while ( @k ) { |
275 | my @cond= ( $k[0][0] ); |
276 | my $d= $k[0][1]; |
277 | my $r= $k[0][2]; |
278 | shift @k; |
279 | while ( @k && $k[0][1] eq $d ) { |
280 | push @cond, $k[0][0]; |
281 | shift @k; |
282 | } |
283 | my $op= |
284 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; |
285 | if ( !$root ) { |
286 | $root= $expr= $op; |
287 | } else { |
288 | push @$expr, $op; |
289 | $expr= $op; |
290 | } |
291 | } |
292 | push @$expr, $else; |
293 | return $root; |
294 | } |
295 | |
296 | # construct the optree for a type. |
297 | # handles the special logic of type ''. |
298 | sub make_optree { |
299 | my ( $self, $type, $size, $fmt )= @_; |
300 | my $else= 0; |
301 | $size||=$self->{size}{$type}[0]; |
302 | $size=1 if $type eq 'c'; |
303 | if ( !$type ) { |
304 | my ( $u, $l ); |
305 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { |
306 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); |
307 | } |
308 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { |
309 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); |
310 | } |
311 | if ( $u ) { |
312 | $else= [ '(is_utf8)', $u, $l || 0 ]; |
313 | } elsif ( $l ) { |
314 | $else= [ '(!is_utf8)', $l, 0 ]; |
315 | } |
316 | $type= 'n'; |
317 | $size-- while !$self->{trie}{n}{$size}; |
318 | } |
319 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); |
320 | } |
321 | |
322 | # construct the optree for a type with length checks to prevent buffer |
323 | # overruns. Only one length check is performed per lookup trading code |
324 | # size for speed. |
325 | sub length_optree { |
326 | my ( $self, $type,$fmt )= @_; |
327 | $type ||= ''; |
328 | return $self->{len_op}{$type} if $self->{len_op}{$type}; |
329 | my @size = @{$self->{size}{$type}}; |
330 | |
331 | my ( $root, $expr ); |
332 | foreach my $size ( @size ) { |
333 | my $op= [ |
334 | "( (e) - (s) > " . ( $size - 1 ) . " )", |
335 | $self->make_optree( $type, $size ), |
336 | ]; |
337 | if ( !$root ) { |
338 | $root= $expr= $op; |
339 | } else { |
340 | push @$expr, $op; |
341 | $expr= $op; |
342 | } |
343 | } |
344 | push @$expr, 0; |
345 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; |
346 | } |
347 | |
348 | # |
349 | # recursively walk an optree and covert it to a huge nested ternary expression. |
350 | # |
351 | sub _optree_to_ternary { |
352 | my ( $node )= @_; |
353 | return $node |
354 | if !ref $node; |
355 | my $depth = 0; |
356 | if ( $node->[0] =~ /\[(\d+)\]/ ) { |
357 | $depth= $1 + 1; |
358 | } |
359 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], |
360 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); |
361 | } |
362 | |
363 | # add \\ to the end of strings in a reasonable neat way. |
364 | sub _macro($) { |
365 | my $str= shift; |
366 | my @lines= split /[^\S\n]*\n/, $str; |
0ccab2bc |
367 | my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines ); |
368 | $macro =~ s/ *$//; |
369 | return $macro . "\n\n"; |
12b72891 |
370 | } |
371 | |
372 | # default type extensions. 'uln' dont have one because normally |
373 | # they are used only as part of type '' which doesnt get an extension |
374 | my %ext= ( |
375 | U => '_utf8', |
376 | L => '_latin1', |
377 | c => '_cp', |
378 | |
379 | ); |
380 | |
381 | # produce the ternary, handling arguments and putting on the macro headers |
382 | # and boiler plate |
383 | sub ternary { |
384 | my ( $self, $type, $ext )= @_; |
385 | $type ||= ''; |
386 | $ext = ($ext{$type} || '') . ($ext||""); |
387 | my ($root,$fmt,$arg); |
388 | if ($type eq 'c') { |
389 | $arg= $fmt= 'cp'; |
390 | } else { |
391 | $arg= 's'; |
392 | } |
393 | if ( $type eq 'c' || $ext !~ /safe/) { |
394 | $root= $self->make_optree( $type, 0, $fmt ); |
395 | } else { |
396 | $root= $self->length_optree( $type, $fmt ); |
397 | } |
398 | |
399 | our $parens; |
400 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; |
401 | my $expr= qr/ |
402 | \( \s* |
403 | ($parens) |
404 | \s* \? \s* |
405 | \( \s* |
406 | ($parens) |
407 | \s* \? \s* |
408 | (\d+|$parens) |
409 | \s* : \s* |
410 | (\d+|$parens) |
411 | \s* \) |
412 | \s* : \s* |
413 | \4 |
414 | \s* \) |
415 | /x; |
416 | my $code= _optree_to_ternary( $root ); |
417 | for ( $code ) { |
418 | s/^\s*//; |
419 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g |
420 | || s<$expr><(($1 && $2) ? $3 : $4)>g |
421 | || s<\(\s*($parens)\s*\)><$1>g; |
422 | } |
423 | my @args=($arg); |
424 | push @args,'e' if $ext=~/safe/; |
425 | push @args,'is_utf8' if !$type; |
426 | my $args=join ",",@args; |
427 | return "/*** GENERATED CODE ***/\n" |
428 | . _macro "#define is_$self->{op}$ext($args)\n$code"; |
429 | } |
430 | |
431 | my $path=shift @ARGV; |
432 | if (!$path) { |
433 | $path= "regcharclass.h"; |
434 | if (!-e $path) { $path="../$path" } |
435 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; |
436 | } |
437 | |
438 | rename $path,"$path.bak"; |
439 | open my $out_fh,">",$path |
440 | or die "Can't write to '$path':$!"; |
441 | binmode $out_fh; # want unix line endings even when run on win32. |
58fbde93 |
442 | my ($zero) = $0=~/([^\\\/]+)$/; |
12b72891 |
443 | print $out_fh <<"HEADER"; |
58fbde93 |
444 | /* -*- buffer-read-only: t -*- |
445 | * |
446 | * regcharclass.h |
447 | * |
448 | * Copyright (C) 2007, by Larry Wall and others |
449 | * |
450 | * You may distribute under the terms of either the GNU General Public |
451 | * License or the Artistic License, as specified in the README file. |
452 | * |
453 | * !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
454 | * This file is built by Porting/$zero. |
455 | * (Generated at: @{[ scalar gmtime ]} GMT) |
456 | * Any changes made here will be lost! |
457 | */ |
12b72891 |
458 | |
459 | HEADER |
460 | |
461 | my ($op,$title,@strs,@txt); |
462 | my $doit= sub { |
463 | return unless $op; |
464 | my $o= __PACKAGE__->new($title,$op,@strs); |
465 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; |
466 | print $out_fh join "\n",@txt,"*/",""; |
467 | for ('', 'U', 'L') { |
468 | print $out_fh $o->ternary( $_ ); |
469 | print $out_fh $o->ternary( $_,'_safe' ); |
470 | } |
471 | print $out_fh $o->ternary( 'c' ); |
472 | }; |
473 | while (<DATA>) { |
474 | next unless /\S/; |
475 | chomp; |
476 | if (/^([A-Z]+)/) { |
477 | $doit->(); |
478 | ($op,$title)=split /\s*:\s*/,$_,2; |
479 | @txt=@strs=(); |
480 | } else { |
481 | push @txt, "\t$_"; |
482 | s/#.*$//; |
483 | if (/^0x/) { |
484 | push @strs,map { chr $_ } eval $_; |
485 | } elsif (/^[""'']/) { |
486 | push @strs,eval $_; |
487 | } |
488 | } |
489 | } |
490 | $doit->(); |
58fbde93 |
491 | print $out_fh "/* ex: set ro: */\n"; |
12b72891 |
492 | print "$path has been updated\n"; |
493 | |
494 | __DATA__ |
495 | LNBREAK: Line Break: \R |
496 | "\x0D\x0A" # CRLF - Network (Windows) line ending |
497 | 0x0A # LF | LINE FEED |
498 | 0x0B # VT | VERTICAL TAB |
499 | 0x0C # FF | FORM FEED |
500 | 0x0D # CR | CARRIAGE RETURN |
501 | 0x85 # NEL | NEXT LINE |
502 | 0x2028 # LINE SEPARATOR |
503 | 0x2029 # PARAGRAPH SEPARATOR |
504 | |
505 | HORIZWS: Horizontal Whitespace: \h \H |
506 | 0x09 # HT |
507 | 0x20 # SPACE |
508 | 0xa0 # NBSP |
509 | 0x1680 # OGHAM SPACE MARK |
510 | 0x180e # MONGOLIAN VOWEL SEPARATOR |
511 | 0x2000 # EN QUAD |
512 | 0x2001 # EM QUAD |
513 | 0x2002 # EN SPACE |
514 | 0x2003 # EM SPACE |
515 | 0x2004 # THREE-PER-EM SPACE |
516 | 0x2005 # FOUR-PER-EM SPACE |
517 | 0x2006 # SIX-PER-EM SPACE |
518 | 0x2007 # FIGURE SPACE |
519 | 0x2008 # PUNCTUATION SPACE |
520 | 0x2009 # THIN SPACE |
521 | 0x200A # HAIR SPACE |
522 | 0x202f # NARROW NO-BREAK SPACE |
523 | 0x205f # MEDIUM MATHEMATICAL SPACE |
524 | 0x3000 # IDEOGRAPHIC SPACE |
525 | |
526 | VERTWS: Vertical Whitespace: \v \V |
527 | 0x0A # LF |
528 | 0x0B # VT |
529 | 0x0C # FF |
530 | 0x0D # CR |
531 | 0x85 # NEL |
532 | 0x2028 # LINE SEPARATOR |
533 | 0x2029 # PARAGRAPH SEPARATOR |
534 | |