Commit | Line | Data |
12b72891 |
1 | package UTF8::Matcher; |
2 | use strict; |
3 | use warnings; |
4 | use Text::Wrap qw(wrap); |
5 | use Encode; |
6 | use Data::Dumper; |
7 | |
8 | # Author: Yves Orton (demerphq) 2007. |
9 | |
10 | =pod |
11 | |
12 | Dynamically generates macros for detecting special charclasses |
13 | in both latin-1, utf8, and codepoint forms. |
14 | |
15 | To regenerate regcharclass.h, run this script from perl-root. No arguments |
16 | are necessary. |
17 | |
18 | Each charclass handler is constructed as follows: |
19 | Each string the charclass must match is rendered as unicode (codepoints>255), |
20 | and if possible as latin1 (codepoints>127), and if possible as "neutral" |
21 | (all codepoints<128). |
22 | |
23 | The rendered strings are then inserted into digit-tries by type and length. |
24 | With shorter strings being added to tries that are allowed to contain longer |
25 | strings, but not vice versa. Thus the "longest" trie contains all strings |
26 | for that charclass. |
27 | |
28 | The following types of trie are generated: |
29 | |
30 | n - Neutral only. All strings in this type have codepoints<128 |
31 | l - Latin1 only. All strings in this type have a codepoint>127 in them |
32 | u - UTF8 only. All strings in this type have a codepoint>255 in them |
33 | L - Latin1. All strings in 'n' and 'l' |
34 | U - UTF8. All string in 'n' and 'u' |
35 | c - Codepoint. All strings in U but in codepoint and not utf8 form. |
36 | |
37 | The ternary() routine is responsible for converting the trie data into a |
38 | ternary conditional that matches the required set of strings. The generated |
39 | macro normally takes at least the argument 's' which is expected to be a |
40 | pointer of type C<char *> or C<U8 *>. The condition generated will be |
41 | optimised to match the string as efficiently as possible, with range lookups |
42 | being used where possible, and in some situations relying on "true" to be 1. |
43 | |
44 | ternary() takes two optional arguments, $type which is one of the above |
45 | characters and $ext which is used to add an extra extension to the macro name. |
46 | |
47 | If $type is omitted or false then the generated macro will take an additional |
48 | argument, 'is_utf8'. |
49 | |
50 | If $ext has the string 'safe' in it then the generated macro will take an extra |
51 | argument 'e' for the end of the string, and all lookups will be length checked |
52 | to prevent lookups past e. If 'safe' is not used then the lookup is assumed to |
53 | be guaranteed safe, and no 'e' argument is provided and no length checks are |
54 | made during execution. |
55 | |
56 | The 'c' type is different as compared to the rest. Instead of producing |
57 | a condition that does octet comparisons of a string array, the 'c' type |
58 | produces a macro that takes a single codepoint as an argument (instead of a |
59 | char* or U8*) and does the lookup based on only that char, thus it cannot be |
60 | used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass. |
61 | This is primarily used for populating charclass bitmaps for codepoints 0..255 |
62 | but will also match codepoints in the unicode range if necessary. |
63 | |
64 | Using LNBREAK as an example the following macros will be produced: |
65 | |
66 | =over 4 |
67 | |
68 | =item is_LNBREAK(s,is_utf8) |
69 | |
70 | =item is_LNBREAK_safe(s,e,is_utf8) |
71 | |
72 | Do a lookup as apporpriate based on the is_utf8 flag. When possible |
73 | comparisons involving octect<128 are done before checking the is_utf8 |
74 | flag, hopefully saving time. |
75 | |
76 | =item is_LNBREAK_utf8(s) |
77 | |
78 | =item is_LNBREAK_utf8_safe(s,e) |
79 | |
80 | Do a lookup assuming the string is encoded in (normalized) UTF8. |
81 | |
82 | =item is_LNBREAK_latin1(s) |
83 | |
84 | =item is_LNBREAK_latin1_safe(s,e) |
85 | |
86 | Do a lookup assuming the string is encoded in latin-1 (aka plan octets). |
87 | |
88 | =item is_LNBREAK_cp(cp) |
89 | |
90 | Check to see if the string matches a given codepoint (hypotethically a |
91 | U32). The condition is constructed as as to "break out" as early as |
92 | possible if the codepoint is out of range of the condition. |
93 | |
94 | IOW: |
95 | |
96 | (cp==X || (cp>X && (cp==Y || (cp>Y && ...)))) |
97 | |
98 | Thus if the character is X+1 only two comparisons will be done. Making |
99 | matching lookups slower, but non-matching faster. |
100 | |
101 | =back |
102 | |
103 | =cut |
104 | |
105 | # store a list of numbers into a hash based trie. |
106 | sub _trie_store { |
107 | my $root= shift; |
108 | foreach my $b ( @_ ) { |
109 | $root->{$b} ||= {}; |
110 | $root= $root->{$b}; |
111 | } |
112 | $root->{''}++; |
113 | } |
114 | |
115 | # Convert a string into its neutral, latin1, utf8 forms, where |
116 | # the form is undefined unless the string can be completely represented |
117 | # in that form. The string is then decomposed into the octects representing |
118 | # it. A list is returned for each. Additional a list of codepoints making |
119 | # up the string. |
120 | # returns (\@n,\@u,\@l,\@cp) |
121 | # |
122 | sub _uni_latin1 { |
123 | my $str= shift; |
124 | my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) }; |
125 | my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) }; |
126 | my $n= $l; |
127 | undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/; |
128 | return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )), |
129 | [map { ord $_ } split //,$str]); |
130 | } |
131 | |
132 | # store an array ref of char data into the appropriate |
133 | # type bins, tracking sizes as we go. |
134 | sub _store { |
135 | my ( $self, $r, @k )= @_; |
136 | for my $z ( @k ) { |
137 | $self->{size}{$z}{ 0 + @$r }++; |
138 | push @{ $self->{data}{$z} }, $r; |
139 | } |
140 | } |
141 | |
142 | # construct a new charclass constructor object. |
143 | # $title ends up in the code a as a comment. |
144 | # $opcode is the name of the operation the charclass implements. |
145 | # the rest of the arguments are strings that the charclass |
146 | # can match. |
147 | sub new { |
148 | my $class= shift; |
149 | my $title= shift; |
150 | my $opcode= shift; |
151 | my $self= bless { op => $opcode, title => $title }, $class; |
152 | my %seen; |
153 | # convert the strings to the numeric equivelents and store |
154 | # them for later insertion while tracking their sizes. |
155 | foreach my $seq ( @_ ) { |
156 | next if $seen{$seq}++; |
157 | push @{$self->{seq}},$seq; |
158 | my ( $n, $u, $l,$cp )= _uni_latin1( $seq ); |
159 | if ( $n ) { |
160 | _store( $self, $n, qw(n U L) ); |
161 | } else { |
162 | if ( $l ) { |
163 | _store( $self, $l, qw(l L) ); |
164 | } |
165 | _store( $self, $u, qw(u U) ); |
166 | } |
167 | _store($self,$cp,'c'); |
168 | } |
169 | # |
170 | # now construct the tries. For each type of data we insert |
171 | # the data into all the tries of length $size and smaller. |
172 | # |
173 | |
174 | my %allsize; |
175 | foreach my $k ( keys %{ $self->{data} } ) { |
176 | my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} }; |
177 | $self->{size}{$k}=\@size; |
178 | undef @allsize{@size}; |
179 | foreach my $d ( @{ $self->{data}{$k} } ) { |
180 | foreach my $sz ( @size ) { |
181 | last if $sz < @$d; |
182 | $self->{trie}{$k}{$sz} ||= {}; |
183 | _trie_store( $self->{trie}{$k}{$sz}, @$d ); |
184 | } |
185 | } |
186 | #delete $self->{data}{$k}; |
187 | } |
188 | my @size= sort { $b <=> $a } keys %allsize; |
189 | $self->{size}{''}= \@size; |
190 | return $self; |
191 | } |
192 | |
193 | # |
194 | # _cond([$v1,$v2,$v2...],$ofs) |
195 | # |
196 | # converts an array of codepoints into a conditional expression |
197 | # consequtive codepoints are merged into a range test |
198 | # returns a string containing the conditional expression in the form |
199 | # '( li[x]==v || li[x]==y )' When possible we also use range lookups. |
200 | |
201 | sub _cond { |
202 | my ( $c, $ofs,$fmt )= @_; |
203 | $fmt||='((U8*)s)[%d]'; |
204 | # cheapo rangification routine. |
205 | # Convert the first element into a singleton represented |
206 | # as [$x,$x] and then merge the rest in as we go. |
207 | my @v= sort { $a <=> $b } @$c; |
208 | my @r= ( [ ( shift @v ) x 2 ] ); |
209 | for my $n ( @v ) { |
210 | if ( $n == $r[-1][1] + 1 ) { |
211 | $r[-1][1]++; |
212 | } else { |
213 | push @r, [ $n, $n ]; |
214 | } |
215 | } |
216 | @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r; |
217 | # sort the ranges by size and order. |
218 | @r= sort { $a->[0] <=> $b->[0] } @r; |
219 | my $alu= sprintf $fmt,$ofs; # C array look up |
220 | |
221 | if ($fmt=~/%d/) { |
222 | # map the ranges into conditions |
223 | @r= map { |
224 | # singleton |
225 | $_->[0] == $_->[1] ? "$alu==$_->[0]" : |
226 | # range |
227 | "($_->[0]<=$alu && $alu<=$_->[1])" |
228 | } @r; |
229 | # return the joined results. |
230 | return '( ' . join( " || ", @r ) . ' )'; |
231 | } else { |
232 | return combine($alu,@r); |
233 | } |
234 | } |
235 | |
236 | # |
237 | # Do the condition in such a way that we break out early if the value |
238 | # we are looking at is in between two elements in the list. |
239 | # Currently used only for codepoint macros (depth 1) |
240 | # |
241 | sub combine { |
242 | my $alu=shift; |
243 | local $_ = shift; |
244 | my $txt= $_->[0] == $_->[1] |
245 | ? "$alu==$_->[0]" |
246 | : "($_->[0]<=$alu && $alu<=$_->[1])"; |
247 | return $txt unless @_; |
248 | return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )"; |
249 | } |
250 | |
251 | # recursively convert a trie to an optree represented by |
252 | # [condition,yes,no] where yes and no can be a ref to another optree |
253 | # or a scalar representing code. |
254 | # called by make_optree |
255 | |
256 | sub _trie_to_optree { |
257 | my ( $node, $ofs, $else, $fmt )= @_; |
258 | return $else unless $node; |
259 | $ofs ||= 0; |
260 | if ( $node->{''} ) { |
261 | $else= $ofs; |
262 | } else { |
263 | $else ||= 0; |
264 | } |
265 | my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] } |
266 | map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] } |
267 | grep length, keys %$node; |
268 | |
269 | return $ofs if !@k; |
270 | |
271 | my ( $root, $expr ); |
272 | while ( @k ) { |
273 | my @cond= ( $k[0][0] ); |
274 | my $d= $k[0][1]; |
275 | my $r= $k[0][2]; |
276 | shift @k; |
277 | while ( @k && $k[0][1] eq $d ) { |
278 | push @cond, $k[0][0]; |
279 | shift @k; |
280 | } |
281 | my $op= |
282 | [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ]; |
283 | if ( !$root ) { |
284 | $root= $expr= $op; |
285 | } else { |
286 | push @$expr, $op; |
287 | $expr= $op; |
288 | } |
289 | } |
290 | push @$expr, $else; |
291 | return $root; |
292 | } |
293 | |
294 | # construct the optree for a type. |
295 | # handles the special logic of type ''. |
296 | sub make_optree { |
297 | my ( $self, $type, $size, $fmt )= @_; |
298 | my $else= 0; |
299 | $size||=$self->{size}{$type}[0]; |
300 | $size=1 if $type eq 'c'; |
301 | if ( !$type ) { |
302 | my ( $u, $l ); |
303 | for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) { |
304 | $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt ); |
305 | } |
306 | for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) { |
307 | $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt ); |
308 | } |
309 | if ( $u ) { |
310 | $else= [ '(is_utf8)', $u, $l || 0 ]; |
311 | } elsif ( $l ) { |
312 | $else= [ '(!is_utf8)', $l, 0 ]; |
313 | } |
314 | $type= 'n'; |
315 | $size-- while !$self->{trie}{n}{$size}; |
316 | } |
317 | return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt ); |
318 | } |
319 | |
320 | # construct the optree for a type with length checks to prevent buffer |
321 | # overruns. Only one length check is performed per lookup trading code |
322 | # size for speed. |
323 | sub length_optree { |
324 | my ( $self, $type,$fmt )= @_; |
325 | $type ||= ''; |
326 | return $self->{len_op}{$type} if $self->{len_op}{$type}; |
327 | my @size = @{$self->{size}{$type}}; |
328 | |
329 | my ( $root, $expr ); |
330 | foreach my $size ( @size ) { |
331 | my $op= [ |
332 | "( (e) - (s) > " . ( $size - 1 ) . " )", |
333 | $self->make_optree( $type, $size ), |
334 | ]; |
335 | if ( !$root ) { |
336 | $root= $expr= $op; |
337 | } else { |
338 | push @$expr, $op; |
339 | $expr= $op; |
340 | } |
341 | } |
342 | push @$expr, 0; |
343 | return $self->{len_op}{$type}= $root ? $root : $expr->[0]; |
344 | } |
345 | |
346 | # |
347 | # recursively walk an optree and covert it to a huge nested ternary expression. |
348 | # |
349 | sub _optree_to_ternary { |
350 | my ( $node )= @_; |
351 | return $node |
352 | if !ref $node; |
353 | my $depth = 0; |
354 | if ( $node->[0] =~ /\[(\d+)\]/ ) { |
355 | $depth= $1 + 1; |
356 | } |
357 | return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0], |
358 | _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] ); |
359 | } |
360 | |
361 | # add \\ to the end of strings in a reasonable neat way. |
362 | sub _macro($) { |
363 | my $str= shift; |
364 | my @lines= split /[^\S\n]*\n/, $str; |
365 | return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n"; |
366 | } |
367 | |
368 | # default type extensions. 'uln' dont have one because normally |
369 | # they are used only as part of type '' which doesnt get an extension |
370 | my %ext= ( |
371 | U => '_utf8', |
372 | L => '_latin1', |
373 | c => '_cp', |
374 | |
375 | ); |
376 | |
377 | # produce the ternary, handling arguments and putting on the macro headers |
378 | # and boiler plate |
379 | sub ternary { |
380 | my ( $self, $type, $ext )= @_; |
381 | $type ||= ''; |
382 | $ext = ($ext{$type} || '') . ($ext||""); |
383 | my ($root,$fmt,$arg); |
384 | if ($type eq 'c') { |
385 | $arg= $fmt= 'cp'; |
386 | } else { |
387 | $arg= 's'; |
388 | } |
389 | if ( $type eq 'c' || $ext !~ /safe/) { |
390 | $root= $self->make_optree( $type, 0, $fmt ); |
391 | } else { |
392 | $root= $self->length_optree( $type, $fmt ); |
393 | } |
394 | |
395 | our $parens; |
396 | $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x; |
397 | my $expr= qr/ |
398 | \( \s* |
399 | ($parens) |
400 | \s* \? \s* |
401 | \( \s* |
402 | ($parens) |
403 | \s* \? \s* |
404 | (\d+|$parens) |
405 | \s* : \s* |
406 | (\d+|$parens) |
407 | \s* \) |
408 | \s* : \s* |
409 | \4 |
410 | \s* \) |
411 | /x; |
412 | my $code= _optree_to_ternary( $root ); |
413 | for ( $code ) { |
414 | s/^\s*//; |
415 | 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g |
416 | || s<$expr><(($1 && $2) ? $3 : $4)>g |
417 | || s<\(\s*($parens)\s*\)><$1>g; |
418 | } |
419 | my @args=($arg); |
420 | push @args,'e' if $ext=~/safe/; |
421 | push @args,'is_utf8' if !$type; |
422 | my $args=join ",",@args; |
423 | return "/*** GENERATED CODE ***/\n" |
424 | . _macro "#define is_$self->{op}$ext($args)\n$code"; |
425 | } |
426 | |
427 | my $path=shift @ARGV; |
428 | if (!$path) { |
429 | $path= "regcharclass.h"; |
430 | if (!-e $path) { $path="../$path" } |
431 | if (!-e $path) { die "Can't find regcharclass.h to update!\n" }; |
432 | } |
433 | |
434 | rename $path,"$path.bak"; |
435 | open my $out_fh,">",$path |
436 | or die "Can't write to '$path':$!"; |
437 | binmode $out_fh; # want unix line endings even when run on win32. |
438 | my ($zero)=$0=~/([^\\\/]+)$/; |
439 | print $out_fh <<"HEADER"; |
440 | /*********************** WARNING WARNING WARNING ************************ |
441 | |
442 | Do not modify this code directly: This file was autogenerated by |
443 | |
444 | Porting/$zero |
445 | |
446 | from data contained within the script. Change the script instead. |
447 | |
448 | Generated at: @{[ scalar gmtime ]} GMT |
449 | |
450 | ************************ WARNING WARNING WARNING ************************/ |
451 | |
452 | HEADER |
453 | |
454 | my ($op,$title,@strs,@txt); |
455 | my $doit= sub { |
456 | return unless $op; |
457 | my $o= __PACKAGE__->new($title,$op,@strs); |
458 | print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n"; |
459 | print $out_fh join "\n",@txt,"*/",""; |
460 | for ('', 'U', 'L') { |
461 | print $out_fh $o->ternary( $_ ); |
462 | print $out_fh $o->ternary( $_,'_safe' ); |
463 | } |
464 | print $out_fh $o->ternary( 'c' ); |
465 | }; |
466 | while (<DATA>) { |
467 | next unless /\S/; |
468 | chomp; |
469 | if (/^([A-Z]+)/) { |
470 | $doit->(); |
471 | ($op,$title)=split /\s*:\s*/,$_,2; |
472 | @txt=@strs=(); |
473 | } else { |
474 | push @txt, "\t$_"; |
475 | s/#.*$//; |
476 | if (/^0x/) { |
477 | push @strs,map { chr $_ } eval $_; |
478 | } elsif (/^[""'']/) { |
479 | push @strs,eval $_; |
480 | } |
481 | } |
482 | } |
483 | $doit->(); |
484 | print "$path has been updated\n"; |
485 | |
486 | __DATA__ |
487 | LNBREAK: Line Break: \R |
488 | "\x0D\x0A" # CRLF - Network (Windows) line ending |
489 | 0x0A # LF | LINE FEED |
490 | 0x0B # VT | VERTICAL TAB |
491 | 0x0C # FF | FORM FEED |
492 | 0x0D # CR | CARRIAGE RETURN |
493 | 0x85 # NEL | NEXT LINE |
494 | 0x2028 # LINE SEPARATOR |
495 | 0x2029 # PARAGRAPH SEPARATOR |
496 | |
497 | HORIZWS: Horizontal Whitespace: \h \H |
498 | 0x09 # HT |
499 | 0x20 # SPACE |
500 | 0xa0 # NBSP |
501 | 0x1680 # OGHAM SPACE MARK |
502 | 0x180e # MONGOLIAN VOWEL SEPARATOR |
503 | 0x2000 # EN QUAD |
504 | 0x2001 # EM QUAD |
505 | 0x2002 # EN SPACE |
506 | 0x2003 # EM SPACE |
507 | 0x2004 # THREE-PER-EM SPACE |
508 | 0x2005 # FOUR-PER-EM SPACE |
509 | 0x2006 # SIX-PER-EM SPACE |
510 | 0x2007 # FIGURE SPACE |
511 | 0x2008 # PUNCTUATION SPACE |
512 | 0x2009 # THIN SPACE |
513 | 0x200A # HAIR SPACE |
514 | 0x202f # NARROW NO-BREAK SPACE |
515 | 0x205f # MEDIUM MATHEMATICAL SPACE |
516 | 0x3000 # IDEOGRAPHIC SPACE |
517 | |
518 | VERTWS: Vertical Whitespace: \v \V |
519 | 0x0A # LF |
520 | 0x0B # VT |
521 | 0x0C # FF |
522 | 0x0D # CR |
523 | 0x85 # NEL |
524 | 0x2028 # LINE SEPARATOR |
525 | 0x2029 # PARAGRAPH SEPARATOR |
526 | |