4 use Text::Wrap qw(wrap);
8 # Author: Yves Orton (demerphq) 2007.
12 Dynamically generates macros for detecting special charclasses
13 in both latin-1, utf8, and codepoint forms.
15 To regenerate regcharclass.h, run this script from perl-root. No arguments
18 Each charclass handler is constructed as follows:
19 Each string the charclass must match is rendered as unicode (codepoints>255),
20 and if possible as latin1 (codepoints>127), and if possible as "neutral"
23 The rendered strings are then inserted into digit-tries by type and length.
24 With shorter strings being added to tries that are allowed to contain longer
25 strings, but not vice versa. Thus the "longest" trie contains all strings
28 The following types of trie are generated:
30 n - Neutral only. All strings in this type have codepoints<128
31 l - Latin1 only. All strings in this type have a codepoint>127 in them
32 u - UTF8 only. All strings in this type have a codepoint>255 in them
33 L - Latin1. All strings in 'n' and 'l'
34 U - UTF8. All string in 'n' and 'u'
35 c - Codepoint. All strings in U but in codepoint and not utf8 form.
37 The ternary() routine is responsible for converting the trie data into a
38 ternary conditional that matches the required set of strings. The generated
39 macro normally takes at least the argument 's' which is expected to be a
40 pointer of type C<char *> or C<U8 *>. The condition generated will be
41 optimised to match the string as efficiently as possible, with range lookups
42 being used where possible, and in some situations relying on "true" to be 1.
44 ternary() takes two optional arguments, $type which is one of the above
45 characters and $ext which is used to add an extra extension to the macro name.
47 If $type is omitted or false then the generated macro will take an additional
50 If $ext has the string 'safe' in it then the generated macro will take an extra
51 argument 'e' for the end of the string, and all lookups will be length checked
52 to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
53 be guaranteed safe, and no 'e' argument is provided and no length checks are
54 made during execution.
56 The 'c' type is different as compared to the rest. Instead of producing
57 a condition that does octet comparisons of a string array, the 'c' type
58 produces a macro that takes a single codepoint as an argument (instead of a
59 char* or U8*) and does the lookup based on only that char, thus it cannot be
60 used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
61 This is primarily used for populating charclass bitmaps for codepoints 0..255
62 but will also match codepoints in the unicode range if necessary.
64 Using LNBREAK as an example the following macros will be produced:
68 =item is_LNBREAK(s,is_utf8)
70 =item is_LNBREAK_safe(s,e,is_utf8)
72 Do a lookup as apporpriate based on the is_utf8 flag. When possible
73 comparisons involving octect<128 are done before checking the is_utf8
74 flag, hopefully saving time.
76 =item is_LNBREAK_utf8(s)
78 =item is_LNBREAK_utf8_safe(s,e)
80 Do a lookup assuming the string is encoded in (normalized) UTF8.
82 =item is_LNBREAK_latin1(s)
84 =item is_LNBREAK_latin1_safe(s,e)
86 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
88 =item is_LNBREAK_cp(cp)
90 Check to see if the string matches a given codepoint (hypotethically a
91 U32). The condition is constructed as as to "break out" as early as
92 possible if the codepoint is out of range of the condition.
96 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
98 Thus if the character is X+1 only two comparisons will be done. Making
99 matching lookups slower, but non-matching faster.
105 # store a list of numbers into a hash based trie.
108 foreach my $b ( @_ ) {
115 # Convert a string into its neutral, latin1, utf8 forms, where
116 # the form is undefined unless the string can be completely represented
117 # in that form. The string is then decomposed into the octects representing
118 # it. A list is returned for each. Additional a list of codepoints making
120 # returns (\@n,\@u,\@l,\@cp)
124 my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
125 my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
127 undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
128 return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
129 [map { ord $_ } split //,$str]);
132 # store an array ref of char data into the appropriate
133 # type bins, tracking sizes as we go.
135 my ( $self, $r, @k )= @_;
137 $self->{size}{$z}{ 0 + @$r }++;
138 push @{ $self->{data}{$z} }, $r;
142 # construct a new charclass constructor object.
143 # $title ends up in the code a as a comment.
144 # $opcode is the name of the operation the charclass implements.
145 # the rest of the arguments are strings that the charclass
151 my $self= bless { op => $opcode, title => $title }, $class;
153 # convert the strings to the numeric equivelents and store
154 # them for later insertion while tracking their sizes.
155 foreach my $seq ( @_ ) {
156 next if $seen{$seq}++;
157 push @{$self->{seq}},$seq;
158 my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
160 _store( $self, $n, qw(n U L) );
163 _store( $self, $l, qw(l L) );
165 _store( $self, $u, qw(u U) );
167 _store($self,$cp,'c');
170 # now construct the tries. For each type of data we insert
171 # the data into all the tries of length $size and smaller.
175 foreach my $k ( keys %{ $self->{data} } ) {
176 my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
177 $self->{size}{$k}=\@size;
178 undef @allsize{@size};
179 foreach my $d ( @{ $self->{data}{$k} } ) {
180 foreach my $sz ( @size ) {
182 $self->{trie}{$k}{$sz} ||= {};
183 _trie_store( $self->{trie}{$k}{$sz}, @$d );
186 #delete $self->{data}{$k};
188 my @size= sort { $b <=> $a } keys %allsize;
189 $self->{size}{''}= \@size;
194 # _cond([$v1,$v2,$v2...],$ofs)
196 # converts an array of codepoints into a conditional expression
197 # consequtive codepoints are merged into a range test
198 # returns a string containing the conditional expression in the form
199 # '( li[x]==v || li[x]==y )' When possible we also use range lookups.
202 my ( $c, $ofs,$fmt )= @_;
203 $fmt||='((U8*)s)[%d]';
204 # cheapo rangification routine.
205 # Convert the first element into a singleton represented
206 # as [$x,$x] and then merge the rest in as we go.
207 my @v= sort { $a <=> $b } @$c;
208 my @r= ( [ ( shift @v ) x 2 ] );
210 if ( $n == $r[-1][1] + 1 ) {
216 @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
217 # sort the ranges by size and order.
218 @r= sort { $a->[0] <=> $b->[0] } @r;
219 my $alu= sprintf $fmt,$ofs; # C array look up
222 # map the ranges into conditions
225 $_->[0] == $_->[1] ? "$alu==$_->[0]" :
227 "($_->[0]<=$alu && $alu<=$_->[1])"
229 # return the joined results.
230 return '( ' . join( " || ", @r ) . ' )';
232 return combine($alu,@r);
237 # Do the condition in such a way that we break out early if the value
238 # we are looking at is in between two elements in the list.
239 # Currently used only for codepoint macros (depth 1)
244 my $txt= $_->[0] == $_->[1]
246 : "($_->[0]<=$alu && $alu<=$_->[1])";
247 return $txt unless @_;
248 return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
251 # recursively convert a trie to an optree represented by
252 # [condition,yes,no] where yes and no can be a ref to another optree
253 # or a scalar representing code.
254 # called by make_optree
256 sub _trie_to_optree {
257 my ( $node, $ofs, $else, $fmt )= @_;
258 return $else unless $node;
265 my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
266 map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
267 grep length, keys %$node;
273 my @cond= ( $k[0][0] );
277 while ( @k && $k[0][1] eq $d ) {
278 push @cond, $k[0][0];
282 [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
294 # construct the optree for a type.
295 # handles the special logic of type ''.
297 my ( $self, $type, $size, $fmt )= @_;
299 $size||=$self->{size}{$type}[0];
300 $size=1 if $type eq 'c';
303 for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
304 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
306 for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
307 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
310 $else= [ '(is_utf8)', $u, $l || 0 ];
312 $else= [ '(!is_utf8)', $l, 0 ];
315 $size-- while !$self->{trie}{n}{$size};
317 return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
320 # construct the optree for a type with length checks to prevent buffer
321 # overruns. Only one length check is performed per lookup trading code
324 my ( $self, $type,$fmt )= @_;
326 return $self->{len_op}{$type} if $self->{len_op}{$type};
327 my @size = @{$self->{size}{$type}};
330 foreach my $size ( @size ) {
332 "( (e) - (s) > " . ( $size - 1 ) . " )",
333 $self->make_optree( $type, $size ),
343 return $self->{len_op}{$type}= $root ? $root : $expr->[0];
347 # recursively walk an optree and covert it to a huge nested ternary expression.
349 sub _optree_to_ternary {
354 if ( $node->[0] =~ /\[(\d+)\]/ ) {
357 return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
358 _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
361 # add \\ to the end of strings in a reasonable neat way.
364 my @lines= split /[^\S\n]*\n/, $str;
365 return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
368 # default type extensions. 'uln' dont have one because normally
369 # they are used only as part of type '' which doesnt get an extension
377 # produce the ternary, handling arguments and putting on the macro headers
380 my ( $self, $type, $ext )= @_;
382 $ext = ($ext{$type} || '') . ($ext||"");
383 my ($root,$fmt,$arg);
389 if ( $type eq 'c' || $ext !~ /safe/) {
390 $root= $self->make_optree( $type, 0, $fmt );
392 $root= $self->length_optree( $type, $fmt );
396 $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
412 my $code= _optree_to_ternary( $root );
415 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
416 || s<$expr><(($1 && $2) ? $3 : $4)>g
417 || s<\(\s*($parens)\s*\)><$1>g;
420 push @args,'e' if $ext=~/safe/;
421 push @args,'is_utf8' if !$type;
422 my $args=join ",",@args;
423 return "/*** GENERATED CODE ***/\n"
424 . _macro "#define is_$self->{op}$ext($args)\n$code";
427 my $path=shift @ARGV;
429 $path= "regcharclass.h";
430 if (!-e $path) { $path="../$path" }
431 if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
434 rename $path,"$path.bak";
435 open my $out_fh,">",$path
436 or die "Can't write to '$path':$!";
437 binmode $out_fh; # want unix line endings even when run on win32.
438 my ($zero)=$0=~/([^\\\/]+)$/;
439 print $out_fh <<"HEADER";
440 /*********************** WARNING WARNING WARNING ************************
442 Do not modify this code directly: This file was autogenerated by
446 from data contained within the script. Change the script instead.
448 Generated at: @{[ scalar gmtime ]} GMT
450 ************************ WARNING WARNING WARNING ************************/
454 my ($op,$title,@strs,@txt);
457 my $o= __PACKAGE__->new($title,$op,@strs);
458 print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
459 print $out_fh join "\n",@txt,"*/","";
461 print $out_fh $o->ternary( $_ );
462 print $out_fh $o->ternary( $_,'_safe' );
464 print $out_fh $o->ternary( 'c' );
471 ($op,$title)=split /\s*:\s*/,$_,2;
477 push @strs,map { chr $_ } eval $_;
478 } elsif (/^[""'']/) {
484 print "$path has been updated\n";
487 LNBREAK: Line Break: \R
488 "\x0D\x0A" # CRLF - Network (Windows) line ending
489 0x0A # LF | LINE FEED
490 0x0B # VT | VERTICAL TAB
491 0x0C # FF | FORM FEED
492 0x0D # CR | CARRIAGE RETURN
493 0x85 # NEL | NEXT LINE
494 0x2028 # LINE SEPARATOR
495 0x2029 # PARAGRAPH SEPARATOR
497 HORIZWS: Horizontal Whitespace: \h \H
501 0x1680 # OGHAM SPACE MARK
502 0x180e # MONGOLIAN VOWEL SEPARATOR
507 0x2004 # THREE-PER-EM SPACE
508 0x2005 # FOUR-PER-EM SPACE
509 0x2006 # SIX-PER-EM SPACE
510 0x2007 # FIGURE SPACE
511 0x2008 # PUNCTUATION SPACE
514 0x202f # NARROW NO-BREAK SPACE
515 0x205f # MEDIUM MATHEMATICAL SPACE
516 0x3000 # IDEOGRAPHIC SPACE
518 VERTWS: Vertical Whitespace: \v \V
524 0x2028 # LINE SEPARATOR
525 0x2029 # PARAGRAPH SEPARATOR