4 use Text::Wrap qw(wrap);
10 # Author: Yves Orton (demerphq) 2007.
14 Dynamically generates macros for detecting special charclasses
15 in both latin-1, utf8, and codepoint forms.
17 To regenerate regcharclass.h, run this script from perl-root. No arguments
20 Each charclass handler is constructed as follows:
21 Each string the charclass must match is rendered as unicode (codepoints>255),
22 and if possible as latin1 (codepoints>127), and if possible as "neutral"
25 The rendered strings are then inserted into digit-tries by type and length.
26 With shorter strings being added to tries that are allowed to contain longer
27 strings, but not vice versa. Thus the "longest" trie contains all strings
30 The following types of trie are generated:
32 n - Neutral only. All strings in this type have codepoints<128
33 l - Latin1 only. All strings in this type have a codepoint>127 in them
34 u - UTF8 only. All strings in this type have a codepoint>255 in them
35 L - Latin1. All strings in 'n' and 'l'
36 U - UTF8. All string in 'n' and 'u'
37 c - Codepoint. All strings in U but in codepoint and not utf8 form.
39 The ternary() routine is responsible for converting the trie data into a
40 ternary conditional that matches the required set of strings. The generated
41 macro normally takes at least the argument 's' which is expected to be a
42 pointer of type C<char *> or C<U8 *>. The condition generated will be
43 optimised to match the string as efficiently as possible, with range lookups
44 being used where possible, and in some situations relying on "true" to be 1.
46 ternary() takes two optional arguments, $type which is one of the above
47 characters and $ext which is used to add an extra extension to the macro name.
49 If $type is omitted or false then the generated macro will take an additional
52 If $ext has the string 'safe' in it then the generated macro will take an extra
53 argument 'e' for the end of the string, and all lookups will be length checked
54 to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
55 be guaranteed safe, and no 'e' argument is provided and no length checks are
56 made during execution.
58 The 'c' type is different as compared to the rest. Instead of producing
59 a condition that does octet comparisons of a string array, the 'c' type
60 produces a macro that takes a single codepoint as an argument (instead of a
61 char* or U8*) and does the lookup based on only that char, thus it cannot be
62 used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
63 This is primarily used for populating charclass bitmaps for codepoints 0..255
64 but will also match codepoints in the unicode range if necessary.
66 Using LNBREAK as an example the following macros will be produced:
70 =item is_LNBREAK(s,is_utf8)
72 =item is_LNBREAK_safe(s,e,is_utf8)
74 Do a lookup as apporpriate based on the is_utf8 flag. When possible
75 comparisons involving octect<128 are done before checking the is_utf8
76 flag, hopefully saving time.
78 =item is_LNBREAK_utf8(s)
80 =item is_LNBREAK_utf8_safe(s,e)
82 Do a lookup assuming the string is encoded in (normalized) UTF8.
84 =item is_LNBREAK_latin1(s)
86 =item is_LNBREAK_latin1_safe(s,e)
88 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
90 =item is_LNBREAK_cp(cp)
92 Check to see if the string matches a given codepoint (hypotethically a
93 U32). The condition is constructed as as to "break out" as early as
94 possible if the codepoint is out of range of the condition.
98 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
100 Thus if the character is X+1 only two comparisons will be done. Making
101 matching lookups slower, but non-matching faster.
107 # store a list of numbers into a hash based trie.
110 foreach my $b ( @_ ) {
117 # Convert a string into its neutral, latin1, utf8 forms, where
118 # the form is undefined unless the string can be completely represented
119 # in that form. The string is then decomposed into the octects representing
120 # it. A list is returned for each. Additional a list of codepoints making
122 # returns (\@n,\@u,\@l,\@cp)
126 my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
127 my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
129 undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
130 return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
131 [map { ord $_ } split //,$str]);
134 # store an array ref of char data into the appropriate
135 # type bins, tracking sizes as we go.
137 my ( $self, $r, @k )= @_;
139 $self->{size}{$z}{ 0 + @$r }++;
140 push @{ $self->{data}{$z} }, $r;
144 # construct a new charclass constructor object.
145 # $title ends up in the code a as a comment.
146 # $opcode is the name of the operation the charclass implements.
147 # the rest of the arguments are strings that the charclass
153 my $self= bless { op => $opcode, title => $title }, $class;
155 # convert the strings to the numeric equivelents and store
156 # them for later insertion while tracking their sizes.
157 foreach my $seq ( @_ ) {
158 next if $seen{$seq}++;
159 push @{$self->{seq}},$seq;
160 my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
162 _store( $self, $n, qw(n U L) );
165 _store( $self, $l, qw(l L) );
167 _store( $self, $u, qw(u U) );
169 _store($self,$cp,'c');
172 # now construct the tries. For each type of data we insert
173 # the data into all the tries of length $size and smaller.
177 foreach my $k ( keys %{ $self->{data} } ) {
178 my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
179 $self->{size}{$k}=\@size;
180 undef @allsize{@size};
181 foreach my $d ( @{ $self->{data}{$k} } ) {
182 foreach my $sz ( @size ) {
184 $self->{trie}{$k}{$sz} ||= {};
185 _trie_store( $self->{trie}{$k}{$sz}, @$d );
188 #delete $self->{data}{$k};
190 my @size= sort { $b <=> $a } keys %allsize;
191 $self->{size}{''}= \@size;
196 # _cond([$v1,$v2,$v2...],$ofs)
198 # converts an array of codepoints into a conditional expression
199 # consequtive codepoints are merged into a range test
200 # returns a string containing the conditional expression in the form
201 # '( li[x]==v || li[x]==y )' When possible we also use range lookups.
204 my ( $c, $ofs,$fmt )= @_;
205 $fmt||='((U8*)s)[%d]';
206 # cheapo rangification routine.
207 # Convert the first element into a singleton represented
208 # as [$x,$x] and then merge the rest in as we go.
209 my @v= sort { $a <=> $b } @$c;
210 my @r= ( [ ( shift @v ) x 2 ] );
212 if ( $n == $r[-1][1] + 1 ) {
218 @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
219 # sort the ranges by size and order.
220 @r= sort { $a->[0] <=> $b->[0] } @r;
221 my $alu= sprintf $fmt,$ofs; # C array look up
224 # map the ranges into conditions
227 $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
229 sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
231 # return the joined results.
232 return '( ' . join( " || ", @r ) . ' )';
234 return combine($alu,@r);
239 # Do the condition in such a way that we break out early if the value
240 # we are looking at is in between two elements in the list.
241 # Currently used only for codepoint macros (depth 1)
246 my $txt= $_->[0] == $_->[1]
247 ? sprintf("$alu == $hex_fmt",$_->[0])
248 : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
249 return $txt unless @_;
250 return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )",
251 $txt,$alu,$_->[1],combine($alu,@_);
254 # recursively convert a trie to an optree represented by
255 # [condition,yes,no] where yes and no can be a ref to another optree
256 # or a scalar representing code.
257 # called by make_optree
259 sub _trie_to_optree {
260 my ( $node, $ofs, $else, $fmt )= @_;
261 return $else unless $node;
268 my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
269 map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
270 grep length, keys %$node;
276 my @cond= ( $k[0][0] );
280 while ( @k && $k[0][1] eq $d ) {
281 push @cond, $k[0][0];
285 [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
297 # construct the optree for a type.
298 # handles the special logic of type ''.
300 my ( $self, $type, $size, $fmt )= @_;
302 $size||=$self->{size}{$type}[0];
303 $size=1 if $type eq 'c';
306 if ($self->{trie}{u}) {
307 for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
308 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
311 if ($self->{trie}{l}) {
312 for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
313 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
317 $else= [ '(is_utf8)', $u, $l || 0 ];
319 $else= [ '(!is_utf8)', $l, 0 ];
323 if (!$self->{trie}{$type}) {
326 $size-- while $size>0 && !$self->{trie}{$type}{$size};
327 return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
331 # construct the optree for a type with length checks to prevent buffer
332 # overruns. Only one length check is performed per lookup trading code
335 my ( $self, $type,$fmt )= @_;
337 return $self->{len_op}{$type} if $self->{len_op}{$type};
338 my @size = @{$self->{size}{$type}};
341 foreach my $size ( @size ) {
343 "( (e) - (s) > " . ( $size - 1 ) . " )",
344 $self->make_optree( $type, $size ),
354 return $self->{len_op}{$type}= $root ? $root : $expr->[0];
358 # recursively walk an optree and covert it to a huge nested ternary expression.
360 sub _optree_to_ternary {
365 if ( $node->[0] =~ /\[(\d+)\]/ ) {
368 return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
369 _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
372 # add \\ to the end of strings in a reasonable neat way.
375 my @lines= split /[^\S\n]*\n/, $str;
376 my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
378 return $macro . "\n\n";
381 # default type extensions. 'uln' dont have one because normally
382 # they are used only as part of type '' which doesnt get an extension
390 # produce the ternary, handling arguments and putting on the macro headers
393 my ( $self, $type, $ext )= @_;
395 $ext = ($ext{$type} || '') . ($ext||"");
396 my ($root,$fmt,$arg);
402 if ( $type eq 'c' || $ext !~ /safe/) {
403 $root= $self->make_optree( $type, 0, $fmt );
405 $root= $self->length_optree( $type, $fmt );
409 $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
425 my $code= _optree_to_ternary( $root );
428 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
429 || s<$expr><(($1 && $2) ? $3 : $4)>g
430 || s<\(\s*($parens)\s*\)><$1>g;
433 push @args,'e' if $ext=~/safe/;
434 push @args,'is_utf8' if !$type;
435 my $args=join ",",@args;
436 return "/*** GENERATED CODE ***/\n"
437 . _macro "#define is_$self->{op}$ext($args)\n$code";
440 my $path=shift @ARGV;
443 $path= "regcharclass.h";
444 if (!-e $path) { $path="../$path" }
445 if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
451 rename $path,"$path.bak";
452 open $out_fh,">",$path
453 or die "Can't write to '$path':$!";
454 binmode $out_fh; # want unix line endings even when run on win32.
456 my ($zero) = $0=~/([^\\\/]+)$/;
457 print $out_fh <<"HEADER";
458 /* -*- buffer-read-only: t -*-
462 * Copyright (C) 2007, by Larry Wall and others
464 * You may distribute under the terms of either the GNU General Public
465 * License or the Artistic License, as specified in the README file.
467 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
468 * This file is built by Porting/$zero.
469 * (Generated at: @{[ scalar gmtime ]} GMT)
470 * Any changes made here will be lost!
475 my ($op,$title,@strs,@txt,$type);
478 my $o= __PACKAGE__->new($title,$op,@strs);
479 print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
480 print $out_fh join "\n",@txt,"*/","";
481 $type||="U L c _safe";
483 my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
485 for my $type (@types) {
487 next if $type eq 'c' and $ext eq '_safe';
488 print $out_fh $o->ternary( $type,$ext );
497 ($op,$title)=split /\s*:\s*/,$_,2;
506 push @strs,map { chr $_ } eval $_;
507 } elsif (/^[""'']/) {
513 print $out_fh "/* ex: set ro: */\n";
516 LNBREAK: Line Break: \R
517 "\x0D\x0A" # CRLF - Network (Windows) line ending
518 0x0A # LF | LINE FEED
519 0x0B # VT | VERTICAL TAB
520 0x0C # FF | FORM FEED
521 0x0D # CR | CARRIAGE RETURN
522 0x85 # NEL | NEXT LINE
523 0x2028 # LINE SEPARATOR
524 0x2029 # PARAGRAPH SEPARATOR
526 HORIZWS: Horizontal Whitespace: \h \H
530 0x1680 # OGHAM SPACE MARK
531 0x180e # MONGOLIAN VOWEL SEPARATOR
536 0x2004 # THREE-PER-EM SPACE
537 0x2005 # FOUR-PER-EM SPACE
538 0x2006 # SIX-PER-EM SPACE
539 0x2007 # FIGURE SPACE
540 0x2008 # PUNCTUATION SPACE
543 0x202f # NARROW NO-BREAK SPACE
544 0x205f # MEDIUM MATHEMATICAL SPACE
545 0x3000 # IDEOGRAPHIC SPACE
547 VERTWS: Vertical Whitespace: \v \V
553 0x2028 # LINE SEPARATOR
554 0x2029 # PARAGRAPH SEPARATOR
556 TRICKYFOLD: Problematic fold case letters.
557 0x00DF # LATIN SMALL LETTER SHARP S
558 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
559 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS