Porting/regcharclass.pl

   1 package UTF8::Matcher;
   2 use strict;
   3 use warnings;
   4 use Text::Wrap qw(wrap);
   5 use Encode;
   6 use Data::Dumper;
   7
   8 # Author: Yves Orton (demerphq) 2007.
   9
  10 =pod
  11
  12 Dynamically generates macros for detecting special charclasses
  13 in both latin-1, utf8, and codepoint forms.
  14
  15 To regenerate regcharclass.h, run this script from perl-root. No arguments
  16 are necessary.
  17
  18 Each charclass handler is constructed as follows:
  19 Each string the charclass must match  is rendered as unicode (codepoints>255),
  20 and if possible  as latin1 (codepoints>127), and if possible as "neutral"
  21 (all codepoints<128).
  22
  23 The rendered strings are then inserted into digit-tries by type and length.
  24 With shorter strings being added to tries that are allowed to contain longer
  25 strings, but not vice versa.  Thus the "longest" trie contains all strings
  26 for that charclass.
  27
  28 The following types of trie are generated:
  29
  30   n - Neutral only. All strings in this type have codepoints<128
  31   l - Latin1 only. All strings in this type have a codepoint>127 in them
  32   u - UTF8 only.   All strings in this type have a codepoint>255 in them
  33   L - Latin1. All strings in 'n' and 'l'
  34   U - UTF8.   All string in 'n' and 'u'
  35   c - Codepoint. All strings in U but in codepoint and not utf8 form.
  36
  37 The ternary() routine is responsible for converting the trie data into a
  38 ternary conditional that matches the required set of strings. The generated
  39 macro normally takes at least the argument 's' which is expected to be a
  40 pointer of type C<char *> or C<U8 *>. The condition generated will be
  41 optimised to match the string as efficiently as possible, with range lookups
  42 being used where possible, and in some situations relying on "true" to be 1.
  43
  44 ternary() takes two optional arguments, $type which is one of the above
  45 characters and $ext which is used to add an extra extension to the macro name.
  46
  47 If $type is omitted or false then the generated macro will take an additional
  48 argument, 'is_utf8'.
  49
  50 If $ext has the string 'safe' in it then the generated macro will take an extra
  51 argument 'e' for the end of the string, and all lookups will be length checked
  52 to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
  53 be guaranteed safe, and no 'e' argument is provided  and no length checks are
  54 made during execution.
  55
  56 The 'c' type is different as compared to the rest. Instead of producing
  57 a condition that does octet comparisons of a string array, the 'c' type
  58 produces a macro that takes a single codepoint as an argument (instead of a
  59 char* or U8*) and does the lookup based on only that char, thus it cannot be
  60 used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
  61 This is primarily used for populating charclass bitmaps for codepoints 0..255
  62 but will also match codepoints in the unicode range if necessary.
  63
  64 Using LNBREAK as an example the following macros will be produced:
  65
  66 =over 4
  67
  68 =item is_LNBREAK(s,is_utf8)
  69
  70 =item is_LNBREAK_safe(s,e,is_utf8)
  71
  72 Do a lookup as apporpriate based on the is_utf8 flag. When possible
  73 comparisons involving octect<128 are done before checking the is_utf8
  74 flag, hopefully saving time.
  75
  76 =item is_LNBREAK_utf8(s)
  77
  78 =item is_LNBREAK_utf8_safe(s,e)
  79
  80 Do a lookup assuming the string is encoded in (normalized) UTF8.
  81
  82 =item is_LNBREAK_latin1(s)
  83
  84 =item is_LNBREAK_latin1_safe(s,e)
  85
  86 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  87
  88 =item is_LNBREAK_cp(cp)
  89
  90 Check to see if the string matches a given codepoint (hypotethically a
  91 U32). The condition is constructed as as to "break out" as early as
  92 possible if the codepoint is out of range of the condition.
  93
  94 IOW:
  95
  96   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  97
  98 Thus if the character is X+1 only two comparisons will be done. Making
  99 matching lookups slower, but non-matching faster.
 100
 101 =back
 102
 103 =cut
 104
 105 # store a list of numbers into a hash based trie.
 106 sub _trie_store {
 107     my $root= shift;
 108     foreach my $b ( @_ ) {
 109         $root->{$b} ||= {};
 110         $root= $root->{$b};
 111     }
 112     $root->{''}++;
 113 }
 114
 115 # Convert a string into its neutral, latin1, utf8 forms, where
 116 # the form is undefined unless the string can be completely represented
 117 # in that form. The string is then decomposed into the octects representing
 118 # it. A list is returned for each. Additional a list of codepoints making
 119 # up the string.
 120 # returns (\@n,\@u,\@l,\@cp)
 121 #
 122 sub _uni_latin1 {
 123     my $str= shift;
 124     my $u= eval { Encode::encode( "utf8",       "$str", Encode::FB_CROAK ) };
 125     my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
 126     my $n= $l;
 127     undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
 128     return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
 129             [map { ord $_ } split //,$str]);
 130 }
 131
 132 # store an array ref of char data into the appropriate
 133 # type bins, tracking sizes as we go.
 134 sub _store {
 135     my ( $self, $r, @k )= @_;
 136     for my $z ( @k ) {
 137         $self->{size}{$z}{ 0 + @$r }++;
 138         push @{ $self->{data}{$z} }, $r;
 139     }
 140 }
 141
 142 # construct a new charclass constructor object.
 143 # $title ends up in the code a as a comment.
 144 # $opcode is the name of the operation the charclass implements.
 145 # the rest of the arguments are strings that the charclass
 146 # can match.
 147 sub new {
 148     my $class= shift;
 149     my $title= shift;
 150     my $opcode= shift;
 151     my $self= bless { op => $opcode, title => $title }, $class;
 152     my %seen;
 153     # convert the strings to the numeric equivelents and store
 154     # them for later insertion while tracking their sizes.
 155     foreach my $seq ( @_ ) {
 156         next if $seen{$seq}++;
 157         push @{$self->{seq}},$seq;
 158         my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
 159         if ( $n ) {
 160             _store( $self, $n, qw(n U L) );
 161         } else {
 162             if ( $l ) {
 163                 _store( $self, $l, qw(l L) );
 164             }
 165             _store( $self, $u, qw(u U) );
 166         }
 167         _store($self,$cp,'c');
 168     }
 169     #
 170     # now construct the tries. For each type of data we insert
 171     # the data into all the tries of length $size and smaller.
 172     #
 173
 174     my %allsize;
 175     foreach my $k ( keys %{ $self->{data} } ) {
 176         my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
 177         $self->{size}{$k}=\@size;
 178         undef @allsize{@size};
 179         foreach my $d ( @{ $self->{data}{$k} } ) {
 180             foreach my $sz ( @size ) {
 181                 last if $sz < @$d;
 182                 $self->{trie}{$k}{$sz} ||= {};
 183                 _trie_store( $self->{trie}{$k}{$sz}, @$d );
 184             }
 185         }
 186         #delete $self->{data}{$k};
 187     }
 188     my @size= sort { $b <=> $a } keys %allsize;
 189     $self->{size}{''}= \@size;
 190     return $self;
 191 }
 192
 193 #
 194 # _cond([$v1,$v2,$v2...],$ofs)
 195 #
 196 # converts an array of codepoints into a conditional expression
 197 # consequtive codepoints are merged into a range test
 198 # returns a string containing the conditional expression in the form
 199 # '( li[x]==v || li[x]==y )' When possible we also use range lookups.
 200
 201 sub _cond {
 202     my ( $c, $ofs,$fmt )= @_;
 203     $fmt||='((U8*)s)[%d]';
 204     # cheapo rangification routine.
 205     # Convert the first element into a singleton represented
 206     # as [$x,$x] and then merge the rest in as we go.
 207     my @v= sort { $a <=> $b } @$c;
 208     my @r= ( [ ( shift @v ) x 2 ] );
 209     for my $n ( @v ) {
 210         if ( $n == $r[-1][1] + 1 ) {
 211             $r[-1][1]++;
 212         } else {
 213             push @r, [ $n, $n ];
 214         }
 215     }
 216     @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
 217     # sort the ranges by size and order.
 218     @r= sort { $a->[0] <=> $b->[0] }  @r;
 219     my $alu= sprintf $fmt,$ofs;    # C array look up
 220
 221     if ($fmt=~/%d/) {
 222         # map the ranges into conditions
 223         @r= map {
 224             # singleton
 225             $_->[0] == $_->[1] ? "$alu==$_->[0]" :
 226             # range
 227             "($_->[0]<=$alu && $alu<=$_->[1])"
 228         } @r;
 229         # return the joined results.
 230         return '( ' . join( " || ", @r ) . ' )';
 231     } else {
 232         return combine($alu,@r);
 233     }
 234 }
 235
 236 #
 237 # Do the condition in such a way that we break out early if the value
 238 # we are looking at is in between two elements in the list.
 239 # Currently used only for codepoint macros (depth 1)
 240 #
 241 sub combine {
 242     my $alu=shift;
 243     local $_ = shift;
 244     my $txt= $_->[0] == $_->[1]
 245            ? "$alu==$_->[0]"
 246            : "($_->[0]<=$alu && $alu<=$_->[1])";
 247     return $txt unless @_;
 248     return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
 249 }
 250
 251 # recursively convert a trie to an optree represented by
 252 # [condition,yes,no] where  yes and no can be a ref to another optree
 253 # or a scalar representing code.
 254 # called by make_optree
 255
 256 sub _trie_to_optree {
 257     my ( $node, $ofs, $else, $fmt )= @_;
 258     return $else unless $node;
 259     $ofs ||= 0;
 260     if ( $node->{''} ) {
 261         $else= $ofs;
 262     } else {
 263         $else ||= 0;
 264     }
 265     my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
 266       map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
 267       grep length, keys %$node;
 268
 269     return $ofs if !@k;
 270
 271     my ( $root, $expr );
 272     while ( @k ) {
 273         my @cond= ( $k[0][0] );
 274         my $d= $k[0][1];
 275         my $r= $k[0][2];
 276         shift @k;
 277         while ( @k && $k[0][1] eq $d ) {
 278             push @cond, $k[0][0];
 279             shift @k;
 280         }
 281         my $op=
 282           [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
 283         if ( !$root ) {
 284             $root= $expr= $op;
 285         } else {
 286             push @$expr, $op;
 287             $expr= $op;
 288         }
 289     }
 290     push @$expr, $else;
 291     return $root;
 292 }
 293
 294 # construct the optree for a type.
 295 # handles the special logic of type ''.
 296 sub make_optree {
 297     my ( $self, $type, $size, $fmt )= @_;
 298     my $else= 0;
 299     $size||=$self->{size}{$type}[0];
 300     $size=1 if $type eq 'c';
 301     if ( !$type ) {
 302         my ( $u, $l );
 303         for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
 304             $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
 305         }
 306         for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
 307             $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
 308         }
 309         if ( $u ) {
 310             $else= [ '(is_utf8)', $u, $l || 0 ];
 311         } elsif ( $l ) {
 312             $else= [ '(!is_utf8)', $l, 0 ];
 313         }
 314         $type= 'n';
 315         $size-- while !$self->{trie}{n}{$size};
 316     }
 317     return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
 318 }
 319
 320 # construct the optree for a type with length checks to prevent buffer
 321 # overruns. Only one length check is performed per lookup trading code
 322 # size for speed.
 323 sub length_optree {
 324     my ( $self, $type,$fmt )= @_;
 325     $type ||= '';
 326     return $self->{len_op}{$type} if $self->{len_op}{$type};
 327     my @size = @{$self->{size}{$type}};
 328
 329     my ( $root, $expr );
 330     foreach my $size ( @size ) {
 331         my $op= [
 332             "( (e) - (s) > " . ( $size - 1 ) . " )",
 333             $self->make_optree( $type, $size ),
 334         ];
 335         if ( !$root ) {
 336             $root= $expr= $op;
 337         } else {
 338             push @$expr, $op;
 339             $expr= $op;
 340         }
 341     }
 342     push @$expr, 0;
 343     return $self->{len_op}{$type}= $root ? $root : $expr->[0];
 344 }
 345
 346 #
 347 # recursively walk an optree and covert it to a huge nested ternary expression.
 348 #
 349 sub _optree_to_ternary {
 350     my ( $node )= @_;
 351     return $node
 352       if !ref $node;
 353     my $depth = 0;
 354     if ( $node->[0] =~ /\[(\d+)\]/ ) {
 355         $depth= $1 + 1;
 356     }
 357     return sprintf "\n%s( %s ? %s : %s )", "  " x $depth, $node->[0],
 358       _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
 359 }
 360
 361 # add \\ to the end of strings in a reasonable neat way.
 362 sub _macro($) {
 363     my $str= shift;
 364     my @lines= split /[^\S\n]*\n/, $str;
 365     return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
 366 }
 367
 368 # default type extensions. 'uln' dont have one because normally
 369 # they are used only as part of type '' which doesnt get an extension
 370 my %ext= (
 371     U => '_utf8',
 372     L => '_latin1',
 373     c => '_cp',
 374
 375 );
 376
 377 # produce the ternary, handling arguments and putting on the macro headers
 378 # and boiler plate
 379 sub ternary {
 380     my ( $self, $type, $ext )= @_;
 381     $type ||= '';
 382     $ext = ($ext{$type} || '') . ($ext||"");
 383     my ($root,$fmt,$arg);
 384     if ($type eq 'c') {
 385         $arg= $fmt= 'cp';
 386     } else {
 387         $arg= 's';
 388     }
 389     if ( $type eq 'c' || $ext !~ /safe/) {
 390         $root= $self->make_optree( $type, 0, $fmt );
 391     } else {
 392         $root= $self->length_optree( $type, $fmt );
 393     }
 394
 395     our $parens;
 396     $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
 397     my $expr= qr/
 398         \( \s*
 399         ($parens)
 400         \s* \? \s*
 401         \( \s*
 402         ($parens)
 403         \s* \? \s*
 404         (\d+|$parens)
 405         \s* : \s*
 406         (\d+|$parens)
 407         \s* \)
 408         \s* : \s*
 409         \4
 410         \s* \)
 411     /x;
 412     my $code= _optree_to_ternary( $root );
 413     for ( $code ) {
 414         s/^\s*//;
 415         1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
 416           || s<$expr><(($1 && $2) ? $3 : $4)>g
 417           || s<\(\s*($parens)\s*\)><$1>g;
 418     }
 419     my @args=($arg);
 420     push @args,'e' if $ext=~/safe/;
 421     push @args,'is_utf8' if !$type;
 422     my $args=join ",",@args;
 423     return "/*** GENERATED CODE ***/\n"
 424           . _macro "#define is_$self->{op}$ext($args)\n$code";
 425 }
 426
 427 my $path=shift @ARGV;
 428 if (!$path) {
 429     $path= "regcharclass.h";
 430     if (!-e $path) { $path="../$path" }
 431     if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
 432 }
 433
 434 rename $path,"$path.bak";
 435 open my $out_fh,">",$path
 436     or die "Can't write to '$path':$!";
 437 binmode $out_fh; # want unix line endings even when run on win32.
 438 my ($zero) = $0=~/([^\\\/]+)$/;
 439 print $out_fh <<"HEADER";
 440 /*  -*- buffer-read-only: t -*-
 441  *
 442  *    regcharclass.h
 443  *
 444  *    Copyright (C) 2007, by Larry Wall and others
 445  *
 446  *    You may distribute under the terms of either the GNU General Public
 447  *    License or the Artistic License, as specified in the README file.
 448  *
 449  * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 450  * This file is built by Porting/$zero.
 451  * (Generated at: @{[ scalar gmtime ]} GMT)
 452  * Any changes made here will be lost!
 453  */
 454
 455 HEADER
 456
 457 my ($op,$title,@strs,@txt);
 458 my $doit= sub {
 459     return unless $op;
 460     my $o= __PACKAGE__->new($title,$op,@strs);
 461     print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
 462     print $out_fh join "\n",@txt,"*/","";
 463     for ('', 'U', 'L') {
 464         print $out_fh $o->ternary( $_ );
 465         print $out_fh $o->ternary( $_,'_safe' );
 466     }
 467     print $out_fh $o->ternary( 'c' );
 468 };
 469 while (<DATA>) {
 470     next unless /\S/;
 471     chomp;
 472     if (/^([A-Z]+)/) {
 473         $doit->();
 474         ($op,$title)=split /\s*:\s*/,$_,2;
 475         @txt=@strs=();
 476     } else {
 477         push @txt, "\t$_";
 478         s/#.*$//;
 479         if (/^0x/) {
 480             push @strs,map { chr $_ } eval $_;
 481         } elsif (/^[""'']/) {
 482             push @strs,eval $_;
 483         }
 484     }
 485 }
 486 $doit->();
 487 print $out_fh "/* ex: set ro: */\n";
 488 print "$path has been updated\n";
 489
 490 __DATA__
 491 LNBREAK: Line Break: \R
 492 "\x0D\x0A"      # CRLF - Network (Windows) line ending
 493 0x0A            # LF  | LINE FEED
 494 0x0B            # VT  | VERTICAL TAB
 495 0x0C            # FF  | FORM FEED
 496 0x0D            # CR  | CARRIAGE RETURN
 497 0x85            # NEL | NEXT LINE
 498 0x2028          # LINE SEPARATOR
 499 0x2029          # PARAGRAPH SEPARATOR
 500
 501 HORIZWS: Horizontal Whitespace: \h \H
 502 0x09            # HT
 503 0x20            # SPACE
 504 0xa0            # NBSP
 505 0x1680          # OGHAM SPACE MARK
 506 0x180e          # MONGOLIAN VOWEL SEPARATOR
 507 0x2000          # EN QUAD
 508 0x2001          # EM QUAD
 509 0x2002          # EN SPACE
 510 0x2003          # EM SPACE
 511 0x2004          # THREE-PER-EM SPACE
 512 0x2005          # FOUR-PER-EM SPACE
 513 0x2006          # SIX-PER-EM SPACE
 514 0x2007          # FIGURE SPACE
 515 0x2008          # PUNCTUATION SPACE
 516 0x2009          # THIN SPACE
 517 0x200A          # HAIR SPACE
 518 0x202f          # NARROW NO-BREAK SPACE
 519 0x205f          # MEDIUM MATHEMATICAL SPACE
 520 0x3000          # IDEOGRAPHIC SPACE
 521
 522 VERTWS: Vertical Whitespace: \v \V
 523 0x0A            # LF
 524 0x0B            # VT
 525 0x0C            # FF
 526 0x0D            # CR
 527 0x85            # NEL
 528 0x2028          # LINE SEPARATOR
 529 0x2029          # PARAGRAPH SEPARATOR
 530