Porting/regcharclass.pl

   1 package UTF8::Matcher;
   2 use strict;
   3 use warnings;
   4 use Text::Wrap qw(wrap);
   5 use Encode;
   6 use Data::Dumper;
   7
   8 our $hex_fmt="0x%02X";
   9
  10 # Author: Yves Orton (demerphq) 2007.
  11
  12 =pod
  13
  14 Dynamically generates macros for detecting special charclasses
  15 in both latin-1, utf8, and codepoint forms.
  16
  17 To regenerate regcharclass.h, run this script from perl-root. No arguments
  18 are necessary.
  19
  20 Each charclass handler is constructed as follows:
  21 Each string the charclass must match  is rendered as unicode (codepoints>255),
  22 and if possible  as latin1 (codepoints>127), and if possible as "neutral"
  23 (all codepoints<128).
  24
  25 The rendered strings are then inserted into digit-tries by type and length.
  26 With shorter strings being added to tries that are allowed to contain longer
  27 strings, but not vice versa.  Thus the "longest" trie contains all strings
  28 for that charclass.
  29
  30 The following types of trie are generated:
  31
  32   n - Neutral only. All strings in this type have codepoints<128
  33   l - Latin1 only. All strings in this type have a codepoint>127 in them
  34   u - UTF8 only.   All strings in this type have a codepoint>255 in them
  35   L - Latin1. All strings in 'n' and 'l'
  36   U - UTF8.   All string in 'n' and 'u'
  37   c - Codepoint. All strings in U but in codepoint and not utf8 form.
  38
  39 The ternary() routine is responsible for converting the trie data into a
  40 ternary conditional that matches the required set of strings. The generated
  41 macro normally takes at least the argument 's' which is expected to be a
  42 pointer of type C<char *> or C<U8 *>. The condition generated will be
  43 optimised to match the string as efficiently as possible, with range lookups
  44 being used where possible, and in some situations relying on "true" to be 1.
  45
  46 ternary() takes two optional arguments, $type which is one of the above
  47 characters and $ext which is used to add an extra extension to the macro name.
  48
  49 If $type is omitted or false then the generated macro will take an additional
  50 argument, 'is_utf8'.
  51
  52 If $ext has the string 'safe' in it then the generated macro will take an extra
  53 argument 'e' for the end of the string, and all lookups will be length checked
  54 to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
  55 be guaranteed safe, and no 'e' argument is provided  and no length checks are
  56 made during execution.
  57
  58 The 'c' type is different as compared to the rest. Instead of producing
  59 a condition that does octet comparisons of a string array, the 'c' type
  60 produces a macro that takes a single codepoint as an argument (instead of a
  61 char* or U8*) and does the lookup based on only that char, thus it cannot be
  62 used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
  63 This is primarily used for populating charclass bitmaps for codepoints 0..255
  64 but will also match codepoints in the unicode range if necessary.
  65
  66 Using LNBREAK as an example the following macros will be produced:
  67
  68 =over 4
  69
  70 =item is_LNBREAK(s,is_utf8)
  71
  72 =item is_LNBREAK_safe(s,e,is_utf8)
  73
  74 Do a lookup as apporpriate based on the is_utf8 flag. When possible
  75 comparisons involving octect<128 are done before checking the is_utf8
  76 flag, hopefully saving time.
  77
  78 =item is_LNBREAK_utf8(s)
  79
  80 =item is_LNBREAK_utf8_safe(s,e)
  81
  82 Do a lookup assuming the string is encoded in (normalized) UTF8.
  83
  84 =item is_LNBREAK_latin1(s)
  85
  86 =item is_LNBREAK_latin1_safe(s,e)
  87
  88 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  89
  90 =item is_LNBREAK_cp(cp)
  91
  92 Check to see if the string matches a given codepoint (hypotethically a
  93 U32). The condition is constructed as as to "break out" as early as
  94 possible if the codepoint is out of range of the condition.
  95
  96 IOW:
  97
  98   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  99
 100 Thus if the character is X+1 only two comparisons will be done. Making
 101 matching lookups slower, but non-matching faster.
 102
 103 =back
 104
 105 =cut
 106
 107 # store a list of numbers into a hash based trie.
 108 sub _trie_store {
 109     my $root= shift;
 110     foreach my $b ( @_ ) {
 111         $root->{$b} ||= {};
 112         $root= $root->{$b};
 113     }
 114     $root->{''}++;
 115 }
 116
 117 # Convert a string into its neutral, latin1, utf8 forms, where
 118 # the form is undefined unless the string can be completely represented
 119 # in that form. The string is then decomposed into the octects representing
 120 # it. A list is returned for each. Additional a list of codepoints making
 121 # up the string.
 122 # returns (\@n,\@u,\@l,\@cp)
 123 #
 124 sub _uni_latin1 {
 125     my $str= shift;
 126     my $u= eval { Encode::encode( "utf8",       "$str", Encode::FB_CROAK ) };
 127     my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
 128     my $n= $l;
 129     undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
 130     return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
 131             [map { ord $_ } split //,$str]);
 132 }
 133
 134 # store an array ref of char data into the appropriate
 135 # type bins, tracking sizes as we go.
 136 sub _store {
 137     my ( $self, $r, @k )= @_;
 138     for my $z ( @k ) {
 139         $self->{size}{$z}{ 0 + @$r }++;
 140         push @{ $self->{data}{$z} }, $r;
 141     }
 142 }
 143
 144 # construct a new charclass constructor object.
 145 # $title ends up in the code a as a comment.
 146 # $opcode is the name of the operation the charclass implements.
 147 # the rest of the arguments are strings that the charclass
 148 # can match.
 149 sub new {
 150     my $class= shift;
 151     my $title= shift;
 152     my $opcode= shift;
 153     my $self= bless { op => $opcode, title => $title }, $class;
 154     my %seen;
 155     # convert the strings to the numeric equivelents and store
 156     # them for later insertion while tracking their sizes.
 157     foreach my $seq ( @_ ) {
 158         next if $seen{$seq}++;
 159         push @{$self->{seq}},$seq;
 160         my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
 161         if ( $n ) {
 162             _store( $self, $n, qw(n U L) );
 163         } else {
 164             if ( $l ) {
 165                 _store( $self, $l, qw(l L) );
 166             }
 167             _store( $self, $u, qw(u U) );
 168         }
 169         _store($self,$cp,'c');
 170     }
 171     #
 172     # now construct the tries. For each type of data we insert
 173     # the data into all the tries of length $size and smaller.
 174     #
 175
 176     my %allsize;
 177     foreach my $k ( keys %{ $self->{data} } ) {
 178         my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
 179         $self->{size}{$k}=\@size;
 180         undef @allsize{@size};
 181         foreach my $d ( @{ $self->{data}{$k} } ) {
 182             foreach my $sz ( @size ) {
 183                 last if $sz < @$d;
 184                 $self->{trie}{$k}{$sz} ||= {};
 185                 _trie_store( $self->{trie}{$k}{$sz}, @$d );
 186             }
 187         }
 188         #delete $self->{data}{$k};
 189     }
 190     my @size= sort { $b <=> $a } keys %allsize;
 191     $self->{size}{''}= \@size;
 192     return $self;
 193 }
 194
 195 #
 196 # _cond([$v1,$v2,$v2...],$ofs)
 197 #
 198 # converts an array of codepoints into a conditional expression
 199 # consequtive codepoints are merged into a range test
 200 # returns a string containing the conditional expression in the form
 201 # '( li[x]==v || li[x]==y )' When possible we also use range lookups.
 202
 203 sub _cond {
 204     my ( $c, $ofs,$fmt )= @_;
 205     $fmt||='((U8*)s)[%d]';
 206     # cheapo rangification routine.
 207     # Convert the first element into a singleton represented
 208     # as [$x,$x] and then merge the rest in as we go.
 209     my @v= sort { $a <=> $b } @$c;
 210     my @r= ( [ ( shift @v ) x 2 ] );
 211     for my $n ( @v ) {
 212         if ( $n == $r[-1][1] + 1 ) {
 213             $r[-1][1]++;
 214         } else {
 215             push @r, [ $n, $n ];
 216         }
 217     }
 218     @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
 219     # sort the ranges by size and order.
 220     @r= sort { $a->[0] <=> $b->[0] }  @r;
 221     my $alu= sprintf $fmt,$ofs;    # C array look up
 222
 223     if ($fmt=~/%d/) {
 224         # map the ranges into conditions
 225         @r= map {
 226             # singleton
 227             $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
 228             # range
 229             sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
 230         } @r;
 231         # return the joined results.
 232         return '( ' . join( " || ", @r ) . ' )';
 233     } else {
 234         return combine($alu,@r);
 235     }
 236 }
 237
 238 #
 239 # Do the condition in such a way that we break out early if the value
 240 # we are looking at is in between two elements in the list.
 241 # Currently used only for codepoint macros (depth 1)
 242 #
 243 sub combine {
 244     my $alu=shift;
 245     local $_ = shift;
 246     my $txt= $_->[0] == $_->[1]
 247            ? sprintf("$alu == $hex_fmt",$_->[0])
 248            : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
 249     return $txt unless @_;
 250     return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )",
 251         $txt,$alu,$_->[1],combine($alu,@_);
 252 }
 253
 254 # recursively convert a trie to an optree represented by
 255 # [condition,yes,no] where  yes and no can be a ref to another optree
 256 # or a scalar representing code.
 257 # called by make_optree
 258
 259 sub _trie_to_optree {
 260     my ( $node, $ofs, $else, $fmt )= @_;
 261     return $else unless $node;
 262     $ofs ||= 0;
 263     if ( $node->{''} ) {
 264         $else= $ofs;
 265     } else {
 266         $else ||= 0;
 267     }
 268     my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
 269       map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
 270       grep length, keys %$node;
 271
 272     return $ofs if !@k;
 273
 274     my ( $root, $expr );
 275     while ( @k ) {
 276         my @cond= ( $k[0][0] );
 277         my $d= $k[0][1];
 278         my $r= $k[0][2];
 279         shift @k;
 280         while ( @k && $k[0][1] eq $d ) {
 281             push @cond, $k[0][0];
 282             shift @k;
 283         }
 284         my $op=
 285           [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
 286         if ( !$root ) {
 287             $root= $expr= $op;
 288         } else {
 289             push @$expr, $op;
 290             $expr= $op;
 291         }
 292     }
 293     push @$expr, $else;
 294     return $root;
 295 }
 296
 297 # construct the optree for a type.
 298 # handles the special logic of type ''.
 299 sub make_optree {
 300     my ( $self, $type, $size, $fmt )= @_;
 301     my $else= 0;
 302     $size||=$self->{size}{$type}[0];
 303     $size=1 if $type eq 'c';
 304     if ( !$type ) {
 305         my ( $u, $l );
 306         if ($self->{trie}{u}) {
 307             for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
 308                 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
 309             }
 310         }
 311         if ($self->{trie}{l}) {
 312             for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
 313                 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
 314             }
 315         }
 316         if ( $u ) {
 317             $else= [ '(is_utf8)', $u, $l || 0 ];
 318         } elsif ( $l ) {
 319             $else= [ '(!is_utf8)', $l, 0 ];
 320         }
 321         $type= 'n';
 322     }
 323     if (!$self->{trie}{$type}) {
 324         return $else;
 325     } else {
 326         $size-- while $size>0 && !$self->{trie}{$type}{$size};
 327         return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
 328     }
 329 }
 330
 331 # construct the optree for a type with length checks to prevent buffer
 332 # overruns. Only one length check is performed per lookup trading code
 333 # size for speed.
 334 sub length_optree {
 335     my ( $self, $type,$fmt )= @_;
 336     $type ||= '';
 337     return $self->{len_op}{$type} if $self->{len_op}{$type};
 338     my @size = @{$self->{size}{$type}};
 339
 340     my ( $root, $expr );
 341     foreach my $size ( @size ) {
 342         my $op= [
 343             "( (e) - (s) > " . ( $size - 1 ) . " )",
 344             $self->make_optree( $type, $size ),
 345         ];
 346         if ( !$root ) {
 347             $root= $expr= $op;
 348         } else {
 349             push @$expr, $op;
 350             $expr= $op;
 351         }
 352     }
 353     push @$expr, 0;
 354     return $self->{len_op}{$type}= $root ? $root : $expr->[0];
 355 }
 356
 357 #
 358 # recursively walk an optree and covert it to a huge nested ternary expression.
 359 #
 360 sub _optree_to_ternary {
 361     my ( $node )= @_;
 362     return $node
 363       if !ref $node;
 364     my $depth = 0;
 365     if ( $node->[0] =~ /\[(\d+)\]/ ) {
 366         $depth= $1 + 1;
 367     }
 368     return sprintf "\n%s( %s ? %s : %s )", "  " x $depth, $node->[0],
 369       _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
 370 }
 371
 372 # add \\ to the end of strings in a reasonable neat way.
 373 sub _macro($) {
 374     my $str= shift;
 375     my @lines= split /[^\S\n]*\n/, $str;
 376     my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
 377     $macro =~ s/  *$//;
 378     return $macro . "\n\n";
 379 }
 380
 381 # default type extensions. 'uln' dont have one because normally
 382 # they are used only as part of type '' which doesnt get an extension
 383 my %ext= (
 384     U => '_utf8',
 385     L => '_latin1',
 386     c => '_cp',
 387
 388 );
 389
 390 # produce the ternary, handling arguments and putting on the macro headers
 391 # and boiler plate
 392 sub ternary {
 393     my ( $self, $type, $ext )= @_;
 394     $type ||= '';
 395     $ext = ($ext{$type} || '') . ($ext||"");
 396     my ($root,$fmt,$arg);
 397     if ($type eq 'c') {
 398         $arg= $fmt= 'cp';
 399     } else {
 400         $arg= 's';
 401     }
 402     if ( $type eq 'c' || $ext !~ /safe/) {
 403         $root= $self->make_optree( $type, 0, $fmt );
 404     } else {
 405         $root= $self->length_optree( $type, $fmt );
 406     }
 407
 408     our $parens;
 409     $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
 410     my $expr= qr/
 411         \( \s*
 412         ($parens)
 413         \s* \? \s*
 414         \( \s*
 415         ($parens)
 416         \s* \? \s*
 417         (\d+|$parens)
 418         \s* : \s*
 419         (\d+|$parens)
 420         \s* \)
 421         \s* : \s*
 422         \4
 423         \s* \)
 424     /x;
 425     my $code= _optree_to_ternary( $root );
 426     for ( $code ) {
 427         s/^\s*//;
 428         1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
 429           || s<$expr><(($1 && $2) ? $3 : $4)>g
 430           || s<\(\s*($parens)\s*\)><$1>g;
 431     }
 432     my @args=($arg);
 433     push @args,'e' if $ext=~/safe/;
 434     push @args,'is_utf8' if !$type;
 435     my $args=join ",",@args;
 436     return "/*** GENERATED CODE ***/\n"
 437           . _macro "#define is_$self->{op}$ext($args)\n$code";
 438 }
 439 $|++;
 440 my $path=shift @ARGV;
 441
 442 if (!$path) {
 443     $path= "regcharclass.h";
 444     if (!-e $path) { $path="../$path" }
 445     if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
 446 }
 447 my $out_fh;
 448 if ($path eq '-') {
 449     $out_fh= \*STDOUT;
 450 } else {
 451     rename $path,"$path.bak";
 452     open $out_fh,">",$path
 453         or die "Can't write to '$path':$!";
 454     binmode $out_fh; # want unix line endings even when run on win32.
 455 }
 456 my ($zero) = $0=~/([^\\\/]+)$/;
 457 print $out_fh <<"HEADER";
 458 /*  -*- buffer-read-only: t -*-
 459  *
 460  *    regcharclass.h
 461  *
 462  *    Copyright (C) 2007, by Larry Wall and others
 463  *
 464  *    You may distribute under the terms of either the GNU General Public
 465  *    License or the Artistic License, as specified in the README file.
 466  *
 467  * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 468  * This file is built by Porting/$zero.
 469  * (Generated at: @{[ scalar gmtime ]} GMT)
 470  * Any changes made here will be lost!
 471  */
 472
 473 HEADER
 474
 475 my ($op,$title,@strs,@txt,$type);
 476 my $doit= sub {
 477     return unless $op;
 478     my $o= __PACKAGE__->new($title,$op,@strs);
 479     print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
 480     print $out_fh join "\n",@txt,"*/","";
 481     $type||="U L c _safe";
 482     my @ext=("");
 483     my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
 484               split /\s+/,$type);
 485     for my $type (@types) {
 486         for my $ext (@ext) {
 487             next if $type eq 'c' and $ext eq '_safe';
 488             print $out_fh $o->ternary( $type,$ext );
 489         }
 490     }
 491 };
 492 while (<DATA>) {
 493     next unless /\S/;
 494     chomp;
 495     if (/^([A-Z]+)/) {
 496         $doit->();
 497         ($op,$title)=split /\s*:\s*/,$_,2;
 498         @txt=@strs=();
 499         $type="";
 500     } elsif (/^=(.*)/) {
 501         $type.=$1;
 502     } else {
 503         push @txt, "\t$_";
 504         s/#.*$//;
 505         if (/^0x/) {
 506             push @strs,map { chr $_ } eval $_;
 507         } elsif (/^[""'']/) {
 508             push @strs,eval $_;
 509         }
 510     }
 511 }
 512 $doit->();
 513 print $out_fh "/* ex: set ro: */\n";
 514
 515 __DATA__
 516 LNBREAK: Line Break: \R
 517 "\x0D\x0A"      # CRLF - Network (Windows) line ending
 518 0x0A            # LF  | LINE FEED
 519 0x0B            # VT  | VERTICAL TAB
 520 0x0C            # FF  | FORM FEED
 521 0x0D            # CR  | CARRIAGE RETURN
 522 0x85            # NEL | NEXT LINE
 523 0x2028          # LINE SEPARATOR
 524 0x2029          # PARAGRAPH SEPARATOR
 525
 526 HORIZWS: Horizontal Whitespace: \h \H
 527 0x09            # HT
 528 0x20            # SPACE
 529 0xa0            # NBSP
 530 0x1680          # OGHAM SPACE MARK
 531 0x180e          # MONGOLIAN VOWEL SEPARATOR
 532 0x2000          # EN QUAD
 533 0x2001          # EM QUAD
 534 0x2002          # EN SPACE
 535 0x2003          # EM SPACE
 536 0x2004          # THREE-PER-EM SPACE
 537 0x2005          # FOUR-PER-EM SPACE
 538 0x2006          # SIX-PER-EM SPACE
 539 0x2007          # FIGURE SPACE
 540 0x2008          # PUNCTUATION SPACE
 541 0x2009          # THIN SPACE
 542 0x200A          # HAIR SPACE
 543 0x202f          # NARROW NO-BREAK SPACE
 544 0x205f          # MEDIUM MATHEMATICAL SPACE
 545 0x3000          # IDEOGRAPHIC SPACE
 546
 547 VERTWS: Vertical Whitespace: \v \V
 548 0x0A            # LF
 549 0x0B            # VT
 550 0x0C            # FF
 551 0x0D            # CR
 552 0x85            # NEL
 553 0x2028          # LINE SEPARATOR
 554 0x2029          # PARAGRAPH SEPARATOR
 555
 556 TRICKYFOLD: Problematic fold case letters.
 557 0x00DF  # LATIN SMALL LETTER SHARP S
 558 0x0390  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
 559 0x03B0  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS