Porting/regcharclass.pl

   1 package CharClass::Matcher;
   2 use strict;
   3 use warnings;
   4 use warnings FATAL => 'all';
   5 use Text::Wrap qw(wrap);
   6 use Encode;
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9 our $hex_fmt= "0x%02X";
  10
  11 =head1 NAME
  12
  13 CharClass::Matcher -- Generate C macros that match character classes efficiently
  14
  15 =head1 SYNOPSIS
  16
  17     perl Porting/regcharclass.pl
  18
  19 =head1 DESCRIPTION
  20
  21 Dynamically generates macros for detecting special charclasses
  22 in latin-1, utf8, and codepoint forms. Macros can be set to return
  23 the length (in bytes) of the matched codepoint, or the codepoint itself.
  24
  25 To regenerate regcharclass.h, run this script from perl-root. No arguments
  26 are necessary.
  27
  28 Using WHATEVER as an example the following macros will be produced:
  29
  30 =over 4
  31
  32 =item is_WHATEVER(s,is_utf8)
  33
  34 =item is_WHATEVER_safe(s,e,is_utf8)
  35
  36 Do a lookup as appropriate based on the is_utf8 flag. When possible
  37 comparisons involving octect<128 are done before checking the is_utf8
  38 flag, hopefully saving time.
  39
  40 =item is_WHATEVER_utf8(s)
  41
  42 =item is_WHATEVER_utf8_safe(s,e)
  43
  44 Do a lookup assuming the string is encoded in (normalized) UTF8.
  45
  46 =item is_WHATEVER_latin1(s)
  47
  48 =item is_WHATEVER_latin1_safe(s,e)
  49
  50 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  51
  52 =item is_WHATEVER_cp(cp)
  53
  54 Check to see if the string matches a given codepoint (hypotethically a
  55 U32). The condition is constructed as as to "break out" as early as
  56 possible if the codepoint is out of range of the condition.
  57
  58 IOW:
  59
  60   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  61
  62 Thus if the character is X+1 only two comparisons will be done. Making
  63 matching lookups slower, but non-matching faster.
  64
  65 =back
  66
  67 Additionally it is possible to generate C<what_> variants that return
  68 the codepoint read instead of the number of octets read, this can be
  69 done by suffixing '-cp' to the type description.
  70
  71 =head2 CODE FORMAT
  72
  73 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
  74
  75
  76 =head1 AUTHOR
  77
  78 Author: Yves Orton (demerphq) 2007
  79
  80 =head1 BUGS
  81
  82 No tests directly here (although the regex engine will fail tests
  83 if this code is broken). Insufficient documentation and no Getopts
  84 handler for using the module as a script.
  85
  86 =head1 LICENSE
  87
  88 You may distribute under the terms of either the GNU General Public
  89 License or the Artistic License, as specified in the README file.
  90
  91 =cut
  92
  93 # Sub naming convention:
  94 # __func : private subroutine, can not be called as a method
  95 # _func  : private method, not meant for external use
  96 # func   : public method.
  97
  98 # private subs
  99 #-------------------------------------------------------------------------------
 100 #
 101 # ($cp,$n,$l,$u)=__uni_latin($str);
 102 #
 103 # Return a list of arrays, each of which when interepreted correctly
 104 # represent the string in some given encoding with specific conditions.
 105 #
 106 # $cp - list of codepoints that make up the string.
 107 # $n  - list of octets that make up the string if all codepoints < 128
 108 # $l  - list of octets that make up the string in latin1 encoding if all
 109 #       codepoints < 256, and at least one codepoint is >127.
 110 # $u  - list of octets that make up the string in utf8 if any codepoint >127
 111 #
 112 #   High CP | Defined
 113 #-----------+----------
 114 #   0 - 127 : $n
 115 # 128 - 255 : $l, $u
 116 # 256 - ... : $u
 117 #
 118
 119 sub __uni_latin1 {
 120     my $str= shift;
 121     my $max= 0;
 122     my @cp;
 123     for my $ch ( split //, $str ) {
 124         my $cp= ord $ch;
 125         push @cp, $cp;
 126         $max= $cp if $max < $cp;
 127     }
 128     my ( $n, $l, $u );
 129     if ( $max < 128 ) {
 130         $n= [@cp];
 131     } else {
 132         $l= [@cp] if $max && $max < 256;
 133
 134         my $copy= $str;    # must copy string, FB_CROAK makes encode destructive
 135         $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
 136         # $u is utf8 but with the utf8 flag OFF
 137         # therefore "C*" gets us the values of the bytes involved.
 138         $u= [ unpack "C*", $u ] if defined $u;
 139     }
 140     return ( \@cp, $n, $l, $u );
 141 }
 142
 143 #
 144 # $clean= __clean($expr);
 145 #
 146 # Cleanup a ternary expression, removing unnecessary parens and apply some
 147 # simplifications using regexes.
 148 #
 149
 150 sub __clean {
 151     my ( $expr )= @_;
 152     our $parens;
 153     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 154
 155     #print "$parens\n$expr\n";
 156     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 157     1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
 158         \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
 159         \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
 160     return $expr;
 161 }
 162
 163 #
 164 # $text= __macro(@args);
 165 # Join args together by newlines, and then neatly add backslashes to the end
 166 # of every  line as expected by the C pre-processor for #define's.
 167 #
 168
 169 sub __macro {
 170     my $str= join "\n", @_;
 171     $str =~ s/\s*$//;
 172     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 173     my $last= pop @lines;
 174     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 175     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 176     return $str . "\n";
 177 }
 178
 179 #
 180 # my $op=__incrdepth($op);
 181 #
 182 # take an 'op' hashref and add one to it and all its childrens depths.
 183 #
 184
 185 sub __incrdepth {
 186     my $op= shift;
 187     return unless ref $op;
 188     $op->{depth} += 1;
 189     __incrdepth( $op->{yes} );
 190     __incrdepth( $op->{no} );
 191     return $op;
 192 }
 193
 194 # join two branches of an opcode together with a condition, incrementing
 195 # the depth on the yes branch when we do so.
 196 # returns the new root opcode of the tree.
 197 sub __cond_join {
 198     my ( $cond, $yes, $no )= @_;
 199     return {
 200         test  => $cond,
 201         yes   => __incrdepth( $yes ),
 202         no    => $no,
 203         depth => 0,
 204     };
 205 }
 206
 207 # Methods
 208
 209 # constructor
 210 #
 211 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 212 #
 213 # Create a new CharClass::Matcher object by parsing the text in
 214 # the txt array. Currently applies the following rules:
 215 #
 216 # Element starts with C<0x>, line is evaled the result treated as
 217 # a number which is passed to chr().
 218 #
 219 # Element starts with C<">, line is evaled and the result treated
 220 # as a string.
 221 #
 222 # Each string is then stored in the 'strs' subhash as a hash record
 223 # made up of the results of __uni_latin1, using the keynames
 224 # 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
 225 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
 226 #
 227 # Size data is tracked per type in the 'size' subhash.
 228 #
 229 # Return an object
 230 #
 231 sub new {
 232     my $class= shift;
 233     my %opt= @_;
 234     for ( qw(op txt) ) {
 235         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 236           if !exists $opt{$_};
 237     }
 238
 239     my $self= bless {
 240         op    => $opt{op},
 241         title => $opt{title} || '',
 242     }, $class;
 243     foreach my $txt ( @{ $opt{txt} } ) {
 244         my $str= $txt;
 245         if ( $str =~ /^[""]/ ) {
 246             $str= eval $str;
 247         } elsif ( $str =~ /^0x/ ) {
 248             $str= chr eval $str;
 249         } elsif ( /\S/ ) {
 250             die "Unparseable line: $txt\n";
 251         } else {
 252             next;
 253         }
 254         my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
 255         my $UTF8= $low   || $utf8;
 256         my $LATIN1= $low || $latin1;
 257         #die Dumper($txt,$cp,$low,$latin1,$utf8)
 258         #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
 259
 260         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
 261           ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
 262         my $rec= $self->{strs}{$str};
 263         foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
 264             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 265               if $self->{strs}{$str}{$key};
 266         }
 267         $self->{has_multi} ||= @$cp > 1;
 268         $self->{has_ascii} ||= $latin1 && @$latin1;
 269         $self->{has_low}   ||= $low && @$low;
 270         $self->{has_high}  ||= !$low && !$latin1;
 271     }
 272     $self->{val_fmt}= $hex_fmt;
 273     $self->{count}= 0 + keys %{ $self->{strs} };
 274     return $self;
 275 }
 276
 277 # my $trie = make_trie($type,$maxlen);
 278 #
 279 # using the data stored in the object build a trie of a specifc type,
 280 # and with specific maximum depth. The trie is made up the elements of
 281 # the given types array for each string in the object (assuming it is
 282 # not too long.)
 283 #
 284 # returns the trie, or undef if there was no relevent data in the object.
 285 #
 286
 287 sub make_trie {
 288     my ( $self, $type, $maxlen )= @_;
 289
 290     my $strs= $self->{strs};
 291     my %trie;
 292     foreach my $rec ( values %$strs ) {
 293         die "panic: unknown type '$type'"
 294           if !exists $rec->{$type};
 295         my $dat= $rec->{$type};
 296         next unless $dat;
 297         next if $maxlen && @$dat > $maxlen;
 298         my $node= \%trie;
 299         foreach my $elem ( @$dat ) {
 300             $node->{$elem} ||= {};
 301             $node= $node->{$elem};
 302         }
 303         $node->{''}= $rec->{str};
 304     }
 305     return 0 + keys( %trie ) ? \%trie : undef;
 306 }
 307
 308 # my $optree= _optree()
 309 #
 310 # recursively convert a trie to an optree where every node represents
 311 # an if else branch.
 312 #
 313 #
 314
 315 sub _optree {
 316     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 317     return unless defined $trie;
 318     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 319         die "Can't do 'cp' optree from multi-codepoint strings";
 320     }
 321     $ret_type ||= 'len';
 322     $else= 0  unless defined $else;
 323     $depth= 0 unless defined $depth;
 324
 325     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 326     if ( $trie->{''} ) {
 327         if ( $ret_type eq 'cp' ) {
 328             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 329             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 330         } elsif ( $ret_type eq 'len' ) {
 331             $else= $depth;
 332         } elsif ( $ret_type eq 'both') {
 333             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 334             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 335             $else= "len=$depth, $else";
 336         }
 337     }
 338     return $else if !@conds;
 339     my $node= {};
 340     my $root= $node;
 341     my ( $yes_res, $as_code, @cond );
 342     my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
 343     my $Update= sub {
 344         $node->{vals}= [@cond];
 345         $node->{test}= $test;
 346         $node->{yes}= $yes_res;
 347         $node->{depth}= $depth;
 348         $node->{no}= shift;
 349     };
 350     while ( @conds ) {
 351         my $cond= shift @conds;
 352         my $res=
 353           $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
 354             $depth + 1 );
 355         my $res_code= Dumper( $res );
 356         if ( !$yes_res || $res_code ne $as_code ) {
 357             if ( $yes_res ) {
 358                 $Update->( {} );
 359                 $node= $node->{no};
 360             }
 361             ( $yes_res, $as_code )= ( $res, $res_code );
 362             @cond= ( $cond );
 363         } else {
 364             push @cond, $cond;
 365         }
 366     }
 367     $Update->( $else );
 368     return $root;
 369 }
 370
 371 # my $optree= optree(%opts);
 372 #
 373 # Convert a trie to an optree, wrapper for _optree
 374
 375 sub optree {
 376     my $self= shift;
 377     my %opt= @_;
 378     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 379     $opt{ret_type} ||= 'len';
 380     my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
 381     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 382 }
 383
 384 # my $optree= generic_optree(%opts);
 385 #
 386 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 387 # sets of strings, including a branch for handling the string type check.
 388 #
 389
 390 sub generic_optree {
 391     my $self= shift;
 392     my %opt= @_;
 393
 394     $opt{ret_type} ||= 'len';
 395     my $test_type= 'depth';
 396     my $else= $opt{else} || 0;
 397
 398     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 399     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 400
 401     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 402       for $latin1, $utf8;
 403
 404     if ( $utf8 ) {
 405         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 406     } elsif ( $latin1 ) {
 407         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 408     }
 409     my $low= $self->make_trie( 'low', $opt{max_depth} );
 410     if ( $low ) {
 411         $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 412     }
 413
 414     return $else;
 415 }
 416
 417 # length_optree()
 418 #
 419 # create a string length guarded optree.
 420 #
 421
 422 sub length_optree {
 423     my $self= shift;
 424     my %opt= @_;
 425     my $type= $opt{type};
 426
 427     die "Can't do a length_optree on type 'cp', makes no sense."
 428       if $type eq 'cp';
 429
 430     my ( @size, $method );
 431
 432     if ( $type eq 'generic' ) {
 433         $method= 'generic_optree';
 434         my %sizes= (
 435             %{ $self->{size}{low}    || {} },
 436             %{ $self->{size}{latin1} || {} },
 437             %{ $self->{size}{utf8}   || {} }
 438         );
 439         @size= sort { $a <=> $b } keys %sizes;
 440     } else {
 441         $method= 'optree';
 442         @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 443     }
 444
 445     my $else= ( $opt{else} ||= 0 );
 446     for my $size ( @size ) {
 447         my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 448         my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 449         $else= __cond_join( $cond, $optree, $else );
 450     }
 451     return $else;
 452 }
 453
 454 # _cond_as_str
 455 # turn a list of conditions into a text expression
 456 # - merges ranges of conditions, and joins the result with ||
 457 sub _cond_as_str {
 458     my ( $self, $op, $combine )= @_;
 459     my $cond= $op->{vals};
 460     my $test= $op->{test};
 461     return "( $test )" if !defined $cond;
 462
 463     # rangify the list
 464     my @ranges;
 465     my $Update= sub {
 466         if ( @ranges ) {
 467             if ( $ranges[-1][0] == $ranges[-1][1] ) {
 468                 $ranges[-1]= $ranges[-1][0];
 469             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
 470                 $ranges[-1]= $ranges[-1][0];
 471                 push @ranges, $ranges[-1] + 1;
 472             }
 473         }
 474     };
 475     for my $cond ( @$cond ) {
 476         if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
 477             $Update->();
 478             push @ranges, [ $cond, $cond ];
 479         } else {
 480             $ranges[-1][1]++;
 481         }
 482     }
 483     $Update->();
 484     return $self->_combine( $test, @ranges )
 485       if $combine;
 486     @ranges= map {
 487         ref $_
 488           ? sprintf(
 489             "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 490             @$_ )
 491           : sprintf( "$self->{val_fmt} == $test", $_ );
 492     } @ranges;
 493     return "( " . join( " || ", @ranges ) . " )";
 494 }
 495
 496 # _combine
 497 # recursively turn a list of conditions into a fast break-out condition
 498 # used by _cond_as_str() for 'cp' type macros.
 499 sub _combine {
 500     my ( $self, $test, @cond )= @_;
 501     return if !@cond;
 502     my $item= shift @cond;
 503     my ( $cstr, $gtv );
 504     if ( ref $item ) {
 505         $cstr=
 506           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 507             @$item );
 508         $gtv= sprintf "$self->{val_fmt}", $item->[1];
 509     } else {
 510         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
 511         $gtv= sprintf "$self->{val_fmt}", $item;
 512     }
 513     if ( @cond ) {
 514         return "( $cstr || ( $gtv < $test &&\n"
 515           . $self->_combine( $test, @cond ) . " ) )";
 516     } else {
 517         return $cstr;
 518     }
 519 }
 520
 521 # _render()
 522 # recursively convert an optree to text with reasonably neat formatting
 523 sub _render {
 524     my ( $self, $op, $combine, $brace )= @_;
 525     if ( !ref $op ) {
 526         return $op;
 527     }
 528     my $cond= $self->_cond_as_str( $op, $combine );
 529     my $yes= $self->_render( $op->{yes}, $combine, 1 );
 530     my $no= $self->_render( $op->{no},   $combine, 0 );
 531     return "( $cond )" if $yes eq '1' and $no eq '0';
 532     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
 533     return "$lb$cond ? $yes : $no$rb"
 534       if !ref( $op->{yes} ) && !ref( $op->{no} );
 535     my $ind1= " " x 4;
 536     my $ind= "\n" . ( $ind1 x $op->{depth} );
 537
 538     if ( ref $op->{yes} ) {
 539         $yes= $ind . $ind1 . $yes;
 540     } else {
 541         $yes= " " . $yes;
 542     }
 543
 544     return "$lb$cond ?$yes$ind: $no$rb";
 545 }
 546
 547 # $expr=render($op,$combine)
 548 #
 549 # convert an optree to text with reasonably neat formatting. If $combine
 550 # is true then the condition is created using "fast breakouts" which
 551 # produce uglier expressions that are more efficient for common case,
 552 # longer lists such as that resulting from type 'cp' output.
 553 # Currently only used for type 'cp' macros.
 554 sub render {
 555     my ( $self, $op, $combine )= @_;
 556     my $str= "( " . $self->_render( $op, $combine ) . " )";
 557     return __clean( $str );
 558 }
 559
 560 # make_macro
 561 # make a macro of a given type.
 562 # calls into make_trie and (generic_|length_)optree as needed
 563 # Opts are:
 564 # type     : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
 565 # ret_type : 'cp' or 'len'
 566 # safe     : add length guards to macro
 567 #
 568 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
 569 # in which case it defaults to 'cp' as well.
 570 #
 571 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
 572 # sequences in it, as the generated macro will accept only a single codepoint
 573 # as an argument.
 574 #
 575 # returns the macro.
 576
 577
 578 sub make_macro {
 579     my $self= shift;
 580     my %opts= @_;
 581     my $type= $opts{type} || 'generic';
 582     die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
 583       if $type eq 'cp'
 584       and $self->{has_multi};
 585     my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
 586     my $method;
 587     if ( $opts{safe} ) {
 588         $method= 'length_optree';
 589     } elsif ( $type eq 'generic' ) {
 590         $method= 'generic_optree';
 591     } else {
 592         $method= 'optree';
 593     }
 594     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
 595     my $text= $self->render( $optree, $type eq 'cp' );
 596     my @args= $type eq 'cp' ? 'cp' : 's';
 597     push @args, "e" if $opts{safe};
 598     push @args, "is_utf8" if $type eq 'generic';
 599     push @args, "len" if $ret_type eq 'both';
 600     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
 601              $ret_type eq 'cp'      ? 'what_'     : 'is_';
 602     my $ext= $type     eq 'generic' ? ''          : '_' . lc( $type );
 603     $ext .= "_safe" if $opts{safe};
 604     my $argstr= join ",", @args;
 605     return "/*** GENERATED CODE ***/\n"
 606       . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
 607 }
 608
 609 # if we arent being used as a module (highly likely) then process
 610 # the __DATA__ below and produce macros in regcharclass.h
 611 # if an argument is provided to the script then it is assumed to
 612 # be the path of the file to output to, if the arg is '-' outputs
 613 # to STDOUT.
 614 if ( !caller ) {
 615
 616
 617
 618     $|++;
 619     my $path= shift @ARGV;
 620
 621     if ( !$path ) {
 622         $path= "regcharclass.h";
 623         if ( !-e $path ) { $path= "../$path" }
 624         if ( !-e $path ) { die "Can't find '$path' to update!\n" }
 625     }
 626     my $out_fh;
 627     if ( $path eq '-' ) {
 628         $out_fh= \*STDOUT;
 629     } else {
 630         rename $path, "$path.bak";
 631         open $out_fh, ">", $path
 632           or die "Can't write to '$path':$!";
 633         binmode $out_fh;    # want unix line endings even when run on win32.
 634     }
 635     my ( $zero )= $0 =~ /([^\\\/]+)$/;
 636     print $out_fh <<"HEADER";
 637 /*  -*- buffer-read-only: t -*-
 638  *
 639  *    regcharclass.h
 640  *
 641  *    Copyright (C) 2007, by Larry Wall and others
 642  *
 643  *    You may distribute under the terms of either the GNU General Public
 644  *    License or the Artistic License, as specified in the README file.
 645  *
 646  * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 647  * This file is built by Porting/$zero.
 648  *
 649  * Any changes made here will be lost!
 650  *
 651  */
 652
 653 HEADER
 654
 655     my ( $op, $title, @txt, @types, @mods );
 656     my $doit= sub {
 657         return unless $op;
 658         print $out_fh "/*\n\t$op: $title\n\n";
 659         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
 660         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
 661
 662         #die Dumper(\@types,\@mods);
 663
 664         foreach my $type_spec ( @types ) {
 665             my ( $type, $ret )= split /-/, $type_spec;
 666             $ret ||= 'len';
 667             foreach my $mod ( @mods ) {
 668                 next if $mod eq 'safe' and $type eq 'cp';
 669                 my $macro= $obj->make_macro(
 670                     type     => $type,
 671                     ret_type => $ret,
 672                     safe     => $mod eq 'safe'
 673                 );
 674                 print $out_fh $macro, "\n";
 675             }
 676         }
 677     };
 678
 679     while ( <DATA> ) {
 680         s/^\s*#//;
 681         next unless /\S/;
 682         chomp;
 683         if ( /^([A-Z]+)/ ) {
 684             $doit->();
 685             ( $op, $title )= split /\s*:\s*/, $_, 2;
 686             @txt= ();
 687         } elsif ( s/^=>// ) {
 688             my ( $type, $modifier )= split /:/, $_;
 689             @types= split ' ', $type;
 690             @mods= split ' ',  $modifier;
 691         } else {
 692             push @txt, "$_";
 693         }
 694     }
 695     $doit->();
 696     print $out_fh "/* ex: set ro: */\n";
 697     print "updated $path\n" if $path ne '-';
 698 }
 699
 700 #
 701 # Valid types: generic, LATIN1, UTF8, low, latin1, utf8
 702 # default return value is octects read.
 703 # append -cp to make it codepoint matched.
 704 # modifiers come after the colon, valid possibilities
 705 # being 'fast' and 'safe'.
 706 #
 707 1; # in the unlikely case we are being used as a module
 708
 709 __DATA__
 710 LNBREAK: Line Break: \R
 711 => generic UTF8 LATIN1 :fast safe
 712 "\x0D\x0A"      # CRLF - Network (Windows) line ending
 713 0x0A            # LF  | LINE FEED
 714 0x0B            # VT  | VERTICAL TAB
 715 0x0C            # FF  | FORM FEED
 716 0x0D            # CR  | CARRIAGE RETURN
 717 0x85            # NEL | NEXT LINE
 718 0x2028          # LINE SEPARATOR
 719 0x2029          # PARAGRAPH SEPARATOR
 720
 721 HORIZWS: Horizontal Whitespace: \h \H
 722 => generic UTF8 LATIN1 cp :fast safe
 723 0x09            # HT
 724 0x20            # SPACE
 725 0xa0            # NBSP
 726 0x1680          # OGHAM SPACE MARK
 727 0x180e          # MONGOLIAN VOWEL SEPARATOR
 728 0x2000          # EN QUAD
 729 0x2001          # EM QUAD
 730 0x2002          # EN SPACE
 731 0x2003          # EM SPACE
 732 0x2004          # THREE-PER-EM SPACE
 733 0x2005          # FOUR-PER-EM SPACE
 734 0x2006          # SIX-PER-EM SPACE
 735 0x2007          # FIGURE SPACE
 736 0x2008          # PUNCTUATION SPACE
 737 0x2009          # THIN SPACE
 738 0x200A          # HAIR SPACE
 739 0x202f          # NARROW NO-BREAK SPACE
 740 0x205f          # MEDIUM MATHEMATICAL SPACE
 741 0x3000          # IDEOGRAPHIC SPACE
 742
 743 VERTWS: Vertical Whitespace: \v \V
 744 => generic UTF8 LATIN1 cp :fast safe
 745 0x0A            # LF
 746 0x0B            # VT
 747 0x0C            # FF
 748 0x0D            # CR
 749 0x85            # NEL
 750 0x2028          # LINE SEPARATOR
 751 0x2029          # PARAGRAPH SEPARATOR
 752
 753
 754 TRICKYFOLD: Problematic fold case letters.
 755 => generic cp generic-cp generic-both :fast safe
 756 0x00DF  # LATIN1 SMALL LETTER SHARP S
 757 0x0390  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
 758 0x03B0  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
 759
 760