Porting/regcharclass.pl

   1 package CharClass::Matcher;
   2 use strict;
   3 use warnings;
   4 use warnings FATAL => 'all';
   5 use Text::Wrap qw(wrap);
   6 use Encode;
   7 use Data::Dumper;
   8 $Data::Dumper::Useqq= 1;
   9 our $hex_fmt= "0x%02X";
  10
  11 =head1 NAME
  12
  13 CharClass::Matcher -- Generate C macros that match character classes efficiently
  14
  15 =head1 SYNOPSIS
  16
  17     perl Porting/regcharclass.pl
  18
  19 =head1 DESCRIPTION
  20
  21 Dynamically generates macros for detecting special charclasses
  22 in latin-1, utf8, and codepoint forms. Macros can be set to return
  23 the length (in bytes) of the matched codepoint, or the codepoint itself.
  24
  25 To regenerate regcharclass.h, run this script from perl-root. No arguments
  26 are necessary.
  27
  28 Using WHATEVER as an example the following macros will be produced:
  29
  30 =over 4
  31
  32 =item is_WHATEVER(s,is_utf8)
  33
  34 =item is_WHATEVER_safe(s,e,is_utf8)
  35
  36 Do a lookup as appropriate based on the is_utf8 flag. When possible
  37 comparisons involving octect<128 are done before checking the is_utf8
  38 flag, hopefully saving time.
  39
  40 =item is_WHATEVER_utf8(s)
  41
  42 =item is_WHATEVER_utf8_safe(s,e)
  43
  44 Do a lookup assuming the string is encoded in (normalized) UTF8.
  45
  46 =item is_WHATEVER_latin1(s)
  47
  48 =item is_WHATEVER_latin1_safe(s,e)
  49
  50 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
  51
  52 =item is_WHATEVER_cp(cp)
  53
  54 Check to see if the string matches a given codepoint (hypotethically a
  55 U32). The condition is constructed as as to "break out" as early as
  56 possible if the codepoint is out of range of the condition.
  57
  58 IOW:
  59
  60   (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
  61
  62 Thus if the character is X+1 only two comparisons will be done. Making
  63 matching lookups slower, but non-matching faster.
  64
  65 =back
  66
  67 Additionally it is possible to generate C<what_> variants that return
  68 the codepoint read instead of the number of octets read, this can be
  69 done by suffixing '-cp' to the type description.
  70
  71 =head2 CODE FORMAT
  72
  73 perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
  74
  75
  76 =head1 AUTHOR
  77
  78 Author: Yves Orton (demerphq) 2007
  79
  80 =head1 BUGS
  81
  82 No tests directly here (although the regex engine will fail tests
  83 if this code is broken). Insufficient documentation and no Getopts
  84 handler for using the module as a script.
  85
  86 =head1 LICENSE
  87
  88 You may distribute under the terms of either the GNU General Public
  89 License or the Artistic License, as specified in the README file.
  90
  91 =cut
  92
  93 # Sub naming convention:
  94 # __func : private subroutine, can not be called as a method
  95 # _func  : private method, not meant for external use
  96 # func   : public method.
  97
  98 # private subs
  99 #-------------------------------------------------------------------------------
 100 #
 101 # ($cp,$n,$l,$u)=__uni_latin($str);
 102 #
 103 # Return a list of arrays, each of which when interepreted correctly
 104 # represent the string in some given encoding with specific conditions.
 105 #
 106 # $cp - list of codepoints that make up the string.
 107 # $n  - list of octets that make up the string if all codepoints < 128
 108 # $l  - list of octets that make up the string in latin1 encoding if all
 109 #       codepoints < 256, and at least one codepoint is >127.
 110 # $u  - list of octets that make up the string in utf8 if any codepoint >127
 111 #
 112 #   High CP | Defined
 113 #-----------+----------
 114 #   0 - 127 : $n
 115 # 128 - 255 : $l, $u
 116 # 256 - ... : $u
 117 #
 118
 119 sub __uni_latin1 {
 120     my $str= shift;
 121     my $max= 0;
 122     my @cp;
 123     for my $ch ( split //, $str ) {
 124         my $cp= ord $ch;
 125         push @cp, $cp;
 126         $max= $cp if $max < $cp;
 127     }
 128     my ( $n, $l, $u );
 129     if ( $max < 128 ) {
 130         $n= [@cp];
 131     } else {
 132         $l= [@cp] if $max && $max < 256;
 133
 134         my $copy= $str;    # must copy string, FB_CROAK makes encode destructive
 135         $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
 136         $u= [ unpack "U0C*", $u ] if $u;
 137     }
 138     return ( \@cp, $n, $l, $u );
 139 }
 140
 141 #
 142 # $clean= __clean($expr);
 143 #
 144 # Cleanup a ternary expression, removing unnecessary parens and apply some
 145 # simplifications using regexes.
 146 #
 147
 148 sub __clean {
 149     my ( $expr )= @_;
 150     our $parens;
 151     $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
 152
 153     #print "$parens\n$expr\n";
 154     1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
 155     1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
 156         \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
 157         \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
 158     return $expr;
 159 }
 160
 161 #
 162 # $text= __macro(@args);
 163 # Join args together by newlines, and then neatly add backslashes to the end
 164 # of every  line as expected by the C pre-processor for #define's.
 165 #
 166
 167 sub __macro {
 168     my $str= join "\n", @_;
 169     $str =~ s/\s*$//;
 170     my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
 171     my $last= pop @lines;
 172     $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
 173     1 while $str =~ s/^(\t*) {8}/$1\t/gm;
 174     return $str . "\n";
 175 }
 176
 177 #
 178 # my $op=__incrdepth($op);
 179 #
 180 # take an 'op' hashref and add one to it and all its childrens depths.
 181 #
 182
 183 sub __incrdepth {
 184     my $op= shift;
 185     return unless ref $op;
 186     $op->{depth} += 1;
 187     __incrdepth( $op->{yes} );
 188     __incrdepth( $op->{no} );
 189     return $op;
 190 }
 191
 192 # join two branches of an opcode together with a condition, incrementing
 193 # the depth on the yes branch when we do so.
 194 # returns the new root opcode of the tree.
 195 sub __cond_join {
 196     my ( $cond, $yes, $no )= @_;
 197     return {
 198         test  => $cond,
 199         yes   => __incrdepth( $yes ),
 200         no    => $no,
 201         depth => 0,
 202     };
 203 }
 204
 205 # Methods
 206
 207 # constructor
 208 #
 209 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
 210 #
 211 # Create a new CharClass::Matcher object by parsing the text in
 212 # the txt array. Currently applies the following rules:
 213 #
 214 # Element starts with C<0x>, line is evaled the result treated as
 215 # a number which is passed to chr().
 216 #
 217 # Element starts with C<">, line is evaled and the result treated
 218 # as a string.
 219 #
 220 # Each string is then stored in the 'strs' subhash as a hash record
 221 # made up of the results of __uni_latin1, using the keynames
 222 # 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
 223 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
 224 #
 225 # Size data is tracked per type in the 'size' subhash.
 226 #
 227 # Return an object
 228 #
 229 sub new {
 230     my $class= shift;
 231     my %opt= @_;
 232     for ( qw(op txt) ) {
 233         die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
 234           if !exists $opt{$_};
 235     }
 236
 237     my $self= bless {
 238         op    => $opt{op},
 239         title => $opt{title} || '',
 240     }, $class;
 241     foreach my $txt ( @{ $opt{txt} } ) {
 242         my $str= $txt;
 243         if ( $str =~ /^[""]/ ) {
 244             $str= eval $str;
 245         } elsif ( $str =~ /^0x/ ) {
 246             $str= chr eval $str;
 247         } elsif ( /\S/ ) {
 248             die "Unparseable line: $txt\n";
 249         } else {
 250             next;
 251         }
 252         my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
 253         my $UTF8= $low   || $utf8;
 254         my $LATIN1= $low || $latin1;
 255
 256         @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
 257           ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
 258         my $rec= $self->{strs}{$str};
 259         foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
 260             $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
 261               if $self->{strs}{$str}{$key};
 262         }
 263         $self->{has_multi} ||= @$cp > 1;
 264         $self->{has_ascii} ||= $latin1 && @$latin1;
 265         $self->{has_low}   ||= $low && @$low;
 266         $self->{has_high}  ||= !$low && !$latin1;
 267     }
 268     $self->{val_fmt}= $hex_fmt;
 269     $self->{count}= 0 + keys %{ $self->{strs} };
 270     return $self;
 271 }
 272
 273 # my $trie = make_trie($type,$maxlen);
 274 #
 275 # using the data stored in the object build a trie of a specifc type,
 276 # and with specific maximum depth. The trie is made up the elements of
 277 # the given types array for each string in the object (assuming it is
 278 # not too long.)
 279 #
 280 # returns the trie, or undef if there was no relevent data in the object.
 281 #
 282
 283 sub make_trie {
 284     my ( $self, $type, $maxlen )= @_;
 285
 286     my $strs= $self->{strs};
 287     my %trie;
 288     foreach my $rec ( values %$strs ) {
 289         die "panic: unknown type '$type'"
 290           if !exists $rec->{$type};
 291         my $dat= $rec->{$type};
 292         next unless $dat;
 293         next if $maxlen && @$dat > $maxlen;
 294         my $node= \%trie;
 295         foreach my $elem ( @$dat ) {
 296             $node->{$elem} ||= {};
 297             $node= $node->{$elem};
 298         }
 299         $node->{''}= $rec->{str};
 300     }
 301     return 0 + keys( %trie ) ? \%trie : undef;
 302 }
 303
 304 # my $optree= _optree()
 305 #
 306 # recursively convert a trie to an optree where every node represents
 307 # an if else branch.
 308 #
 309 #
 310
 311 sub _optree {
 312     my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
 313     return unless defined $trie;
 314     if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
 315         die "Can't do 'cp' optree from multi-codepoint strings";
 316     }
 317     $ret_type ||= 'len';
 318     $else= 0  unless defined $else;
 319     $depth= 0 unless defined $depth;
 320
 321     my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
 322     if ( $trie->{''} ) {
 323         if ( $ret_type eq 'cp' ) {
 324             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 325             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 326         } elsif ( $ret_type eq 'len' ) {
 327             $else= $depth;
 328         } elsif ( $ret_type eq 'both') {
 329             $else= $self->{strs}{ $trie->{''} }{cp}[0];
 330             $else= sprintf "$self->{val_fmt}", $else if $else > 9;
 331             $else= "len=$depth, $else";
 332         }
 333     }
 334     return $else if !@conds;
 335     my $node= {};
 336     my $root= $node;
 337     my ( $yes_res, $as_code, @cond );
 338     my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
 339     my $Update= sub {
 340         $node->{vals}= [@cond];
 341         $node->{test}= $test;
 342         $node->{yes}= $yes_res;
 343         $node->{depth}= $depth;
 344         $node->{no}= shift;
 345     };
 346     while ( @conds ) {
 347         my $cond= shift @conds;
 348         my $res=
 349           $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
 350             $depth + 1 );
 351         my $res_code= Dumper( $res );
 352         if ( !$yes_res || $res_code ne $as_code ) {
 353             if ( $yes_res ) {
 354                 $Update->( {} );
 355                 $node= $node->{no};
 356             }
 357             ( $yes_res, $as_code )= ( $res, $res_code );
 358             @cond= ( $cond );
 359         } else {
 360             push @cond, $cond;
 361         }
 362     }
 363     $Update->( $else );
 364     return $root;
 365 }
 366
 367 # my $optree= optree(%opts);
 368 #
 369 # Convert a trie to an optree, wrapper for _optree
 370
 371 sub optree {
 372     my $self= shift;
 373     my %opt= @_;
 374     my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
 375     $opt{ret_type} ||= 'len';
 376     my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
 377     return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
 378 }
 379
 380 # my $optree= generic_optree(%opts);
 381 #
 382 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
 383 # sets of strings, including a branch for handling the string type check.
 384 #
 385
 386 sub generic_optree {
 387     my $self= shift;
 388     my %opt= @_;
 389
 390     $opt{ret_type} ||= 'len';
 391     my $test_type= 'depth';
 392     my $else= $opt{else} || 0;
 393
 394     my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
 395     my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
 396
 397     $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
 398       for $latin1, $utf8;
 399
 400     if ( $utf8 ) {
 401         $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
 402     } elsif ( $latin1 ) {
 403         $else= __cond_join( "!( is_utf8 )", $latin1, $else );
 404     }
 405     my $low= $self->make_trie( 'low', $opt{max_depth} );
 406     if ( $low ) {
 407         $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
 408     }
 409
 410     return $else;
 411 }
 412
 413 # length_optree()
 414 #
 415 # create a string length guarded optree.
 416 #
 417
 418 sub length_optree {
 419     my $self= shift;
 420     my %opt= @_;
 421     my $type= $opt{type};
 422
 423     die "Can't do a length_optree on type 'cp', makes no sense."
 424       if $type eq 'cp';
 425
 426     my ( @size, $method );
 427
 428     if ( $type eq 'generic' ) {
 429         $method= 'generic_optree';
 430         my %sizes= (
 431             %{ $self->{size}{low}    || {} },
 432             %{ $self->{size}{latin1} || {} },
 433             %{ $self->{size}{utf8}   || {} }
 434         );
 435         @size= sort { $a <=> $b } keys %sizes;
 436     } else {
 437         $method= 'optree';
 438         @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
 439     }
 440
 441     my $else= ( $opt{else} ||= 0 );
 442     for my $size ( @size ) {
 443         my $optree= $self->$method( %opt, type => $type, max_depth => $size );
 444         my $cond= "((e)-(s) > " . ( $size - 1 ).")";
 445         $else= __cond_join( $cond, $optree, $else );
 446     }
 447     return $else;
 448 }
 449
 450 # _cond_as_str
 451 # turn a list of conditions into a text expression
 452 # - merges ranges of conditions, and joins the result with ||
 453 sub _cond_as_str {
 454     my ( $self, $op, $combine )= @_;
 455     my $cond= $op->{vals};
 456     my $test= $op->{test};
 457     return "( $test )" if !defined $cond;
 458
 459     # rangify the list
 460     my @ranges;
 461     my $Update= sub {
 462         if ( @ranges ) {
 463             if ( $ranges[-1][0] == $ranges[-1][1] ) {
 464                 $ranges[-1]= $ranges[-1][0];
 465             } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
 466                 $ranges[-1]= $ranges[-1][0];
 467                 push @ranges, $ranges[-1] + 1;
 468             }
 469         }
 470     };
 471     for my $cond ( @$cond ) {
 472         if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
 473             $Update->();
 474             push @ranges, [ $cond, $cond ];
 475         } else {
 476             $ranges[-1][1]++;
 477         }
 478     }
 479     $Update->();
 480     return $self->_combine( $test, @ranges )
 481       if $combine;
 482     @ranges= map {
 483         ref $_
 484           ? sprintf(
 485             "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 486             @$_ )
 487           : sprintf( "$self->{val_fmt} == $test", $_ );
 488     } @ranges;
 489     return "( " . join( " || ", @ranges ) . " )";
 490 }
 491
 492 # _combine
 493 # recursively turn a list of conditions into a fast break-out condition
 494 # used by _cond_as_str() for 'cp' type macros.
 495 sub _combine {
 496     my ( $self, $test, @cond )= @_;
 497     return if !@cond;
 498     my $item= shift @cond;
 499     my ( $cstr, $gtv );
 500     if ( ref $item ) {
 501         $cstr=
 502           sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
 503             @$item );
 504         $gtv= sprintf "$self->{val_fmt}", $item->[1];
 505     } else {
 506         $cstr= sprintf( "$self->{val_fmt} == $test", $item );
 507         $gtv= sprintf "$self->{val_fmt}", $item;
 508     }
 509     if ( @cond ) {
 510         return "( $cstr || ( $gtv < $test &&\n"
 511           . $self->_combine( $test, @cond ) . " ) )";
 512     } else {
 513         return $cstr;
 514     }
 515 }
 516
 517 # _render()
 518 # recursively convert an optree to text with reasonably neat formatting
 519 sub _render {
 520     my ( $self, $op, $combine, $brace )= @_;
 521     if ( !ref $op ) {
 522         return $op;
 523     }
 524     my $cond= $self->_cond_as_str( $op, $combine );
 525     my $yes= $self->_render( $op->{yes}, $combine, 1 );
 526     my $no= $self->_render( $op->{no},   $combine, 0 );
 527     return "( $cond )" if $yes eq '1' and $no eq '0';
 528     my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
 529     return "$lb$cond ? $yes : $no$rb"
 530       if !ref( $op->{yes} ) && !ref( $op->{no} );
 531     my $ind1= " " x 4;
 532     my $ind= "\n" . ( $ind1 x $op->{depth} );
 533
 534     if ( ref $op->{yes} ) {
 535         $yes= $ind . $ind1 . $yes;
 536     } else {
 537         $yes= " " . $yes;
 538     }
 539
 540     return "$lb$cond ?$yes$ind: $no$rb";
 541 }
 542
 543 # $expr=render($op,$combine)
 544 #
 545 # convert an optree to text with reasonably neat formatting. If $combine
 546 # is true then the condition is created using "fast breakouts" which
 547 # produce uglier expressions that are more efficient for common case,
 548 # longer lists such as that resulting from type 'cp' output.
 549 # Currently only used for type 'cp' macros.
 550 sub render {
 551     my ( $self, $op, $combine )= @_;
 552     my $str= "( " . $self->_render( $op, $combine ) . " )";
 553     return __clean( $str );
 554 }
 555
 556 # make_macro
 557 # make a macro of a given type.
 558 # calls into make_trie and (generic_|length_)optree as needed
 559 # Opts are:
 560 # type     : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
 561 # ret_type : 'cp' or 'len'
 562 # safe     : add length guards to macro
 563 #
 564 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
 565 # in which case it defaults to 'cp' as well.
 566 #
 567 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
 568 # sequences in it, as the generated macro will accept only a single codepoint
 569 # as an argument.
 570 #
 571 # returns the macro.
 572
 573
 574 sub make_macro {
 575     my $self= shift;
 576     my %opts= @_;
 577     my $type= $opts{type} || 'generic';
 578     die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
 579       if $type eq 'cp'
 580       and $self->{has_multi};
 581     my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
 582     my $method;
 583     if ( $opts{safe} ) {
 584         $method= 'length_optree';
 585     } elsif ( $type eq 'generic' ) {
 586         $method= 'generic_optree';
 587     } else {
 588         $method= 'optree';
 589     }
 590     my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
 591     my $text= $self->render( $optree, $type eq 'cp' );
 592     my @args= $type eq 'cp' ? 'cp' : 's';
 593     push @args, "e" if $opts{safe};
 594     push @args, "is_utf8" if $type eq 'generic';
 595     push @args, "len" if $ret_type eq 'both';
 596     my $pfx= $ret_type eq 'both'    ? 'what_len_' :
 597              $ret_type eq 'cp'      ? 'what_'     : 'is_';
 598     my $ext= $type     eq 'generic' ? ''          : '_' . lc( $type );
 599     $ext .= "_safe" if $opts{safe};
 600     my $argstr= join ",", @args;
 601     return "/*** GENERATED CODE ***/\n"
 602       . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
 603 }
 604
 605 # if we arent being used as a module (highly likely) then process
 606 # the __DATA__ below and produce macros in regcharclass.h
 607 # if an argument is provided to the script then it is assumed to
 608 # be the path of the file to output to, if the arg is '-' outputs
 609 # to STDOUT.
 610 if ( !caller ) {
 611
 612
 613
 614     $|++;
 615     my $path= shift @ARGV;
 616
 617     if ( !$path ) {
 618         $path= "regcharclass.h";
 619         if ( !-e $path ) { $path= "../$path" }
 620         if ( !-e $path ) { die "Can't find '$path' to update!\n" }
 621     }
 622     my $out_fh;
 623     if ( $path eq '-' ) {
 624         $out_fh= \*STDOUT;
 625     } else {
 626         rename $path, "$path.bak";
 627         open $out_fh, ">", $path
 628           or die "Can't write to '$path':$!";
 629         binmode $out_fh;    # want unix line endings even when run on win32.
 630     }
 631     my ( $zero )= $0 =~ /([^\\\/]+)$/;
 632     print $out_fh <<"HEADER";
 633 /*  -*- buffer-read-only: t -*-
 634  *
 635  *    regcharclass.h
 636  *
 637  *    Copyright (C) 2007, by Larry Wall and others
 638  *
 639  *    You may distribute under the terms of either the GNU General Public
 640  *    License or the Artistic License, as specified in the README file.
 641  *
 642  * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 643  * This file is built by Porting/$zero.
 644  * (Generated at: @{[ scalar gmtime ]} GMT)
 645  * Any changes made here will be lost!
 646  */
 647
 648 HEADER
 649
 650     my ( $op, $title, @txt, @types, @mods );
 651     my $doit= sub {
 652         return unless $op;
 653         print $out_fh "/*\n\t$op: $title\n\n";
 654         print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
 655         my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
 656
 657         #die Dumper(\@types,\@mods);
 658
 659         foreach my $type_spec ( @types ) {
 660             my ( $type, $ret )= split /-/, $type_spec;
 661             $ret ||= 'len';
 662             foreach my $mod ( @mods ) {
 663                 next if $mod eq 'safe' and $type eq 'cp';
 664                 my $macro= $obj->make_macro(
 665                     type     => $type,
 666                     ret_type => $ret,
 667                     safe     => $mod eq 'safe'
 668                 );
 669                 print $out_fh $macro, "\n";
 670             }
 671         }
 672     };
 673
 674     while ( <DATA> ) {
 675         s/^\s*#//;
 676         next unless /\S/;
 677         chomp;
 678         if ( /^([A-Z]+)/ ) {
 679             $doit->();
 680             ( $op, $title )= split /\s*:\s*/, $_, 2;
 681             @txt= ();
 682         } elsif ( s/^=>// ) {
 683             my ( $type, $modifier )= split /:/, $_;
 684             @types= split ' ', $type;
 685             @mods= split ' ',  $modifier;
 686         } else {
 687             push @txt, "$_";
 688         }
 689     }
 690     $doit->();
 691     print $out_fh "/* ex: set ro: */\n";
 692     print "updated $path\n" if $path ne '-';
 693 }
 694
 695 #
 696 # Valid types: generic, LATIN1, UTF8, low, latin1, utf8
 697 # default return value is octects read.
 698 # append -cp to make it codepoint matched.
 699 # modifiers come after the colon, valid possibilities
 700 # being 'fast' and 'safe'.
 701 #
 702 1; # in the unlikely case we are being used as a module
 703
 704 __DATA__
 705 LNBREAK: Line Break: \R
 706 => generic UTF8 LATIN1 :fast safe
 707 "\x0D\x0A"      # CRLF - Network (Windows) line ending
 708 0x0A            # LF  | LINE FEED
 709 0x0B            # VT  | VERTICAL TAB
 710 0x0C            # FF  | FORM FEED
 711 0x0D            # CR  | CARRIAGE RETURN
 712 0x85            # NEL | NEXT LINE
 713 0x2028          # LINE SEPARATOR
 714 0x2029          # PARAGRAPH SEPARATOR
 715
 716 HORIZWS: Horizontal Whitespace: \h \H
 717 => generic UTF8 LATIN1 cp :fast safe
 718 0x09            # HT
 719 0x20            # SPACE
 720 0xa0            # NBSP
 721 0x1680          # OGHAM SPACE MARK
 722 0x180e          # MONGOLIAN VOWEL SEPARATOR
 723 0x2000          # EN QUAD
 724 0x2001          # EM QUAD
 725 0x2002          # EN SPACE
 726 0x2003          # EM SPACE
 727 0x2004          # THREE-PER-EM SPACE
 728 0x2005          # FOUR-PER-EM SPACE
 729 0x2006          # SIX-PER-EM SPACE
 730 0x2007          # FIGURE SPACE
 731 0x2008          # PUNCTUATION SPACE
 732 0x2009          # THIN SPACE
 733 0x200A          # HAIR SPACE
 734 0x202f          # NARROW NO-BREAK SPACE
 735 0x205f          # MEDIUM MATHEMATICAL SPACE
 736 0x3000          # IDEOGRAPHIC SPACE
 737
 738 VERTWS: Vertical Whitespace: \v \V
 739 => generic UTF8 LATIN1 cp :fast safe
 740 0x0A            # LF
 741 0x0B            # VT
 742 0x0C            # FF
 743 0x0D            # CR
 744 0x85            # NEL
 745 0x2028          # LINE SEPARATOR
 746 0x2029          # PARAGRAPH SEPARATOR
 747
 748
 749 TRICKYFOLD: Problematic fold case letters.
 750 => generic cp generic-cp generic-both :fast safe
 751 0x00DF  # LATIN1 SMALL LETTER SHARP S
 752 0x0390  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
 753 0x03B0  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
 754
 755