Some escapes were mentioned twice, although they're not qr//-specific
[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl
CommitLineData
e64b1bd1 1package CharClass::Matcher;
12b72891 2use strict;
3use warnings;
e64b1bd1 4use warnings FATAL => 'all';
12b72891 5use Text::Wrap qw(wrap);
6use Encode;
7use Data::Dumper;
e64b1bd1 8$Data::Dumper::Useqq= 1;
9our $hex_fmt= "0x%02X";
12b72891 10
ab84f958 11=head1 NAME
0ccab2bc 12
e64b1bd1 13CharClass::Matcher -- Generate C macros that match character classes efficiently
12b72891 14
e64b1bd1 15=head1 SYNOPSIS
16
ab84f958 17 perl Porting/regcharclass.pl
e64b1bd1 18
19=head1 DESCRIPTION
12b72891 20
21Dynamically generates macros for detecting special charclasses
e64b1bd1 22in latin-1, utf8, and codepoint forms. Macros can be set to return
23the length (in bytes) of the matched codepoint, or the codepoint itself.
12b72891 24
25To regenerate regcharclass.h, run this script from perl-root. No arguments
26are necessary.
27
e64b1bd1 28Using WHATEVER as an example the following macros will be produced:
12b72891 29
30=over 4
31
e64b1bd1 32=item is_WHATEVER(s,is_utf8)
12b72891 33
e64b1bd1 34=item is_WHATEVER_safe(s,e,is_utf8)
12b72891 35
e64b1bd1 36Do a lookup as appropriate based on the is_utf8 flag. When possible
12b72891 37comparisons involving octect<128 are done before checking the is_utf8
38flag, hopefully saving time.
39
e64b1bd1 40=item is_WHATEVER_utf8(s)
12b72891 41
e64b1bd1 42=item is_WHATEVER_utf8_safe(s,e)
12b72891 43
44Do a lookup assuming the string is encoded in (normalized) UTF8.
45
e64b1bd1 46=item is_WHATEVER_latin1(s)
12b72891 47
e64b1bd1 48=item is_WHATEVER_latin1_safe(s,e)
12b72891 49
50Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
51
e64b1bd1 52=item is_WHATEVER_cp(cp)
12b72891 53
54Check to see if the string matches a given codepoint (hypotethically a
55U32). The condition is constructed as as to "break out" as early as
56possible if the codepoint is out of range of the condition.
57
58IOW:
59
60 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
61
62Thus if the character is X+1 only two comparisons will be done. Making
63matching lookups slower, but non-matching faster.
64
65=back
66
e64b1bd1 67Additionally it is possible to generate C<what_> variants that return
68the codepoint read instead of the number of octets read, this can be
69done by suffixing '-cp' to the type description.
70
71=head2 CODE FORMAT
72
73perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
74
75
76=head1 AUTHOR
77
78Author: Yves Orton (demerphq) 2007
79
80=head1 BUGS
81
82No tests directly here (although the regex engine will fail tests
83if this code is broken). Insufficient documentation and no Getopts
84handler for using the module as a script.
85
86=head1 LICENSE
87
88You may distribute under the terms of either the GNU General Public
89License or the Artistic License, as specified in the README file.
90
12b72891 91=cut
92
e64b1bd1 93# Sub naming convention:
94# __func : private subroutine, can not be called as a method
95# _func : private method, not meant for external use
96# func : public method.
97
98# private subs
99#-------------------------------------------------------------------------------
100#
101# ($cp,$n,$l,$u)=__uni_latin($str);
102#
103# Return a list of arrays, each of which when interepreted correctly
104# represent the string in some given encoding with specific conditions.
105#
106# $cp - list of codepoints that make up the string.
107# $n - list of octets that make up the string if all codepoints < 128
108# $l - list of octets that make up the string in latin1 encoding if all
109# codepoints < 256, and at least one codepoint is >127.
110# $u - list of octets that make up the string in utf8 if any codepoint >127
111#
112# High CP | Defined
113#-----------+----------
114# 0 - 127 : $n
115# 128 - 255 : $l, $u
116# 256 - ... : $u
117#
118
119sub __uni_latin1 {
120 my $str= shift;
121 my $max= 0;
122 my @cp;
123 for my $ch ( split //, $str ) {
124 my $cp= ord $ch;
125 push @cp, $cp;
126 $max= $cp if $max < $cp;
127 }
128 my ( $n, $l, $u );
129 if ( $max < 128 ) {
130 $n= [@cp];
131 } else {
132 $l= [@cp] if $max && $max < 256;
133
134 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
135 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
dda856b2 136 # $u is utf8 but with the utf8 flag OFF
137 # therefore "C*" gets us the values of the bytes involved.
138 $u= [ unpack "C*", $u ] if defined $u;
12b72891 139 }
e64b1bd1 140 return ( \@cp, $n, $l, $u );
12b72891 141}
142
12b72891 143#
e64b1bd1 144# $clean= __clean($expr);
145#
146# Cleanup a ternary expression, removing unnecessary parens and apply some
147# simplifications using regexes.
148#
149
150sub __clean {
151 my ( $expr )= @_;
152 our $parens;
153 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
154
155 #print "$parens\n$expr\n";
156 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
157 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
158 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
159 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
160 return $expr;
12b72891 161}
162
e64b1bd1 163#
164# $text= __macro(@args);
165# Join args together by newlines, and then neatly add backslashes to the end
166# of every line as expected by the C pre-processor for #define's.
167#
168
169sub __macro {
170 my $str= join "\n", @_;
171 $str =~ s/\s*$//;
172 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
173 my $last= pop @lines;
174 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
175 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
176 return $str . "\n";
12b72891 177}
178
e64b1bd1 179#
180# my $op=__incrdepth($op);
181#
182# take an 'op' hashref and add one to it and all its childrens depths.
183#
184
185sub __incrdepth {
186 my $op= shift;
187 return unless ref $op;
188 $op->{depth} += 1;
189 __incrdepth( $op->{yes} );
190 __incrdepth( $op->{no} );
191 return $op;
192}
193
194# join two branches of an opcode together with a condition, incrementing
195# the depth on the yes branch when we do so.
196# returns the new root opcode of the tree.
197sub __cond_join {
198 my ( $cond, $yes, $no )= @_;
199 return {
200 test => $cond,
201 yes => __incrdepth( $yes ),
202 no => $no,
203 depth => 0,
204 };
205}
206
207# Methods
208
209# constructor
210#
211# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
212#
213# Create a new CharClass::Matcher object by parsing the text in
214# the txt array. Currently applies the following rules:
215#
216# Element starts with C<0x>, line is evaled the result treated as
217# a number which is passed to chr().
218#
219# Element starts with C<">, line is evaled and the result treated
220# as a string.
221#
222# Each string is then stored in the 'strs' subhash as a hash record
223# made up of the results of __uni_latin1, using the keynames
224# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
225# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
226#
227# Size data is tracked per type in the 'size' subhash.
228#
229# Return an object
230#
12b72891 231sub new {
232 my $class= shift;
e64b1bd1 233 my %opt= @_;
234 for ( qw(op txt) ) {
235 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
236 if !exists $opt{$_};
237 }
238
239 my $self= bless {
240 op => $opt{op},
241 title => $opt{title} || '',
242 }, $class;
243 foreach my $txt ( @{ $opt{txt} } ) {
244 my $str= $txt;
245 if ( $str =~ /^[""]/ ) {
246 $str= eval $str;
247 } elsif ( $str =~ /^0x/ ) {
248 $str= chr eval $str;
249 } elsif ( /\S/ ) {
250 die "Unparseable line: $txt\n";
12b72891 251 } else {
e64b1bd1 252 next;
12b72891 253 }
e64b1bd1 254 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
255 my $UTF8= $low || $utf8;
256 my $LATIN1= $low || $latin1;
dda856b2 257 #die Dumper($txt,$cp,$low,$latin1,$utf8)
258 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
e64b1bd1 259
260 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
261 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
262 my $rec= $self->{strs}{$str};
263 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
264 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
265 if $self->{strs}{$str}{$key};
12b72891 266 }
e64b1bd1 267 $self->{has_multi} ||= @$cp > 1;
268 $self->{has_ascii} ||= $latin1 && @$latin1;
269 $self->{has_low} ||= $low && @$low;
270 $self->{has_high} ||= !$low && !$latin1;
12b72891 271 }
e64b1bd1 272 $self->{val_fmt}= $hex_fmt;
273 $self->{count}= 0 + keys %{ $self->{strs} };
12b72891 274 return $self;
275}
276
e64b1bd1 277# my $trie = make_trie($type,$maxlen);
12b72891 278#
e64b1bd1 279# using the data stored in the object build a trie of a specifc type,
280# and with specific maximum depth. The trie is made up the elements of
281# the given types array for each string in the object (assuming it is
282# not too long.)
283#
284# returns the trie, or undef if there was no relevent data in the object.
285#
286
287sub make_trie {
288 my ( $self, $type, $maxlen )= @_;
289
290 my $strs= $self->{strs};
291 my %trie;
292 foreach my $rec ( values %$strs ) {
293 die "panic: unknown type '$type'"
294 if !exists $rec->{$type};
295 my $dat= $rec->{$type};
296 next unless $dat;
297 next if $maxlen && @$dat > $maxlen;
298 my $node= \%trie;
299 foreach my $elem ( @$dat ) {
300 $node->{$elem} ||= {};
301 $node= $node->{$elem};
12b72891 302 }
e64b1bd1 303 $node->{''}= $rec->{str};
12b72891 304 }
e64b1bd1 305 return 0 + keys( %trie ) ? \%trie : undef;
12b72891 306}
307
e64b1bd1 308# my $optree= _optree()
309#
310# recursively convert a trie to an optree where every node represents
311# an if else branch.
12b72891 312#
12b72891 313#
12b72891 314
e64b1bd1 315sub _optree {
316 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
317 return unless defined $trie;
318 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
319 die "Can't do 'cp' optree from multi-codepoint strings";
12b72891 320 }
e64b1bd1 321 $ret_type ||= 'len';
322 $else= 0 unless defined $else;
323 $depth= 0 unless defined $depth;
324
325 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
326 if ( $trie->{''} ) {
327 if ( $ret_type eq 'cp' ) {
328 $else= $self->{strs}{ $trie->{''} }{cp}[0];
329 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
330 } elsif ( $ret_type eq 'len' ) {
331 $else= $depth;
332 } elsif ( $ret_type eq 'both') {
333 $else= $self->{strs}{ $trie->{''} }{cp}[0];
334 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
335 $else= "len=$depth, $else";
12b72891 336 }
e64b1bd1 337 }
338 return $else if !@conds;
339 my $node= {};
340 my $root= $node;
341 my ( $yes_res, $as_code, @cond );
342 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
343 my $Update= sub {
344 $node->{vals}= [@cond];
345 $node->{test}= $test;
346 $node->{yes}= $yes_res;
347 $node->{depth}= $depth;
348 $node->{no}= shift;
349 };
350 while ( @conds ) {
351 my $cond= shift @conds;
352 my $res=
353 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
354 $depth + 1 );
355 my $res_code= Dumper( $res );
356 if ( !$yes_res || $res_code ne $as_code ) {
357 if ( $yes_res ) {
358 $Update->( {} );
359 $node= $node->{no};
360 }
361 ( $yes_res, $as_code )= ( $res, $res_code );
362 @cond= ( $cond );
12b72891 363 } else {
e64b1bd1 364 push @cond, $cond;
12b72891 365 }
366 }
e64b1bd1 367 $Update->( $else );
12b72891 368 return $root;
369}
370
e64b1bd1 371# my $optree= optree(%opts);
372#
373# Convert a trie to an optree, wrapper for _optree
374
375sub optree {
376 my $self= shift;
377 my %opt= @_;
378 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
379 $opt{ret_type} ||= 'len';
380 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
381 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
12b72891 382}
383
e64b1bd1 384# my $optree= generic_optree(%opts);
385#
386# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
387# sets of strings, including a branch for handling the string type check.
388#
389
390sub generic_optree {
391 my $self= shift;
392 my %opt= @_;
393
394 $opt{ret_type} ||= 'len';
395 my $test_type= 'depth';
396 my $else= $opt{else} || 0;
397
398 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
399 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
400
401 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
402 for $latin1, $utf8;
403
404 if ( $utf8 ) {
405 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
406 } elsif ( $latin1 ) {
407 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
408 }
409 my $low= $self->make_trie( 'low', $opt{max_depth} );
410 if ( $low ) {
411 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
12b72891 412 }
e64b1bd1 413
414 return $else;
12b72891 415}
416
e64b1bd1 417# length_optree()
12b72891 418#
e64b1bd1 419# create a string length guarded optree.
12b72891 420#
e64b1bd1 421
422sub length_optree {
423 my $self= shift;
424 my %opt= @_;
425 my $type= $opt{type};
426
427 die "Can't do a length_optree on type 'cp', makes no sense."
428 if $type eq 'cp';
429
430 my ( @size, $method );
431
432 if ( $type eq 'generic' ) {
433 $method= 'generic_optree';
434 my %sizes= (
435 %{ $self->{size}{low} || {} },
436 %{ $self->{size}{latin1} || {} },
437 %{ $self->{size}{utf8} || {} }
438 );
439 @size= sort { $a <=> $b } keys %sizes;
440 } else {
441 $method= 'optree';
442 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
12b72891 443 }
e64b1bd1 444
445 my $else= ( $opt{else} ||= 0 );
446 for my $size ( @size ) {
447 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
448 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
449 $else= __cond_join( $cond, $optree, $else );
450 }
451 return $else;
12b72891 452}
453
e64b1bd1 454# _cond_as_str
455# turn a list of conditions into a text expression
456# - merges ranges of conditions, and joins the result with ||
457sub _cond_as_str {
458 my ( $self, $op, $combine )= @_;
459 my $cond= $op->{vals};
460 my $test= $op->{test};
461 return "( $test )" if !defined $cond;
462
463 # rangify the list
464 my @ranges;
465 my $Update= sub {
466 if ( @ranges ) {
467 if ( $ranges[-1][0] == $ranges[-1][1] ) {
468 $ranges[-1]= $ranges[-1][0];
469 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
470 $ranges[-1]= $ranges[-1][0];
471 push @ranges, $ranges[-1] + 1;
472 }
473 }
474 };
475 for my $cond ( @$cond ) {
476 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
477 $Update->();
478 push @ranges, [ $cond, $cond ];
479 } else {
480 $ranges[-1][1]++;
481 }
482 }
483 $Update->();
484 return $self->_combine( $test, @ranges )
485 if $combine;
486 @ranges= map {
487 ref $_
488 ? sprintf(
489 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
490 @$_ )
491 : sprintf( "$self->{val_fmt} == $test", $_ );
492 } @ranges;
493 return "( " . join( " || ", @ranges ) . " )";
12b72891 494}
495
e64b1bd1 496# _combine
497# recursively turn a list of conditions into a fast break-out condition
498# used by _cond_as_str() for 'cp' type macros.
499sub _combine {
500 my ( $self, $test, @cond )= @_;
501 return if !@cond;
502 my $item= shift @cond;
503 my ( $cstr, $gtv );
504 if ( ref $item ) {
505 $cstr=
506 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
507 @$item );
508 $gtv= sprintf "$self->{val_fmt}", $item->[1];
12b72891 509 } else {
e64b1bd1 510 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
511 $gtv= sprintf "$self->{val_fmt}", $item;
12b72891 512 }
e64b1bd1 513 if ( @cond ) {
514 return "( $cstr || ( $gtv < $test &&\n"
515 . $self->_combine( $test, @cond ) . " ) )";
12b72891 516 } else {
e64b1bd1 517 return $cstr;
12b72891 518 }
e64b1bd1 519}
12b72891 520
e64b1bd1 521# _render()
522# recursively convert an optree to text with reasonably neat formatting
523sub _render {
524 my ( $self, $op, $combine, $brace )= @_;
525 if ( !ref $op ) {
526 return $op;
12b72891 527 }
e64b1bd1 528 my $cond= $self->_cond_as_str( $op, $combine );
529 my $yes= $self->_render( $op->{yes}, $combine, 1 );
530 my $no= $self->_render( $op->{no}, $combine, 0 );
531 return "( $cond )" if $yes eq '1' and $no eq '0';
532 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
533 return "$lb$cond ? $yes : $no$rb"
534 if !ref( $op->{yes} ) && !ref( $op->{no} );
535 my $ind1= " " x 4;
536 my $ind= "\n" . ( $ind1 x $op->{depth} );
537
538 if ( ref $op->{yes} ) {
539 $yes= $ind . $ind1 . $yes;
540 } else {
541 $yes= " " . $yes;
542 }
543
544 return "$lb$cond ?$yes$ind: $no$rb";
12b72891 545}
32e6a07c 546
e64b1bd1 547# $expr=render($op,$combine)
548#
549# convert an optree to text with reasonably neat formatting. If $combine
550# is true then the condition is created using "fast breakouts" which
551# produce uglier expressions that are more efficient for common case,
552# longer lists such as that resulting from type 'cp' output.
553# Currently only used for type 'cp' macros.
554sub render {
555 my ( $self, $op, $combine )= @_;
556 my $str= "( " . $self->_render( $op, $combine ) . " )";
557 return __clean( $str );
12b72891 558}
e64b1bd1 559
560# make_macro
561# make a macro of a given type.
562# calls into make_trie and (generic_|length_)optree as needed
563# Opts are:
564# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
565# ret_type : 'cp' or 'len'
566# safe : add length guards to macro
567#
568# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
569# in which case it defaults to 'cp' as well.
570#
571# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
572# sequences in it, as the generated macro will accept only a single codepoint
573# as an argument.
574#
575# returns the macro.
576
577
578sub make_macro {
579 my $self= shift;
580 my %opts= @_;
581 my $type= $opts{type} || 'generic';
582 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
583 if $type eq 'cp'
584 and $self->{has_multi};
585 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
586 my $method;
587 if ( $opts{safe} ) {
588 $method= 'length_optree';
589 } elsif ( $type eq 'generic' ) {
590 $method= 'generic_optree';
591 } else {
592 $method= 'optree';
593 }
594 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
595 my $text= $self->render( $optree, $type eq 'cp' );
596 my @args= $type eq 'cp' ? 'cp' : 's';
597 push @args, "e" if $opts{safe};
598 push @args, "is_utf8" if $type eq 'generic';
599 push @args, "len" if $ret_type eq 'both';
600 my $pfx= $ret_type eq 'both' ? 'what_len_' :
601 $ret_type eq 'cp' ? 'what_' : 'is_';
602 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
603 $ext .= "_safe" if $opts{safe};
604 my $argstr= join ",", @args;
605 return "/*** GENERATED CODE ***/\n"
606 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
32e6a07c 607}
e64b1bd1 608
609# if we arent being used as a module (highly likely) then process
610# the __DATA__ below and produce macros in regcharclass.h
611# if an argument is provided to the script then it is assumed to
612# be the path of the file to output to, if the arg is '-' outputs
613# to STDOUT.
614if ( !caller ) {
615
616
617
618 $|++;
619 my $path= shift @ARGV;
620
621 if ( !$path ) {
622 $path= "regcharclass.h";
623 if ( !-e $path ) { $path= "../$path" }
624 if ( !-e $path ) { die "Can't find '$path' to update!\n" }
625 }
626 my $out_fh;
627 if ( $path eq '-' ) {
628 $out_fh= \*STDOUT;
629 } else {
630 rename $path, "$path.bak";
631 open $out_fh, ">", $path
632 or die "Can't write to '$path':$!";
633 binmode $out_fh; # want unix line endings even when run on win32.
634 }
635 my ( $zero )= $0 =~ /([^\\\/]+)$/;
636 print $out_fh <<"HEADER";
58fbde93 637/* -*- buffer-read-only: t -*-
638 *
639 * regcharclass.h
640 *
641 * Copyright (C) 2007, by Larry Wall and others
642 *
643 * You may distribute under the terms of either the GNU General Public
644 * License or the Artistic License, as specified in the README file.
645 *
646 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
647 * This file is built by Porting/$zero.
dda856b2 648 *
58fbde93 649 * Any changes made here will be lost!
dda856b2 650 *
58fbde93 651 */
12b72891 652
653HEADER
654
e64b1bd1 655 my ( $op, $title, @txt, @types, @mods );
656 my $doit= sub {
657 return unless $op;
658 print $out_fh "/*\n\t$op: $title\n\n";
659 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
660 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
661
662 #die Dumper(\@types,\@mods);
663
664 foreach my $type_spec ( @types ) {
665 my ( $type, $ret )= split /-/, $type_spec;
666 $ret ||= 'len';
667 foreach my $mod ( @mods ) {
668 next if $mod eq 'safe' and $type eq 'cp';
669 my $macro= $obj->make_macro(
670 type => $type,
671 ret_type => $ret,
672 safe => $mod eq 'safe'
673 );
674 print $out_fh $macro, "\n";
675 }
32e6a07c 676 }
e64b1bd1 677 };
678
679 while ( <DATA> ) {
680 s/^\s*#//;
681 next unless /\S/;
682 chomp;
683 if ( /^([A-Z]+)/ ) {
684 $doit->();
685 ( $op, $title )= split /\s*:\s*/, $_, 2;
686 @txt= ();
687 } elsif ( s/^=>// ) {
688 my ( $type, $modifier )= split /:/, $_;
689 @types= split ' ', $type;
690 @mods= split ' ', $modifier;
691 } else {
692 push @txt, "$_";
12b72891 693 }
694 }
e64b1bd1 695 $doit->();
696 print $out_fh "/* ex: set ro: */\n";
697 print "updated $path\n" if $path ne '-';
12b72891 698}
e64b1bd1 699
700#
701# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
702# default return value is octects read.
703# append -cp to make it codepoint matched.
704# modifiers come after the colon, valid possibilities
705# being 'fast' and 'safe'.
706#
7071; # in the unlikely case we are being used as a module
12b72891 708
709__DATA__
710LNBREAK: Line Break: \R
e64b1bd1 711=> generic UTF8 LATIN1 :fast safe
12b72891 712"\x0D\x0A" # CRLF - Network (Windows) line ending
7130x0A # LF | LINE FEED
7140x0B # VT | VERTICAL TAB
7150x0C # FF | FORM FEED
7160x0D # CR | CARRIAGE RETURN
7170x85 # NEL | NEXT LINE
7180x2028 # LINE SEPARATOR
7190x2029 # PARAGRAPH SEPARATOR
720
721HORIZWS: Horizontal Whitespace: \h \H
e64b1bd1 722=> generic UTF8 LATIN1 cp :fast safe
12b72891 7230x09 # HT
7240x20 # SPACE
7250xa0 # NBSP
7260x1680 # OGHAM SPACE MARK
7270x180e # MONGOLIAN VOWEL SEPARATOR
7280x2000 # EN QUAD
7290x2001 # EM QUAD
7300x2002 # EN SPACE
7310x2003 # EM SPACE
7320x2004 # THREE-PER-EM SPACE
7330x2005 # FOUR-PER-EM SPACE
7340x2006 # SIX-PER-EM SPACE
7350x2007 # FIGURE SPACE
7360x2008 # PUNCTUATION SPACE
7370x2009 # THIN SPACE
7380x200A # HAIR SPACE
7390x202f # NARROW NO-BREAK SPACE
7400x205f # MEDIUM MATHEMATICAL SPACE
7410x3000 # IDEOGRAPHIC SPACE
742
743VERTWS: Vertical Whitespace: \v \V
e64b1bd1 744=> generic UTF8 LATIN1 cp :fast safe
12b72891 7450x0A # LF
7460x0B # VT
7470x0C # FF
7480x0D # CR
7490x85 # NEL
7500x2028 # LINE SEPARATOR
7510x2029 # PARAGRAPH SEPARATOR
752
e64b1bd1 753
32e6a07c 754TRICKYFOLD: Problematic fold case letters.
e64b1bd1 755=> generic cp generic-cp generic-both :fast safe
7560x00DF # LATIN1 SMALL LETTER SHARP S
32e6a07c 7570x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
7580x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
e64b1bd1 759
760