That's NAME, not TITLE.
[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl
CommitLineData
e64b1bd1 1package CharClass::Matcher;
12b72891 2use strict;
3use warnings;
e64b1bd1 4use warnings FATAL => 'all';
12b72891 5use Text::Wrap qw(wrap);
6use Encode;
7use Data::Dumper;
e64b1bd1 8$Data::Dumper::Useqq= 1;
9our $hex_fmt= "0x%02X";
12b72891 10
ab84f958 11=head1 NAME
0ccab2bc 12
e64b1bd1 13CharClass::Matcher -- Generate C macros that match character classes efficiently
12b72891 14
e64b1bd1 15=head1 SYNOPSIS
16
ab84f958 17 perl Porting/regcharclass.pl
e64b1bd1 18
19=head1 DESCRIPTION
12b72891 20
21Dynamically generates macros for detecting special charclasses
e64b1bd1 22in latin-1, utf8, and codepoint forms. Macros can be set to return
23the length (in bytes) of the matched codepoint, or the codepoint itself.
12b72891 24
25To regenerate regcharclass.h, run this script from perl-root. No arguments
26are necessary.
27
e64b1bd1 28Using WHATEVER as an example the following macros will be produced:
12b72891 29
30=over 4
31
e64b1bd1 32=item is_WHATEVER(s,is_utf8)
12b72891 33
e64b1bd1 34=item is_WHATEVER_safe(s,e,is_utf8)
12b72891 35
e64b1bd1 36Do a lookup as appropriate based on the is_utf8 flag. When possible
12b72891 37comparisons involving octect<128 are done before checking the is_utf8
38flag, hopefully saving time.
39
e64b1bd1 40=item is_WHATEVER_utf8(s)
12b72891 41
e64b1bd1 42=item is_WHATEVER_utf8_safe(s,e)
12b72891 43
44Do a lookup assuming the string is encoded in (normalized) UTF8.
45
e64b1bd1 46=item is_WHATEVER_latin1(s)
12b72891 47
e64b1bd1 48=item is_WHATEVER_latin1_safe(s,e)
12b72891 49
50Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
51
e64b1bd1 52=item is_WHATEVER_cp(cp)
12b72891 53
54Check to see if the string matches a given codepoint (hypotethically a
55U32). The condition is constructed as as to "break out" as early as
56possible if the codepoint is out of range of the condition.
57
58IOW:
59
60 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
61
62Thus if the character is X+1 only two comparisons will be done. Making
63matching lookups slower, but non-matching faster.
64
65=back
66
e64b1bd1 67Additionally it is possible to generate C<what_> variants that return
68the codepoint read instead of the number of octets read, this can be
69done by suffixing '-cp' to the type description.
70
71=head2 CODE FORMAT
72
73perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
74
75
76=head1 AUTHOR
77
78Author: Yves Orton (demerphq) 2007
79
80=head1 BUGS
81
82No tests directly here (although the regex engine will fail tests
83if this code is broken). Insufficient documentation and no Getopts
84handler for using the module as a script.
85
86=head1 LICENSE
87
88You may distribute under the terms of either the GNU General Public
89License or the Artistic License, as specified in the README file.
90
12b72891 91=cut
92
e64b1bd1 93# Sub naming convention:
94# __func : private subroutine, can not be called as a method
95# _func : private method, not meant for external use
96# func : public method.
97
98# private subs
99#-------------------------------------------------------------------------------
100#
101# ($cp,$n,$l,$u)=__uni_latin($str);
102#
103# Return a list of arrays, each of which when interepreted correctly
104# represent the string in some given encoding with specific conditions.
105#
106# $cp - list of codepoints that make up the string.
107# $n - list of octets that make up the string if all codepoints < 128
108# $l - list of octets that make up the string in latin1 encoding if all
109# codepoints < 256, and at least one codepoint is >127.
110# $u - list of octets that make up the string in utf8 if any codepoint >127
111#
112# High CP | Defined
113#-----------+----------
114# 0 - 127 : $n
115# 128 - 255 : $l, $u
116# 256 - ... : $u
117#
118
119sub __uni_latin1 {
120 my $str= shift;
121 my $max= 0;
122 my @cp;
123 for my $ch ( split //, $str ) {
124 my $cp= ord $ch;
125 push @cp, $cp;
126 $max= $cp if $max < $cp;
127 }
128 my ( $n, $l, $u );
129 if ( $max < 128 ) {
130 $n= [@cp];
131 } else {
132 $l= [@cp] if $max && $max < 256;
133
134 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
135 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
136 $u= [ unpack "U0C*", $u ] if $u;
12b72891 137 }
e64b1bd1 138 return ( \@cp, $n, $l, $u );
12b72891 139}
140
12b72891 141#
e64b1bd1 142# $clean= __clean($expr);
143#
144# Cleanup a ternary expression, removing unnecessary parens and apply some
145# simplifications using regexes.
146#
147
148sub __clean {
149 my ( $expr )= @_;
150 our $parens;
151 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
152
153 #print "$parens\n$expr\n";
154 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
155 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
156 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
157 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
158 return $expr;
12b72891 159}
160
e64b1bd1 161#
162# $text= __macro(@args);
163# Join args together by newlines, and then neatly add backslashes to the end
164# of every line as expected by the C pre-processor for #define's.
165#
166
167sub __macro {
168 my $str= join "\n", @_;
169 $str =~ s/\s*$//;
170 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
171 my $last= pop @lines;
172 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
173 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
174 return $str . "\n";
12b72891 175}
176
e64b1bd1 177#
178# my $op=__incrdepth($op);
179#
180# take an 'op' hashref and add one to it and all its childrens depths.
181#
182
183sub __incrdepth {
184 my $op= shift;
185 return unless ref $op;
186 $op->{depth} += 1;
187 __incrdepth( $op->{yes} );
188 __incrdepth( $op->{no} );
189 return $op;
190}
191
192# join two branches of an opcode together with a condition, incrementing
193# the depth on the yes branch when we do so.
194# returns the new root opcode of the tree.
195sub __cond_join {
196 my ( $cond, $yes, $no )= @_;
197 return {
198 test => $cond,
199 yes => __incrdepth( $yes ),
200 no => $no,
201 depth => 0,
202 };
203}
204
205# Methods
206
207# constructor
208#
209# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
210#
211# Create a new CharClass::Matcher object by parsing the text in
212# the txt array. Currently applies the following rules:
213#
214# Element starts with C<0x>, line is evaled the result treated as
215# a number which is passed to chr().
216#
217# Element starts with C<">, line is evaled and the result treated
218# as a string.
219#
220# Each string is then stored in the 'strs' subhash as a hash record
221# made up of the results of __uni_latin1, using the keynames
222# 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
223# 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
224#
225# Size data is tracked per type in the 'size' subhash.
226#
227# Return an object
228#
12b72891 229sub new {
230 my $class= shift;
e64b1bd1 231 my %opt= @_;
232 for ( qw(op txt) ) {
233 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
234 if !exists $opt{$_};
235 }
236
237 my $self= bless {
238 op => $opt{op},
239 title => $opt{title} || '',
240 }, $class;
241 foreach my $txt ( @{ $opt{txt} } ) {
242 my $str= $txt;
243 if ( $str =~ /^[""]/ ) {
244 $str= eval $str;
245 } elsif ( $str =~ /^0x/ ) {
246 $str= chr eval $str;
247 } elsif ( /\S/ ) {
248 die "Unparseable line: $txt\n";
12b72891 249 } else {
e64b1bd1 250 next;
12b72891 251 }
e64b1bd1 252 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
253 my $UTF8= $low || $utf8;
254 my $LATIN1= $low || $latin1;
255
256 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
257 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
258 my $rec= $self->{strs}{$str};
259 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
260 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
261 if $self->{strs}{$str}{$key};
12b72891 262 }
e64b1bd1 263 $self->{has_multi} ||= @$cp > 1;
264 $self->{has_ascii} ||= $latin1 && @$latin1;
265 $self->{has_low} ||= $low && @$low;
266 $self->{has_high} ||= !$low && !$latin1;
12b72891 267 }
e64b1bd1 268 $self->{val_fmt}= $hex_fmt;
269 $self->{count}= 0 + keys %{ $self->{strs} };
12b72891 270 return $self;
271}
272
e64b1bd1 273# my $trie = make_trie($type,$maxlen);
12b72891 274#
e64b1bd1 275# using the data stored in the object build a trie of a specifc type,
276# and with specific maximum depth. The trie is made up the elements of
277# the given types array for each string in the object (assuming it is
278# not too long.)
279#
280# returns the trie, or undef if there was no relevent data in the object.
281#
282
283sub make_trie {
284 my ( $self, $type, $maxlen )= @_;
285
286 my $strs= $self->{strs};
287 my %trie;
288 foreach my $rec ( values %$strs ) {
289 die "panic: unknown type '$type'"
290 if !exists $rec->{$type};
291 my $dat= $rec->{$type};
292 next unless $dat;
293 next if $maxlen && @$dat > $maxlen;
294 my $node= \%trie;
295 foreach my $elem ( @$dat ) {
296 $node->{$elem} ||= {};
297 $node= $node->{$elem};
12b72891 298 }
e64b1bd1 299 $node->{''}= $rec->{str};
12b72891 300 }
e64b1bd1 301 return 0 + keys( %trie ) ? \%trie : undef;
12b72891 302}
303
e64b1bd1 304# my $optree= _optree()
305#
306# recursively convert a trie to an optree where every node represents
307# an if else branch.
12b72891 308#
12b72891 309#
12b72891 310
e64b1bd1 311sub _optree {
312 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
313 return unless defined $trie;
314 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
315 die "Can't do 'cp' optree from multi-codepoint strings";
12b72891 316 }
e64b1bd1 317 $ret_type ||= 'len';
318 $else= 0 unless defined $else;
319 $depth= 0 unless defined $depth;
320
321 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
322 if ( $trie->{''} ) {
323 if ( $ret_type eq 'cp' ) {
324 $else= $self->{strs}{ $trie->{''} }{cp}[0];
325 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
326 } elsif ( $ret_type eq 'len' ) {
327 $else= $depth;
328 } elsif ( $ret_type eq 'both') {
329 $else= $self->{strs}{ $trie->{''} }{cp}[0];
330 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
331 $else= "len=$depth, $else";
12b72891 332 }
e64b1bd1 333 }
334 return $else if !@conds;
335 my $node= {};
336 my $root= $node;
337 my ( $yes_res, $as_code, @cond );
338 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
339 my $Update= sub {
340 $node->{vals}= [@cond];
341 $node->{test}= $test;
342 $node->{yes}= $yes_res;
343 $node->{depth}= $depth;
344 $node->{no}= shift;
345 };
346 while ( @conds ) {
347 my $cond= shift @conds;
348 my $res=
349 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
350 $depth + 1 );
351 my $res_code= Dumper( $res );
352 if ( !$yes_res || $res_code ne $as_code ) {
353 if ( $yes_res ) {
354 $Update->( {} );
355 $node= $node->{no};
356 }
357 ( $yes_res, $as_code )= ( $res, $res_code );
358 @cond= ( $cond );
12b72891 359 } else {
e64b1bd1 360 push @cond, $cond;
12b72891 361 }
362 }
e64b1bd1 363 $Update->( $else );
12b72891 364 return $root;
365}
366
e64b1bd1 367# my $optree= optree(%opts);
368#
369# Convert a trie to an optree, wrapper for _optree
370
371sub optree {
372 my $self= shift;
373 my %opt= @_;
374 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
375 $opt{ret_type} ||= 'len';
376 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
377 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
12b72891 378}
379
e64b1bd1 380# my $optree= generic_optree(%opts);
381#
382# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
383# sets of strings, including a branch for handling the string type check.
384#
385
386sub generic_optree {
387 my $self= shift;
388 my %opt= @_;
389
390 $opt{ret_type} ||= 'len';
391 my $test_type= 'depth';
392 my $else= $opt{else} || 0;
393
394 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
395 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
396
397 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
398 for $latin1, $utf8;
399
400 if ( $utf8 ) {
401 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
402 } elsif ( $latin1 ) {
403 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
404 }
405 my $low= $self->make_trie( 'low', $opt{max_depth} );
406 if ( $low ) {
407 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
12b72891 408 }
e64b1bd1 409
410 return $else;
12b72891 411}
412
e64b1bd1 413# length_optree()
12b72891 414#
e64b1bd1 415# create a string length guarded optree.
12b72891 416#
e64b1bd1 417
418sub length_optree {
419 my $self= shift;
420 my %opt= @_;
421 my $type= $opt{type};
422
423 die "Can't do a length_optree on type 'cp', makes no sense."
424 if $type eq 'cp';
425
426 my ( @size, $method );
427
428 if ( $type eq 'generic' ) {
429 $method= 'generic_optree';
430 my %sizes= (
431 %{ $self->{size}{low} || {} },
432 %{ $self->{size}{latin1} || {} },
433 %{ $self->{size}{utf8} || {} }
434 );
435 @size= sort { $a <=> $b } keys %sizes;
436 } else {
437 $method= 'optree';
438 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
12b72891 439 }
e64b1bd1 440
441 my $else= ( $opt{else} ||= 0 );
442 for my $size ( @size ) {
443 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
444 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
445 $else= __cond_join( $cond, $optree, $else );
446 }
447 return $else;
12b72891 448}
449
e64b1bd1 450# _cond_as_str
451# turn a list of conditions into a text expression
452# - merges ranges of conditions, and joins the result with ||
453sub _cond_as_str {
454 my ( $self, $op, $combine )= @_;
455 my $cond= $op->{vals};
456 my $test= $op->{test};
457 return "( $test )" if !defined $cond;
458
459 # rangify the list
460 my @ranges;
461 my $Update= sub {
462 if ( @ranges ) {
463 if ( $ranges[-1][0] == $ranges[-1][1] ) {
464 $ranges[-1]= $ranges[-1][0];
465 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
466 $ranges[-1]= $ranges[-1][0];
467 push @ranges, $ranges[-1] + 1;
468 }
469 }
470 };
471 for my $cond ( @$cond ) {
472 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
473 $Update->();
474 push @ranges, [ $cond, $cond ];
475 } else {
476 $ranges[-1][1]++;
477 }
478 }
479 $Update->();
480 return $self->_combine( $test, @ranges )
481 if $combine;
482 @ranges= map {
483 ref $_
484 ? sprintf(
485 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
486 @$_ )
487 : sprintf( "$self->{val_fmt} == $test", $_ );
488 } @ranges;
489 return "( " . join( " || ", @ranges ) . " )";
12b72891 490}
491
e64b1bd1 492# _combine
493# recursively turn a list of conditions into a fast break-out condition
494# used by _cond_as_str() for 'cp' type macros.
495sub _combine {
496 my ( $self, $test, @cond )= @_;
497 return if !@cond;
498 my $item= shift @cond;
499 my ( $cstr, $gtv );
500 if ( ref $item ) {
501 $cstr=
502 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
503 @$item );
504 $gtv= sprintf "$self->{val_fmt}", $item->[1];
12b72891 505 } else {
e64b1bd1 506 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
507 $gtv= sprintf "$self->{val_fmt}", $item;
12b72891 508 }
e64b1bd1 509 if ( @cond ) {
510 return "( $cstr || ( $gtv < $test &&\n"
511 . $self->_combine( $test, @cond ) . " ) )";
12b72891 512 } else {
e64b1bd1 513 return $cstr;
12b72891 514 }
e64b1bd1 515}
12b72891 516
e64b1bd1 517# _render()
518# recursively convert an optree to text with reasonably neat formatting
519sub _render {
520 my ( $self, $op, $combine, $brace )= @_;
521 if ( !ref $op ) {
522 return $op;
12b72891 523 }
e64b1bd1 524 my $cond= $self->_cond_as_str( $op, $combine );
525 my $yes= $self->_render( $op->{yes}, $combine, 1 );
526 my $no= $self->_render( $op->{no}, $combine, 0 );
527 return "( $cond )" if $yes eq '1' and $no eq '0';
528 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
529 return "$lb$cond ? $yes : $no$rb"
530 if !ref( $op->{yes} ) && !ref( $op->{no} );
531 my $ind1= " " x 4;
532 my $ind= "\n" . ( $ind1 x $op->{depth} );
533
534 if ( ref $op->{yes} ) {
535 $yes= $ind . $ind1 . $yes;
536 } else {
537 $yes= " " . $yes;
538 }
539
540 return "$lb$cond ?$yes$ind: $no$rb";
12b72891 541}
32e6a07c 542
e64b1bd1 543# $expr=render($op,$combine)
544#
545# convert an optree to text with reasonably neat formatting. If $combine
546# is true then the condition is created using "fast breakouts" which
547# produce uglier expressions that are more efficient for common case,
548# longer lists such as that resulting from type 'cp' output.
549# Currently only used for type 'cp' macros.
550sub render {
551 my ( $self, $op, $combine )= @_;
552 my $str= "( " . $self->_render( $op, $combine ) . " )";
553 return __clean( $str );
12b72891 554}
e64b1bd1 555
556# make_macro
557# make a macro of a given type.
558# calls into make_trie and (generic_|length_)optree as needed
559# Opts are:
560# type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
561# ret_type : 'cp' or 'len'
562# safe : add length guards to macro
563#
564# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
565# in which case it defaults to 'cp' as well.
566#
567# it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
568# sequences in it, as the generated macro will accept only a single codepoint
569# as an argument.
570#
571# returns the macro.
572
573
574sub make_macro {
575 my $self= shift;
576 my %opts= @_;
577 my $type= $opts{type} || 'generic';
578 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
579 if $type eq 'cp'
580 and $self->{has_multi};
581 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
582 my $method;
583 if ( $opts{safe} ) {
584 $method= 'length_optree';
585 } elsif ( $type eq 'generic' ) {
586 $method= 'generic_optree';
587 } else {
588 $method= 'optree';
589 }
590 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
591 my $text= $self->render( $optree, $type eq 'cp' );
592 my @args= $type eq 'cp' ? 'cp' : 's';
593 push @args, "e" if $opts{safe};
594 push @args, "is_utf8" if $type eq 'generic';
595 push @args, "len" if $ret_type eq 'both';
596 my $pfx= $ret_type eq 'both' ? 'what_len_' :
597 $ret_type eq 'cp' ? 'what_' : 'is_';
598 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
599 $ext .= "_safe" if $opts{safe};
600 my $argstr= join ",", @args;
601 return "/*** GENERATED CODE ***/\n"
602 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
32e6a07c 603}
e64b1bd1 604
605# if we arent being used as a module (highly likely) then process
606# the __DATA__ below and produce macros in regcharclass.h
607# if an argument is provided to the script then it is assumed to
608# be the path of the file to output to, if the arg is '-' outputs
609# to STDOUT.
610if ( !caller ) {
611
612
613
614 $|++;
615 my $path= shift @ARGV;
616
617 if ( !$path ) {
618 $path= "regcharclass.h";
619 if ( !-e $path ) { $path= "../$path" }
620 if ( !-e $path ) { die "Can't find '$path' to update!\n" }
621 }
622 my $out_fh;
623 if ( $path eq '-' ) {
624 $out_fh= \*STDOUT;
625 } else {
626 rename $path, "$path.bak";
627 open $out_fh, ">", $path
628 or die "Can't write to '$path':$!";
629 binmode $out_fh; # want unix line endings even when run on win32.
630 }
631 my ( $zero )= $0 =~ /([^\\\/]+)$/;
632 print $out_fh <<"HEADER";
58fbde93 633/* -*- buffer-read-only: t -*-
634 *
635 * regcharclass.h
636 *
637 * Copyright (C) 2007, by Larry Wall and others
638 *
639 * You may distribute under the terms of either the GNU General Public
640 * License or the Artistic License, as specified in the README file.
641 *
642 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
643 * This file is built by Porting/$zero.
644 * (Generated at: @{[ scalar gmtime ]} GMT)
645 * Any changes made here will be lost!
646 */
12b72891 647
648HEADER
649
e64b1bd1 650 my ( $op, $title, @txt, @types, @mods );
651 my $doit= sub {
652 return unless $op;
653 print $out_fh "/*\n\t$op: $title\n\n";
654 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
655 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
656
657 #die Dumper(\@types,\@mods);
658
659 foreach my $type_spec ( @types ) {
660 my ( $type, $ret )= split /-/, $type_spec;
661 $ret ||= 'len';
662 foreach my $mod ( @mods ) {
663 next if $mod eq 'safe' and $type eq 'cp';
664 my $macro= $obj->make_macro(
665 type => $type,
666 ret_type => $ret,
667 safe => $mod eq 'safe'
668 );
669 print $out_fh $macro, "\n";
670 }
32e6a07c 671 }
e64b1bd1 672 };
673
674 while ( <DATA> ) {
675 s/^\s*#//;
676 next unless /\S/;
677 chomp;
678 if ( /^([A-Z]+)/ ) {
679 $doit->();
680 ( $op, $title )= split /\s*:\s*/, $_, 2;
681 @txt= ();
682 } elsif ( s/^=>// ) {
683 my ( $type, $modifier )= split /:/, $_;
684 @types= split ' ', $type;
685 @mods= split ' ', $modifier;
686 } else {
687 push @txt, "$_";
12b72891 688 }
689 }
e64b1bd1 690 $doit->();
691 print $out_fh "/* ex: set ro: */\n";
692 print "updated $path\n" if $path ne '-';
12b72891 693}
e64b1bd1 694
695#
696# Valid types: generic, LATIN1, UTF8, low, latin1, utf8
697# default return value is octects read.
698# append -cp to make it codepoint matched.
699# modifiers come after the colon, valid possibilities
700# being 'fast' and 'safe'.
701#
7021; # in the unlikely case we are being used as a module
12b72891 703
704__DATA__
705LNBREAK: Line Break: \R
e64b1bd1 706=> generic UTF8 LATIN1 :fast safe
12b72891 707"\x0D\x0A" # CRLF - Network (Windows) line ending
7080x0A # LF | LINE FEED
7090x0B # VT | VERTICAL TAB
7100x0C # FF | FORM FEED
7110x0D # CR | CARRIAGE RETURN
7120x85 # NEL | NEXT LINE
7130x2028 # LINE SEPARATOR
7140x2029 # PARAGRAPH SEPARATOR
715
716HORIZWS: Horizontal Whitespace: \h \H
e64b1bd1 717=> generic UTF8 LATIN1 cp :fast safe
12b72891 7180x09 # HT
7190x20 # SPACE
7200xa0 # NBSP
7210x1680 # OGHAM SPACE MARK
7220x180e # MONGOLIAN VOWEL SEPARATOR
7230x2000 # EN QUAD
7240x2001 # EM QUAD
7250x2002 # EN SPACE
7260x2003 # EM SPACE
7270x2004 # THREE-PER-EM SPACE
7280x2005 # FOUR-PER-EM SPACE
7290x2006 # SIX-PER-EM SPACE
7300x2007 # FIGURE SPACE
7310x2008 # PUNCTUATION SPACE
7320x2009 # THIN SPACE
7330x200A # HAIR SPACE
7340x202f # NARROW NO-BREAK SPACE
7350x205f # MEDIUM MATHEMATICAL SPACE
7360x3000 # IDEOGRAPHIC SPACE
737
738VERTWS: Vertical Whitespace: \v \V
e64b1bd1 739=> generic UTF8 LATIN1 cp :fast safe
12b72891 7400x0A # LF
7410x0B # VT
7420x0C # FF
7430x0D # CR
7440x85 # NEL
7450x2028 # LINE SEPARATOR
7460x2029 # PARAGRAPH SEPARATOR
747
e64b1bd1 748
32e6a07c 749TRICKYFOLD: Problematic fold case letters.
e64b1bd1 750=> generic cp generic-cp generic-both :fast safe
7510x00DF # LATIN1 SMALL LETTER SHARP S
32e6a07c 7520x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
7530x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
e64b1bd1 754
755