Silence 5 "possible loss of data" warnings from VC6
[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl
CommitLineData
12b72891 1package UTF8::Matcher;
2use strict;
3use warnings;
4use Text::Wrap qw(wrap);
5use Encode;
6use Data::Dumper;
7
0ccab2bc 8our $hex_fmt="0x%02X";
9
12b72891 10# Author: Yves Orton (demerphq) 2007.
11
12=pod
13
14Dynamically generates macros for detecting special charclasses
15in both latin-1, utf8, and codepoint forms.
16
17To regenerate regcharclass.h, run this script from perl-root. No arguments
18are necessary.
19
20Each charclass handler is constructed as follows:
21Each string the charclass must match is rendered as unicode (codepoints>255),
22and if possible as latin1 (codepoints>127), and if possible as "neutral"
23(all codepoints<128).
24
25The rendered strings are then inserted into digit-tries by type and length.
26With shorter strings being added to tries that are allowed to contain longer
27strings, but not vice versa. Thus the "longest" trie contains all strings
28for that charclass.
29
30The following types of trie are generated:
31
32 n - Neutral only. All strings in this type have codepoints<128
33 l - Latin1 only. All strings in this type have a codepoint>127 in them
34 u - UTF8 only. All strings in this type have a codepoint>255 in them
35 L - Latin1. All strings in 'n' and 'l'
36 U - UTF8. All string in 'n' and 'u'
37 c - Codepoint. All strings in U but in codepoint and not utf8 form.
38
39The ternary() routine is responsible for converting the trie data into a
40ternary conditional that matches the required set of strings. The generated
41macro normally takes at least the argument 's' which is expected to be a
42pointer of type C<char *> or C<U8 *>. The condition generated will be
43optimised to match the string as efficiently as possible, with range lookups
44being used where possible, and in some situations relying on "true" to be 1.
45
46ternary() takes two optional arguments, $type which is one of the above
47characters and $ext which is used to add an extra extension to the macro name.
48
49If $type is omitted or false then the generated macro will take an additional
50argument, 'is_utf8'.
51
52If $ext has the string 'safe' in it then the generated macro will take an extra
53argument 'e' for the end of the string, and all lookups will be length checked
54to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
55be guaranteed safe, and no 'e' argument is provided and no length checks are
56made during execution.
57
58The 'c' type is different as compared to the rest. Instead of producing
59a condition that does octet comparisons of a string array, the 'c' type
60produces a macro that takes a single codepoint as an argument (instead of a
61char* or U8*) and does the lookup based on only that char, thus it cannot be
62used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
63This is primarily used for populating charclass bitmaps for codepoints 0..255
64but will also match codepoints in the unicode range if necessary.
65
66Using LNBREAK as an example the following macros will be produced:
67
68=over 4
69
70=item is_LNBREAK(s,is_utf8)
71
72=item is_LNBREAK_safe(s,e,is_utf8)
73
74Do a lookup as apporpriate based on the is_utf8 flag. When possible
75comparisons involving octect<128 are done before checking the is_utf8
76flag, hopefully saving time.
77
78=item is_LNBREAK_utf8(s)
79
80=item is_LNBREAK_utf8_safe(s,e)
81
82Do a lookup assuming the string is encoded in (normalized) UTF8.
83
84=item is_LNBREAK_latin1(s)
85
86=item is_LNBREAK_latin1_safe(s,e)
87
88Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
89
90=item is_LNBREAK_cp(cp)
91
92Check to see if the string matches a given codepoint (hypotethically a
93U32). The condition is constructed as as to "break out" as early as
94possible if the codepoint is out of range of the condition.
95
96IOW:
97
98 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
99
100Thus if the character is X+1 only two comparisons will be done. Making
101matching lookups slower, but non-matching faster.
102
103=back
104
105=cut
106
107# store a list of numbers into a hash based trie.
108sub _trie_store {
109 my $root= shift;
110 foreach my $b ( @_ ) {
111 $root->{$b} ||= {};
112 $root= $root->{$b};
113 }
114 $root->{''}++;
115}
116
117# Convert a string into its neutral, latin1, utf8 forms, where
118# the form is undefined unless the string can be completely represented
119# in that form. The string is then decomposed into the octects representing
120# it. A list is returned for each. Additional a list of codepoints making
121# up the string.
122# returns (\@n,\@u,\@l,\@cp)
123#
124sub _uni_latin1 {
125 my $str= shift;
126 my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
127 my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
128 my $n= $l;
129 undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
130 return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
131 [map { ord $_ } split //,$str]);
132}
133
134# store an array ref of char data into the appropriate
135# type bins, tracking sizes as we go.
136sub _store {
137 my ( $self, $r, @k )= @_;
138 for my $z ( @k ) {
139 $self->{size}{$z}{ 0 + @$r }++;
140 push @{ $self->{data}{$z} }, $r;
141 }
142}
143
144# construct a new charclass constructor object.
145# $title ends up in the code a as a comment.
146# $opcode is the name of the operation the charclass implements.
147# the rest of the arguments are strings that the charclass
148# can match.
149sub new {
150 my $class= shift;
151 my $title= shift;
152 my $opcode= shift;
153 my $self= bless { op => $opcode, title => $title }, $class;
154 my %seen;
155 # convert the strings to the numeric equivelents and store
156 # them for later insertion while tracking their sizes.
157 foreach my $seq ( @_ ) {
158 next if $seen{$seq}++;
159 push @{$self->{seq}},$seq;
160 my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
161 if ( $n ) {
162 _store( $self, $n, qw(n U L) );
163 } else {
164 if ( $l ) {
165 _store( $self, $l, qw(l L) );
166 }
167 _store( $self, $u, qw(u U) );
168 }
169 _store($self,$cp,'c');
170 }
171 #
172 # now construct the tries. For each type of data we insert
173 # the data into all the tries of length $size and smaller.
174 #
175
176 my %allsize;
177 foreach my $k ( keys %{ $self->{data} } ) {
178 my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
179 $self->{size}{$k}=\@size;
180 undef @allsize{@size};
181 foreach my $d ( @{ $self->{data}{$k} } ) {
182 foreach my $sz ( @size ) {
183 last if $sz < @$d;
184 $self->{trie}{$k}{$sz} ||= {};
185 _trie_store( $self->{trie}{$k}{$sz}, @$d );
186 }
187 }
188 #delete $self->{data}{$k};
189 }
190 my @size= sort { $b <=> $a } keys %allsize;
191 $self->{size}{''}= \@size;
192 return $self;
193}
194
195#
196# _cond([$v1,$v2,$v2...],$ofs)
197#
198# converts an array of codepoints into a conditional expression
199# consequtive codepoints are merged into a range test
200# returns a string containing the conditional expression in the form
201# '( li[x]==v || li[x]==y )' When possible we also use range lookups.
202
203sub _cond {
204 my ( $c, $ofs,$fmt )= @_;
205 $fmt||='((U8*)s)[%d]';
206 # cheapo rangification routine.
207 # Convert the first element into a singleton represented
208 # as [$x,$x] and then merge the rest in as we go.
209 my @v= sort { $a <=> $b } @$c;
210 my @r= ( [ ( shift @v ) x 2 ] );
211 for my $n ( @v ) {
212 if ( $n == $r[-1][1] + 1 ) {
213 $r[-1][1]++;
214 } else {
215 push @r, [ $n, $n ];
216 }
217 }
218 @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
219 # sort the ranges by size and order.
220 @r= sort { $a->[0] <=> $b->[0] } @r;
221 my $alu= sprintf $fmt,$ofs; # C array look up
222
223 if ($fmt=~/%d/) {
224 # map the ranges into conditions
225 @r= map {
226 # singleton
0ccab2bc 227 $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
12b72891 228 # range
0ccab2bc 229 sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
12b72891 230 } @r;
231 # return the joined results.
232 return '( ' . join( " || ", @r ) . ' )';
233 } else {
234 return combine($alu,@r);
235 }
236}
237
238#
239# Do the condition in such a way that we break out early if the value
240# we are looking at is in between two elements in the list.
241# Currently used only for codepoint macros (depth 1)
242#
243sub combine {
244 my $alu=shift;
245 local $_ = shift;
246 my $txt= $_->[0] == $_->[1]
0ccab2bc 247 ? sprintf("$alu == $hex_fmt",$_->[0])
248 : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
12b72891 249 return $txt unless @_;
250 return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
251}
252
253# recursively convert a trie to an optree represented by
254# [condition,yes,no] where yes and no can be a ref to another optree
255# or a scalar representing code.
256# called by make_optree
257
258sub _trie_to_optree {
259 my ( $node, $ofs, $else, $fmt )= @_;
260 return $else unless $node;
261 $ofs ||= 0;
262 if ( $node->{''} ) {
263 $else= $ofs;
264 } else {
265 $else ||= 0;
266 }
267 my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
268 map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
269 grep length, keys %$node;
270
271 return $ofs if !@k;
272
273 my ( $root, $expr );
274 while ( @k ) {
275 my @cond= ( $k[0][0] );
276 my $d= $k[0][1];
277 my $r= $k[0][2];
278 shift @k;
279 while ( @k && $k[0][1] eq $d ) {
280 push @cond, $k[0][0];
281 shift @k;
282 }
283 my $op=
284 [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
285 if ( !$root ) {
286 $root= $expr= $op;
287 } else {
288 push @$expr, $op;
289 $expr= $op;
290 }
291 }
292 push @$expr, $else;
293 return $root;
294}
295
296# construct the optree for a type.
297# handles the special logic of type ''.
298sub make_optree {
299 my ( $self, $type, $size, $fmt )= @_;
300 my $else= 0;
301 $size||=$self->{size}{$type}[0];
302 $size=1 if $type eq 'c';
303 if ( !$type ) {
304 my ( $u, $l );
305 for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
306 $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
307 }
308 for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
309 $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
310 }
311 if ( $u ) {
312 $else= [ '(is_utf8)', $u, $l || 0 ];
313 } elsif ( $l ) {
314 $else= [ '(!is_utf8)', $l, 0 ];
315 }
316 $type= 'n';
317 $size-- while !$self->{trie}{n}{$size};
318 }
319 return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
320}
321
322# construct the optree for a type with length checks to prevent buffer
323# overruns. Only one length check is performed per lookup trading code
324# size for speed.
325sub length_optree {
326 my ( $self, $type,$fmt )= @_;
327 $type ||= '';
328 return $self->{len_op}{$type} if $self->{len_op}{$type};
329 my @size = @{$self->{size}{$type}};
330
331 my ( $root, $expr );
332 foreach my $size ( @size ) {
333 my $op= [
334 "( (e) - (s) > " . ( $size - 1 ) . " )",
335 $self->make_optree( $type, $size ),
336 ];
337 if ( !$root ) {
338 $root= $expr= $op;
339 } else {
340 push @$expr, $op;
341 $expr= $op;
342 }
343 }
344 push @$expr, 0;
345 return $self->{len_op}{$type}= $root ? $root : $expr->[0];
346}
347
348#
349# recursively walk an optree and covert it to a huge nested ternary expression.
350#
351sub _optree_to_ternary {
352 my ( $node )= @_;
353 return $node
354 if !ref $node;
355 my $depth = 0;
356 if ( $node->[0] =~ /\[(\d+)\]/ ) {
357 $depth= $1 + 1;
358 }
359 return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
360 _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
361}
362
363# add \\ to the end of strings in a reasonable neat way.
364sub _macro($) {
365 my $str= shift;
366 my @lines= split /[^\S\n]*\n/, $str;
0ccab2bc 367 my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
368 $macro =~ s/ *$//;
369 return $macro . "\n\n";
12b72891 370}
371
372# default type extensions. 'uln' dont have one because normally
373# they are used only as part of type '' which doesnt get an extension
374my %ext= (
375 U => '_utf8',
376 L => '_latin1',
377 c => '_cp',
378
379);
380
381# produce the ternary, handling arguments and putting on the macro headers
382# and boiler plate
383sub ternary {
384 my ( $self, $type, $ext )= @_;
385 $type ||= '';
386 $ext = ($ext{$type} || '') . ($ext||"");
387 my ($root,$fmt,$arg);
388 if ($type eq 'c') {
389 $arg= $fmt= 'cp';
390 } else {
391 $arg= 's';
392 }
393 if ( $type eq 'c' || $ext !~ /safe/) {
394 $root= $self->make_optree( $type, 0, $fmt );
395 } else {
396 $root= $self->length_optree( $type, $fmt );
397 }
398
399 our $parens;
400 $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
401 my $expr= qr/
402 \( \s*
403 ($parens)
404 \s* \? \s*
405 \( \s*
406 ($parens)
407 \s* \? \s*
408 (\d+|$parens)
409 \s* : \s*
410 (\d+|$parens)
411 \s* \)
412 \s* : \s*
413 \4
414 \s* \)
415 /x;
416 my $code= _optree_to_ternary( $root );
417 for ( $code ) {
418 s/^\s*//;
419 1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
420 || s<$expr><(($1 && $2) ? $3 : $4)>g
421 || s<\(\s*($parens)\s*\)><$1>g;
422 }
423 my @args=($arg);
424 push @args,'e' if $ext=~/safe/;
425 push @args,'is_utf8' if !$type;
426 my $args=join ",",@args;
427 return "/*** GENERATED CODE ***/\n"
428 . _macro "#define is_$self->{op}$ext($args)\n$code";
429}
430
431my $path=shift @ARGV;
432if (!$path) {
433 $path= "regcharclass.h";
434 if (!-e $path) { $path="../$path" }
435 if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
436}
437
438rename $path,"$path.bak";
439open my $out_fh,">",$path
440 or die "Can't write to '$path':$!";
441binmode $out_fh; # want unix line endings even when run on win32.
58fbde93 442my ($zero) = $0=~/([^\\\/]+)$/;
12b72891 443print $out_fh <<"HEADER";
58fbde93 444/* -*- buffer-read-only: t -*-
445 *
446 * regcharclass.h
447 *
448 * Copyright (C) 2007, by Larry Wall and others
449 *
450 * You may distribute under the terms of either the GNU General Public
451 * License or the Artistic License, as specified in the README file.
452 *
453 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
454 * This file is built by Porting/$zero.
455 * (Generated at: @{[ scalar gmtime ]} GMT)
456 * Any changes made here will be lost!
457 */
12b72891 458
459HEADER
460
461my ($op,$title,@strs,@txt);
462my $doit= sub {
463 return unless $op;
464 my $o= __PACKAGE__->new($title,$op,@strs);
465 print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
466 print $out_fh join "\n",@txt,"*/","";
467 for ('', 'U', 'L') {
468 print $out_fh $o->ternary( $_ );
469 print $out_fh $o->ternary( $_,'_safe' );
470 }
471 print $out_fh $o->ternary( 'c' );
472};
473while (<DATA>) {
474 next unless /\S/;
475 chomp;
476 if (/^([A-Z]+)/) {
477 $doit->();
478 ($op,$title)=split /\s*:\s*/,$_,2;
479 @txt=@strs=();
480 } else {
481 push @txt, "\t$_";
482 s/#.*$//;
483 if (/^0x/) {
484 push @strs,map { chr $_ } eval $_;
485 } elsif (/^[""'']/) {
486 push @strs,eval $_;
487 }
488 }
489}
490$doit->();
58fbde93 491print $out_fh "/* ex: set ro: */\n";
12b72891 492print "$path has been updated\n";
493
494__DATA__
495LNBREAK: Line Break: \R
496"\x0D\x0A" # CRLF - Network (Windows) line ending
4970x0A # LF | LINE FEED
4980x0B # VT | VERTICAL TAB
4990x0C # FF | FORM FEED
5000x0D # CR | CARRIAGE RETURN
5010x85 # NEL | NEXT LINE
5020x2028 # LINE SEPARATOR
5030x2029 # PARAGRAPH SEPARATOR
504
505HORIZWS: Horizontal Whitespace: \h \H
5060x09 # HT
5070x20 # SPACE
5080xa0 # NBSP
5090x1680 # OGHAM SPACE MARK
5100x180e # MONGOLIAN VOWEL SEPARATOR
5110x2000 # EN QUAD
5120x2001 # EM QUAD
5130x2002 # EN SPACE
5140x2003 # EM SPACE
5150x2004 # THREE-PER-EM SPACE
5160x2005 # FOUR-PER-EM SPACE
5170x2006 # SIX-PER-EM SPACE
5180x2007 # FIGURE SPACE
5190x2008 # PUNCTUATION SPACE
5200x2009 # THIN SPACE
5210x200A # HAIR SPACE
5220x202f # NARROW NO-BREAK SPACE
5230x205f # MEDIUM MATHEMATICAL SPACE
5240x3000 # IDEOGRAPHIC SPACE
525
526VERTWS: Vertical Whitespace: \v \V
5270x0A # LF
5280x0B # VT
5290x0C # FF
5300x0D # CR
5310x85 # NEL
5320x2028 # LINE SEPARATOR
5330x2029 # PARAGRAPH SEPARATOR
534