[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl

package UTF8::Matcher;
use strict;
use warnings;
use Text::Wrap qw(wrap);
use Encode;
use Data::Dumper;

# Author: Yves Orton (demerphq) 2007.

=pod

Dynamically generates macros for detecting special charclasses
in both latin-1, utf8, and codepoint forms.

To regenerate regcharclass.h, run this script from perl-root. No arguments
are necessary.

Each charclass handler is constructed as follows:
Each string the charclass must match  is rendered as unicode (codepoints>255),
and if possible  as latin1 (codepoints>127), and if possible as "neutral"
(all codepoints<128).

The rendered strings are then inserted into digit-tries by type and length.
With shorter strings being added to tries that are allowed to contain longer
strings, but not vice versa.  Thus the "longest" trie contains all strings
for that charclass.

The following types of trie are generated:

  n - Neutral only. All strings in this type have codepoints<128
  l - Latin1 only. All strings in this type have a codepoint>127 in them
  u - UTF8 only.   All strings in this type have a codepoint>255 in them
  L - Latin1. All strings in 'n' and 'l'
  U - UTF8.   All string in 'n' and 'u'
  c - Codepoint. All strings in U but in codepoint and not utf8 form.

The ternary() routine is responsible for converting the trie data into a
ternary conditional that matches the required set of strings. The generated
macro normally takes at least the argument 's' which is expected to be a
pointer of type C<char *> or C<U8 *>. The condition generated will be
optimised to match the string as efficiently as possible, with range lookups
being used where possible, and in some situations relying on "true" to be 1.

ternary() takes two optional arguments, $type which is one of the above
characters and $ext which is used to add an extra extension to the macro name.

If $type is omitted or false then the generated macro will take an additional
argument, 'is_utf8'.

If $ext has the string 'safe' in it then the generated macro will take an extra
argument 'e' for the end of the string, and all lookups will be length checked
to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
be guaranteed safe, and no 'e' argument is provided  and no length checks are
made during execution.

The 'c' type is different as compared to the rest. Instead of producing
a condition that does octet comparisons of a string array, the 'c' type
produces a macro that takes a single codepoint as an argument (instead of a
char* or U8*) and does the lookup based on only that char, thus it cannot be
used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
This is primarily used for populating charclass bitmaps for codepoints 0..255
but will also match codepoints in the unicode range if necessary.

Using LNBREAK as an example the following macros will be produced:

=over 4

=item is_LNBREAK(s,is_utf8)

=item is_LNBREAK_safe(s,e,is_utf8)

Do a lookup as apporpriate based on the is_utf8 flag. When possible
comparisons involving octect<128 are done before checking the is_utf8
flag, hopefully saving time.

=item is_LNBREAK_utf8(s)

=item is_LNBREAK_utf8_safe(s,e)

Do a lookup assuming the string is encoded in (normalized) UTF8.

=item is_LNBREAK_latin1(s)

=item is_LNBREAK_latin1_safe(s,e)

Do a lookup assuming the string is encoded in latin-1 (aka plan octets).

=item is_LNBREAK_cp(cp)

Check to see if the string matches a given codepoint (hypotethically a
U32). The condition is constructed as as to "break out" as early as
possible if the codepoint is out of range of the condition.

IOW:

  (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))

Thus if the character is X+1 only two comparisons will be done. Making
matching lookups slower, but non-matching faster.

=back

=cut

# store a list of numbers into a hash based trie.
sub _trie_store {
    my $root= shift;
    foreach my $b ( @_ ) {
        $root->{$b} ||= {};
        $root= $root->{$b};
    }
    $root->{''}++;
}

# Convert a string into its neutral, latin1, utf8 forms, where
# the form is undefined unless the string can be completely represented
# in that form. The string is then decomposed into the octects representing
# it. A list is returned for each. Additional a list of codepoints making
# up the string.
# returns (\@n,\@u,\@l,\@cp)
#
sub _uni_latin1 {
    my $str= shift;
    my $u= eval { Encode::encode( "utf8",       "$str", Encode::FB_CROAK ) };
    my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
    my $n= $l;
    undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
    return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
            [map { ord $_ } split //,$str]);
}

# store an array ref of char data into the appropriate
# type bins, tracking sizes as we go.
sub _store {
    my ( $self, $r, @k )= @_;
    for my $z ( @k ) {
        $self->{size}{$z}{ 0 + @$r }++;
        push @{ $self->{data}{$z} }, $r;
    }
}

# construct a new charclass constructor object.
# $title ends up in the code a as a comment.
# $opcode is the name of the operation the charclass implements.
# the rest of the arguments are strings that the charclass
# can match.
sub new {
    my $class= shift;
    my $title= shift;
    my $opcode= shift;
    my $self= bless { op => $opcode, title => $title }, $class;
    my %seen;
    # convert the strings to the numeric equivelents and store
    # them for later insertion while tracking their sizes.
    foreach my $seq ( @_ ) {
        next if $seen{$seq}++;
        push @{$self->{seq}},$seq;
        my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
        if ( $n ) {
            _store( $self, $n, qw(n U L) );
        } else {
            if ( $l ) {
                _store( $self, $l, qw(l L) );
            }
            _store( $self, $u, qw(u U) );
        }
        _store($self,$cp,'c');
    }
    #
    # now construct the tries. For each type of data we insert
    # the data into all the tries of length $size and smaller.
    #

    my %allsize;
    foreach my $k ( keys %{ $self->{data} } ) {
        my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
        $self->{size}{$k}=\@size;
        undef @allsize{@size};
        foreach my $d ( @{ $self->{data}{$k} } ) {
            foreach my $sz ( @size ) {
                last if $sz < @$d;
                $self->{trie}{$k}{$sz} ||= {};
                _trie_store( $self->{trie}{$k}{$sz}, @$d );
            }
        }
        #delete $self->{data}{$k};
    }
    my @size= sort { $b <=> $a } keys %allsize;
    $self->{size}{''}= \@size;
    return $self;
}

#
# _cond([$v1,$v2,$v2...],$ofs)
#
# converts an array of codepoints into a conditional expression
# consequtive codepoints are merged into a range test
# returns a string containing the conditional expression in the form
# '( li[x]==v || li[x]==y )' When possible we also use range lookups.

sub _cond {
    my ( $c, $ofs,$fmt )= @_;
    $fmt||='((U8*)s)[%d]';
    # cheapo rangification routine.
    # Convert the first element into a singleton represented
    # as [$x,$x] and then merge the rest in as we go.
    my @v= sort { $a <=> $b } @$c;
    my @r= ( [ ( shift @v ) x 2 ] );
    for my $n ( @v ) {
        if ( $n == $r[-1][1] + 1 ) {
            $r[-1][1]++;
        } else {
            push @r, [ $n, $n ];
        }
    }
    @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
    # sort the ranges by size and order.
    @r= sort { $a->[0] <=> $b->[0] }  @r;
    my $alu= sprintf $fmt,$ofs;    # C array look up

    if ($fmt=~/%d/) {
        # map the ranges into conditions
        @r= map {
            # singleton
            $_->[0] == $_->[1] ? "$alu==$_->[0]" :
            # range
            "($_->[0]<=$alu && $alu<=$_->[1])"
        } @r;
        # return the joined results.
        return '( ' . join( " || ", @r ) . ' )';
    } else {
        return combine($alu,@r);
    }
}

#
# Do the condition in such a way that we break out early if the value
# we are looking at is in between two elements in the list.
# Currently used only for codepoint macros (depth 1)
#
sub combine {
    my $alu=shift;
    local $_ = shift;
    my $txt= $_->[0] == $_->[1]
           ? "$alu==$_->[0]"
           : "($_->[0]<=$alu && $alu<=$_->[1])";
    return $txt unless @_;
    return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
}

# recursively convert a trie to an optree represented by
# [condition,yes,no] where  yes and no can be a ref to another optree
# or a scalar representing code.
# called by make_optree

sub _trie_to_optree {
    my ( $node, $ofs, $else, $fmt )= @_;
    return $else unless $node;
    $ofs ||= 0;
    if ( $node->{''} ) {
        $else= $ofs;
    } else {
        $else ||= 0;
    }
    my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
      map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
      grep length, keys %$node;

    return $ofs if !@k;

    my ( $root, $expr );
    while ( @k ) {
        my @cond= ( $k[0][0] );
        my $d= $k[0][1];
        my $r= $k[0][2];
        shift @k;
        while ( @k && $k[0][1] eq $d ) {
            push @cond, $k[0][0];
            shift @k;
        }
        my $op=
          [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, $else;
    return $root;
}

# construct the optree for a type.
# handles the special logic of type ''.
sub make_optree {
    my ( $self, $type, $size, $fmt )= @_;
    my $else= 0;
    $size||=$self->{size}{$type}[0];
    $size=1 if $type eq 'c';
    if ( !$type ) {
        my ( $u, $l );
        for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
            $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
        }
        for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
            $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
        }
        if ( $u ) {
            $else= [ '(is_utf8)', $u, $l || 0 ];
        } elsif ( $l ) {
            $else= [ '(!is_utf8)', $l, 0 ];
        }
        $type= 'n';
        $size-- while !$self->{trie}{n}{$size};
    }
    return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
}

# construct the optree for a type with length checks to prevent buffer
# overruns. Only one length check is performed per lookup trading code
# size for speed.
sub length_optree {
    my ( $self, $type,$fmt )= @_;
    $type ||= '';
    return $self->{len_op}{$type} if $self->{len_op}{$type};
    my @size = @{$self->{size}{$type}};

    my ( $root, $expr );
    foreach my $size ( @size ) {
        my $op= [
            "( (e) - (s) > " . ( $size - 1 ) . " )",
            $self->make_optree( $type, $size ),
        ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, 0;
    return $self->{len_op}{$type}= $root ? $root : $expr->[0];
}

#
# recursively walk an optree and covert it to a huge nested ternary expression.
#
sub _optree_to_ternary {
    my ( $node )= @_;
    return $node
      if !ref $node;
    my $depth = 0;
    if ( $node->[0] =~ /\[(\d+)\]/ ) {
        $depth= $1 + 1;
    }
    return sprintf "\n%s( %s ? %s : %s )", "  " x $depth, $node->[0],
      _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
}

# add \\ to the end of strings in a reasonable neat way.
sub _macro($) {
    my $str= shift;
    my @lines= split /[^\S\n]*\n/, $str;
    return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
}

# default type extensions. 'uln' dont have one because normally
# they are used only as part of type '' which doesnt get an extension
my %ext= (
    U => '_utf8',
    L => '_latin1',
    c => '_cp',

);

# produce the ternary, handling arguments and putting on the macro headers
# and boiler plate
sub ternary {
    my ( $self, $type, $ext )= @_;
    $type ||= '';
    $ext = ($ext{$type} || '') . ($ext||"");
    my ($root,$fmt,$arg);
    if ($type eq 'c') {
        $arg= $fmt= 'cp';
    } else {
        $arg= 's';
    }
    if ( $type eq 'c' || $ext !~ /safe/) {
        $root= $self->make_optree( $type, 0, $fmt );
    } else {
        $root= $self->length_optree( $type, $fmt );
    }

    our $parens;
    $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
    my $expr= qr/
        \( \s*
        ($parens)
        \s* \? \s*
        \( \s*
        ($parens)
        \s* \? \s*
        (\d+|$parens)
        \s* : \s*
        (\d+|$parens)
        \s* \)
        \s* : \s*
        \4
        \s* \)
    /x;
    my $code= _optree_to_ternary( $root );
    for ( $code ) {
        s/^\s*//;
        1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
          || s<$expr><(($1 && $2) ? $3 : $4)>g
          || s<\(\s*($parens)\s*\)><$1>g;
    }
    my @args=($arg);
    push @args,'e' if $ext=~/safe/;
    push @args,'is_utf8' if !$type;
    my $args=join ",",@args;
    return "/*** GENERATED CODE ***/\n"
          . _macro "#define is_$self->{op}$ext($args)\n$code";
}

my $path=shift @ARGV;
if (!$path) {
    $path= "regcharclass.h";
    if (!-e $path) { $path="../$path" }
    if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
}

rename $path,"$path.bak";
open my $out_fh,">",$path
    or die "Can't write to '$path':$!";
binmode $out_fh; # want unix line endings even when run on win32.
my ($zero)=$0=~/([^\\\/]+)$/;
print $out_fh <<"HEADER";
/*********************** WARNING WARNING WARNING ************************

Do not modify this code directly: This file was autogenerated by

        Porting/$zero

from data contained within the script.  Change the script instead.

Generated at: @{[ scalar gmtime ]} GMT

************************ WARNING WARNING WARNING ************************/

HEADER

my ($op,$title,@strs,@txt);
my $doit= sub {
    return unless $op;
    my $o= __PACKAGE__->new($title,$op,@strs);
    print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
    print $out_fh join "\n",@txt,"*/","";
    for ('', 'U', 'L') {
        print $out_fh $o->ternary( $_ );
        print $out_fh $o->ternary( $_,'_safe' );
    }
    print $out_fh $o->ternary( 'c' );
};
while (<DATA>) {
    next unless /\S/;
    chomp;
    if (/^([A-Z]+)/) {
        $doit->();
        ($op,$title)=split /\s*:\s*/,$_,2;
        @txt=@strs=();
    } else {
        push @txt, "\t$_";
        s/#.*$//;
        if (/^0x/) {
            push @strs,map { chr $_ } eval $_;
        } elsif (/^[""'']/) {
            push @strs,eval $_;
        }
    }
}
$doit->();
print "$path has been updated\n";

__DATA__
LNBREAK: Line Break: \R
"\x0D\x0A"      # CRLF - Network (Windows) line ending
0x0A            # LF  | LINE FEED
0x0B            # VT  | VERTICAL TAB
0x0C            # FF  | FORM FEED
0x0D            # CR  | CARRIAGE RETURN
0x85            # NEL | NEXT LINE
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR

HORIZWS: Horizontal Whitespace: \h \H
0x09            # HT
0x20            # SPACE
0xa0            # NBSP
0x1680          # OGHAM SPACE MARK
0x180e          # MONGOLIAN VOWEL SEPARATOR
0x2000          # EN QUAD
0x2001          # EM QUAD
0x2002          # EN SPACE
0x2003          # EM SPACE
0x2004          # THREE-PER-EM SPACE
0x2005          # FOUR-PER-EM SPACE
0x2006          # SIX-PER-EM SPACE
0x2007          # FIGURE SPACE
0x2008          # PUNCTUATION SPACE
0x2009          # THIN SPACE
0x200A          # HAIR SPACE
0x202f          # NARROW NO-BREAK SPACE
0x205f          # MEDIUM MATHEMATICAL SPACE
0x3000          # IDEOGRAPHIC SPACE

VERTWS: Vertical Whitespace: \v \V
0x0A            # LF
0x0B            # VT
0x0C            # FF
0x0D            # CR
0x85            # NEL
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR
Commit	Line	Data
12b72891	1	package UTF8::Matcher;
	2	use strict;
	3	use warnings;
	4	use Text::Wrap qw(wrap);
	5	use Encode;
	6	use Data::Dumper;
	7
	8	# Author: Yves Orton (demerphq) 2007.
	9
	10	=pod
	11
	12	Dynamically generates macros for detecting special charclasses
	13	in both latin-1, utf8, and codepoint forms.
	14
	15	To regenerate regcharclass.h, run this script from perl-root. No arguments
	16	are necessary.
	17
	18	Each charclass handler is constructed as follows:
	19	Each string the charclass must match is rendered as unicode (codepoints>255),
	20	and if possible as latin1 (codepoints>127), and if possible as "neutral"
	21	(all codepoints<128).
	22
	23	The rendered strings are then inserted into digit-tries by type and length.
	24	With shorter strings being added to tries that are allowed to contain longer
	25	strings, but not vice versa. Thus the "longest" trie contains all strings
	26	for that charclass.
	27
	28	The following types of trie are generated:
	29
	30	n - Neutral only. All strings in this type have codepoints<128
	31	l - Latin1 only. All strings in this type have a codepoint>127 in them
	32	u - UTF8 only. All strings in this type have a codepoint>255 in them
	33	L - Latin1. All strings in 'n' and 'l'
	34	U - UTF8. All string in 'n' and 'u'
	35	c - Codepoint. All strings in U but in codepoint and not utf8 form.
	36
	37	The ternary() routine is responsible for converting the trie data into a
	38	ternary conditional that matches the required set of strings. The generated
	39	macro normally takes at least the argument 's' which is expected to be a
	40	pointer of type C<char > or C<U8 >. The condition generated will be
	41	optimised to match the string as efficiently as possible, with range lookups
	42	being used where possible, and in some situations relying on "true" to be 1.
	43
	44	ternary() takes two optional arguments, $type which is one of the above
	45	characters and $ext which is used to add an extra extension to the macro name.
	46
	47	If $type is omitted or false then the generated macro will take an additional
	48	argument, 'is_utf8'.
	49
	50	If $ext has the string 'safe' in it then the generated macro will take an extra
	51	argument 'e' for the end of the string, and all lookups will be length checked
	52	to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
	53	be guaranteed safe, and no 'e' argument is provided and no length checks are
	54	made during execution.
	55
	56	The 'c' type is different as compared to the rest. Instead of producing
	57	a condition that does octet comparisons of a string array, the 'c' type
	58	produces a macro that takes a single codepoint as an argument (instead of a
	59	char* or U8*) and does the lookup based on only that char, thus it cannot be
	60	used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
	61	This is primarily used for populating charclass bitmaps for codepoints 0..255
	62	but will also match codepoints in the unicode range if necessary.
	63
	64	Using LNBREAK as an example the following macros will be produced:
65
66	=over 4
67
68	=item is_LNBREAK(s,is_utf8)
69
70	=item is_LNBREAK_safe(s,e,is_utf8)
71
72	Do a lookup as apporpriate based on the is_utf8 flag. When possible
73	comparisons involving octect<128 are done before checking the is_utf8
74	flag, hopefully saving time.
75
76	=item is_LNBREAK_utf8(s)
77
78	=item is_LNBREAK_utf8_safe(s,e)
79
80	Do a lookup assuming the string is encoded in (normalized) UTF8.
81
82	=item is_LNBREAK_latin1(s)
83
84	=item is_LNBREAK_latin1_safe(s,e)
85
86	Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
87
88	=item is_LNBREAK_cp(cp)
89
90	Check to see if the string matches a given codepoint (hypotethically a
91	U32). The condition is constructed as as to "break out" as early as
92	possible if the codepoint is out of range of the condition.
93
94	IOW:
95
96	(cp==X \|\| (cp>X && (cp==Y \|\| (cp>Y && ...))))
97
98	Thus if the character is X+1 only two comparisons will be done. Making
99	matching lookups slower, but non-matching faster.
100
101	=back
102
103	=cut
104
105	# store a list of numbers into a hash based trie.
106	sub _trie_store {
107	my $root= shift;
108	foreach my $b ( @_ ) {
109	$root->{$b} \|\|= {};
110	$root= $root->{$b};
111	}
112	$root->{''}++;
113	}
114
115	# Convert a string into its neutral, latin1, utf8 forms, where
116	# the form is undefined unless the string can be completely represented
117	# in that form. The string is then decomposed into the octects representing
118	# it. A list is returned for each. Additional a list of codepoints making
119	# up the string.
120	# returns (\@n,\@u,\@l,\@cp)
121	#
122	sub _uni_latin1 {
123	my $str= shift;
124	my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
125	my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
126	my $n= $l;
127	undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
128	return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
129	[map { ord $_ } split //,$str]);
130	}
131
132	# store an array ref of char data into the appropriate
133	# type bins, tracking sizes as we go.
134	sub _store {
135	my ( $self, $r, @k )= @_;
136	for my $z ( @k ) {
137	$self->{size}{$z}{ 0 + @$r }++;
138	push @{ $self->{data}{$z} }, $r;
139	}
140	}
141
142	# construct a new charclass constructor object.
143	# $title ends up in the code a as a comment.
144	# $opcode is the name of the operation the charclass implements.
145	# the rest of the arguments are strings that the charclass
146	# can match.
147	sub new {
148	my $class= shift;
149	my $title= shift;
150	my $opcode= shift;
151	my $self= bless { op => $opcode, title => $title }, $class;
152	my %seen;
153	# convert the strings to the numeric equivelents and store
154	# them for later insertion while tracking their sizes.
155	foreach my $seq ( @_ ) {
156	next if $seen{$seq}++;
157	push @{$self->{seq}},$seq;
158	my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
159	if ( $n ) {
160	_store( $self, $n, qw(n U L) );
161	} else {
162	if ( $l ) {
163	_store( $self, $l, qw(l L) );
164	}
165	_store( $self, $u, qw(u U) );
166	}
167	_store($self,$cp,'c');
168	}
169	#
170	# now construct the tries. For each type of data we insert
171	# the data into all the tries of length $size and smaller.
172	#
173
174	my %allsize;
175	foreach my $k ( keys %{ $self->{data} } ) {
176	my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
177	$self->{size}{$k}=\@size;
178	undef @allsize{@size};
179	foreach my $d ( @{ $self->{data}{$k} } ) {
180	foreach my $sz ( @size ) {
181	last if $sz < @$d;
182	$self->{trie}{$k}{$sz} \|\|= {};
183	_trie_store( $self->{trie}{$k}{$sz}, @$d );
184	}
185	}
186	#delete $self->{data}{$k};
187	}
188	my @size= sort { $b <=> $a } keys %allsize;
189	$self->{size}{''}= \@size;
190	return $self;
191	}
192
193	#
194	# _cond([$v1,$v2,$v2...],$ofs)
195	#
196	# converts an array of codepoints into a conditional expression
197	# consequtive codepoints are merged into a range test
198	# returns a string containing the conditional expression in the form
199	# '( li[x]==v \|\| li[x]==y )' When possible we also use range lookups.
200
201	sub _cond {
202	my ( $c, $ofs,$fmt )= @_;
203	$fmt\|\|='((U8*)s)[%d]';
204	# cheapo rangification routine.
205	# Convert the first element into a singleton represented
206	# as [$x,$x] and then merge the rest in as we go.
207	my @v= sort { $a <=> $b } @$c;
208	my @r= ( [ ( shift @v ) x 2 ] );
209	for my $n ( @v ) {
210	if ( $n == $r[-1][1] + 1 ) {
211	$r[-1][1]++;
212	} else {
213	push @r, [ $n, $n ];
214	}
215	}
216	@r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
217	# sort the ranges by size and order.
218	@r= sort { $a->[0] <=> $b->[0] } @r;
219	my $alu= sprintf $fmt,$ofs; # C array look up
220
221	if ($fmt=~/%d/) {
222	# map the ranges into conditions
223	@r= map {
224	# singleton
225	$_->[0] == $_->[1] ? "$alu==$_->[0]" :
226	# range
227	"($_->[0]<=$alu && $alu<=$_->[1])"
228	} @r;
229	# return the joined results.
230	return '( ' . join( " \|\| ", @r ) . ' )';
231	} else {
232	return combine($alu,@r);
233	}
234	}
235
236	#
237	# Do the condition in such a way that we break out early if the value
238	# we are looking at is in between two elements in the list.
239	# Currently used only for codepoint macros (depth 1)
240	#
241	sub combine {
242	my $alu=shift;
243	local $_ = shift;
244	my $txt= $_->[0] == $_->[1]
245	? "$alu==$_->[0]"
246	: "($_->[0]<=$alu && $alu<=$_->[1])";
247	return $txt unless @_;
248	return "( $txt \|\| ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
249	}
250
251	# recursively convert a trie to an optree represented by
252	# [condition,yes,no] where yes and no can be a ref to another optree
253	# or a scalar representing code.
254	# called by make_optree
255
256	sub _trie_to_optree {
257	my ( $node, $ofs, $else, $fmt )= @_;
258	return $else unless $node;
259	$ofs \|\|= 0;
260	if ( $node->{''} ) {
261	$else= $ofs;
262	} else {
263	$else \|\|= 0;
264	}
265	my @k= sort { $b->[1] cmp $a->[1] \|\| $a->[0] <=> $b->[0] }
266	map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
267	grep length, keys %$node;
268
269	return $ofs if !@k;
270
271	my ( $root, $expr );
272	while ( @k ) {
273	my @cond= ( $k[0][0] );
274	my $d= $k[0][1];
275	my $r= $k[0][2];
276	shift @k;
277	while ( @k && $k[0][1] eq $d ) {
278	push @cond, $k[0][0];
279	shift @k;
280	}
281	my $op=
282	[ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
283	if ( !$root ) {
284	$root= $expr= $op;
285	} else {
286	push @$expr, $op;
287	$expr= $op;
288	}
289	}
290	push @$expr, $else;
291	return $root;
292	}
293
294	# construct the optree for a type.
295	# handles the special logic of type ''.
296	sub make_optree {
297	my ( $self, $type, $size, $fmt )= @_;
298	my $else= 0;
299	$size\|\|=$self->{size}{$type}[0];
300	$size=1 if $type eq 'c';
301	if ( !$type ) {
302	my ( $u, $l );
303	for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
304	$u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
305	}
306	for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
307	$l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
308	}
309	if ( $u ) {
310	$else= [ '(is_utf8)', $u, $l \|\| 0 ];
311	} elsif ( $l ) {
312	$else= [ '(!is_utf8)', $l, 0 ];
313	}
314	$type= 'n';
315	$size-- while !$self->{trie}{n}{$size};
316	}
317	return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
318	}
319
320	# construct the optree for a type with length checks to prevent buffer
321	# overruns. Only one length check is performed per lookup trading code
322	# size for speed.
323	sub length_optree {
324	my ( $self, $type,$fmt )= @_;
325	$type \|\|= '';
326	return $self->{len_op}{$type} if $self->{len_op}{$type};
327	my @size = @{$self->{size}{$type}};
328
329	my ( $root, $expr );
330	foreach my $size ( @size ) {
331	my $op= [
332	"( (e) - (s) > " . ( $size - 1 ) . " )",
333	$self->make_optree( $type, $size ),
334	];
335	if ( !$root ) {
336	$root= $expr= $op;
337	} else {
338	push @$expr, $op;
339	$expr= $op;
340	}
341	}
342	push @$expr, 0;
343	return $self->{len_op}{$type}= $root ? $root : $expr->[0];
344	}
345
346	#
347	# recursively walk an optree and covert it to a huge nested ternary expression.
348	#
349	sub _optree_to_ternary {
350	my ( $node )= @_;
351	return $node
352	if !ref $node;
353	my $depth = 0;
354	if ( $node->[0] =~ /\[(\d+)\]/ ) {
355	$depth= $1 + 1;
356	}
357	return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
358	_optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
359	}
360
361	# add \\ to the end of strings in a reasonable neat way.
362	sub _macro($) {
363	my $str= shift;
364	my @lines= split /[^\S\n]*\n/, $str;
365	return join( "\\\n", map { sprintf "%-76s", $_ } @lines ) . "\n\n";
366	}
367
368	# default type extensions. 'uln' dont have one because normally
369	# they are used only as part of type '' which doesnt get an extension
370	my %ext= (
371	U => '_utf8',
372	L => '_latin1',
373	c => '_cp',
374
375	);
376
377	# produce the ternary, handling arguments and putting on the macro headers
378	# and boiler plate
379	sub ternary {
380	my ( $self, $type, $ext )= @_;
381	$type \|\|= '';
382	$ext = ($ext{$type} \|\| '') . ($ext\|\|"");
383	my ($root,$fmt,$arg);
384	if ($type eq 'c') {
385	$arg= $fmt= 'cp';
386	} else {
387	$arg= 's';
388	}
389	if ( $type eq 'c' \|\| $ext !~ /safe/) {
390	$root= $self->make_optree( $type, 0, $fmt );
391	} else {
392	$root= $self->length_optree( $type, $fmt );
393	}
394
395	our $parens;
396	$parens= qr/ \( (?: (?> [^()]+? ) \| (??{$parens}) )+? \) /x;
397	my $expr= qr/
398	\( \s*
399	($parens)
400	\s* \? \s*
401	\( \s*
402	($parens)
403	\s* \? \s*
404	(\d+\|$parens)
405	\s* : \s*
406	(\d+\|$parens)
407	\s* \)
408	\s* : \s*
409	\4
410	\s* \)
411	/x;
412	my $code= _optree_to_ternary( $root );
413	for ( $code ) {
414	s/^\s*//;
415	1 while s/\(\s($parens)\s\?\s1\s:\s0\s\)/$1/g
416	\|\| s<$expr><(($1 && $2) ? $3 : $4)>g
417	\|\| s<\(\s($parens)\s\)><$1>g;
418	}
419	my @args=($arg);
420	push @args,'e' if $ext=~/safe/;
421	push @args,'is_utf8' if !$type;
422	my $args=join ",",@args;
423	return "/* GENERATED CODE */\n"
424	. _macro "#define is_$self->{op}$ext($args)\n$code";
425	}
426
427	my $path=shift @ARGV;
428	if (!$path) {
429	$path= "regcharclass.h";
430	if (!-e $path) { $path="../$path" }
431	if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
432	}
433
434	rename $path,"$path.bak";
435	open my $out_fh,">",$path
436	or die "Can't write to '$path':$!";
437	binmode $out_fh; # want unix line endings even when run on win32.
438	my ($zero)=$0=~/([^\\\/]+)$/;
439	print $out_fh <<"HEADER";
440	/********************* WARNING WARNING WARNING **********************
441
442	Do not modify this code directly: This file was autogenerated by
443
444	Porting/$zero
445
446	from data contained within the script. Change the script instead.
447
448	Generated at: @{[ scalar gmtime ]} GMT
449
450	********************** WARNING WARNING WARNING **********************/
451
452	HEADER
453
454	my ($op,$title,@strs,@txt);
455	my $doit= sub {
456	return unless $op;
457	my $o= __PACKAGE__->new($title,$op,@strs);
458	print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
459	print $out_fh join "\n",@txt,"*/","";
460	for ('', 'U', 'L') {
461	print $out_fh $o->ternary( $_ );
462	print $out_fh $o->ternary( $_,'_safe' );
463	}
464	print $out_fh $o->ternary( 'c' );
465	};
466	while (<DATA>) {
467	next unless /\S/;
468	chomp;
469	if (/^([A-Z]+)/) {
470	$doit->();
471	($op,$title)=split /\s:\s/,$_,2;
472	@txt=@strs=();
473	} else {
474	push @txt, "\t$_";
475	s/#.*$//;
476	if (/^0x/) {
477	push @strs,map { chr $_ } eval $_;
478	} elsif (/^[""'']/) {
479	push @strs,eval $_;
480	}
481	}
482	}
483	$doit->();
484	print "$path has been updated\n";
485
486	__DATA__
487	LNBREAK: Line Break: \R
488	"\x0D\x0A" # CRLF - Network (Windows) line ending
489	0x0A # LF \| LINE FEED
490	0x0B # VT \| VERTICAL TAB
491	0x0C # FF \| FORM FEED
492	0x0D # CR \| CARRIAGE RETURN
493	0x85 # NEL \| NEXT LINE
494	0x2028 # LINE SEPARATOR
495	0x2029 # PARAGRAPH SEPARATOR
496
497	HORIZWS: Horizontal Whitespace: \h \H
498	0x09 # HT
499	0x20 # SPACE
500	0xa0 # NBSP
501	0x1680 # OGHAM SPACE MARK
502	0x180e # MONGOLIAN VOWEL SEPARATOR
503	0x2000 # EN QUAD
504	0x2001 # EM QUAD
505	0x2002 # EN SPACE
506	0x2003 # EM SPACE
507	0x2004 # THREE-PER-EM SPACE
508	0x2005 # FOUR-PER-EM SPACE
509	0x2006 # SIX-PER-EM SPACE
510	0x2007 # FIGURE SPACE
511	0x2008 # PUNCTUATION SPACE
512	0x2009 # THIN SPACE
513	0x200A # HAIR SPACE
514	0x202f # NARROW NO-BREAK SPACE
515	0x205f # MEDIUM MATHEMATICAL SPACE
516	0x3000 # IDEOGRAPHIC SPACE
517
518	VERTWS: Vertical Whitespace: \v \V
519	0x0A # LF
520	0x0B # VT
521	0x0C # FF
522	0x0D # CR
523	0x85 # NEL
524	0x2028 # LINE SEPARATOR
525	0x2029 # PARAGRAPH SEPARATOR
526