[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl

package UTF8::Matcher;
use strict;
use warnings;
use Text::Wrap qw(wrap);
use Encode;
use Data::Dumper;

our $hex_fmt="0x%02X";

# Author: Yves Orton (demerphq) 2007.

=pod

Dynamically generates macros for detecting special charclasses
in both latin-1, utf8, and codepoint forms.

To regenerate regcharclass.h, run this script from perl-root. No arguments
are necessary.

Each charclass handler is constructed as follows:
Each string the charclass must match  is rendered as unicode (codepoints>255),
and if possible  as latin1 (codepoints>127), and if possible as "neutral"
(all codepoints<128).

The rendered strings are then inserted into digit-tries by type and length.
With shorter strings being added to tries that are allowed to contain longer
strings, but not vice versa.  Thus the "longest" trie contains all strings
for that charclass.

The following types of trie are generated:

  n - Neutral only. All strings in this type have codepoints<128
  l - Latin1 only. All strings in this type have a codepoint>127 in them
  u - UTF8 only.   All strings in this type have a codepoint>255 in them
  L - Latin1. All strings in 'n' and 'l'
  U - UTF8.   All string in 'n' and 'u'
  c - Codepoint. All strings in U but in codepoint and not utf8 form.

The ternary() routine is responsible for converting the trie data into a
ternary conditional that matches the required set of strings. The generated
macro normally takes at least the argument 's' which is expected to be a
pointer of type C<char *> or C<U8 *>. The condition generated will be
optimised to match the string as efficiently as possible, with range lookups
being used where possible, and in some situations relying on "true" to be 1.

ternary() takes two optional arguments, $type which is one of the above
characters and $ext which is used to add an extra extension to the macro name.

If $type is omitted or false then the generated macro will take an additional
argument, 'is_utf8'.

If $ext has the string 'safe' in it then the generated macro will take an extra
argument 'e' for the end of the string, and all lookups will be length checked
to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
be guaranteed safe, and no 'e' argument is provided  and no length checks are
made during execution.

The 'c' type is different as compared to the rest. Instead of producing
a condition that does octet comparisons of a string array, the 'c' type
produces a macro that takes a single codepoint as an argument (instead of a
char* or U8*) and does the lookup based on only that char, thus it cannot be
used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
This is primarily used for populating charclass bitmaps for codepoints 0..255
but will also match codepoints in the unicode range if necessary.

Using LNBREAK as an example the following macros will be produced:

=over 4

=item is_LNBREAK(s,is_utf8)

=item is_LNBREAK_safe(s,e,is_utf8)

Do a lookup as apporpriate based on the is_utf8 flag. When possible
comparisons involving octect<128 are done before checking the is_utf8
flag, hopefully saving time.

=item is_LNBREAK_utf8(s)

=item is_LNBREAK_utf8_safe(s,e)

Do a lookup assuming the string is encoded in (normalized) UTF8.

=item is_LNBREAK_latin1(s)

=item is_LNBREAK_latin1_safe(s,e)

Do a lookup assuming the string is encoded in latin-1 (aka plan octets).

=item is_LNBREAK_cp(cp)

Check to see if the string matches a given codepoint (hypotethically a
U32). The condition is constructed as as to "break out" as early as
possible if the codepoint is out of range of the condition.

IOW:

  (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))

Thus if the character is X+1 only two comparisons will be done. Making
matching lookups slower, but non-matching faster.

=back

=cut

# store a list of numbers into a hash based trie.
sub _trie_store {
    my $root= shift;
    foreach my $b ( @_ ) {
        $root->{$b} ||= {};
        $root= $root->{$b};
    }
    $root->{''}++;
}

# Convert a string into its neutral, latin1, utf8 forms, where
# the form is undefined unless the string can be completely represented
# in that form. The string is then decomposed into the octects representing
# it. A list is returned for each. Additional a list of codepoints making
# up the string.
# returns (\@n,\@u,\@l,\@cp)
#
sub _uni_latin1 {
    my $str= shift;
    my $u= eval { Encode::encode( "utf8",       "$str", Encode::FB_CROAK ) };
    my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
    my $n= $l;
    undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
    return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
            [map { ord $_ } split //,$str]);
}

# store an array ref of char data into the appropriate
# type bins, tracking sizes as we go.
sub _store {
    my ( $self, $r, @k )= @_;
    for my $z ( @k ) {
        $self->{size}{$z}{ 0 + @$r }++;
        push @{ $self->{data}{$z} }, $r;
    }
}

# construct a new charclass constructor object.
# $title ends up in the code a as a comment.
# $opcode is the name of the operation the charclass implements.
# the rest of the arguments are strings that the charclass
# can match.
sub new {
    my $class= shift;
    my $title= shift;
    my $opcode= shift;
    my $self= bless { op => $opcode, title => $title }, $class;
    my %seen;
    # convert the strings to the numeric equivelents and store
    # them for later insertion while tracking their sizes.
    foreach my $seq ( @_ ) {
        next if $seen{$seq}++;
        push @{$self->{seq}},$seq;
        my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
        if ( $n ) {
            _store( $self, $n, qw(n U L) );
        } else {
            if ( $l ) {
                _store( $self, $l, qw(l L) );
            }
            _store( $self, $u, qw(u U) );
        }
        _store($self,$cp,'c');
    }
    #
    # now construct the tries. For each type of data we insert
    # the data into all the tries of length $size and smaller.
    #

    my %allsize;
    foreach my $k ( keys %{ $self->{data} } ) {
        my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
        $self->{size}{$k}=\@size;
        undef @allsize{@size};
        foreach my $d ( @{ $self->{data}{$k} } ) {
            foreach my $sz ( @size ) {
                last if $sz < @$d;
                $self->{trie}{$k}{$sz} ||= {};
                _trie_store( $self->{trie}{$k}{$sz}, @$d );
            }
        }
        #delete $self->{data}{$k};
    }
    my @size= sort { $b <=> $a } keys %allsize;
    $self->{size}{''}= \@size;
    return $self;
}

#
# _cond([$v1,$v2,$v2...],$ofs)
#
# converts an array of codepoints into a conditional expression
# consequtive codepoints are merged into a range test
# returns a string containing the conditional expression in the form
# '( li[x]==v || li[x]==y )' When possible we also use range lookups.

sub _cond {
    my ( $c, $ofs,$fmt )= @_;
    $fmt||='((U8*)s)[%d]';
    # cheapo rangification routine.
    # Convert the first element into a singleton represented
    # as [$x,$x] and then merge the rest in as we go.
    my @v= sort { $a <=> $b } @$c;
    my @r= ( [ ( shift @v ) x 2 ] );
    for my $n ( @v ) {
        if ( $n == $r[-1][1] + 1 ) {
            $r[-1][1]++;
        } else {
            push @r, [ $n, $n ];
        }
    }
    @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
    # sort the ranges by size and order.
    @r= sort { $a->[0] <=> $b->[0] }  @r;
    my $alu= sprintf $fmt,$ofs;    # C array look up

    if ($fmt=~/%d/) {
        # map the ranges into conditions
        @r= map {
            # singleton
            $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
            # range
            sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
        } @r;
        # return the joined results.
        return '( ' . join( " || ", @r ) . ' )';
    } else {
        return combine($alu,@r);
    }
}

#
# Do the condition in such a way that we break out early if the value
# we are looking at is in between two elements in the list.
# Currently used only for codepoint macros (depth 1)
#
sub combine {
    my $alu=shift;
    local $_ = shift;
    my $txt= $_->[0] == $_->[1]
           ? sprintf("$alu == $hex_fmt",$_->[0])
           : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
    return $txt unless @_;
    return "( $txt || ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
}

# recursively convert a trie to an optree represented by
# [condition,yes,no] where  yes and no can be a ref to another optree
# or a scalar representing code.
# called by make_optree

sub _trie_to_optree {
    my ( $node, $ofs, $else, $fmt )= @_;
    return $else unless $node;
    $ofs ||= 0;
    if ( $node->{''} ) {
        $else= $ofs;
    } else {
        $else ||= 0;
    }
    my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
      map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
      grep length, keys %$node;

    return $ofs if !@k;

    my ( $root, $expr );
    while ( @k ) {
        my @cond= ( $k[0][0] );
        my $d= $k[0][1];
        my $r= $k[0][2];
        shift @k;
        while ( @k && $k[0][1] eq $d ) {
            push @cond, $k[0][0];
            shift @k;
        }
        my $op=
          [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, $else;
    return $root;
}

# construct the optree for a type.
# handles the special logic of type ''.
sub make_optree {
    my ( $self, $type, $size, $fmt )= @_;
    my $else= 0;
    $size||=$self->{size}{$type}[0];
    $size=1 if $type eq 'c';
    if ( !$type ) {
        my ( $u, $l );
        for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
            $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
        }
        for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
            $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
        }
        if ( $u ) {
            $else= [ '(is_utf8)', $u, $l || 0 ];
        } elsif ( $l ) {
            $else= [ '(!is_utf8)', $l, 0 ];
        }
        $type= 'n';
        $size-- while !$self->{trie}{n}{$size};
    }
    return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
}

# construct the optree for a type with length checks to prevent buffer
# overruns. Only one length check is performed per lookup trading code
# size for speed.
sub length_optree {
    my ( $self, $type,$fmt )= @_;
    $type ||= '';
    return $self->{len_op}{$type} if $self->{len_op}{$type};
    my @size = @{$self->{size}{$type}};

    my ( $root, $expr );
    foreach my $size ( @size ) {
        my $op= [
            "( (e) - (s) > " . ( $size - 1 ) . " )",
            $self->make_optree( $type, $size ),
        ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, 0;
    return $self->{len_op}{$type}= $root ? $root : $expr->[0];
}

#
# recursively walk an optree and covert it to a huge nested ternary expression.
#
sub _optree_to_ternary {
    my ( $node )= @_;
    return $node
      if !ref $node;
    my $depth = 0;
    if ( $node->[0] =~ /\[(\d+)\]/ ) {
        $depth= $1 + 1;
    }
    return sprintf "\n%s( %s ? %s : %s )", "  " x $depth, $node->[0],
      _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
}

# add \\ to the end of strings in a reasonable neat way.
sub _macro($) {
    my $str= shift;
    my @lines= split /[^\S\n]*\n/, $str;
    my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
    $macro =~ s/  *$//;
    return $macro . "\n\n";
}

# default type extensions. 'uln' dont have one because normally
# they are used only as part of type '' which doesnt get an extension
my %ext= (
    U => '_utf8',
    L => '_latin1',
    c => '_cp',

);

# produce the ternary, handling arguments and putting on the macro headers
# and boiler plate
sub ternary {
    my ( $self, $type, $ext )= @_;
    $type ||= '';
    $ext = ($ext{$type} || '') . ($ext||"");
    my ($root,$fmt,$arg);
    if ($type eq 'c') {
        $arg= $fmt= 'cp';
    } else {
        $arg= 's';
    }
    if ( $type eq 'c' || $ext !~ /safe/) {
        $root= $self->make_optree( $type, 0, $fmt );
    } else {
        $root= $self->length_optree( $type, $fmt );
    }

    our $parens;
    $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
    my $expr= qr/
        \( \s*
        ($parens)
        \s* \? \s*
        \( \s*
        ($parens)
        \s* \? \s*
        (\d+|$parens)
        \s* : \s*
        (\d+|$parens)
        \s* \)
        \s* : \s*
        \4
        \s* \)
    /x;
    my $code= _optree_to_ternary( $root );
    for ( $code ) {
        s/^\s*//;
        1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
          || s<$expr><(($1 && $2) ? $3 : $4)>g
          || s<\(\s*($parens)\s*\)><$1>g;
    }
    my @args=($arg);
    push @args,'e' if $ext=~/safe/;
    push @args,'is_utf8' if !$type;
    my $args=join ",",@args;
    return "/*** GENERATED CODE ***/\n"
          . _macro "#define is_$self->{op}$ext($args)\n$code";
}

my $path=shift @ARGV;
if (!$path) {
    $path= "regcharclass.h";
    if (!-e $path) { $path="../$path" }
    if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
}

rename $path,"$path.bak";
open my $out_fh,">",$path
    or die "Can't write to '$path':$!";
binmode $out_fh; # want unix line endings even when run on win32.
my ($zero) = $0=~/([^\\\/]+)$/;
print $out_fh <<"HEADER";
/*  -*- buffer-read-only: t -*-
 *
 *    regcharclass.h
 *
 *    Copyright (C) 2007, by Larry Wall and others
 *
 *    You may distribute under the terms of either the GNU General Public
 *    License or the Artistic License, as specified in the README file.
 *
 * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 * This file is built by Porting/$zero.
 * (Generated at: @{[ scalar gmtime ]} GMT)
 * Any changes made here will be lost!
 */

HEADER

my ($op,$title,@strs,@txt);
my $doit= sub {
    return unless $op;
    my $o= __PACKAGE__->new($title,$op,@strs);
    print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
    print $out_fh join "\n",@txt,"*/","";
    for ('', 'U', 'L') {
        print $out_fh $o->ternary( $_ );
        print $out_fh $o->ternary( $_,'_safe' );
    }
    print $out_fh $o->ternary( 'c' );
};
while (<DATA>) {
    next unless /\S/;
    chomp;
    if (/^([A-Z]+)/) {
        $doit->();
        ($op,$title)=split /\s*:\s*/,$_,2;
        @txt=@strs=();
    } else {
        push @txt, "\t$_";
        s/#.*$//;
        if (/^0x/) {
            push @strs,map { chr $_ } eval $_;
        } elsif (/^[""'']/) {
            push @strs,eval $_;
        }
    }
}
$doit->();
print $out_fh "/* ex: set ro: */\n";
print "$path has been updated\n";

__DATA__
LNBREAK: Line Break: \R
"\x0D\x0A"      # CRLF - Network (Windows) line ending
0x0A            # LF  | LINE FEED
0x0B            # VT  | VERTICAL TAB
0x0C            # FF  | FORM FEED
0x0D            # CR  | CARRIAGE RETURN
0x85            # NEL | NEXT LINE
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR

HORIZWS: Horizontal Whitespace: \h \H
0x09            # HT
0x20            # SPACE
0xa0            # NBSP
0x1680          # OGHAM SPACE MARK
0x180e          # MONGOLIAN VOWEL SEPARATOR
0x2000          # EN QUAD
0x2001          # EM QUAD
0x2002          # EN SPACE
0x2003          # EM SPACE
0x2004          # THREE-PER-EM SPACE
0x2005          # FOUR-PER-EM SPACE
0x2006          # SIX-PER-EM SPACE
0x2007          # FIGURE SPACE
0x2008          # PUNCTUATION SPACE
0x2009          # THIN SPACE
0x200A          # HAIR SPACE
0x202f          # NARROW NO-BREAK SPACE
0x205f          # MEDIUM MATHEMATICAL SPACE
0x3000          # IDEOGRAPHIC SPACE

VERTWS: Vertical Whitespace: \v \V
0x0A            # LF
0x0B            # VT
0x0C            # FF
0x0D            # CR
0x85            # NEL
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR
Commit	Line	Data
12b72891	1	package UTF8::Matcher;
	2	use strict;
	3	use warnings;
	4	use Text::Wrap qw(wrap);
	5	use Encode;
	6	use Data::Dumper;
	7
0ccab2bc	8	our $hex_fmt="0x%02X";
0ccab2bc	9
12b72891	10	# Author: Yves Orton (demerphq) 2007.
	11
	12	=pod
	13
	14	Dynamically generates macros for detecting special charclasses
	15	in both latin-1, utf8, and codepoint forms.
	16
	17	To regenerate regcharclass.h, run this script from perl-root. No arguments
	18	are necessary.
	19
	20	Each charclass handler is constructed as follows:
	21	Each string the charclass must match is rendered as unicode (codepoints>255),
	22	and if possible as latin1 (codepoints>127), and if possible as "neutral"
	23	(all codepoints<128).
	24
	25	The rendered strings are then inserted into digit-tries by type and length.
	26	With shorter strings being added to tries that are allowed to contain longer
	27	strings, but not vice versa. Thus the "longest" trie contains all strings
	28	for that charclass.
	29
	30	The following types of trie are generated:
	31
	32	n - Neutral only. All strings in this type have codepoints<128
	33	l - Latin1 only. All strings in this type have a codepoint>127 in them
	34	u - UTF8 only. All strings in this type have a codepoint>255 in them
	35	L - Latin1. All strings in 'n' and 'l'
	36	U - UTF8. All string in 'n' and 'u'
	37	c - Codepoint. All strings in U but in codepoint and not utf8 form.
	38
	39	The ternary() routine is responsible for converting the trie data into a
	40	ternary conditional that matches the required set of strings. The generated
	41	macro normally takes at least the argument 's' which is expected to be a
	42	pointer of type C<char > or C<U8 >. The condition generated will be
	43	optimised to match the string as efficiently as possible, with range lookups
	44	being used where possible, and in some situations relying on "true" to be 1.
	45
	46	ternary() takes two optional arguments, $type which is one of the above
	47	characters and $ext which is used to add an extra extension to the macro name.
	48
	49	If $type is omitted or false then the generated macro will take an additional
	50	argument, 'is_utf8'.
	51
	52	If $ext has the string 'safe' in it then the generated macro will take an extra
	53	argument 'e' for the end of the string, and all lookups will be length checked
	54	to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
	55	be guaranteed safe, and no 'e' argument is provided and no length checks are
	56	made during execution.
	57
	58	The 'c' type is different as compared to the rest. Instead of producing
	59	a condition that does octet comparisons of a string array, the 'c' type
	60	produces a macro that takes a single codepoint as an argument (instead of a
	61	char* or U8*) and does the lookup based on only that char, thus it cannot be
	62	used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
	63	This is primarily used for populating charclass bitmaps for codepoints 0..255
	64	but will also match codepoints in the unicode range if necessary.
	65
	66	Using LNBREAK as an example the following macros will be produced:
	67
	68	=over 4
	69
	70	=item is_LNBREAK(s,is_utf8)
	71
	72	=item is_LNBREAK_safe(s,e,is_utf8)
	73
74	Do a lookup as apporpriate based on the is_utf8 flag. When possible
75	comparisons involving octect<128 are done before checking the is_utf8
76	flag, hopefully saving time.
77
78	=item is_LNBREAK_utf8(s)
79
80	=item is_LNBREAK_utf8_safe(s,e)
81
82	Do a lookup assuming the string is encoded in (normalized) UTF8.
83
84	=item is_LNBREAK_latin1(s)
85
86	=item is_LNBREAK_latin1_safe(s,e)
87
88	Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
89
90	=item is_LNBREAK_cp(cp)
91
92	Check to see if the string matches a given codepoint (hypotethically a
93	U32). The condition is constructed as as to "break out" as early as
94	possible if the codepoint is out of range of the condition.
95
96	IOW:
97
98	(cp==X \|\| (cp>X && (cp==Y \|\| (cp>Y && ...))))
99
100	Thus if the character is X+1 only two comparisons will be done. Making
101	matching lookups slower, but non-matching faster.
102
103	=back
104
105	=cut
106
107	# store a list of numbers into a hash based trie.
108	sub _trie_store {
109	my $root= shift;
110	foreach my $b ( @_ ) {
111	$root->{$b} \|\|= {};
112	$root= $root->{$b};
113	}
114	$root->{''}++;
115	}
116
117	# Convert a string into its neutral, latin1, utf8 forms, where
118	# the form is undefined unless the string can be completely represented
119	# in that form. The string is then decomposed into the octects representing
120	# it. A list is returned for each. Additional a list of codepoints making
121	# up the string.
122	# returns (\@n,\@u,\@l,\@cp)
123	#
124	sub _uni_latin1 {
125	my $str= shift;
126	my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
127	my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
128	my $n= $l;
129	undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
130	return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
131	[map { ord $_ } split //,$str]);
132	}
133
134	# store an array ref of char data into the appropriate
135	# type bins, tracking sizes as we go.
136	sub _store {
137	my ( $self, $r, @k )= @_;
138	for my $z ( @k ) {
139	$self->{size}{$z}{ 0 + @$r }++;
140	push @{ $self->{data}{$z} }, $r;
141	}
142	}
143
144	# construct a new charclass constructor object.
145	# $title ends up in the code a as a comment.
146	# $opcode is the name of the operation the charclass implements.
147	# the rest of the arguments are strings that the charclass
148	# can match.
149	sub new {
150	my $class= shift;
151	my $title= shift;
152	my $opcode= shift;
153	my $self= bless { op => $opcode, title => $title }, $class;
154	my %seen;
155	# convert the strings to the numeric equivelents and store
156	# them for later insertion while tracking their sizes.
157	foreach my $seq ( @_ ) {
158	next if $seen{$seq}++;
159	push @{$self->{seq}},$seq;
160	my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
161	if ( $n ) {
162	_store( $self, $n, qw(n U L) );
163	} else {
164	if ( $l ) {
165	_store( $self, $l, qw(l L) );
166	}
167	_store( $self, $u, qw(u U) );
168	}
169	_store($self,$cp,'c');
170	}
171	#
172	# now construct the tries. For each type of data we insert
173	# the data into all the tries of length $size and smaller.
174	#
175
176	my %allsize;
177	foreach my $k ( keys %{ $self->{data} } ) {
178	my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
179	$self->{size}{$k}=\@size;
180	undef @allsize{@size};
181	foreach my $d ( @{ $self->{data}{$k} } ) {
182	foreach my $sz ( @size ) {
183	last if $sz < @$d;
184	$self->{trie}{$k}{$sz} \|\|= {};
185	_trie_store( $self->{trie}{$k}{$sz}, @$d );
186	}
187	}
188	#delete $self->{data}{$k};
189	}
190	my @size= sort { $b <=> $a } keys %allsize;
191	$self->{size}{''}= \@size;
192	return $self;
193	}
194
195	#
196	# _cond([$v1,$v2,$v2...],$ofs)
197	#
198	# converts an array of codepoints into a conditional expression
199	# consequtive codepoints are merged into a range test
200	# returns a string containing the conditional expression in the form
201	# '( li[x]==v \|\| li[x]==y )' When possible we also use range lookups.
202
203	sub _cond {
204	my ( $c, $ofs,$fmt )= @_;
205	$fmt\|\|='((U8*)s)[%d]';
206	# cheapo rangification routine.
207	# Convert the first element into a singleton represented
208	# as [$x,$x] and then merge the rest in as we go.
209	my @v= sort { $a <=> $b } @$c;
210	my @r= ( [ ( shift @v ) x 2 ] );
211	for my $n ( @v ) {
212	if ( $n == $r[-1][1] + 1 ) {
213	$r[-1][1]++;
214	} else {
215	push @r, [ $n, $n ];
216	}
217	}
218	@r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
219	# sort the ranges by size and order.
220	@r= sort { $a->[0] <=> $b->[0] } @r;
221	my $alu= sprintf $fmt,$ofs; # C array look up
222
223	if ($fmt=~/%d/) {
224	# map the ranges into conditions
225	@r= map {
226	# singleton
0ccab2bc	227	$_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
12b72891	228	# range
0ccab2bc	229	sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
12b72891	230	} @r;
	231	# return the joined results.
	232	return '( ' . join( " \|\| ", @r ) . ' )';
	233	} else {
	234	return combine($alu,@r);
	235	}
	236	}
	237
	238	#
	239	# Do the condition in such a way that we break out early if the value
	240	# we are looking at is in between two elements in the list.
	241	# Currently used only for codepoint macros (depth 1)
	242	#
	243	sub combine {
	244	my $alu=shift;
	245	local $_ = shift;
	246	my $txt= $_->[0] == $_->[1]
0ccab2bc	247	? sprintf("$alu == $hex_fmt",$_->[0])
0ccab2bc	248	: sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
12b72891	249	return $txt unless @_;
	250	return "( $txt \|\| ( $alu > $_->[1] && \n".combine($alu,@_)." ) )";
	251	}
	252
	253	# recursively convert a trie to an optree represented by
	254	# [condition,yes,no] where yes and no can be a ref to another optree
	255	# or a scalar representing code.
	256	# called by make_optree
	257
	258	sub _trie_to_optree {
	259	my ( $node, $ofs, $else, $fmt )= @_;
	260	return $else unless $node;
	261	$ofs \|\|= 0;
	262	if ( $node->{''} ) {
	263	$else= $ofs;
	264	} else {
	265	$else \|\|= 0;
	266	}
	267	my @k= sort { $b->[1] cmp $a->[1] \|\| $a->[0] <=> $b->[0] }
	268	map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
	269	grep length, keys %$node;
	270
	271	return $ofs if !@k;
	272
	273	my ( $root, $expr );
	274	while ( @k ) {
	275	my @cond= ( $k[0][0] );
	276	my $d= $k[0][1];
	277	my $r= $k[0][2];
	278	shift @k;
	279	while ( @k && $k[0][1] eq $d ) {
	280	push @cond, $k[0][0];
	281	shift @k;
	282	}
	283	my $op=
	284	[ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
	285	if ( !$root ) {
	286	$root= $expr= $op;
	287	} else {
	288	push @$expr, $op;
	289	$expr= $op;
	290	}
	291	}
	292	push @$expr, $else;
	293	return $root;
	294	}
	295
	296	# construct the optree for a type.
	297	# handles the special logic of type ''.
	298	sub make_optree {
	299	my ( $self, $type, $size, $fmt )= @_;
	300	my $else= 0;
	301	$size\|\|=$self->{size}{$type}[0];
	302	$size=1 if $type eq 'c';
	303	if ( !$type ) {
	304	my ( $u, $l );
	305	for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
	306	$u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
	307	}
	308	for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
	309	$l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
	310	}
	311	if ( $u ) {
	312	$else= [ '(is_utf8)', $u, $l \|\| 0 ];
313	} elsif ( $l ) {
314	$else= [ '(!is_utf8)', $l, 0 ];
315	}
316	$type= 'n';
317	$size-- while !$self->{trie}{n}{$size};
318	}
319	return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
320	}
321
322	# construct the optree for a type with length checks to prevent buffer
323	# overruns. Only one length check is performed per lookup trading code
324	# size for speed.
325	sub length_optree {
326	my ( $self, $type,$fmt )= @_;
327	$type \|\|= '';
328	return $self->{len_op}{$type} if $self->{len_op}{$type};
329	my @size = @{$self->{size}{$type}};
330
331	my ( $root, $expr );
332	foreach my $size ( @size ) {
333	my $op= [
334	"( (e) - (s) > " . ( $size - 1 ) . " )",
335	$self->make_optree( $type, $size ),
336	];
337	if ( !$root ) {
338	$root= $expr= $op;
339	} else {
340	push @$expr, $op;
341	$expr= $op;
342	}
343	}
344	push @$expr, 0;
345	return $self->{len_op}{$type}= $root ? $root : $expr->[0];
346	}
347
348	#
349	# recursively walk an optree and covert it to a huge nested ternary expression.
350	#
351	sub _optree_to_ternary {
352	my ( $node )= @_;
353	return $node
354	if !ref $node;
355	my $depth = 0;
356	if ( $node->[0] =~ /\[(\d+)\]/ ) {
357	$depth= $1 + 1;
358	}
359	return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
360	_optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
361	}
362
363	# add \\ to the end of strings in a reasonable neat way.
364	sub _macro($) {
365	my $str= shift;
366	my @lines= split /[^\S\n]*\n/, $str;
0ccab2bc	367	my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
	368	$macro =~ s/ *$//;
	369	return $macro . "\n\n";
12b72891	370	}
	371
	372	# default type extensions. 'uln' dont have one because normally
	373	# they are used only as part of type '' which doesnt get an extension
	374	my %ext= (
	375	U => '_utf8',
	376	L => '_latin1',
	377	c => '_cp',
	378
	379	);
	380
	381	# produce the ternary, handling arguments and putting on the macro headers
	382	# and boiler plate
	383	sub ternary {
	384	my ( $self, $type, $ext )= @_;
	385	$type \|\|= '';
	386	$ext = ($ext{$type} \|\| '') . ($ext\|\|"");
	387	my ($root,$fmt,$arg);
	388	if ($type eq 'c') {
	389	$arg= $fmt= 'cp';
	390	} else {
	391	$arg= 's';
	392	}
	393	if ( $type eq 'c' \|\| $ext !~ /safe/) {
	394	$root= $self->make_optree( $type, 0, $fmt );
	395	} else {
	396	$root= $self->length_optree( $type, $fmt );
	397	}
	398
	399	our $parens;
	400	$parens= qr/ \( (?: (?> [^()]+? ) \| (??{$parens}) )+? \) /x;
	401	my $expr= qr/
	402	\( \s*
	403	($parens)
	404	\s* \? \s*
	405	\( \s*
	406	($parens)
	407	\s* \? \s*
	408	(\d+\|$parens)
	409	\s* : \s*
	410	(\d+\|$parens)
	411	\s* \)
	412	\s* : \s*
	413	\4
	414	\s* \)
	415	/x;
	416	my $code= _optree_to_ternary( $root );
	417	for ( $code ) {
	418	s/^\s*//;
	419	1 while s/\(\s($parens)\s\?\s1\s:\s0\s\)/$1/g
	420	\|\| s<$expr><(($1 && $2) ? $3 : $4)>g
	421	\|\| s<\(\s($parens)\s\)><$1>g;
	422	}
	423	my @args=($arg);
	424	push @args,'e' if $ext=~/safe/;
	425	push @args,'is_utf8' if !$type;
	426	my $args=join ",",@args;
	427	return "/* GENERATED CODE */\n"
	428	. _macro "#define is_$self->{op}$ext($args)\n$code";
	429	}
	430
	431	my $path=shift @ARGV;
	432	if (!$path) {
	433	$path= "regcharclass.h";
434	if (!-e $path) { $path="../$path" }
435	if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
436	}
437
438	rename $path,"$path.bak";
439	open my $out_fh,">",$path
440	or die "Can't write to '$path':$!";
441	binmode $out_fh; # want unix line endings even when run on win32.
58fbde93	442	my ($zero) = $0=~/([^\\\/]+)$/;
12b72891	443	print $out_fh <<"HEADER";
58fbde93	444	/* -- buffer-read-only: t --
	445	*
	446	* regcharclass.h
	447	*
	448	* Copyright (C) 2007, by Larry Wall and others
	449	*
	450	* You may distribute under the terms of either the GNU General Public
	451	* License or the Artistic License, as specified in the README file.
	452	*
	453	* !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	454	* This file is built by Porting/$zero.
	455	* (Generated at: @{[ scalar gmtime ]} GMT)
	456	* Any changes made here will be lost!
	457	*/
12b72891	458
	459	HEADER
	460
	461	my ($op,$title,@strs,@txt);
	462	my $doit= sub {
	463	return unless $op;
	464	my $o= __PACKAGE__->new($title,$op,@strs);
	465	print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
	466	print $out_fh join "\n",@txt,"*/","";
	467	for ('', 'U', 'L') {
	468	print $out_fh $o->ternary( $_ );
	469	print $out_fh $o->ternary( $_,'_safe' );
	470	}
	471	print $out_fh $o->ternary( 'c' );
	472	};
	473	while (<DATA>) {
	474	next unless /\S/;
	475	chomp;
	476	if (/^([A-Z]+)/) {
	477	$doit->();
	478	($op,$title)=split /\s:\s/,$_,2;
	479	@txt=@strs=();
	480	} else {
	481	push @txt, "\t$_";
	482	s/#.*$//;
	483	if (/^0x/) {
	484	push @strs,map { chr $_ } eval $_;
	485	} elsif (/^[""'']/) {
	486	push @strs,eval $_;
	487	}
	488	}
	489	}
	490	$doit->();
58fbde93	491	print $out_fh "/* ex: set ro: */\n";
12b72891	492	print "$path has been updated\n";
	493
	494	__DATA__
	495	LNBREAK: Line Break: \R
	496	"\x0D\x0A" # CRLF - Network (Windows) line ending
	497	0x0A # LF \| LINE FEED
	498	0x0B # VT \| VERTICAL TAB
	499	0x0C # FF \| FORM FEED
	500	0x0D # CR \| CARRIAGE RETURN
	501	0x85 # NEL \| NEXT LINE
	502	0x2028 # LINE SEPARATOR
	503	0x2029 # PARAGRAPH SEPARATOR
	504
	505	HORIZWS: Horizontal Whitespace: \h \H
	506	0x09 # HT
	507	0x20 # SPACE
	508	0xa0 # NBSP
	509	0x1680 # OGHAM SPACE MARK
	510	0x180e # MONGOLIAN VOWEL SEPARATOR
	511	0x2000 # EN QUAD
	512	0x2001 # EM QUAD
	513	0x2002 # EN SPACE
	514	0x2003 # EM SPACE
	515	0x2004 # THREE-PER-EM SPACE
	516	0x2005 # FOUR-PER-EM SPACE
	517	0x2006 # SIX-PER-EM SPACE
	518	0x2007 # FIGURE SPACE
	519	0x2008 # PUNCTUATION SPACE
	520	0x2009 # THIN SPACE
	521	0x200A # HAIR SPACE
	522	0x202f # NARROW NO-BREAK SPACE
	523	0x205f # MEDIUM MATHEMATICAL SPACE
	524	0x3000 # IDEOGRAPHIC SPACE
	525
	526	VERTWS: Vertical Whitespace: \v \V
	527	0x0A # LF
	528	0x0B # VT
	529	0x0C # FF
	530	0x0D # CR
	531	0x85 # NEL
	532	0x2028 # LINE SEPARATOR
	533	0x2029 # PARAGRAPH SEPARATOR
	534