[p5sagit/p5-mst-13.2.git] / Porting / regcharclass.pl

package UTF8::Matcher;
use strict;
use warnings;
use Text::Wrap qw(wrap);
use Encode;
use Data::Dumper;

our $hex_fmt="0x%02X";

# Author: Yves Orton (demerphq) 2007.

=pod

Dynamically generates macros for detecting special charclasses
in both latin-1, utf8, and codepoint forms.

To regenerate regcharclass.h, run this script from perl-root. No arguments
are necessary.

Each charclass handler is constructed as follows:
Each string the charclass must match  is rendered as unicode (codepoints>255),
and if possible  as latin1 (codepoints>127), and if possible as "neutral"
(all codepoints<128).

The rendered strings are then inserted into digit-tries by type and length.
With shorter strings being added to tries that are allowed to contain longer
strings, but not vice versa.  Thus the "longest" trie contains all strings
for that charclass.

The following types of trie are generated:

  n - Neutral only. All strings in this type have codepoints<128
  l - Latin1 only. All strings in this type have a codepoint>127 in them
  u - UTF8 only.   All strings in this type have a codepoint>255 in them
  L - Latin1. All strings in 'n' and 'l'
  U - UTF8.   All string in 'n' and 'u'
  c - Codepoint. All strings in U but in codepoint and not utf8 form.

The ternary() routine is responsible for converting the trie data into a
ternary conditional that matches the required set of strings. The generated
macro normally takes at least the argument 's' which is expected to be a
pointer of type C<char *> or C<U8 *>. The condition generated will be
optimised to match the string as efficiently as possible, with range lookups
being used where possible, and in some situations relying on "true" to be 1.

ternary() takes two optional arguments, $type which is one of the above
characters and $ext which is used to add an extra extension to the macro name.

If $type is omitted or false then the generated macro will take an additional
argument, 'is_utf8'.

If $ext has the string 'safe' in it then the generated macro will take an extra
argument 'e' for the end of the string, and all lookups will be length checked
to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
be guaranteed safe, and no 'e' argument is provided  and no length checks are
made during execution.

The 'c' type is different as compared to the rest. Instead of producing
a condition that does octet comparisons of a string array, the 'c' type
produces a macro that takes a single codepoint as an argument (instead of a
char* or U8*) and does the lookup based on only that char, thus it cannot be
used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
This is primarily used for populating charclass bitmaps for codepoints 0..255
but will also match codepoints in the unicode range if necessary.

Using LNBREAK as an example the following macros will be produced:

=over 4

=item is_LNBREAK(s,is_utf8)

=item is_LNBREAK_safe(s,e,is_utf8)

Do a lookup as apporpriate based on the is_utf8 flag. When possible
comparisons involving octect<128 are done before checking the is_utf8
flag, hopefully saving time.

=item is_LNBREAK_utf8(s)

=item is_LNBREAK_utf8_safe(s,e)

Do a lookup assuming the string is encoded in (normalized) UTF8.

=item is_LNBREAK_latin1(s)

=item is_LNBREAK_latin1_safe(s,e)

Do a lookup assuming the string is encoded in latin-1 (aka plan octets).

=item is_LNBREAK_cp(cp)

Check to see if the string matches a given codepoint (hypotethically a
U32). The condition is constructed as as to "break out" as early as
possible if the codepoint is out of range of the condition.

IOW:

  (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))

Thus if the character is X+1 only two comparisons will be done. Making
matching lookups slower, but non-matching faster.

=back

=cut

# store a list of numbers into a hash based trie.
sub _trie_store {
    my $root= shift;
    foreach my $b ( @_ ) {
        $root->{$b} ||= {};
        $root= $root->{$b};
    }
    $root->{''}++;
}

# Convert a string into its neutral, latin1, utf8 forms, where
# the form is undefined unless the string can be completely represented
# in that form. The string is then decomposed into the octects representing
# it. A list is returned for each. Additional a list of codepoints making
# up the string.
# returns (\@n,\@u,\@l,\@cp)
#
sub _uni_latin1 {
    my $str= shift;
    my $u= eval { Encode::encode( "utf8",       "$str", Encode::FB_CROAK ) };
    my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
    my $n= $l;
    undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
    return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
            [map { ord $_ } split //,$str]);
}

# store an array ref of char data into the appropriate
# type bins, tracking sizes as we go.
sub _store {
    my ( $self, $r, @k )= @_;
    for my $z ( @k ) {
        $self->{size}{$z}{ 0 + @$r }++;
        push @{ $self->{data}{$z} }, $r;
    }
}

# construct a new charclass constructor object.
# $title ends up in the code a as a comment.
# $opcode is the name of the operation the charclass implements.
# the rest of the arguments are strings that the charclass
# can match.
sub new {
    my $class= shift;
    my $title= shift;
    my $opcode= shift;
    my $self= bless { op => $opcode, title => $title }, $class;
    my %seen;
    # convert the strings to the numeric equivelents and store
    # them for later insertion while tracking their sizes.
    foreach my $seq ( @_ ) {
        next if $seen{$seq}++;
        push @{$self->{seq}},$seq;
        my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
        if ( $n ) {
            _store( $self, $n, qw(n U L) );
        } else {
            if ( $l ) {
                _store( $self, $l, qw(l L) );
            }
            _store( $self, $u, qw(u U) );
        }
        _store($self,$cp,'c');
    }
    #
    # now construct the tries. For each type of data we insert
    # the data into all the tries of length $size and smaller.
    #

    my %allsize;
    foreach my $k ( keys %{ $self->{data} } ) {
        my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
        $self->{size}{$k}=\@size;
        undef @allsize{@size};
        foreach my $d ( @{ $self->{data}{$k} } ) {
            foreach my $sz ( @size ) {
                last if $sz < @$d;
                $self->{trie}{$k}{$sz} ||= {};
                _trie_store( $self->{trie}{$k}{$sz}, @$d );
            }
        }
        #delete $self->{data}{$k};
    }
    my @size= sort { $b <=> $a } keys %allsize;
    $self->{size}{''}= \@size;
    return $self;
}

#
# _cond([$v1,$v2,$v2...],$ofs)
#
# converts an array of codepoints into a conditional expression
# consequtive codepoints are merged into a range test
# returns a string containing the conditional expression in the form
# '( li[x]==v || li[x]==y )' When possible we also use range lookups.

sub _cond {
    my ( $c, $ofs,$fmt )= @_;
    $fmt||='((U8*)s)[%d]';
    # cheapo rangification routine.
    # Convert the first element into a singleton represented
    # as [$x,$x] and then merge the rest in as we go.
    my @v= sort { $a <=> $b } @$c;
    my @r= ( [ ( shift @v ) x 2 ] );
    for my $n ( @v ) {
        if ( $n == $r[-1][1] + 1 ) {
            $r[-1][1]++;
        } else {
            push @r, [ $n, $n ];
        }
    }
    @r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
    # sort the ranges by size and order.
    @r= sort { $a->[0] <=> $b->[0] }  @r;
    my $alu= sprintf $fmt,$ofs;    # C array look up

    if ($fmt=~/%d/) {
        # map the ranges into conditions
        @r= map {
            # singleton
            $_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
            # range
            sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
        } @r;
        # return the joined results.
        return '( ' . join( " || ", @r ) . ' )';
    } else {
        return combine($alu,@r);
    }
}

#
# Do the condition in such a way that we break out early if the value
# we are looking at is in between two elements in the list.
# Currently used only for codepoint macros (depth 1)
#
sub combine {
    my $alu=shift;
    local $_ = shift;
    my $txt= $_->[0] == $_->[1]
           ? sprintf("$alu == $hex_fmt",$_->[0])
           : sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
    return $txt unless @_;
    return sprintf "( %s ||( %s > 0x%02X &&\n%s ) )",
        $txt,$alu,$_->[1],combine($alu,@_);
}

# recursively convert a trie to an optree represented by
# [condition,yes,no] where  yes and no can be a ref to another optree
# or a scalar representing code.
# called by make_optree

sub _trie_to_optree {
    my ( $node, $ofs, $else, $fmt )= @_;
    return $else unless $node;
    $ofs ||= 0;
    if ( $node->{''} ) {
        $else= $ofs;
    } else {
        $else ||= 0;
    }
    my @k= sort { $b->[1] cmp $a->[1] || $a->[0] <=> $b->[0] }
      map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
      grep length, keys %$node;

    return $ofs if !@k;

    my ( $root, $expr );
    while ( @k ) {
        my @cond= ( $k[0][0] );
        my $d= $k[0][1];
        my $r= $k[0][2];
        shift @k;
        while ( @k && $k[0][1] eq $d ) {
            push @cond, $k[0][0];
            shift @k;
        }
        my $op=
          [ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, $else;
    return $root;
}

# construct the optree for a type.
# handles the special logic of type ''.
sub make_optree {
    my ( $self, $type, $size, $fmt )= @_;
    my $else= 0;
    $size||=$self->{size}{$type}[0];
    $size=1 if $type eq 'c';
    if ( !$type ) {
        my ( $u, $l );
        if ($self->{trie}{u}) {
            for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
                $u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
            }
        }
        if ($self->{trie}{l}) {
            for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
                $l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
            }
        }
        if ( $u ) {
            $else= [ '(is_utf8)', $u, $l || 0 ];
        } elsif ( $l ) {
            $else= [ '(!is_utf8)', $l, 0 ];
        }
        $type= 'n';
    }
    if (!$self->{trie}{$type}) {
        return $else;
    } else {
        $size-- while $size>0 && !$self->{trie}{$type}{$size};
        return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
    }
}

# construct the optree for a type with length checks to prevent buffer
# overruns. Only one length check is performed per lookup trading code
# size for speed.
sub length_optree {
    my ( $self, $type,$fmt )= @_;
    $type ||= '';
    return $self->{len_op}{$type} if $self->{len_op}{$type};
    my @size = @{$self->{size}{$type}};

    my ( $root, $expr );
    foreach my $size ( @size ) {
        my $op= [
            "( (e) - (s) > " . ( $size - 1 ) . " )",
            $self->make_optree( $type, $size ),
        ];
        if ( !$root ) {
            $root= $expr= $op;
        } else {
            push @$expr, $op;
            $expr= $op;
        }
    }
    push @$expr, 0;
    return $self->{len_op}{$type}= $root ? $root : $expr->[0];
}

#
# recursively walk an optree and covert it to a huge nested ternary expression.
#
sub _optree_to_ternary {
    my ( $node )= @_;
    return $node
      if !ref $node;
    my $depth = 0;
    if ( $node->[0] =~ /\[(\d+)\]/ ) {
        $depth= $1 + 1;
    }
    return sprintf "\n%s( %s ? %s : %s )", "  " x $depth, $node->[0],
      _optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
}

# add \\ to the end of strings in a reasonable neat way.
sub _macro($) {
    my $str= shift;
    my @lines= split /[^\S\n]*\n/, $str;
    my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
    $macro =~ s/  *$//;
    return $macro . "\n\n";
}

# default type extensions. 'uln' dont have one because normally
# they are used only as part of type '' which doesnt get an extension
my %ext= (
    U => '_utf8',
    L => '_latin1',
    c => '_cp',

);

# produce the ternary, handling arguments and putting on the macro headers
# and boiler plate
sub ternary {
    my ( $self, $type, $ext )= @_;
    $type ||= '';
    $ext = ($ext{$type} || '') . ($ext||"");
    my ($root,$fmt,$arg);
    if ($type eq 'c') {
        $arg= $fmt= 'cp';
    } else {
        $arg= 's';
    }
    if ( $type eq 'c' || $ext !~ /safe/) {
        $root= $self->make_optree( $type, 0, $fmt );
    } else {
        $root= $self->length_optree( $type, $fmt );
    }

    our $parens;
    $parens= qr/ \( (?: (?> [^()]+? ) | (??{$parens}) )+? \) /x;
    my $expr= qr/
        \( \s*
        ($parens)
        \s* \? \s*
        \( \s*
        ($parens)
        \s* \? \s*
        (\d+|$parens)
        \s* : \s*
        (\d+|$parens)
        \s* \)
        \s* : \s*
        \4
        \s* \)
    /x;
    my $code= _optree_to_ternary( $root );
    for ( $code ) {
        s/^\s*//;
        1 while s/\(\s*($parens)\s*\?\s*1\s*:\s*0\s*\)/$1/g
          || s<$expr><(($1 && $2) ? $3 : $4)>g
          || s<\(\s*($parens)\s*\)><$1>g;
    }
    my @args=($arg);
    push @args,'e' if $ext=~/safe/;
    push @args,'is_utf8' if !$type;
    my $args=join ",",@args;
    return "/*** GENERATED CODE ***/\n"
          . _macro "#define is_$self->{op}$ext($args)\n$code";
}
$|++;
my $path=shift @ARGV;

if (!$path) {
    $path= "regcharclass.h";
    if (!-e $path) { $path="../$path" }
    if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
}
my $out_fh;
if ($path eq '-') {
    $out_fh= \*STDOUT;
} else {
    rename $path,"$path.bak";
    open $out_fh,">",$path
        or die "Can't write to '$path':$!";
    binmode $out_fh; # want unix line endings even when run on win32.
}
my ($zero) = $0=~/([^\\\/]+)$/;
print $out_fh <<"HEADER";
/*  -*- buffer-read-only: t -*-
 *
 *    regcharclass.h
 *
 *    Copyright (C) 2007, by Larry Wall and others
 *
 *    You may distribute under the terms of either the GNU General Public
 *    License or the Artistic License, as specified in the README file.
 *
 * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 * This file is built by Porting/$zero.
 * (Generated at: @{[ scalar gmtime ]} GMT)
 * Any changes made here will be lost!
 */

HEADER

my ($op,$title,@strs,@txt,$type);
my $doit= sub {
    return unless $op;
    my $o= __PACKAGE__->new($title,$op,@strs);
    print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
    print $out_fh join "\n",@txt,"*/","";
    $type||="U L c _safe";
    my @ext=("");
    my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
              split /\s+/,$type);
    for my $type (@types) {
        for my $ext (@ext) {
            next if $type eq 'c' and $ext eq '_safe';
            print $out_fh $o->ternary( $type,$ext );
        }
    }
};
while (<DATA>) {
    next unless /\S/;
    chomp;
    if (/^([A-Z]+)/) {
        $doit->();
        ($op,$title)=split /\s*:\s*/,$_,2;
        @txt=@strs=();
        $type="";
    } elsif (/^=(.*)/) {
        $type.=$1;
    } else {
        push @txt, "\t$_";
        s/#.*$//;
        if (/^0x/) {
            push @strs,map { chr $_ } eval $_;
        } elsif (/^[""'']/) {
            push @strs,eval $_;
        }
    }
}
$doit->();
print $out_fh "/* ex: set ro: */\n";

__DATA__
LNBREAK: Line Break: \R
"\x0D\x0A"      # CRLF - Network (Windows) line ending
0x0A            # LF  | LINE FEED
0x0B            # VT  | VERTICAL TAB
0x0C            # FF  | FORM FEED
0x0D            # CR  | CARRIAGE RETURN
0x85            # NEL | NEXT LINE
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR

HORIZWS: Horizontal Whitespace: \h \H
0x09            # HT
0x20            # SPACE
0xa0            # NBSP
0x1680          # OGHAM SPACE MARK
0x180e          # MONGOLIAN VOWEL SEPARATOR
0x2000          # EN QUAD
0x2001          # EM QUAD
0x2002          # EN SPACE
0x2003          # EM SPACE
0x2004          # THREE-PER-EM SPACE
0x2005          # FOUR-PER-EM SPACE
0x2006          # SIX-PER-EM SPACE
0x2007          # FIGURE SPACE
0x2008          # PUNCTUATION SPACE
0x2009          # THIN SPACE
0x200A          # HAIR SPACE
0x202f          # NARROW NO-BREAK SPACE
0x205f          # MEDIUM MATHEMATICAL SPACE
0x3000          # IDEOGRAPHIC SPACE

VERTWS: Vertical Whitespace: \v \V
0x0A            # LF
0x0B            # VT
0x0C            # FF
0x0D            # CR
0x85            # NEL
0x2028          # LINE SEPARATOR
0x2029          # PARAGRAPH SEPARATOR

TRICKYFOLD: Problematic fold case letters.
0x00DF	# LATIN SMALL LETTER SHARP S
0x0390	# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
0x03B0	# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
Commit	Line	Data
12b72891	1	package UTF8::Matcher;
	2	use strict;
	3	use warnings;
	4	use Text::Wrap qw(wrap);
	5	use Encode;
	6	use Data::Dumper;
	7
0ccab2bc	8	our $hex_fmt="0x%02X";
0ccab2bc	9
12b72891	10	# Author: Yves Orton (demerphq) 2007.
	11
	12	=pod
	13
	14	Dynamically generates macros for detecting special charclasses
	15	in both latin-1, utf8, and codepoint forms.
	16
	17	To regenerate regcharclass.h, run this script from perl-root. No arguments
	18	are necessary.
	19
	20	Each charclass handler is constructed as follows:
	21	Each string the charclass must match is rendered as unicode (codepoints>255),
	22	and if possible as latin1 (codepoints>127), and if possible as "neutral"
	23	(all codepoints<128).
	24
	25	The rendered strings are then inserted into digit-tries by type and length.
	26	With shorter strings being added to tries that are allowed to contain longer
	27	strings, but not vice versa. Thus the "longest" trie contains all strings
	28	for that charclass.
	29
	30	The following types of trie are generated:
	31
	32	n - Neutral only. All strings in this type have codepoints<128
	33	l - Latin1 only. All strings in this type have a codepoint>127 in them
	34	u - UTF8 only. All strings in this type have a codepoint>255 in them
	35	L - Latin1. All strings in 'n' and 'l'
	36	U - UTF8. All string in 'n' and 'u'
	37	c - Codepoint. All strings in U but in codepoint and not utf8 form.
	38
	39	The ternary() routine is responsible for converting the trie data into a
	40	ternary conditional that matches the required set of strings. The generated
	41	macro normally takes at least the argument 's' which is expected to be a
	42	pointer of type C<char > or C<U8 >. The condition generated will be
	43	optimised to match the string as efficiently as possible, with range lookups
	44	being used where possible, and in some situations relying on "true" to be 1.
	45
	46	ternary() takes two optional arguments, $type which is one of the above
	47	characters and $ext which is used to add an extra extension to the macro name.
	48
	49	If $type is omitted or false then the generated macro will take an additional
	50	argument, 'is_utf8'.
	51
	52	If $ext has the string 'safe' in it then the generated macro will take an extra
	53	argument 'e' for the end of the string, and all lookups will be length checked
	54	to prevent lookups past e. If 'safe' is not used then the lookup is assumed to
	55	be guaranteed safe, and no 'e' argument is provided and no length checks are
	56	made during execution.
	57
	58	The 'c' type is different as compared to the rest. Instead of producing
	59	a condition that does octet comparisons of a string array, the 'c' type
	60	produces a macro that takes a single codepoint as an argument (instead of a
	61	char* or U8*) and does the lookup based on only that char, thus it cannot be
	62	used to match multi-codepoint sequences like "\r\n" in the LNBREAK charclass.
	63	This is primarily used for populating charclass bitmaps for codepoints 0..255
	64	but will also match codepoints in the unicode range if necessary.
	65
	66	Using LNBREAK as an example the following macros will be produced:
	67
	68	=over 4
	69
	70	=item is_LNBREAK(s,is_utf8)
	71
	72	=item is_LNBREAK_safe(s,e,is_utf8)
	73
74	Do a lookup as apporpriate based on the is_utf8 flag. When possible
75	comparisons involving octect<128 are done before checking the is_utf8
76	flag, hopefully saving time.
77
78	=item is_LNBREAK_utf8(s)
79
80	=item is_LNBREAK_utf8_safe(s,e)
81
82	Do a lookup assuming the string is encoded in (normalized) UTF8.
83
84	=item is_LNBREAK_latin1(s)
85
86	=item is_LNBREAK_latin1_safe(s,e)
87
88	Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
89
90	=item is_LNBREAK_cp(cp)
91
92	Check to see if the string matches a given codepoint (hypotethically a
93	U32). The condition is constructed as as to "break out" as early as
94	possible if the codepoint is out of range of the condition.
95
96	IOW:
97
98	(cp==X \|\| (cp>X && (cp==Y \|\| (cp>Y && ...))))
99
100	Thus if the character is X+1 only two comparisons will be done. Making
101	matching lookups slower, but non-matching faster.
102
103	=back
104
105	=cut
106
107	# store a list of numbers into a hash based trie.
108	sub _trie_store {
109	my $root= shift;
110	foreach my $b ( @_ ) {
111	$root->{$b} \|\|= {};
112	$root= $root->{$b};
113	}
114	$root->{''}++;
115	}
116
117	# Convert a string into its neutral, latin1, utf8 forms, where
118	# the form is undefined unless the string can be completely represented
119	# in that form. The string is then decomposed into the octects representing
120	# it. A list is returned for each. Additional a list of codepoints making
121	# up the string.
122	# returns (\@n,\@u,\@l,\@cp)
123	#
124	sub _uni_latin1 {
125	my $str= shift;
126	my $u= eval { Encode::encode( "utf8", "$str", Encode::FB_CROAK ) };
127	my $l= eval { Encode::encode( "iso-8859-1", "$str", Encode::FB_CROAK ) };
128	my $n= $l;
129	undef $n if defined( $n ) && $str =~ /[^\x00-\x7F]/;
130	return ((map { $_ ? [ unpack "U0C*", $_ ] : $_ } ( $n, $u, $l )),
131	[map { ord $_ } split //,$str]);
132	}
133
134	# store an array ref of char data into the appropriate
135	# type bins, tracking sizes as we go.
136	sub _store {
137	my ( $self, $r, @k )= @_;
138	for my $z ( @k ) {
139	$self->{size}{$z}{ 0 + @$r }++;
140	push @{ $self->{data}{$z} }, $r;
141	}
142	}
143
144	# construct a new charclass constructor object.
145	# $title ends up in the code a as a comment.
146	# $opcode is the name of the operation the charclass implements.
147	# the rest of the arguments are strings that the charclass
148	# can match.
149	sub new {
150	my $class= shift;
151	my $title= shift;
152	my $opcode= shift;
153	my $self= bless { op => $opcode, title => $title }, $class;
154	my %seen;
155	# convert the strings to the numeric equivelents and store
156	# them for later insertion while tracking their sizes.
157	foreach my $seq ( @_ ) {
158	next if $seen{$seq}++;
159	push @{$self->{seq}},$seq;
160	my ( $n, $u, $l,$cp )= _uni_latin1( $seq );
161	if ( $n ) {
162	_store( $self, $n, qw(n U L) );
163	} else {
164	if ( $l ) {
165	_store( $self, $l, qw(l L) );
166	}
167	_store( $self, $u, qw(u U) );
168	}
169	_store($self,$cp,'c');
170	}
171	#
172	# now construct the tries. For each type of data we insert
173	# the data into all the tries of length $size and smaller.
174	#
175
176	my %allsize;
177	foreach my $k ( keys %{ $self->{data} } ) {
178	my @size= sort { $b <=> $a } keys %{ $self->{size}{$k} };
179	$self->{size}{$k}=\@size;
180	undef @allsize{@size};
181	foreach my $d ( @{ $self->{data}{$k} } ) {
182	foreach my $sz ( @size ) {
183	last if $sz < @$d;
184	$self->{trie}{$k}{$sz} \|\|= {};
185	_trie_store( $self->{trie}{$k}{$sz}, @$d );
186	}
187	}
188	#delete $self->{data}{$k};
189	}
190	my @size= sort { $b <=> $a } keys %allsize;
191	$self->{size}{''}= \@size;
192	return $self;
193	}
194
195	#
196	# _cond([$v1,$v2,$v2...],$ofs)
197	#
198	# converts an array of codepoints into a conditional expression
199	# consequtive codepoints are merged into a range test
200	# returns a string containing the conditional expression in the form
201	# '( li[x]==v \|\| li[x]==y )' When possible we also use range lookups.
202
203	sub _cond {
204	my ( $c, $ofs,$fmt )= @_;
205	$fmt\|\|='((U8*)s)[%d]';
206	# cheapo rangification routine.
207	# Convert the first element into a singleton represented
208	# as [$x,$x] and then merge the rest in as we go.
209	my @v= sort { $a <=> $b } @$c;
210	my @r= ( [ ( shift @v ) x 2 ] );
211	for my $n ( @v ) {
212	if ( $n == $r[-1][1] + 1 ) {
213	$r[-1][1]++;
214	} else {
215	push @r, [ $n, $n ];
216	}
217	}
218	@r = map { $_->[0]==$_->[1]-1 ? ([$_->[0],$_->[0]],[$_->[1],$_->[1]]) : $_} @r;
219	# sort the ranges by size and order.
220	@r= sort { $a->[0] <=> $b->[0] } @r;
221	my $alu= sprintf $fmt,$ofs; # C array look up
222
223	if ($fmt=~/%d/) {
224	# map the ranges into conditions
225	@r= map {
226	# singleton
0ccab2bc	227	$_->[0] == $_->[1] ? sprintf("$alu == $hex_fmt",$_->[0]) :
12b72891	228	# range
0ccab2bc	229	sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_)
12b72891	230	} @r;
	231	# return the joined results.
	232	return '( ' . join( " \|\| ", @r ) . ' )';
	233	} else {
	234	return combine($alu,@r);
	235	}
	236	}
	237
	238	#
	239	# Do the condition in such a way that we break out early if the value
	240	# we are looking at is in between two elements in the list.
	241	# Currently used only for codepoint macros (depth 1)
	242	#
	243	sub combine {
	244	my $alu=shift;
	245	local $_ = shift;
	246	my $txt= $_->[0] == $_->[1]
0ccab2bc	247	? sprintf("$alu == $hex_fmt",$_->[0])
0ccab2bc	248	: sprintf("($hex_fmt <= $alu && $alu <= $hex_fmt)",@$_);
12b72891	249	return $txt unless @_;
32e6a07c	250	return sprintf "( %s \|\|( %s > 0x%02X &&\n%s ) )",
32e6a07c	251	$txt,$alu,$_->[1],combine($alu,@_);
12b72891	252	}
	253
	254	# recursively convert a trie to an optree represented by
	255	# [condition,yes,no] where yes and no can be a ref to another optree
	256	# or a scalar representing code.
	257	# called by make_optree
	258
	259	sub _trie_to_optree {
	260	my ( $node, $ofs, $else, $fmt )= @_;
	261	return $else unless $node;
	262	$ofs \|\|= 0;
	263	if ( $node->{''} ) {
	264	$else= $ofs;
	265	} else {
	266	$else \|\|= 0;
	267	}
	268	my @k= sort { $b->[1] cmp $a->[1] \|\| $a->[0] <=> $b->[0] }
	269	map { [ $_, Dumper( $node->{$_} ), $node->{$_} ] }
	270	grep length, keys %$node;
	271
	272	return $ofs if !@k;
	273
	274	my ( $root, $expr );
	275	while ( @k ) {
	276	my @cond= ( $k[0][0] );
	277	my $d= $k[0][1];
	278	my $r= $k[0][2];
	279	shift @k;
	280	while ( @k && $k[0][1] eq $d ) {
	281	push @cond, $k[0][0];
	282	shift @k;
	283	}
	284	my $op=
	285	[ _cond( \@cond, $ofs, $fmt ), _trie_to_optree( $r, $ofs + 1, $else, $fmt ) ];
	286	if ( !$root ) {
	287	$root= $expr= $op;
	288	} else {
	289	push @$expr, $op;
	290	$expr= $op;
	291	}
	292	}
	293	push @$expr, $else;
	294	return $root;
	295	}
	296
	297	# construct the optree for a type.
	298	# handles the special logic of type ''.
	299	sub make_optree {
	300	my ( $self, $type, $size, $fmt )= @_;
	301	my $else= 0;
	302	$size\|\|=$self->{size}{$type}[0];
	303	$size=1 if $type eq 'c';
	304	if ( !$type ) {
	305	my ( $u, $l );
32e6a07c	306	if ($self->{trie}{u}) {
	307	for ( my $sz= $size ; !$u && $sz > 0 ; $sz-- ) {
	308	$u= _trie_to_optree( $self->{trie}{u}{$sz}, 0, 0, $fmt );
	309	}
12b72891	310	}
32e6a07c	311	if ($self->{trie}{l}) {
	312	for ( my $sz= $size ; !$l && $sz > 0 ; $sz-- ) {
	313	$l= _trie_to_optree( $self->{trie}{l}{$sz}, 0, 0, $fmt );
	314	}
12b72891	315	}
	316	if ( $u ) {
	317	$else= [ '(is_utf8)', $u, $l \|\| 0 ];
	318	} elsif ( $l ) {
	319	$else= [ '(!is_utf8)', $l, 0 ];
	320	}
	321	$type= 'n';
12b72891	322	}
32e6a07c	323	if (!$self->{trie}{$type}) {
	324	return $else;
	325	} else {
	326	$size-- while $size>0 && !$self->{trie}{$type}{$size};
	327	return _trie_to_optree( $self->{trie}{$type}{$size}, 0, $else, $fmt );
	328	}
12b72891	329	}
	330
	331	# construct the optree for a type with length checks to prevent buffer
	332	# overruns. Only one length check is performed per lookup trading code
	333	# size for speed.
	334	sub length_optree {
	335	my ( $self, $type,$fmt )= @_;
	336	$type \|\|= '';
	337	return $self->{len_op}{$type} if $self->{len_op}{$type};
	338	my @size = @{$self->{size}{$type}};
	339
	340	my ( $root, $expr );
	341	foreach my $size ( @size ) {
	342	my $op= [
	343	"( (e) - (s) > " . ( $size - 1 ) . " )",
	344	$self->make_optree( $type, $size ),
	345	];
	346	if ( !$root ) {
	347	$root= $expr= $op;
	348	} else {
	349	push @$expr, $op;
	350	$expr= $op;
	351	}
	352	}
	353	push @$expr, 0;
	354	return $self->{len_op}{$type}= $root ? $root : $expr->[0];
	355	}
	356
	357	#
	358	# recursively walk an optree and covert it to a huge nested ternary expression.
	359	#
	360	sub _optree_to_ternary {
	361	my ( $node )= @_;
	362	return $node
	363	if !ref $node;
	364	my $depth = 0;
	365	if ( $node->[0] =~ /\[(\d+)\]/ ) {
	366	$depth= $1 + 1;
	367	}
	368	return sprintf "\n%s( %s ? %s : %s )", " " x $depth, $node->[0],
	369	_optree_to_ternary( $node->[1] ), _optree_to_ternary( $node->[2] );
	370	}
	371
	372	# add \\ to the end of strings in a reasonable neat way.
	373	sub _macro($) {
	374	my $str= shift;
	375	my @lines= split /[^\S\n]*\n/, $str;
0ccab2bc	376	my $macro = join( "\\\n", map { sprintf "%-76s", $_ } @lines );
	377	$macro =~ s/ *$//;
	378	return $macro . "\n\n";
12b72891	379	}
	380
	381	# default type extensions. 'uln' dont have one because normally
	382	# they are used only as part of type '' which doesnt get an extension
	383	my %ext= (
	384	U => '_utf8',
	385	L => '_latin1',
	386	c => '_cp',
	387
	388	);
	389
	390	# produce the ternary, handling arguments and putting on the macro headers
	391	# and boiler plate
	392	sub ternary {
	393	my ( $self, $type, $ext )= @_;
	394	$type \|\|= '';
	395	$ext = ($ext{$type} \|\| '') . ($ext\|\|"");
	396	my ($root,$fmt,$arg);
	397	if ($type eq 'c') {
	398	$arg= $fmt= 'cp';
	399	} else {
	400	$arg= 's';
	401	}
	402	if ( $type eq 'c' \|\| $ext !~ /safe/) {
	403	$root= $self->make_optree( $type, 0, $fmt );
	404	} else {
	405	$root= $self->length_optree( $type, $fmt );
	406	}
	407
	408	our $parens;
	409	$parens= qr/ \( (?: (?> [^()]+? ) \| (??{$parens}) )+? \) /x;
	410	my $expr= qr/
	411	\( \s*
	412	($parens)
	413	\s* \? \s*
	414	\( \s*
	415	($parens)
	416	\s* \? \s*
	417	(\d+\|$parens)
	418	\s* : \s*
	419	(\d+\|$parens)
	420	\s* \)
	421	\s* : \s*
	422	\4
	423	\s* \)
	424	/x;
	425	my $code= _optree_to_ternary( $root );
	426	for ( $code ) {
	427	s/^\s*//;
	428	1 while s/\(\s($parens)\s\?\s1\s:\s0\s\)/$1/g
	429	\|\| s<$expr><(($1 && $2) ? $3 : $4)>g
	430	\|\| s<\(\s($parens)\s\)><$1>g;
	431	}
	432	my @args=($arg);
	433	push @args,'e' if $ext=~/safe/;
	434	push @args,'is_utf8' if !$type;
	435	my $args=join ",",@args;
	436	return "/* GENERATED CODE */\n"
	437	. _macro "#define is_$self->{op}$ext($args)\n$code";
	438	}
32e6a07c	439	$\|++;
12b72891	440	my $path=shift @ARGV;
32e6a07c	441
12b72891	442	if (!$path) {
	443	$path= "regcharclass.h";
	444	if (!-e $path) { $path="../$path" }
	445	if (!-e $path) { die "Can't find regcharclass.h to update!\n" };
	446	}
32e6a07c	447	my $out_fh;
	448	if ($path eq '-') {
	449	$out_fh= \*STDOUT;
	450	} else {
	451	rename $path,"$path.bak";
	452	open $out_fh,">",$path
	453	or die "Can't write to '$path':$!";
	454	binmode $out_fh; # want unix line endings even when run on win32.
	455	}
58fbde93	456	my ($zero) = $0=~/([^\\\/]+)$/;
12b72891	457	print $out_fh <<"HEADER";
58fbde93	458	/* -- buffer-read-only: t --
	459	*
	460	* regcharclass.h
	461	*
	462	* Copyright (C) 2007, by Larry Wall and others
	463	*
	464	* You may distribute under the terms of either the GNU General Public
	465	* License or the Artistic License, as specified in the README file.
	466	*
	467	* !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	468	* This file is built by Porting/$zero.
	469	* (Generated at: @{[ scalar gmtime ]} GMT)
	470	* Any changes made here will be lost!
	471	*/
12b72891	472
	473	HEADER
	474
32e6a07c	475	my ($op,$title,@strs,@txt,$type);
12b72891	476	my $doit= sub {
	477	return unless $op;
	478	my $o= __PACKAGE__->new($title,$op,@strs);
	479	print $out_fh "/*\n\t$o->{op}: $o->{title}\n\n";
	480	print $out_fh join "\n",@txt,"*/","";
32e6a07c	481	$type\|\|="U L c _safe";
	482	my @ext=("");
	483	my @types=("",map{ if (length $_>1) { push @ext,$_; () } else { $_ } }
	484	split /\s+/,$type);
	485	for my $type (@types) {
	486	for my $ext (@ext) {
	487	next if $type eq 'c' and $ext eq '_safe';
	488	print $out_fh $o->ternary( $type,$ext );
	489	}
12b72891	490	}
12b72891	491	};
	492	while (<DATA>) {
	493	next unless /\S/;
	494	chomp;
	495	if (/^([A-Z]+)/) {
	496	$doit->();
	497	($op,$title)=split /\s:\s/,$_,2;
	498	@txt=@strs=();
32e6a07c	499	$type="";
	500	} elsif (/^=(.*)/) {
	501	$type.=$1;
12b72891	502	} else {
	503	push @txt, "\t$_";
	504	s/#.*$//;
	505	if (/^0x/) {
	506	push @strs,map { chr $_ } eval $_;
	507	} elsif (/^[""'']/) {
	508	push @strs,eval $_;
	509	}
	510	}
	511	}
	512	$doit->();
58fbde93	513	print $out_fh "/* ex: set ro: */\n";
12b72891	514
	515	__DATA__
	516	LNBREAK: Line Break: \R
	517	"\x0D\x0A" # CRLF - Network (Windows) line ending
	518	0x0A # LF \| LINE FEED
	519	0x0B # VT \| VERTICAL TAB
	520	0x0C # FF \| FORM FEED
	521	0x0D # CR \| CARRIAGE RETURN
	522	0x85 # NEL \| NEXT LINE
	523	0x2028 # LINE SEPARATOR
	524	0x2029 # PARAGRAPH SEPARATOR
	525
	526	HORIZWS: Horizontal Whitespace: \h \H
	527	0x09 # HT
	528	0x20 # SPACE
	529	0xa0 # NBSP
	530	0x1680 # OGHAM SPACE MARK
	531	0x180e # MONGOLIAN VOWEL SEPARATOR
	532	0x2000 # EN QUAD
	533	0x2001 # EM QUAD
	534	0x2002 # EN SPACE
	535	0x2003 # EM SPACE
	536	0x2004 # THREE-PER-EM SPACE
	537	0x2005 # FOUR-PER-EM SPACE
	538	0x2006 # SIX-PER-EM SPACE
	539	0x2007 # FIGURE SPACE
	540	0x2008 # PUNCTUATION SPACE
	541	0x2009 # THIN SPACE
	542	0x200A # HAIR SPACE
	543	0x202f # NARROW NO-BREAK SPACE
	544	0x205f # MEDIUM MATHEMATICAL SPACE
	545	0x3000 # IDEOGRAPHIC SPACE
	546
	547	VERTWS: Vertical Whitespace: \v \V
	548	0x0A # LF
	549	0x0B # VT
	550	0x0C # FF
	551	0x0D # CR
	552	0x85 # NEL
	553	0x2028 # LINE SEPARATOR
	554	0x2029 # PARAGRAPH SEPARATOR
	555
32e6a07c	556	TRICKYFOLD: Problematic fold case letters.
	557	0x00DF # LATIN SMALL LETTER SHARP S
	558	0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
	559	0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS