#!/usr/bin/perl -w

#
# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
# from the Unicode database files (lib/unicore/*.txt).
#

use strict;

my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.

mkdir("In", 0755);
mkdir("Is", 0755);
mkdir("To", 0755);

sub extend {
    my ($table, $last) = @_;

    $table->[-1]->[1] = $last;
}

sub append {
    my ($table, $code, $name) = @_;
    if (@$table &&
	hex($table->[-1]->[1]) == hex($code) - 1 &&
	(!defined $name || $table->[-1]->[2] eq $name)) {
	extend($table, $code);
    } else {
	push @$table, [$code, $code, $name];
    }
}

sub inverse {
    my ($table) = @_;
    my $inverse = [];
    my ($first, $last);
    if ($table->[0]->[0]) {
	$last = hex($table->[0]->[0]);
	push @$inverse, [ "0000",
			  sprintf("%04X", $last - 1) ];
    }
    for my $i (0..$#$table-1) {
	$first = defined $table->[$i    ]->[1] ?
	             hex($table->[$i    ]->[1]) : 0;
	$last  = defined $table->[$i + 1]->[0] ?
	             hex($table->[$i + 1]->[0]) : $first;
	push @$inverse, [ sprintf("%04X", $first + 1),
			  sprintf("%04X", $last  - 1) ]
			      unless $first + 1 == $last;
    }
    return $inverse;
}

sub header {
    my $fh = shift;

    print $fh <<EOT;
# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! 
# This file is built by $0 from e.g. Unicode.txt.
# Any changes made here will be lost!
EOT
}

sub begin {
    my $fh = shift;

    print $fh <<EOT;
return <<'END';
EOT
}

sub end {
    my $fh = shift;
    
    print $fh <<EOT;
END
EOT
}

sub flush {
    my ($table, $file) = @_;
    print "$file\n";
    if (open(my $fh, ">$file")) {
	header($fh);
	begin($fh);
	for my $i (@$table) {
	    print $fh $i->[0], "\t",
                      $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
                      defined $i->[2] ? $i->[2] : "", "\n";
	}
	end($fh);
	close($fh);
    } else {
	die "$0: $file: $!\n";
    }
}

#
# The %In contains the mapping of the script/block name into a number.
#

my %In;
my $InId = 0;
my %InIn;

#
# Read in the Unicode.txt, the main Unicode database.
#

my %Cat;
my %General;
my @General;

if (open(my $Unicode, "Unicode.txt")) {
    my @Name;
    my @Bidi;
    my %Bidi;
    my @Comb;
    my @Deco;
    my %Deco;
    my %DC;
    my @Number;
    my @Mirrored;
    my %To;
    while (<$Unicode>) {
	next unless /^[0-9A-Fa-f]+;/;
	s/\s+$//;

	my ($code, $name, $cat, $comb, $bidi, $deco,
	    $decimal, $digit, $number,
	    $mirrored, $unicode10, $comment,
	    $upper, $lower, $title) = split(/\s*;\s*/);

	if ($name =~ /^<(.+), (First|Last)>$/) {
	    $name = $1;
	    if ($2 eq 'First') {
		append($General{$name} ||= [], $code, $name);
	    } else {
		extend($General{$name}       , $code);
	    }
	    unless (defined $In{$name}) {
		$In{$name}   = $InId++;
		$InIn{$name} = $General{$name};
	    }
	    append($Cat{$cat}       ||= [], $code);
	    append($Cat{substr($cat, 0, 1)}
		                    ||= [], $code);
	} else {
	    append(\@Name,                  $code, $name);

	    append(\@General,               $code, $cat);

	    append($Cat{$cat}       ||= [], $code);
	    append($Cat{substr($cat, 0, 1)}
		                    ||= [], $code);
	    # 005F: SPACING UNDERSCORE
	    append($Cat{Word}       ||= [], $code)
		if $cat =~ /^[LMN]/ or $code eq "005F";
	    append($Cat{Alnum}      ||= [], $code)
		if $cat =~ /^[LMN]/;
	    append($Cat{Alpha}      ||= [], $code)
		if $cat =~ /^[LM]/;
	    # 0009: HORIZONTAL TABULATION
	    # 000A: LINE FEED
	    # 000B: VERTICAL TABULATION
	    # 000C: FORM FEED
	    # 000D: CARRIAGE RETURN
	    # 0020: SPACE
	    append($Cat{Space}      ||= [], $code)
		if $cat  =~ /^Z/ ||
		    $code =~ /^(0009|000A|000B|000C|000D)$/;
	    append($Cat{SpacePerl}  ||= [], $code)
		if $cat  =~ /^Z/ ||
		    $code =~ /^(0009|000A|000C|000D)$/;
	    append($Cat{Blank}      ||= [], $code)
		if $code =~ /^(0020|0009)$/ ||
		    $cat  =~ /^Z[^lp]$/;
	    append($Cat{Digit}      ||= [], $code) if $cat eq "Nd";
	    append($Cat{Upper}      ||= [], $code) if $cat eq "Lu";
	    append($Cat{Lower}      ||= [], $code) if $cat eq "Ll";
	    append($Cat{Title}      ||= [], $code) if $cat eq "Lt";
	    append($Cat{ASCII}      ||= [], $code) if $code le "007F";
	    append($Cat{Cntrl}      ||= [], $code) if $cat =~ /^C/;
	    append($Cat{Graph}      ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
	    append($Cat{Print}      ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
	    append($Cat{Punct}      ||= [], $code) if $cat =~ /^P/;
	    # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
	    append($Cat{XDigit}     ||= [], $code)
		if $code =~ /^00(3[0-9]|[46][1-6])$/;
	    
	    append($To{Upper}       ||= [], $code, $upper)   if $upper;
	    append($To{Lower}       ||= [], $code, $lower)   if $lower;
	    append($To{Title}       ||= [], $code, $title)   if $title;
	    append($To{Digit}       ||= [], $code, $decimal) if $decimal;
	    
	    append(\@Bidi,                  $code, $bidi);
	    append($Bidi{$bidi}     ||= [], $code);
	    
	    append(\@Comb,                  $code, $comb) if $comb;
	    
	    if ($deco) {
		append(\@Deco,                  $code, $deco);
		if ($deco =~/^<(\w+)>/) {
		    append($Deco{Compat} ||= [], $code);
		    append($DC{$1}       ||= [], $code);
		} else {
		    append($Deco{Canon}  ||= [], $code);
		}
	    }
	    
	    append(\@Number,                     $code, $number) if $number;
	    
	    append(\@Mirrored,                   $code) if $mirrored eq "Y";
	}
    }

    flush(\@Name, "Name.pl");

    foreach my $cat (sort keys %Cat) {
	flush($Cat{$cat}, "Is/$cat.pl");
    }

    foreach my $to (sort keys %To) {
	flush($To{$to}, "To/$to.pl");
    }

    flush(\@Bidi, "Bidirectional.pl");
    foreach my $bidi (sort keys %Bidi) {
	flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
    }

    flush(\@Comb, "CombiningClass.pl");

    flush(\@Deco, "Decomposition.pl");
    foreach my $deco (sort keys %Deco) {
	flush($Deco{$deco}, "Is/Deco$deco.pl");
    }
    foreach my $dc (sort keys %DC) {
	flush($DC{$dc}, "Is/DC$dc.pl");
    }

    flush(\@Number, "Number.pl");

    flush(\@Mirrored, "Is/Mirrored.pl");
} else {
    die "$0: Unicode.txt: $!\n";
}

#  The general cateory can be written out already now.

flush(\@General, "Category.pl");

#
# Read in the LineBrk.txt.
#

if (open(my $LineBrk, "LineBrk.txt")) {
    my @Lbrk;
    my %Lbrk;

    while (<$LineBrk>) {
        next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;

	my ($first, $last, $lbrk) = ($1, $2, $3);

	append(\@Lbrk,              $first, $lbrk);
	append($Lbrk{$lbrk} ||= [], $first);
	if (defined $last) {
	    extend(\@Lbrk,          $last);
	    extend($Lbrk{$lbrk},    $last);
	}
    }

    flush(\@Lbrk, "Lbrk.pl");
    foreach my $lbrk (sort keys %Lbrk) {
	flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
    }
} else {
    die "$0: LineBrk.txt: $!\n";
}

#
# Read in the ArabShap.txt.
#

if (open(my $ArabShap, "ArabShap.txt")) {
    my @ArabLink;
    my @ArabLinkGroup;

    while (<$ArabShap>) {
	next unless /^[0-9A-Fa-f]+;/;
	s/\s+$//;

	my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);

	append(\@ArabLink,      $code, $link);
	append(\@ArabLinkGroup, $code, $linkgroup);
    }

    flush(\@ArabLink,      "ArabLink.pl");
    flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
} else {
    die "$0: ArabShap.txt: $!\n";
}

#
# Read in the Jamo.txt.
#

if (open(my $Jamo, "Jamo.txt")) {
    my @Short;

    while (<$Jamo>) {
	next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;

	my ($code, $short) = ($1, $2);

	append(\@Short, $code, $short);
    }

    flush(\@Short, "JamoShort.pl");
} else {
    die "$0: Jamo.txt: $!\n";
}

#
# Read in the Scripts.txt.
#

my @Scripts;

if (open(my $Scripts, "Scripts.txt")) {
    while (<$Scripts>) {
	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;

	# Wait until all the scripts have been read since
	# they are not listed in numeric order.
	push @Scripts, [ hex($1), $1, $2, $3 ];
    }
} else {
    die "$0: Scripts.txt: $!\n";
}

# Now append the scripts properties in their code point order.

my %Script;
my $Scripts = [];

for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
    my ($code, $first, $last, $name) = @$script;
    append($Scripts,              $first, $name);
    append($Script{$name} ||= [], $first, $name);
    if (defined $last) {
	extend($Scripts,       $last);
	extend($Script{$name}, $last);
    }
    unless (defined $In{$name}) {
	$In{$name}   = $InId++;
	$InIn{$name} = $Script{$name};
    }
}

# Scripts.pl can be written out already now.

flush(\@Scripts, "Scripts.pl");

# Common is everything not explicitly assigned to a Script

$In{Common} = $InId++;
my $Common = inverse($Scripts);
$InIn{Common} = $Common;

#
# Read in the Blocks.txt.
#

my @Blocks;
my %Blocks;

if (open(my $Blocks, "Blocks.txt")) {
    while (<$Blocks>) {
	next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
	
	my ($first, $last, $name) = ($1, $2, $3);

	# If there's a naming conflict (the script names are
	# in uppercase), the name of the block has " Block"
	# appended to it.
	$name = "$name Block" if defined $In{"\U$name"};

	append(\@Blocks,              $first, $name);
	append($Blocks{$name} ||= [], $first, $name);
	if (defined $last) {
	    extend(\@Blocks,       $last);
	    extend($Blocks{$name}, $last);
	}
	unless (defined $In{$name}) {
	    $In{$name}   = $InId++;
	    $InIn{$name} = $Blocks{$name};
	}
    }
} else {
    die "$0: Blocks.txt: $!\n";
}

# Blocks.pl can be written out already now.

flush(\@Blocks, "Blocks.pl");

#
# Read in the PropList.txt.  It contains extended properties not
# listed in the Unicode.txt, such as 'Other_Alphabetic':
# alphabetic but not of the general category L; many modifiers
# belong to this extended property category: while they are not
# alphabets, they are alphabetic in nature.
#

my @Props;

if (open(my $Props, "PropList.txt")) {
    while (<$Props>) {
	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;

	# Wait until all the extended properties have been read since
	# they are not listed in numeric order.
	push @Props, [ hex($1), $1, $2, $3 ];
    }
} else {
    die "$0: PropList.txt: $!\n";
}

# Now append the extended properties in their code point order.

my %Prop;
my $Props = [];

for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
    my ($code, $first, $last, $name) = @$prop;
    append($Props,              $first, $name);
    append($Prop{$name} ||= [], $first, $name);
    if (defined $last) {
	extend($Props,       $last);
	extend($Prop{$name}, $last);
    }
    unless (defined $In{$name}) {
	$In{$name}   = $InId++;
	$InIn{$name} = $Prop{$name};
    }
}

# Assigned is everything not Cn aka Noncharacter_Code_Point

$In{Assigned} = $InId++;
my $Assigned = inverse($Prop{Noncharacter_Code_Point});
$InIn{Assigned} = $Assigned;

sub merge_general_and_extended {
    my ($name, $general, $extended) = @_;
    my $merged;

    push @$merged,
         map { pop @{$_}; $_ }
             sort { $a->[2] <=> $b->[2] }
                  map { [ $_->[0], $_->[1], hex($_->[0]) ] }
                      ($general ?
		         map { ref $_ ? @$_ : $_ }
                             @Cat {ref $general  ? @$general  : $general } :
		         (),
                       $extended ?
		         map { ref $_ ? @$_ : $_ }
                             @Prop{ref $extended ? @$extended : $extended} :
		         ());

    $In{$name}   = $InId++;
    $InIn{$name} = $merged;
    
    return $merged;
}

# Alphabetic is L and Other_Alphabetic.

my $Alphabetic =
    merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');

# Lowercase is Ll and Other_Lowercase.

my $Lowercase =
    merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');

# Uppercase is Lu and Other_Uppercase.

my $Uppercase =
    merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');

# Math is Sm and Other_Math.

my $Math =
    merge_general_and_extended('Math', 'Sm', 'Other_Math');

# Lampersand is Ll, Lu, and Lt.

my $Lampersand =
    merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);

# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.

my $ID_Start =
    merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);

# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.

my $ID_Continue =
    merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
						   Mn Mc Nd Pc) ]);

#
# Any is any.
#

$In{Any} = $InId++;
my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
$InIn{Any} = $Any;

#
# mapping() will be used to write out the In and Is virtual mappings.
#

sub mapping {
    my ($map, $name) = @_;

    if (open(my $fh, ">$name.pl")) {
	print "$name.pl\n";
	header($fh);

	# The %pat will hold a hash that maps the first two
	# lowercased letters of a class to a 'fuzzified' regular
	# expression that points to the real mapping.

	my %pat;

	# But first write out the offical name to real name
	# (the filename) mapping.

	print $fh <<EOT;
%utf8::${name} =
(
EOT
        for my $i (sort keys %$map) {
	    my $pat = $i;
	    # Here is the 'fuzzification': accept any space,
	    # dash, or underbar where in the official name
	    # there is space or a dash (or underbar, but
	    # there never is).
	    $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
	    # The prefix length of 2 is enough spread,
	    # and besides, we have 'Yi' as an In category.
	    push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
	    print $fh "'$i' => '$map->{$i}',\n";
	}
        print $fh <<EOT;
);
EOT

	# Now write out the %pat mapping.

        print $fh <<EOT;
%utf8::${name}Pat =
(
EOT
        foreach my $prefix (sort keys %pat) {
	    print $fh "'$prefix' => {\n";
	    foreach my $ipat (@{$pat{$prefix}}) {
		my ($i, $pat) = @$ipat;
		print $fh "\t'$pat' => '$map->{$i}',\n";
	    }
	    print $fh "},\n";
	}
        print $fh <<EOT;
);
EOT

        close($fh);
    } else {
        die "$0: $name.pl: $!\n";
    }
}

#
# Write out the virtual In mappings.
#

mapping(\%In, "In");

# Easy low-calorie cheat.
use File::Copy;
copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");

#
# Write out the real In mappings
# (the In.pl written out just above has the virtual In mappings)
#

foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
    flush($InIn{$in}, "In/$In{$in}.pl");
}

#
# The mapping from General Category long forms to short forms is
# currently hardwired here since no simple data file in the UCD
# seems to do that.
#

my %Is = (
	'Letter'			=>	'L',
	'Uppercase Letter'		=>	'Lu',
	'Lowercase Letter'		=>	'Ll',
	'Titlecase Letter'		=>	'Lt',
	'Modifier Letter'		=>	'Lm',
	'Other Letter'			=>	'Lo',

	'Mark'				=>	'M',
	'Non-Spacing Mark'		=>	'Mn',
	'Spacing Combining Mark'	=>	'Mc',
	'Enclosing Mark'		=>	'Me',

	'Separator'			=>	'Z',
	'Space Separator'		=>	'Zs',
	'Line Separator'		=>	'Zl',
	'Paragraph Separator'		=>	'Zp',

	'Number'			=>	'N',
	'Decimal Digit Number'		=>	'Nd',
	'Letter Number'			=>	'Nl',
	'Other Number'			=>	'No',

	'Punctuation'			=>	'P',
	'Connector Punctuation'		=>	'Pc',
	'Dash Punctuation'		=>	'Pd',
	'Open Punctuation'		=>	'Ps',
	'Close Punctuation'		=>	'Pe',
	'Initial Punctuation'		=>	'Pi',
	'Final Punctuation'		=>	'Pf',
	'Other Punctuation'		=>	'Po',

	'Symbol'			=>	'S',
	'Math Symbol'			=>	'Sm',
	'Currency Symbol'		=>	'Sc',
	'Modifier Symbol'		=>	'Sk',
	'Other Symbol'			=>	'So',

	'Other'				=>	'C',
	'Control'			=>	'Cc',
	'Format'			=>	'Cf',
	'Surrogate'			=>	'Cs',
	'Private Use'			=>	'Co',
	'Not Assigned'			=>	'Cn',
	# 'Other' aliases
	'Other Control'			=>	'Cc',
	'Other Format'			=>	'Cf',
	'Other Surrogate'		=>	'Cs',
	'Other Private Use'		=>	'Co',
	'Other Not Assigned'		=>	'Cn',
);

#
# Write out the virtual Is mappings.
#

mapping(\%Is, "Is");

#
# Read in the special cases.
#

my %Case;

if (open(my $SpecCase, "SpecCase.txt")) {
    while (<$SpecCase>) {
	next unless /^[0-9A-Fa-f]+;/;
	s/\#.*//;
	s/\s+$//;

	my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);

	if ($condition) { # not implemented yet
	    print "# SKIPPING $_\n";
	    next;
	}

	# Wait until all the special cases have been read since
	# they are not listed in numeric order.
	my $ix = hex($code);
	push @{$Case{Lower}}, [ $ix, $code, $lower ];
	push @{$Case{Title}}, [ $ix, $code, $title ];
	push @{$Case{Upper}}, [ $ix, $code, $upper ];
    }
} else {
    die "$0: SpecCase.txt: $!\n";
}

# Now write out the special cases properties in their code point order.
# The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash
# routines do not do returning multiple characters.

for my $case (qw(Lower Title Upper)) {
    my @case;
    for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
        my ($ix, $code, $to) = @$prop;
        append(\@case, $code, $to);
    }
    flush(\@case, "To/Spec$case.pl");
}

# That's all, folks!