Add the special casing mappings (from SpecCase.txt)
[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
2
3#
4# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
5# from the Unicode database files (lib/unicore/*.txt).
6#
7
d73e5302 8use strict;
9
d2d499f5 10my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
11
d73e5302 12mkdir("In", 0755);
13mkdir("Is", 0755);
14mkdir("To", 0755);
15
16sub extend {
17 my ($table, $last) = @_;
18
19 $table->[-1]->[1] = $last;
20}
21
22sub append {
23 my ($table, $code, $name) = @_;
24 if (@$table &&
25 hex($table->[-1]->[1]) == hex($code) - 1 &&
26 (!defined $name || $table->[-1]->[2] eq $name)) {
27 extend($table, $code);
28 } else {
29 push @$table, [$code, $code, $name];
30 }
31}
32
33sub inverse {
34 my ($table) = @_;
35 my $inverse = [];
36 my ($first, $last);
37 if ($table->[0]->[0]) {
38 $last = hex($table->[0]->[0]);
39 push @$inverse, [ "0000",
40 sprintf("%04X", $last - 1) ];
41 }
42 for my $i (0..$#$table-1) {
43 $first = defined $table->[$i ]->[1] ?
44 hex($table->[$i ]->[1]) : 0;
45 $last = defined $table->[$i + 1]->[0] ?
46 hex($table->[$i + 1]->[0]) : $first;
47 push @$inverse, [ sprintf("%04X", $first + 1),
48 sprintf("%04X", $last - 1) ]
49 unless $first + 1 == $last;
50 }
51 return $inverse;
52}
53
54sub header {
55 my $fh = shift;
56
57 print $fh <<EOT;
58# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
59# This file is built by $0 from e.g. Unicode.txt.
60# Any changes made here will be lost!
61EOT
62}
63
64sub begin {
65 my $fh = shift;
66
67 print $fh <<EOT;
68return <<'END';
69EOT
70}
71
72sub end {
73 my $fh = shift;
74
75 print $fh <<EOT;
76END
77EOT
78}
79
80sub flush {
81 my ($table, $file) = @_;
82 print "$file\n";
83 if (open(my $fh, ">$file")) {
84 header($fh);
85 begin($fh);
86 for my $i (@$table) {
87 print $fh $i->[0], "\t",
88 $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
89 defined $i->[2] ? $i->[2] : "", "\n";
90 }
91 end($fh);
92 close($fh);
93 } else {
94 die "$0: $file: $!\n";
95 }
96}
97
98#
99# The %In contains the mapping of the script/block name into a number.
100#
101
102my %In;
103my $InId = 0;
104my %InIn;
105
106#
107# Read in the Unicode.txt, the main Unicode database.
108#
109
110my %Cat;
111my %General;
112my @General;
113
114if (open(my $Unicode, "Unicode.txt")) {
115 my @Name;
116 my @Bidi;
117 my %Bidi;
118 my @Comb;
119 my @Deco;
120 my %Deco;
121 my %DC;
122 my @Number;
123 my @Mirrored;
124 my %To;
125 while (<$Unicode>) {
d2d499f5 126 next unless /^[0-9A-Fa-f]+;/;
d73e5302 127 s/\s+$//;
d2d499f5 128
d73e5302 129 my ($code, $name, $cat, $comb, $bidi, $deco,
130 $decimal, $digit, $number,
131 $mirrored, $unicode10, $comment,
132 $upper, $lower, $title) = split(/\s*;\s*/);
133
134 if ($name =~ /^<(.+), (First|Last)>$/) {
135 $name = $1;
136 if ($2 eq 'First') {
137 append($General{$name} ||= [], $code, $name);
138 } else {
139 extend($General{$name} , $code);
140 }
141 unless (defined $In{$name}) {
142 $In{$name} = $InId++;
143 $InIn{$name} = $General{$name};
144 }
145 append($Cat{$cat} ||= [], $code);
146 append($Cat{substr($cat, 0, 1)}
147 ||= [], $code);
148 } else {
149 append(\@Name, $code, $name);
150
151 append(\@General, $code, $cat);
152
153 append($Cat{$cat} ||= [], $code);
154 append($Cat{substr($cat, 0, 1)}
155 ||= [], $code);
156 # 005F: SPACING UNDERSCORE
157 append($Cat{Word} ||= [], $code)
158 if $cat =~ /^[LMN]/ or $code eq "005F";
159 append($Cat{Alnum} ||= [], $code)
160 if $cat =~ /^[LMN]/;
161 append($Cat{Alpha} ||= [], $code)
162 if $cat =~ /^[LM]/;
163 # 0009: HORIZONTAL TABULATION
164 # 000A: LINE FEED
165 # 000B: VERTICAL TABULATION
166 # 000C: FORM FEED
167 # 000D: CARRIAGE RETURN
168 # 0020: SPACE
169 append($Cat{Space} ||= [], $code)
170 if $cat =~ /^Z/ ||
171 $code =~ /^(0009|000A|000B|000C|000D)$/;
172 append($Cat{SpacePerl} ||= [], $code)
173 if $cat =~ /^Z/ ||
174 $code =~ /^(0009|000A|000C|000D)$/;
175 append($Cat{Blank} ||= [], $code)
176 if $code =~ /^(0020|0009)$/ ||
177 $cat =~ /^Z[^lp]$/;
178 append($Cat{Digit} ||= [], $code) if $cat eq "Nd";
179 append($Cat{Upper} ||= [], $code) if $cat eq "Lu";
180 append($Cat{Lower} ||= [], $code) if $cat eq "Ll";
181 append($Cat{Title} ||= [], $code) if $cat eq "Lt";
182 append($Cat{ASCII} ||= [], $code) if $code le "007F";
183 append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/;
184 append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
185 append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
186 append($Cat{Punct} ||= [], $code) if $cat =~ /^P/;
187 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
188 append($Cat{XDigit} ||= [], $code)
189 if $code =~ /^00(3[0-9]|[46][1-6])$/;
190
191 append($To{Upper} ||= [], $code, $upper) if $upper;
192 append($To{Lower} ||= [], $code, $lower) if $lower;
193 append($To{Title} ||= [], $code, $title) if $title;
194 append($To{Digit} ||= [], $code, $decimal) if $decimal;
195
196 append(\@Bidi, $code, $bidi);
197 append($Bidi{$bidi} ||= [], $code);
198
199 append(\@Comb, $code, $comb) if $comb;
200
201 if ($deco) {
202 append(\@Deco, $code, $deco);
203 if ($deco =~/^<(\w+)>/) {
204 append($Deco{Compat} ||= [], $code);
205 append($DC{$1} ||= [], $code);
206 } else {
207 append($Deco{Canon} ||= [], $code);
208 }
209 }
210
211 append(\@Number, $code, $number) if $number;
212
213 append(\@Mirrored, $code) if $mirrored eq "Y";
214 }
215 }
216
217 flush(\@Name, "Name.pl");
218
219 foreach my $cat (sort keys %Cat) {
220 flush($Cat{$cat}, "Is/$cat.pl");
221 }
222
223 foreach my $to (sort keys %To) {
224 flush($To{$to}, "To/$to.pl");
225 }
226
227 flush(\@Bidi, "Bidirectional.pl");
228 foreach my $bidi (sort keys %Bidi) {
229 flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
230 }
231
232 flush(\@Comb, "CombiningClass.pl");
233
234 flush(\@Deco, "Decomposition.pl");
235 foreach my $deco (sort keys %Deco) {
236 flush($Deco{$deco}, "Is/Deco$deco.pl");
237 }
238 foreach my $dc (sort keys %DC) {
239 flush($DC{$dc}, "Is/DC$dc.pl");
240 }
241
242 flush(\@Number, "Number.pl");
243
244 flush(\@Mirrored, "Is/Mirrored.pl");
245} else {
246 die "$0: Unicode.txt: $!\n";
247}
248
249# The general cateory can be written out already now.
250
251flush(\@General, "Category.pl");
252
253#
254# Read in the LineBrk.txt.
255#
256
257if (open(my $LineBrk, "LineBrk.txt")) {
258 my @Lbrk;
259 my %Lbrk;
260
261 while (<$LineBrk>) {
d2d499f5 262 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
d73e5302 263
264 my ($first, $last, $lbrk) = ($1, $2, $3);
265
266 append(\@Lbrk, $first, $lbrk);
267 append($Lbrk{$lbrk} ||= [], $first);
268 if (defined $last) {
269 extend(\@Lbrk, $last);
270 extend($Lbrk{$lbrk}, $last);
271 }
272 }
273
274 flush(\@Lbrk, "Lbrk.pl");
275 foreach my $lbrk (sort keys %Lbrk) {
276 flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
277 }
278} else {
279 die "$0: LineBrk.txt: $!\n";
280}
281
282#
283# Read in the ArabShap.txt.
284#
285
286if (open(my $ArabShap, "ArabShap.txt")) {
287 my @ArabLink;
288 my @ArabLinkGroup;
289
290 while (<$ArabShap>) {
d2d499f5 291 next unless /^[0-9A-Fa-f]+;/;
d73e5302 292 s/\s+$//;
d2d499f5 293
d73e5302 294 my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
295
296 append(\@ArabLink, $code, $link);
297 append(\@ArabLinkGroup, $code, $linkgroup);
298 }
299
300 flush(\@ArabLink, "ArabLink.pl");
301 flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
302} else {
303 die "$0: ArabShap.txt: $!\n";
304}
305
306#
307# Read in the Jamo.txt.
308#
309
310if (open(my $Jamo, "Jamo.txt")) {
311 my @Short;
312
313 while (<$Jamo>) {
d2d499f5 314 next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
315
316 my ($code, $short) = ($1, $2);
d73e5302 317
318 append(\@Short, $code, $short);
319 }
320
321 flush(\@Short, "JamoShort.pl");
322} else {
323 die "$0: Jamo.txt: $!\n";
324}
325
326#
327# Read in the Scripts.txt.
328#
329
330my @Scripts;
331
332if (open(my $Scripts, "Scripts.txt")) {
333 while (<$Scripts>) {
d2d499f5 334 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
d73e5302 335
336 # Wait until all the scripts have been read since
337 # they are not listed in numeric order.
338 push @Scripts, [ hex($1), $1, $2, $3 ];
339 }
340} else {
341 die "$0: Scripts.txt: $!\n";
342}
343
344# Now append the scripts properties in their code point order.
345
346my %Script;
347my $Scripts = [];
348
349for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
350 my ($code, $first, $last, $name) = @$script;
351 append($Scripts, $first, $name);
352 append($Script{$name} ||= [], $first, $name);
353 if (defined $last) {
354 extend($Scripts, $last);
355 extend($Script{$name}, $last);
356 }
357 unless (defined $In{$name}) {
358 $In{$name} = $InId++;
359 $InIn{$name} = $Script{$name};
360 }
361}
362
363# Scripts.pl can be written out already now.
364
365flush(\@Scripts, "Scripts.pl");
366
367# Common is everything not explicitly assigned to a Script
368
369$In{Common} = $InId++;
370my $Common = inverse($Scripts);
371$InIn{Common} = $Common;
372
373#
374# Read in the Blocks.txt.
375#
376
377my @Blocks;
378my %Blocks;
379
380if (open(my $Blocks, "Blocks.txt")) {
381 while (<$Blocks>) {
d2d499f5 382 next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
383
d73e5302 384 my ($first, $last, $name) = ($1, $2, $3);
385
386 # If there's a naming conflict (the script names are
387 # in uppercase), the name of the block has " Block"
388 # appended to it.
389 $name = "$name Block" if defined $In{"\U$name"};
390
391 append(\@Blocks, $first, $name);
392 append($Blocks{$name} ||= [], $first, $name);
393 if (defined $last) {
394 extend(\@Blocks, $last);
395 extend($Blocks{$name}, $last);
396 }
397 unless (defined $In{$name}) {
398 $In{$name} = $InId++;
399 $InIn{$name} = $Blocks{$name};
400 }
401 }
402} else {
403 die "$0: Blocks.txt: $!\n";
404}
405
406# Blocks.pl can be written out already now.
407
408flush(\@Blocks, "Blocks.pl");
409
410#
411# Read in the PropList.txt. It contains extended properties not
412# listed in the Unicode.txt, such as 'Other_Alphabetic':
413# alphabetic but not of the general category L; many modifiers
414# belong to this extended property category: while they are not
415# alphabets, they are alphabetic in nature.
416#
417
418my @Props;
419
420if (open(my $Props, "PropList.txt")) {
421 while (<$Props>) {
d2d499f5 422 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
d73e5302 423
424 # Wait until all the extended properties have been read since
425 # they are not listed in numeric order.
426 push @Props, [ hex($1), $1, $2, $3 ];
427 }
428} else {
429 die "$0: PropList.txt: $!\n";
430}
431
432# Now append the extended properties in their code point order.
433
434my %Prop;
435my $Props = [];
436
437for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
438 my ($code, $first, $last, $name) = @$prop;
439 append($Props, $first, $name);
440 append($Prop{$name} ||= [], $first, $name);
441 if (defined $last) {
442 extend($Props, $last);
443 extend($Prop{$name}, $last);
444 }
445 unless (defined $In{$name}) {
446 $In{$name} = $InId++;
447 $InIn{$name} = $Prop{$name};
448 }
449}
450
451# Assigned is everything not Cn aka Noncharacter_Code_Point
452
453$In{Assigned} = $InId++;
454my $Assigned = inverse($Prop{Noncharacter_Code_Point});
455$InIn{Assigned} = $Assigned;
456
457sub merge_general_and_extended {
458 my ($name, $general, $extended) = @_;
459 my $merged;
460
461 push @$merged,
462 map { pop @{$_}; $_ }
463 sort { $a->[2] <=> $b->[2] }
464 map { [ $_->[0], $_->[1], hex($_->[0]) ] }
465 ($general ?
466 map { ref $_ ? @$_ : $_ }
467 @Cat {ref $general ? @$general : $general } :
468 (),
469 $extended ?
470 map { ref $_ ? @$_ : $_ }
471 @Prop{ref $extended ? @$extended : $extended} :
472 ());
473
474 $In{$name} = $InId++;
475 $InIn{$name} = $merged;
476
477 return $merged;
478}
479
480# Alphabetic is L and Other_Alphabetic.
481
482my $Alphabetic =
483 merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
484
485# Lowercase is Ll and Other_Lowercase.
486
487my $Lowercase =
488 merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
489
490# Uppercase is Lu and Other_Uppercase.
491
492my $Uppercase =
493 merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
494
495# Math is Sm and Other_Math.
496
497my $Math =
498 merge_general_and_extended('Math', 'Sm', 'Other_Math');
499
500# Lampersand is Ll, Lu, and Lt.
501
502my $Lampersand =
503 merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
504
505# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
506
507my $ID_Start =
508 merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
509
510# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
511
512my $ID_Continue =
513 merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
514 Mn Mc Nd Pc) ]);
515
516#
517# Any is any.
518#
519
520$In{Any} = $InId++;
521my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
522$InIn{Any} = $Any;
523
524#
525# mapping() will be used to write out the In and Is virtual mappings.
526#
527
528sub mapping {
529 my ($map, $name) = @_;
530
531 if (open(my $fh, ">$name.pl")) {
532 print "$name.pl\n";
533 header($fh);
534
535 # The %pat will hold a hash that maps the first two
536 # lowercased letters of a class to a 'fuzzified' regular
537 # expression that points to the real mapping.
538
539 my %pat;
540
541 # But first write out the offical name to real name
542 # (the filename) mapping.
543
544 print $fh <<EOT;
545%utf8::${name} =
546(
547EOT
548 for my $i (sort keys %$map) {
549 my $pat = $i;
550 # Here is the 'fuzzification': accept any space,
551 # dash, or underbar where in the official name
552 # there is space or a dash (or underbar, but
553 # there never is).
554 $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
555 # The prefix length of 2 is enough spread,
556 # and besides, we have 'Yi' as an In category.
557 push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
558 print $fh "'$i' => '$map->{$i}',\n";
559 }
560 print $fh <<EOT;
561);
562EOT
563
564 # Now write out the %pat mapping.
565
566 print $fh <<EOT;
567%utf8::${name}Pat =
568(
569EOT
570 foreach my $prefix (sort keys %pat) {
571 print $fh "'$prefix' => {\n";
572 foreach my $ipat (@{$pat{$prefix}}) {
573 my ($i, $pat) = @$ipat;
574 print $fh "\t'$pat' => '$map->{$i}',\n";
575 }
576 print $fh "},\n";
577 }
578 print $fh <<EOT;
579);
580EOT
581
582 close($fh);
583 } else {
584 die "$0: $name.pl: $!\n";
585 }
586}
587
588#
589# Write out the virtual In mappings.
590#
591
592mapping(\%In, "In");
593
594# Easy low-calorie cheat.
595use File::Copy;
596copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
597
598#
599# Write out the real In mappings
600# (the In.pl written out just above has the virtual In mappings)
601#
602
603foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
604 flush($InIn{$in}, "In/$In{$in}.pl");
605}
606
607#
608# The mapping from General Category long forms to short forms is
609# currently hardwired here since no simple data file in the UCD
610# seems to do that.
611#
612
613my %Is = (
614 'Letter' => 'L',
615 'Uppercase Letter' => 'Lu',
616 'Lowercase Letter' => 'Ll',
617 'Titlecase Letter' => 'Lt',
618 'Modifier Letter' => 'Lm',
619 'Other Letter' => 'Lo',
620
621 'Mark' => 'M',
622 'Non-Spacing Mark' => 'Mn',
623 'Spacing Combining Mark' => 'Mc',
624 'Enclosing Mark' => 'Me',
625
626 'Separator' => 'Z',
627 'Space Separator' => 'Zs',
628 'Line Separator' => 'Zl',
629 'Paragraph Separator' => 'Zp',
630
631 'Number' => 'N',
632 'Decimal Digit Number' => 'Nd',
633 'Letter Number' => 'Nl',
634 'Other Number' => 'No',
635
636 'Punctuation' => 'P',
637 'Connector Punctuation' => 'Pc',
638 'Dash Punctuation' => 'Pd',
639 'Open Punctuation' => 'Ps',
640 'Close Punctuation' => 'Pe',
641 'Initial Punctuation' => 'Pi',
642 'Final Punctuation' => 'Pf',
643 'Other Punctuation' => 'Po',
644
645 'Symbol' => 'S',
646 'Math Symbol' => 'Sm',
647 'Currency Symbol' => 'Sc',
648 'Modifier Symbol' => 'Sk',
649 'Other Symbol' => 'So',
650
651 'Other' => 'C',
652 'Control' => 'Cc',
653 'Format' => 'Cf',
654 'Surrogate' => 'Cs',
655 'Private Use' => 'Co',
656 'Not Assigned' => 'Cn',
657 # 'Other' aliases
658 'Other Control' => 'Cc',
659 'Other Format' => 'Cf',
660 'Other Surrogate' => 'Cs',
661 'Other Private Use' => 'Co',
662 'Other Not Assigned' => 'Cn',
663);
664
665#
666# Write out the virtual Is mappings.
667#
668
669mapping(\%Is, "Is");
670
d2d499f5 671#
672# Read in the special cases.
673#
674
675my %Case;
676
677if (open(my $SpecCase, "SpecCase.txt")) {
678 while (<$SpecCase>) {
679 next unless /^[0-9A-Fa-f]+;/;
680 s/\#.*//;
681 s/\s+$//;
682
683 my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
684
685 if ($condition) { # not implemented yet
686 print "# SKIPPING $_\n";
687 next;
688 }
689
690 # Wait until all the special cases have been read since
691 # they are not listed in numeric order.
692 my $ix = hex($code);
693 push @{$Case{Lower}}, [ $ix, $code, $lower ];
694 push @{$Case{Title}}, [ $ix, $code, $title ];
695 push @{$Case{Upper}}, [ $ix, $code, $upper ];
696 }
697} else {
698 die "$0: SpecCase.txt: $!\n";
699}
700
701# Now write out the special cases properties in their code point order.
702# The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash
703# routines do not do returning multiple characters.
704
705for my $case (qw(Lower Title Upper)) {
706 my @case;
707 for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
708 my ($ix, $code, $to) = @$prop;
709 append(\@case, $code, $to);
710 }
711 flush(\@case, "To/Spec$case.pl");
712}
713
d73e5302 714# That's all, folks!
715