Don't expect ASCII ordering.
[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
2
3#
4# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
5# from the Unicode database files (lib/unicore/*.txt).
6#
7
d73e5302 8use strict;
9
d2d499f5 10my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
11
d73e5302 12mkdir("In", 0755);
13mkdir("Is", 0755);
14mkdir("To", 0755);
15
16sub extend {
17 my ($table, $last) = @_;
18
19 $table->[-1]->[1] = $last;
20}
21
22sub append {
23 my ($table, $code, $name) = @_;
24 if (@$table &&
25 hex($table->[-1]->[1]) == hex($code) - 1 &&
26 (!defined $name || $table->[-1]->[2] eq $name)) {
27 extend($table, $code);
28 } else {
29 push @$table, [$code, $code, $name];
30 }
31}
32
33sub inverse {
34 my ($table) = @_;
35 my $inverse = [];
36 my ($first, $last);
37 if ($table->[0]->[0]) {
38 $last = hex($table->[0]->[0]);
39 push @$inverse, [ "0000",
40 sprintf("%04X", $last - 1) ];
41 }
42 for my $i (0..$#$table-1) {
43 $first = defined $table->[$i ]->[1] ?
44 hex($table->[$i ]->[1]) : 0;
45 $last = defined $table->[$i + 1]->[0] ?
46 hex($table->[$i + 1]->[0]) : $first;
47 push @$inverse, [ sprintf("%04X", $first + 1),
48 sprintf("%04X", $last - 1) ]
49 unless $first + 1 == $last;
50 }
51 return $inverse;
52}
53
54sub header {
55 my $fh = shift;
56
57 print $fh <<EOT;
58# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
59# This file is built by $0 from e.g. Unicode.txt.
60# Any changes made here will be lost!
61EOT
62}
63
64sub begin {
65 my $fh = shift;
66
67 print $fh <<EOT;
68return <<'END';
69EOT
70}
71
72sub end {
73 my $fh = shift;
74
75 print $fh <<EOT;
76END
77EOT
78}
79
80sub flush {
81 my ($table, $file) = @_;
82 print "$file\n";
83 if (open(my $fh, ">$file")) {
84 header($fh);
85 begin($fh);
86 for my $i (@$table) {
87 print $fh $i->[0], "\t",
88 $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
89 defined $i->[2] ? $i->[2] : "", "\n";
90 }
91 end($fh);
92 close($fh);
93 } else {
94 die "$0: $file: $!\n";
95 }
96}
97
98#
99# The %In contains the mapping of the script/block name into a number.
100#
101
102my %In;
103my $InId = 0;
104my %InIn;
105
71d929cb 106my %InScript;
107my %InBlock;
108
d73e5302 109#
110# Read in the Unicode.txt, the main Unicode database.
111#
112
113my %Cat;
114my %General;
115my @General;
116
117if (open(my $Unicode, "Unicode.txt")) {
118 my @Name;
119 my @Bidi;
120 my %Bidi;
121 my @Comb;
122 my @Deco;
123 my %Deco;
124 my %DC;
125 my @Number;
126 my @Mirrored;
127 my %To;
128 while (<$Unicode>) {
d2d499f5 129 next unless /^[0-9A-Fa-f]+;/;
d73e5302 130 s/\s+$//;
d2d499f5 131
d73e5302 132 my ($code, $name, $cat, $comb, $bidi, $deco,
133 $decimal, $digit, $number,
134 $mirrored, $unicode10, $comment,
135 $upper, $lower, $title) = split(/\s*;\s*/);
136
137 if ($name =~ /^<(.+), (First|Last)>$/) {
138 $name = $1;
139 if ($2 eq 'First') {
140 append($General{$name} ||= [], $code, $name);
141 } else {
142 extend($General{$name} , $code);
143 }
144 unless (defined $In{$name}) {
145 $In{$name} = $InId++;
146 $InIn{$name} = $General{$name};
147 }
148 append($Cat{$cat} ||= [], $code);
149 append($Cat{substr($cat, 0, 1)}
150 ||= [], $code);
151 } else {
152 append(\@Name, $code, $name);
153
154 append(\@General, $code, $cat);
155
156 append($Cat{$cat} ||= [], $code);
157 append($Cat{substr($cat, 0, 1)}
158 ||= [], $code);
159 # 005F: SPACING UNDERSCORE
160 append($Cat{Word} ||= [], $code)
161 if $cat =~ /^[LMN]/ or $code eq "005F";
162 append($Cat{Alnum} ||= [], $code)
163 if $cat =~ /^[LMN]/;
164 append($Cat{Alpha} ||= [], $code)
165 if $cat =~ /^[LM]/;
166 # 0009: HORIZONTAL TABULATION
167 # 000A: LINE FEED
168 # 000B: VERTICAL TABULATION
169 # 000C: FORM FEED
170 # 000D: CARRIAGE RETURN
171 # 0020: SPACE
172 append($Cat{Space} ||= [], $code)
173 if $cat =~ /^Z/ ||
174 $code =~ /^(0009|000A|000B|000C|000D)$/;
175 append($Cat{SpacePerl} ||= [], $code)
176 if $cat =~ /^Z/ ||
177 $code =~ /^(0009|000A|000C|000D)$/;
178 append($Cat{Blank} ||= [], $code)
179 if $code =~ /^(0020|0009)$/ ||
180 $cat =~ /^Z[^lp]$/;
181 append($Cat{Digit} ||= [], $code) if $cat eq "Nd";
182 append($Cat{Upper} ||= [], $code) if $cat eq "Lu";
183 append($Cat{Lower} ||= [], $code) if $cat eq "Ll";
184 append($Cat{Title} ||= [], $code) if $cat eq "Lt";
185 append($Cat{ASCII} ||= [], $code) if $code le "007F";
186 append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/;
187 append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
188 append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
189 append($Cat{Punct} ||= [], $code) if $cat =~ /^P/;
190 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
191 append($Cat{XDigit} ||= [], $code)
192 if $code =~ /^00(3[0-9]|[46][1-6])$/;
193
194 append($To{Upper} ||= [], $code, $upper) if $upper;
195 append($To{Lower} ||= [], $code, $lower) if $lower;
196 append($To{Title} ||= [], $code, $title) if $title;
197 append($To{Digit} ||= [], $code, $decimal) if $decimal;
198
199 append(\@Bidi, $code, $bidi);
200 append($Bidi{$bidi} ||= [], $code);
201
202 append(\@Comb, $code, $comb) if $comb;
203
204 if ($deco) {
205 append(\@Deco, $code, $deco);
206 if ($deco =~/^<(\w+)>/) {
207 append($Deco{Compat} ||= [], $code);
208 append($DC{$1} ||= [], $code);
209 } else {
210 append($Deco{Canon} ||= [], $code);
211 }
212 }
213
214 append(\@Number, $code, $number) if $number;
215
216 append(\@Mirrored, $code) if $mirrored eq "Y";
217 }
218 }
219
220 flush(\@Name, "Name.pl");
221
222 foreach my $cat (sort keys %Cat) {
223 flush($Cat{$cat}, "Is/$cat.pl");
224 }
225
226 foreach my $to (sort keys %To) {
227 flush($To{$to}, "To/$to.pl");
228 }
229
230 flush(\@Bidi, "Bidirectional.pl");
231 foreach my $bidi (sort keys %Bidi) {
232 flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
233 }
234
235 flush(\@Comb, "CombiningClass.pl");
236
237 flush(\@Deco, "Decomposition.pl");
238 foreach my $deco (sort keys %Deco) {
239 flush($Deco{$deco}, "Is/Deco$deco.pl");
240 }
241 foreach my $dc (sort keys %DC) {
242 flush($DC{$dc}, "Is/DC$dc.pl");
243 }
244
245 flush(\@Number, "Number.pl");
246
247 flush(\@Mirrored, "Is/Mirrored.pl");
248} else {
249 die "$0: Unicode.txt: $!\n";
250}
251
252# The general cateory can be written out already now.
253
254flush(\@General, "Category.pl");
255
256#
257# Read in the LineBrk.txt.
258#
259
260if (open(my $LineBrk, "LineBrk.txt")) {
261 my @Lbrk;
262 my %Lbrk;
263
264 while (<$LineBrk>) {
d2d499f5 265 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/;
d73e5302 266
267 my ($first, $last, $lbrk) = ($1, $2, $3);
268
269 append(\@Lbrk, $first, $lbrk);
270 append($Lbrk{$lbrk} ||= [], $first);
271 if (defined $last) {
272 extend(\@Lbrk, $last);
273 extend($Lbrk{$lbrk}, $last);
274 }
275 }
276
277 flush(\@Lbrk, "Lbrk.pl");
278 foreach my $lbrk (sort keys %Lbrk) {
279 flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
280 }
281} else {
282 die "$0: LineBrk.txt: $!\n";
283}
284
285#
286# Read in the ArabShap.txt.
287#
288
289if (open(my $ArabShap, "ArabShap.txt")) {
290 my @ArabLink;
291 my @ArabLinkGroup;
292
293 while (<$ArabShap>) {
d2d499f5 294 next unless /^[0-9A-Fa-f]+;/;
d73e5302 295 s/\s+$//;
d2d499f5 296
d73e5302 297 my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
298
299 append(\@ArabLink, $code, $link);
300 append(\@ArabLinkGroup, $code, $linkgroup);
301 }
302
303 flush(\@ArabLink, "ArabLink.pl");
304 flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
305} else {
306 die "$0: ArabShap.txt: $!\n";
307}
308
309#
310# Read in the Jamo.txt.
311#
312
313if (open(my $Jamo, "Jamo.txt")) {
314 my @Short;
315
316 while (<$Jamo>) {
d2d499f5 317 next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/;
318
319 my ($code, $short) = ($1, $2);
d73e5302 320
321 append(\@Short, $code, $short);
322 }
323
324 flush(\@Short, "JamoShort.pl");
325} else {
326 die "$0: Jamo.txt: $!\n";
327}
328
329#
330# Read in the Scripts.txt.
331#
332
333my @Scripts;
334
335if (open(my $Scripts, "Scripts.txt")) {
336 while (<$Scripts>) {
d2d499f5 337 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
d73e5302 338
339 # Wait until all the scripts have been read since
340 # they are not listed in numeric order.
341 push @Scripts, [ hex($1), $1, $2, $3 ];
342 }
343} else {
344 die "$0: Scripts.txt: $!\n";
345}
346
347# Now append the scripts properties in their code point order.
348
349my %Script;
350my $Scripts = [];
351
352for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
353 my ($code, $first, $last, $name) = @$script;
354 append($Scripts, $first, $name);
355 append($Script{$name} ||= [], $first, $name);
356 if (defined $last) {
357 extend($Scripts, $last);
358 extend($Script{$name}, $last);
359 }
360 unless (defined $In{$name}) {
71d929cb 361 $InScript{$InId} = $name;
362 $In{$name} = $InId++;
363 $InIn{$name} = $Script{$name};
d73e5302 364 }
365}
366
367# Scripts.pl can be written out already now.
368
369flush(\@Scripts, "Scripts.pl");
370
371# Common is everything not explicitly assigned to a Script
372
373$In{Common} = $InId++;
374my $Common = inverse($Scripts);
375$InIn{Common} = $Common;
376
377#
378# Read in the Blocks.txt.
379#
380
381my @Blocks;
382my %Blocks;
383
384if (open(my $Blocks, "Blocks.txt")) {
385 while (<$Blocks>) {
d2d499f5 386 next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/;
387
d73e5302 388 my ($first, $last, $name) = ($1, $2, $3);
71d929cb 389 my $origname = $name;
d73e5302 390
391 # If there's a naming conflict (the script names are
392 # in uppercase), the name of the block has " Block"
393 # appended to it.
71d929cb 394 my $pat = $name;
395 $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
396 for my $i (values %InScript) {
397 if ($i =~ /^$pat$/i) {
398 $name .= " Block";
399 last;
400 }
401 }
d73e5302 402
403 append(\@Blocks, $first, $name);
404 append($Blocks{$name} ||= [], $first, $name);
405 if (defined $last) {
406 extend(\@Blocks, $last);
407 extend($Blocks{$name}, $last);
408 }
409 unless (defined $In{$name}) {
71d929cb 410 $InBlock{$InId} = $origname;
411 $In{$name} = $InId++;
412 $InIn{$name} = $Blocks{$name};
d73e5302 413 }
414 }
415} else {
416 die "$0: Blocks.txt: $!\n";
417}
418
419# Blocks.pl can be written out already now.
420
421flush(\@Blocks, "Blocks.pl");
422
423#
424# Read in the PropList.txt. It contains extended properties not
425# listed in the Unicode.txt, such as 'Other_Alphabetic':
426# alphabetic but not of the general category L; many modifiers
427# belong to this extended property category: while they are not
428# alphabets, they are alphabetic in nature.
429#
430
431my @Props;
432
433if (open(my $Props, "PropList.txt")) {
434 while (<$Props>) {
d2d499f5 435 next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/;
d73e5302 436
437 # Wait until all the extended properties have been read since
438 # they are not listed in numeric order.
439 push @Props, [ hex($1), $1, $2, $3 ];
440 }
441} else {
442 die "$0: PropList.txt: $!\n";
443}
444
445# Now append the extended properties in their code point order.
446
447my %Prop;
448my $Props = [];
449
450for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
451 my ($code, $first, $last, $name) = @$prop;
452 append($Props, $first, $name);
453 append($Prop{$name} ||= [], $first, $name);
454 if (defined $last) {
455 extend($Props, $last);
456 extend($Prop{$name}, $last);
457 }
458 unless (defined $In{$name}) {
459 $In{$name} = $InId++;
460 $InIn{$name} = $Prop{$name};
461 }
462}
463
464# Assigned is everything not Cn aka Noncharacter_Code_Point
465
466$In{Assigned} = $InId++;
467my $Assigned = inverse($Prop{Noncharacter_Code_Point});
468$InIn{Assigned} = $Assigned;
469
470sub merge_general_and_extended {
471 my ($name, $general, $extended) = @_;
472 my $merged;
473
474 push @$merged,
475 map { pop @{$_}; $_ }
476 sort { $a->[2] <=> $b->[2] }
477 map { [ $_->[0], $_->[1], hex($_->[0]) ] }
478 ($general ?
479 map { ref $_ ? @$_ : $_ }
480 @Cat {ref $general ? @$general : $general } :
481 (),
482 $extended ?
483 map { ref $_ ? @$_ : $_ }
484 @Prop{ref $extended ? @$extended : $extended} :
485 ());
486
487 $In{$name} = $InId++;
488 $InIn{$name} = $merged;
489
490 return $merged;
491}
492
493# Alphabetic is L and Other_Alphabetic.
494
495my $Alphabetic =
496 merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
497
498# Lowercase is Ll and Other_Lowercase.
499
500my $Lowercase =
501 merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
502
503# Uppercase is Lu and Other_Uppercase.
504
505my $Uppercase =
506 merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
507
508# Math is Sm and Other_Math.
509
510my $Math =
511 merge_general_and_extended('Math', 'Sm', 'Other_Math');
512
513# Lampersand is Ll, Lu, and Lt.
514
515my $Lampersand =
516 merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
517
518# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
519
520my $ID_Start =
521 merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
522
523# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
524
525my $ID_Continue =
526 merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
527 Mn Mc Nd Pc) ]);
528
529#
530# Any is any.
531#
532
533$In{Any} = $InId++;
534my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
535$InIn{Any} = $Any;
536
537#
538# mapping() will be used to write out the In and Is virtual mappings.
539#
540
541sub mapping {
542 my ($map, $name) = @_;
543
544 if (open(my $fh, ">$name.pl")) {
545 print "$name.pl\n";
546 header($fh);
547
548 # The %pat will hold a hash that maps the first two
549 # lowercased letters of a class to a 'fuzzified' regular
550 # expression that points to the real mapping.
551
552 my %pat;
553
554 # But first write out the offical name to real name
555 # (the filename) mapping.
556
557 print $fh <<EOT;
558%utf8::${name} =
559(
560EOT
67765ba6 561 for my $i (sort { lc $a cmp lc $b } keys %$map) {
d73e5302 562 my $pat = $i;
563 # Here is the 'fuzzification': accept any space,
564 # dash, or underbar where in the official name
565 # there is space or a dash (or underbar, but
566 # there never is).
567 $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
568 # The prefix length of 2 is enough spread,
569 # and besides, we have 'Yi' as an In category.
570 push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
67765ba6 571 printf $fh "%-45s => '$map->{$i}',\n", "'$i'";
d73e5302 572 }
573 print $fh <<EOT;
574);
575EOT
576
577 # Now write out the %pat mapping.
578
579 print $fh <<EOT;
580%utf8::${name}Pat =
581(
582EOT
583 foreach my $prefix (sort keys %pat) {
584 print $fh "'$prefix' => {\n";
585 foreach my $ipat (@{$pat{$prefix}}) {
586 my ($i, $pat) = @$ipat;
587 print $fh "\t'$pat' => '$map->{$i}',\n";
588 }
589 print $fh "},\n";
590 }
591 print $fh <<EOT;
592);
593EOT
594
595 close($fh);
596 } else {
597 die "$0: $name.pl: $!\n";
598 }
599}
600
601#
602# Write out the virtual In mappings.
603#
604
605mapping(\%In, "In");
606
71d929cb 607#
608# Append the InScript and InBlock mappings.
609# These are needed only if Script= and Block= syntaxes are used.
610#
611
612if (open(my $In, ">>In.pl")) {
613 print $In <<EOT;
614
615%utf8::InScript =
616(
617EOT
618 for my $i (sort { $a <=> $b } keys %InScript) {
619 printf $In "%4d => '$InScript{$i}',\n", $i;
620 }
621 print $In <<EOT;
622);
623EOT
624
625 print $In <<EOT;
626
627%utf8::InBlock =
628(
629EOT
630 for my $i (sort { $a <=> $b } keys %InBlock) {
631 printf $In "%4d => '$InBlock{$i}',\n", $i;
632 }
633 print $In <<EOT;
634);
635EOT
636} else {
637 die "$0: In.pl: $!\n";
638}
639
d73e5302 640# Easy low-calorie cheat.
641use File::Copy;
642copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
643
644#
645# Write out the real In mappings
646# (the In.pl written out just above has the virtual In mappings)
647#
648
649foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
650 flush($InIn{$in}, "In/$In{$in}.pl");
651}
652
653#
654# The mapping from General Category long forms to short forms is
655# currently hardwired here since no simple data file in the UCD
656# seems to do that.
657#
658
659my %Is = (
660 'Letter' => 'L',
e150c829 661 'Uppercase_Letter' => 'Lu',
662 'Lowercase_Letter' => 'Ll',
663 'Titlecase_Letter' => 'Lt',
664 'Modifier_Letter' => 'Lm',
665 'Other_Letter' => 'Lo',
d73e5302 666
667 'Mark' => 'M',
e150c829 668 'Non_Spacing_Mark' => 'Mn',
669 'Spacing_Mark' => 'Mc',
670 'Enclosing_Mark' => 'Me',
d73e5302 671
672 'Separator' => 'Z',
e150c829 673 'Space_Separator' => 'Zs',
674 'Line_Separator' => 'Zl',
675 'Paragraph_Separator' => 'Zp',
d73e5302 676
677 'Number' => 'N',
e150c829 678 'Decimal_Number' => 'Nd',
679 'Letter_Number' => 'Nl',
680 'Other_Number' => 'No',
d73e5302 681
682 'Punctuation' => 'P',
e150c829 683 'Connector_Punctuation' => 'Pc',
684 'Dash_Punctuation' => 'Pd',
685 'Open_Punctuation' => 'Ps',
686 'Close_Punctuation' => 'Pe',
687 'Initial_Punctuation' => 'Pi',
688 'Final_Punctuation' => 'Pf',
689 'Other_Punctuation' => 'Po',
d73e5302 690
691 'Symbol' => 'S',
e150c829 692 'Math_Symbol' => 'Sm',
693 'Currency_Symbol' => 'Sc',
694 'Modifier_Symbol' => 'Sk',
695 'Other_Symbol' => 'So',
d73e5302 696
697 'Other' => 'C',
698 'Control' => 'Cc',
699 'Format' => 'Cf',
700 'Surrogate' => 'Cs',
701 'Private Use' => 'Co',
e150c829 702 'Unassigned' => 'Cn',
d73e5302 703);
704
705#
706# Write out the virtual Is mappings.
707#
708
709mapping(\%Is, "Is");
710
d2d499f5 711#
712# Read in the special cases.
713#
714
715my %Case;
716
717if (open(my $SpecCase, "SpecCase.txt")) {
718 while (<$SpecCase>) {
719 next unless /^[0-9A-Fa-f]+;/;
720 s/\#.*//;
721 s/\s+$//;
722
723 my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/);
724
725 if ($condition) { # not implemented yet
726 print "# SKIPPING $_\n";
727 next;
728 }
729
730 # Wait until all the special cases have been read since
731 # they are not listed in numeric order.
732 my $ix = hex($code);
733 push @{$Case{Lower}}, [ $ix, $code, $lower ];
734 push @{$Case{Title}}, [ $ix, $code, $title ];
735 push @{$Case{Upper}}, [ $ix, $code, $upper ];
736 }
737} else {
738 die "$0: SpecCase.txt: $!\n";
739}
740
741# Now write out the special cases properties in their code point order.
983ffd37 742# Prepend them to the To/{Upper,Lower,Title}.pl.
d2d499f5 743
744for my $case (qw(Lower Title Upper)) {
c4051cc5 745 my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n";
983ffd37 746 if (open(my $Case, ">To/$case.pl")) {
747 header($Case);
748 print $Case <<EOT;
749
750%utf8::ToSpec$case = (
751EOT
752 for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) {
753 my ($ix, $code, $to) = @$prop;
754 my $tostr =
755 join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
6d47b937 756 printf $Case qq['%04X' => "$tostr",\n], $ix;
983ffd37 757 }
758 print $Case <<EOT;
759);
760
761EOT
762 begin($Case);
763 print $Case $NormalCase;
764 end($Case);
765 } else {
766 die "$0: To/$case.txt: $!\n";
d2d499f5 767 }
d2d499f5 768}
769
c4051cc5 770#
771# Read in the case foldings.
772#
773# We will do full case folding, C + F + I (see CaseFold.txt).
774#
775
776if (open(my $CaseFold, "CaseFold.txt")) {
777 my @Fold;
778 my %Fold;
779
780 while (<$CaseFold>) {
781 next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
782
783 my ($code, $status, $fold) = ($1, $2, $3);
784
785 if ($status eq 'C') { # Common: one-to-one folding
786 append(\@Fold, $code, $fold);
787 } else { # F: full, or I: dotted uppercase I -> dotless lowercase I
788 $Fold{hex($code)} = $fold;
789 }
790 }
791
792 flush(\@Fold, "To/Fold.pl");
793
794 #
795 # Prepend the special foldings to the common foldings.
796 #
797
798 my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n";
799 if (open(my $Fold, ">To/Fold.pl")) {
800 header($Fold);
801 print $Fold <<EOT;
802
803%utf8::ToSpecFold = (
804EOT
805 for my $code (sort { $a <=> $b } keys %Fold) {
806 my $foldstr =
807 join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
6d47b937 808 printf $Fold qq['%04X' => "$foldstr",\n], $code;
c4051cc5 809 }
810 print $Fold <<EOT;
811);
812
813EOT
814 begin($Fold);
815 print $Fold $CommonFold;
816 end($Fold);
817 } else {
818 die "$0: To/Fold.pl: $!\n";
819 }
820} else {
821 die "$0: CaseFold.txt: $!\n";
822}
823
d73e5302 824# That's all, folks!
825