Commit | Line | Data |
d73e5302 |
1 | #!/usr/bin/perl -w |
2 | |
3 | # |
4 | # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) |
5 | # from the Unicode database files (lib/unicore/*.txt). |
6 | # |
7 | |
d73e5302 |
8 | use strict; |
9 | |
d2d499f5 |
10 | my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. |
11 | |
d73e5302 |
12 | mkdir("In", 0755); |
13 | mkdir("Is", 0755); |
14 | mkdir("To", 0755); |
15 | |
16 | sub extend { |
17 | my ($table, $last) = @_; |
18 | |
19 | $table->[-1]->[1] = $last; |
20 | } |
21 | |
22 | sub append { |
23 | my ($table, $code, $name) = @_; |
24 | if (@$table && |
25 | hex($table->[-1]->[1]) == hex($code) - 1 && |
26 | (!defined $name || $table->[-1]->[2] eq $name)) { |
27 | extend($table, $code); |
28 | } else { |
29 | push @$table, [$code, $code, $name]; |
30 | } |
31 | } |
32 | |
33 | sub inverse { |
34 | my ($table) = @_; |
35 | my $inverse = []; |
36 | my ($first, $last); |
37 | if ($table->[0]->[0]) { |
38 | $last = hex($table->[0]->[0]); |
39 | push @$inverse, [ "0000", |
40 | sprintf("%04X", $last - 1) ]; |
41 | } |
42 | for my $i (0..$#$table-1) { |
43 | $first = defined $table->[$i ]->[1] ? |
44 | hex($table->[$i ]->[1]) : 0; |
45 | $last = defined $table->[$i + 1]->[0] ? |
46 | hex($table->[$i + 1]->[0]) : $first; |
47 | push @$inverse, [ sprintf("%04X", $first + 1), |
48 | sprintf("%04X", $last - 1) ] |
49 | unless $first + 1 == $last; |
50 | } |
51 | return $inverse; |
52 | } |
53 | |
54 | sub header { |
55 | my $fh = shift; |
56 | |
57 | print $fh <<EOT; |
58 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
59 | # This file is built by $0 from e.g. Unicode.txt. |
60 | # Any changes made here will be lost! |
61 | EOT |
62 | } |
63 | |
64 | sub begin { |
65 | my $fh = shift; |
66 | |
67 | print $fh <<EOT; |
68 | return <<'END'; |
69 | EOT |
70 | } |
71 | |
72 | sub end { |
73 | my $fh = shift; |
74 | |
75 | print $fh <<EOT; |
76 | END |
77 | EOT |
78 | } |
79 | |
80 | sub flush { |
81 | my ($table, $file) = @_; |
82 | print "$file\n"; |
83 | if (open(my $fh, ">$file")) { |
84 | header($fh); |
85 | begin($fh); |
86 | for my $i (@$table) { |
87 | print $fh $i->[0], "\t", |
88 | $i->[1] ne $i->[0] ? $i->[1] : "", "\t", |
89 | defined $i->[2] ? $i->[2] : "", "\n"; |
90 | } |
91 | end($fh); |
92 | close($fh); |
93 | } else { |
94 | die "$0: $file: $!\n"; |
95 | } |
96 | } |
97 | |
98 | # |
99 | # The %In contains the mapping of the script/block name into a number. |
100 | # |
101 | |
102 | my %In; |
103 | my $InId = 0; |
104 | my %InIn; |
105 | |
71d929cb |
106 | my %InScript; |
107 | my %InBlock; |
108 | |
d73e5302 |
109 | # |
110 | # Read in the Unicode.txt, the main Unicode database. |
111 | # |
112 | |
113 | my %Cat; |
114 | my %General; |
115 | my @General; |
116 | |
117 | if (open(my $Unicode, "Unicode.txt")) { |
118 | my @Name; |
119 | my @Bidi; |
120 | my %Bidi; |
121 | my @Comb; |
122 | my @Deco; |
123 | my %Deco; |
124 | my %DC; |
125 | my @Number; |
126 | my @Mirrored; |
127 | my %To; |
128 | while (<$Unicode>) { |
d2d499f5 |
129 | next unless /^[0-9A-Fa-f]+;/; |
d73e5302 |
130 | s/\s+$//; |
d2d499f5 |
131 | |
d73e5302 |
132 | my ($code, $name, $cat, $comb, $bidi, $deco, |
133 | $decimal, $digit, $number, |
134 | $mirrored, $unicode10, $comment, |
135 | $upper, $lower, $title) = split(/\s*;\s*/); |
136 | |
137 | if ($name =~ /^<(.+), (First|Last)>$/) { |
138 | $name = $1; |
139 | if ($2 eq 'First') { |
140 | append($General{$name} ||= [], $code, $name); |
141 | } else { |
142 | extend($General{$name} , $code); |
143 | } |
144 | unless (defined $In{$name}) { |
145 | $In{$name} = $InId++; |
146 | $InIn{$name} = $General{$name}; |
147 | } |
148 | append($Cat{$cat} ||= [], $code); |
149 | append($Cat{substr($cat, 0, 1)} |
150 | ||= [], $code); |
151 | } else { |
152 | append(\@Name, $code, $name); |
153 | |
154 | append(\@General, $code, $cat); |
155 | |
156 | append($Cat{$cat} ||= [], $code); |
157 | append($Cat{substr($cat, 0, 1)} |
158 | ||= [], $code); |
159 | # 005F: SPACING UNDERSCORE |
160 | append($Cat{Word} ||= [], $code) |
161 | if $cat =~ /^[LMN]/ or $code eq "005F"; |
162 | append($Cat{Alnum} ||= [], $code) |
163 | if $cat =~ /^[LMN]/; |
164 | append($Cat{Alpha} ||= [], $code) |
165 | if $cat =~ /^[LM]/; |
166 | # 0009: HORIZONTAL TABULATION |
167 | # 000A: LINE FEED |
168 | # 000B: VERTICAL TABULATION |
169 | # 000C: FORM FEED |
170 | # 000D: CARRIAGE RETURN |
171 | # 0020: SPACE |
172 | append($Cat{Space} ||= [], $code) |
173 | if $cat =~ /^Z/ || |
174 | $code =~ /^(0009|000A|000B|000C|000D)$/; |
175 | append($Cat{SpacePerl} ||= [], $code) |
176 | if $cat =~ /^Z/ || |
177 | $code =~ /^(0009|000A|000C|000D)$/; |
178 | append($Cat{Blank} ||= [], $code) |
179 | if $code =~ /^(0020|0009)$/ || |
180 | $cat =~ /^Z[^lp]$/; |
181 | append($Cat{Digit} ||= [], $code) if $cat eq "Nd"; |
182 | append($Cat{Upper} ||= [], $code) if $cat eq "Lu"; |
183 | append($Cat{Lower} ||= [], $code) if $cat eq "Ll"; |
184 | append($Cat{Title} ||= [], $code) if $cat eq "Lt"; |
185 | append($Cat{ASCII} ||= [], $code) if $code le "007F"; |
186 | append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/; |
187 | append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; |
188 | append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; |
189 | append($Cat{Punct} ||= [], $code) if $cat =~ /^P/; |
190 | # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f |
191 | append($Cat{XDigit} ||= [], $code) |
192 | if $code =~ /^00(3[0-9]|[46][1-6])$/; |
193 | |
194 | append($To{Upper} ||= [], $code, $upper) if $upper; |
195 | append($To{Lower} ||= [], $code, $lower) if $lower; |
196 | append($To{Title} ||= [], $code, $title) if $title; |
197 | append($To{Digit} ||= [], $code, $decimal) if $decimal; |
198 | |
199 | append(\@Bidi, $code, $bidi); |
200 | append($Bidi{$bidi} ||= [], $code); |
201 | |
202 | append(\@Comb, $code, $comb) if $comb; |
203 | |
204 | if ($deco) { |
205 | append(\@Deco, $code, $deco); |
206 | if ($deco =~/^<(\w+)>/) { |
207 | append($Deco{Compat} ||= [], $code); |
208 | append($DC{$1} ||= [], $code); |
209 | } else { |
210 | append($Deco{Canon} ||= [], $code); |
211 | } |
212 | } |
213 | |
214 | append(\@Number, $code, $number) if $number; |
215 | |
216 | append(\@Mirrored, $code) if $mirrored eq "Y"; |
217 | } |
218 | } |
219 | |
220 | flush(\@Name, "Name.pl"); |
221 | |
222 | foreach my $cat (sort keys %Cat) { |
223 | flush($Cat{$cat}, "Is/$cat.pl"); |
224 | } |
225 | |
226 | foreach my $to (sort keys %To) { |
227 | flush($To{$to}, "To/$to.pl"); |
228 | } |
229 | |
230 | flush(\@Bidi, "Bidirectional.pl"); |
231 | foreach my $bidi (sort keys %Bidi) { |
232 | flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); |
233 | } |
234 | |
235 | flush(\@Comb, "CombiningClass.pl"); |
236 | |
237 | flush(\@Deco, "Decomposition.pl"); |
238 | foreach my $deco (sort keys %Deco) { |
239 | flush($Deco{$deco}, "Is/Deco$deco.pl"); |
240 | } |
241 | foreach my $dc (sort keys %DC) { |
242 | flush($DC{$dc}, "Is/DC$dc.pl"); |
243 | } |
244 | |
245 | flush(\@Number, "Number.pl"); |
246 | |
247 | flush(\@Mirrored, "Is/Mirrored.pl"); |
248 | } else { |
249 | die "$0: Unicode.txt: $!\n"; |
250 | } |
251 | |
252 | # The general cateory can be written out already now. |
253 | |
254 | flush(\@General, "Category.pl"); |
255 | |
256 | # |
257 | # Read in the LineBrk.txt. |
258 | # |
259 | |
260 | if (open(my $LineBrk, "LineBrk.txt")) { |
261 | my @Lbrk; |
262 | my %Lbrk; |
263 | |
264 | while (<$LineBrk>) { |
d2d499f5 |
265 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; |
d73e5302 |
266 | |
267 | my ($first, $last, $lbrk) = ($1, $2, $3); |
268 | |
269 | append(\@Lbrk, $first, $lbrk); |
270 | append($Lbrk{$lbrk} ||= [], $first); |
271 | if (defined $last) { |
272 | extend(\@Lbrk, $last); |
273 | extend($Lbrk{$lbrk}, $last); |
274 | } |
275 | } |
276 | |
277 | flush(\@Lbrk, "Lbrk.pl"); |
278 | foreach my $lbrk (sort keys %Lbrk) { |
279 | flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); |
280 | } |
281 | } else { |
282 | die "$0: LineBrk.txt: $!\n"; |
283 | } |
284 | |
285 | # |
286 | # Read in the ArabShap.txt. |
287 | # |
288 | |
289 | if (open(my $ArabShap, "ArabShap.txt")) { |
290 | my @ArabLink; |
291 | my @ArabLinkGroup; |
292 | |
293 | while (<$ArabShap>) { |
d2d499f5 |
294 | next unless /^[0-9A-Fa-f]+;/; |
d73e5302 |
295 | s/\s+$//; |
d2d499f5 |
296 | |
d73e5302 |
297 | my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); |
298 | |
299 | append(\@ArabLink, $code, $link); |
300 | append(\@ArabLinkGroup, $code, $linkgroup); |
301 | } |
302 | |
303 | flush(\@ArabLink, "ArabLink.pl"); |
304 | flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); |
305 | } else { |
306 | die "$0: ArabShap.txt: $!\n"; |
307 | } |
308 | |
309 | # |
310 | # Read in the Jamo.txt. |
311 | # |
312 | |
313 | if (open(my $Jamo, "Jamo.txt")) { |
314 | my @Short; |
315 | |
316 | while (<$Jamo>) { |
d2d499f5 |
317 | next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; |
318 | |
319 | my ($code, $short) = ($1, $2); |
d73e5302 |
320 | |
321 | append(\@Short, $code, $short); |
322 | } |
323 | |
324 | flush(\@Short, "JamoShort.pl"); |
325 | } else { |
326 | die "$0: Jamo.txt: $!\n"; |
327 | } |
328 | |
329 | # |
330 | # Read in the Scripts.txt. |
331 | # |
332 | |
333 | my @Scripts; |
334 | |
335 | if (open(my $Scripts, "Scripts.txt")) { |
336 | while (<$Scripts>) { |
d2d499f5 |
337 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; |
d73e5302 |
338 | |
339 | # Wait until all the scripts have been read since |
340 | # they are not listed in numeric order. |
341 | push @Scripts, [ hex($1), $1, $2, $3 ]; |
342 | } |
343 | } else { |
344 | die "$0: Scripts.txt: $!\n"; |
345 | } |
346 | |
347 | # Now append the scripts properties in their code point order. |
348 | |
349 | my %Script; |
350 | my $Scripts = []; |
351 | |
352 | for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { |
353 | my ($code, $first, $last, $name) = @$script; |
354 | append($Scripts, $first, $name); |
355 | append($Script{$name} ||= [], $first, $name); |
356 | if (defined $last) { |
357 | extend($Scripts, $last); |
358 | extend($Script{$name}, $last); |
359 | } |
360 | unless (defined $In{$name}) { |
71d929cb |
361 | $InScript{$InId} = $name; |
362 | $In{$name} = $InId++; |
363 | $InIn{$name} = $Script{$name}; |
d73e5302 |
364 | } |
365 | } |
366 | |
367 | # Scripts.pl can be written out already now. |
368 | |
369 | flush(\@Scripts, "Scripts.pl"); |
370 | |
371 | # Common is everything not explicitly assigned to a Script |
372 | |
373 | $In{Common} = $InId++; |
374 | my $Common = inverse($Scripts); |
375 | $InIn{Common} = $Common; |
376 | |
377 | # |
378 | # Read in the Blocks.txt. |
379 | # |
380 | |
381 | my @Blocks; |
382 | my %Blocks; |
383 | |
384 | if (open(my $Blocks, "Blocks.txt")) { |
385 | while (<$Blocks>) { |
d2d499f5 |
386 | next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; |
387 | |
d73e5302 |
388 | my ($first, $last, $name) = ($1, $2, $3); |
71d929cb |
389 | my $origname = $name; |
d73e5302 |
390 | |
391 | # If there's a naming conflict (the script names are |
392 | # in uppercase), the name of the block has " Block" |
393 | # appended to it. |
71d929cb |
394 | my $pat = $name; |
395 | $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; |
396 | for my $i (values %InScript) { |
397 | if ($i =~ /^$pat$/i) { |
398 | $name .= " Block"; |
399 | last; |
400 | } |
401 | } |
d73e5302 |
402 | |
403 | append(\@Blocks, $first, $name); |
404 | append($Blocks{$name} ||= [], $first, $name); |
405 | if (defined $last) { |
406 | extend(\@Blocks, $last); |
407 | extend($Blocks{$name}, $last); |
408 | } |
409 | unless (defined $In{$name}) { |
71d929cb |
410 | $InBlock{$InId} = $origname; |
411 | $In{$name} = $InId++; |
412 | $InIn{$name} = $Blocks{$name}; |
d73e5302 |
413 | } |
414 | } |
415 | } else { |
416 | die "$0: Blocks.txt: $!\n"; |
417 | } |
418 | |
419 | # Blocks.pl can be written out already now. |
420 | |
421 | flush(\@Blocks, "Blocks.pl"); |
422 | |
423 | # |
424 | # Read in the PropList.txt. It contains extended properties not |
425 | # listed in the Unicode.txt, such as 'Other_Alphabetic': |
426 | # alphabetic but not of the general category L; many modifiers |
427 | # belong to this extended property category: while they are not |
428 | # alphabets, they are alphabetic in nature. |
429 | # |
430 | |
431 | my @Props; |
432 | |
433 | if (open(my $Props, "PropList.txt")) { |
434 | while (<$Props>) { |
d2d499f5 |
435 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; |
d73e5302 |
436 | |
437 | # Wait until all the extended properties have been read since |
438 | # they are not listed in numeric order. |
439 | push @Props, [ hex($1), $1, $2, $3 ]; |
440 | } |
441 | } else { |
442 | die "$0: PropList.txt: $!\n"; |
443 | } |
444 | |
445 | # Now append the extended properties in their code point order. |
446 | |
447 | my %Prop; |
448 | my $Props = []; |
449 | |
450 | for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { |
451 | my ($code, $first, $last, $name) = @$prop; |
452 | append($Props, $first, $name); |
453 | append($Prop{$name} ||= [], $first, $name); |
454 | if (defined $last) { |
455 | extend($Props, $last); |
456 | extend($Prop{$name}, $last); |
457 | } |
458 | unless (defined $In{$name}) { |
459 | $In{$name} = $InId++; |
460 | $InIn{$name} = $Prop{$name}; |
461 | } |
462 | } |
463 | |
464 | # Assigned is everything not Cn aka Noncharacter_Code_Point |
465 | |
466 | $In{Assigned} = $InId++; |
467 | my $Assigned = inverse($Prop{Noncharacter_Code_Point}); |
468 | $InIn{Assigned} = $Assigned; |
469 | |
470 | sub merge_general_and_extended { |
471 | my ($name, $general, $extended) = @_; |
472 | my $merged; |
473 | |
474 | push @$merged, |
475 | map { pop @{$_}; $_ } |
476 | sort { $a->[2] <=> $b->[2] } |
477 | map { [ $_->[0], $_->[1], hex($_->[0]) ] } |
478 | ($general ? |
479 | map { ref $_ ? @$_ : $_ } |
480 | @Cat {ref $general ? @$general : $general } : |
481 | (), |
482 | $extended ? |
483 | map { ref $_ ? @$_ : $_ } |
484 | @Prop{ref $extended ? @$extended : $extended} : |
485 | ()); |
486 | |
487 | $In{$name} = $InId++; |
488 | $InIn{$name} = $merged; |
489 | |
490 | return $merged; |
491 | } |
492 | |
493 | # Alphabetic is L and Other_Alphabetic. |
494 | |
495 | my $Alphabetic = |
496 | merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); |
497 | |
498 | # Lowercase is Ll and Other_Lowercase. |
499 | |
500 | my $Lowercase = |
501 | merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); |
502 | |
503 | # Uppercase is Lu and Other_Uppercase. |
504 | |
505 | my $Uppercase = |
506 | merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); |
507 | |
508 | # Math is Sm and Other_Math. |
509 | |
510 | my $Math = |
511 | merge_general_and_extended('Math', 'Sm', 'Other_Math'); |
512 | |
513 | # Lampersand is Ll, Lu, and Lt. |
514 | |
515 | my $Lampersand = |
516 | merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); |
517 | |
518 | # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. |
519 | |
520 | my $ID_Start = |
521 | merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); |
522 | |
523 | # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. |
524 | |
525 | my $ID_Continue = |
526 | merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl |
527 | Mn Mc Nd Pc) ]); |
528 | |
529 | # |
530 | # Any is any. |
531 | # |
532 | |
533 | $In{Any} = $InId++; |
534 | my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; |
535 | $InIn{Any} = $Any; |
536 | |
537 | # |
538 | # mapping() will be used to write out the In and Is virtual mappings. |
539 | # |
540 | |
541 | sub mapping { |
542 | my ($map, $name) = @_; |
543 | |
544 | if (open(my $fh, ">$name.pl")) { |
545 | print "$name.pl\n"; |
546 | header($fh); |
547 | |
548 | # The %pat will hold a hash that maps the first two |
549 | # lowercased letters of a class to a 'fuzzified' regular |
550 | # expression that points to the real mapping. |
551 | |
552 | my %pat; |
553 | |
554 | # But first write out the offical name to real name |
555 | # (the filename) mapping. |
556 | |
557 | print $fh <<EOT; |
558 | %utf8::${name} = |
559 | ( |
560 | EOT |
67765ba6 |
561 | for my $i (sort { lc $a cmp lc $b } keys %$map) { |
d73e5302 |
562 | my $pat = $i; |
563 | # Here is the 'fuzzification': accept any space, |
564 | # dash, or underbar where in the official name |
565 | # there is space or a dash (or underbar, but |
566 | # there never is). |
567 | $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; |
568 | # The prefix length of 2 is enough spread, |
569 | # and besides, we have 'Yi' as an In category. |
570 | push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ]; |
67765ba6 |
571 | printf $fh "%-45s => '$map->{$i}',\n", "'$i'"; |
d73e5302 |
572 | } |
573 | print $fh <<EOT; |
574 | ); |
575 | EOT |
576 | |
577 | # Now write out the %pat mapping. |
578 | |
579 | print $fh <<EOT; |
580 | %utf8::${name}Pat = |
581 | ( |
582 | EOT |
583 | foreach my $prefix (sort keys %pat) { |
584 | print $fh "'$prefix' => {\n"; |
585 | foreach my $ipat (@{$pat{$prefix}}) { |
586 | my ($i, $pat) = @$ipat; |
587 | print $fh "\t'$pat' => '$map->{$i}',\n"; |
588 | } |
589 | print $fh "},\n"; |
590 | } |
591 | print $fh <<EOT; |
592 | ); |
593 | EOT |
594 | |
595 | close($fh); |
596 | } else { |
597 | die "$0: $name.pl: $!\n"; |
598 | } |
599 | } |
600 | |
601 | # |
602 | # Write out the virtual In mappings. |
603 | # |
604 | |
605 | mapping(\%In, "In"); |
606 | |
71d929cb |
607 | # |
608 | # Append the InScript and InBlock mappings. |
609 | # These are needed only if Script= and Block= syntaxes are used. |
610 | # |
611 | |
612 | if (open(my $In, ">>In.pl")) { |
613 | print $In <<EOT; |
614 | |
615 | %utf8::InScript = |
616 | ( |
617 | EOT |
618 | for my $i (sort { $a <=> $b } keys %InScript) { |
619 | printf $In "%4d => '$InScript{$i}',\n", $i; |
620 | } |
621 | print $In <<EOT; |
622 | ); |
623 | EOT |
624 | |
625 | print $In <<EOT; |
626 | |
627 | %utf8::InBlock = |
628 | ( |
629 | EOT |
630 | for my $i (sort { $a <=> $b } keys %InBlock) { |
631 | printf $In "%4d => '$InBlock{$i}',\n", $i; |
632 | } |
633 | print $In <<EOT; |
634 | ); |
635 | EOT |
636 | } else { |
637 | die "$0: In.pl: $!\n"; |
638 | } |
639 | |
d73e5302 |
640 | # Easy low-calorie cheat. |
641 | use File::Copy; |
642 | copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl"); |
643 | |
644 | # |
645 | # Write out the real In mappings |
646 | # (the In.pl written out just above has the virtual In mappings) |
647 | # |
648 | |
649 | foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) { |
650 | flush($InIn{$in}, "In/$In{$in}.pl"); |
651 | } |
652 | |
653 | # |
654 | # The mapping from General Category long forms to short forms is |
655 | # currently hardwired here since no simple data file in the UCD |
656 | # seems to do that. |
657 | # |
658 | |
659 | my %Is = ( |
660 | 'Letter' => 'L', |
661 | 'Uppercase Letter' => 'Lu', |
662 | 'Lowercase Letter' => 'Ll', |
663 | 'Titlecase Letter' => 'Lt', |
664 | 'Modifier Letter' => 'Lm', |
665 | 'Other Letter' => 'Lo', |
666 | |
667 | 'Mark' => 'M', |
668 | 'Non-Spacing Mark' => 'Mn', |
669 | 'Spacing Combining Mark' => 'Mc', |
670 | 'Enclosing Mark' => 'Me', |
671 | |
672 | 'Separator' => 'Z', |
673 | 'Space Separator' => 'Zs', |
674 | 'Line Separator' => 'Zl', |
675 | 'Paragraph Separator' => 'Zp', |
676 | |
677 | 'Number' => 'N', |
678 | 'Decimal Digit Number' => 'Nd', |
679 | 'Letter Number' => 'Nl', |
680 | 'Other Number' => 'No', |
681 | |
682 | 'Punctuation' => 'P', |
683 | 'Connector Punctuation' => 'Pc', |
684 | 'Dash Punctuation' => 'Pd', |
685 | 'Open Punctuation' => 'Ps', |
686 | 'Close Punctuation' => 'Pe', |
687 | 'Initial Punctuation' => 'Pi', |
688 | 'Final Punctuation' => 'Pf', |
689 | 'Other Punctuation' => 'Po', |
690 | |
691 | 'Symbol' => 'S', |
692 | 'Math Symbol' => 'Sm', |
693 | 'Currency Symbol' => 'Sc', |
694 | 'Modifier Symbol' => 'Sk', |
695 | 'Other Symbol' => 'So', |
696 | |
697 | 'Other' => 'C', |
698 | 'Control' => 'Cc', |
699 | 'Format' => 'Cf', |
700 | 'Surrogate' => 'Cs', |
701 | 'Private Use' => 'Co', |
702 | 'Not Assigned' => 'Cn', |
703 | # 'Other' aliases |
704 | 'Other Control' => 'Cc', |
705 | 'Other Format' => 'Cf', |
706 | 'Other Surrogate' => 'Cs', |
707 | 'Other Private Use' => 'Co', |
708 | 'Other Not Assigned' => 'Cn', |
709 | ); |
710 | |
711 | # |
712 | # Write out the virtual Is mappings. |
713 | # |
714 | |
715 | mapping(\%Is, "Is"); |
716 | |
d2d499f5 |
717 | # |
718 | # Read in the special cases. |
719 | # |
720 | |
721 | my %Case; |
722 | |
723 | if (open(my $SpecCase, "SpecCase.txt")) { |
724 | while (<$SpecCase>) { |
725 | next unless /^[0-9A-Fa-f]+;/; |
726 | s/\#.*//; |
727 | s/\s+$//; |
728 | |
729 | my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); |
730 | |
731 | if ($condition) { # not implemented yet |
732 | print "# SKIPPING $_\n"; |
733 | next; |
734 | } |
735 | |
736 | # Wait until all the special cases have been read since |
737 | # they are not listed in numeric order. |
738 | my $ix = hex($code); |
739 | push @{$Case{Lower}}, [ $ix, $code, $lower ]; |
740 | push @{$Case{Title}}, [ $ix, $code, $title ]; |
741 | push @{$Case{Upper}}, [ $ix, $code, $upper ]; |
742 | } |
743 | } else { |
744 | die "$0: SpecCase.txt: $!\n"; |
745 | } |
746 | |
747 | # Now write out the special cases properties in their code point order. |
983ffd37 |
748 | # Prepend them to the To/{Upper,Lower,Title}.pl. |
d2d499f5 |
749 | |
750 | for my $case (qw(Lower Title Upper)) { |
c4051cc5 |
751 | my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n"; |
983ffd37 |
752 | if (open(my $Case, ">To/$case.pl")) { |
753 | header($Case); |
754 | print $Case <<EOT; |
755 | |
756 | %utf8::ToSpec$case = ( |
757 | EOT |
758 | for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) { |
759 | my ($ix, $code, $to) = @$prop; |
760 | my $tostr = |
761 | join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; |
6d47b937 |
762 | printf $Case qq['%04X' => "$tostr",\n], $ix; |
983ffd37 |
763 | } |
764 | print $Case <<EOT; |
765 | ); |
766 | |
767 | EOT |
768 | begin($Case); |
769 | print $Case $NormalCase; |
770 | end($Case); |
771 | } else { |
772 | die "$0: To/$case.txt: $!\n"; |
d2d499f5 |
773 | } |
d2d499f5 |
774 | } |
775 | |
c4051cc5 |
776 | # |
777 | # Read in the case foldings. |
778 | # |
779 | # We will do full case folding, C + F + I (see CaseFold.txt). |
780 | # |
781 | |
782 | if (open(my $CaseFold, "CaseFold.txt")) { |
783 | my @Fold; |
784 | my %Fold; |
785 | |
786 | while (<$CaseFold>) { |
787 | next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; |
788 | |
789 | my ($code, $status, $fold) = ($1, $2, $3); |
790 | |
791 | if ($status eq 'C') { # Common: one-to-one folding |
792 | append(\@Fold, $code, $fold); |
793 | } else { # F: full, or I: dotted uppercase I -> dotless lowercase I |
794 | $Fold{hex($code)} = $fold; |
795 | } |
796 | } |
797 | |
798 | flush(\@Fold, "To/Fold.pl"); |
799 | |
800 | # |
801 | # Prepend the special foldings to the common foldings. |
802 | # |
803 | |
804 | my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; |
805 | if (open(my $Fold, ">To/Fold.pl")) { |
806 | header($Fold); |
807 | print $Fold <<EOT; |
808 | |
809 | %utf8::ToSpecFold = ( |
810 | EOT |
811 | for my $code (sort { $a <=> $b } keys %Fold) { |
812 | my $foldstr = |
813 | join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; |
6d47b937 |
814 | printf $Fold qq['%04X' => "$foldstr",\n], $code; |
c4051cc5 |
815 | } |
816 | print $Fold <<EOT; |
817 | ); |
818 | |
819 | EOT |
820 | begin($Fold); |
821 | print $Fold $CommonFold; |
822 | end($Fold); |
823 | } else { |
824 | die "$0: To/Fold.pl: $!\n"; |
825 | } |
826 | } else { |
827 | die "$0: CaseFold.txt: $!\n"; |
828 | } |
829 | |
d73e5302 |
830 | # That's all, folks! |
831 | |