Commit | Line | Data |
d73e5302 |
1 | #!/usr/bin/perl -w |
2 | |
3 | # |
4 | # mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) |
5 | # from the Unicode database files (lib/unicore/*.txt). |
6 | # |
7 | |
d73e5302 |
8 | use strict; |
9 | |
d2d499f5 |
10 | my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. |
11 | |
d73e5302 |
12 | mkdir("In", 0755); |
13 | mkdir("Is", 0755); |
14 | mkdir("To", 0755); |
15 | |
16 | sub extend { |
17 | my ($table, $last) = @_; |
18 | |
19 | $table->[-1]->[1] = $last; |
20 | } |
21 | |
22 | sub append { |
23 | my ($table, $code, $name) = @_; |
24 | if (@$table && |
25 | hex($table->[-1]->[1]) == hex($code) - 1 && |
26 | (!defined $name || $table->[-1]->[2] eq $name)) { |
27 | extend($table, $code); |
28 | } else { |
29 | push @$table, [$code, $code, $name]; |
30 | } |
31 | } |
32 | |
33 | sub inverse { |
34 | my ($table) = @_; |
35 | my $inverse = []; |
36 | my ($first, $last); |
37 | if ($table->[0]->[0]) { |
38 | $last = hex($table->[0]->[0]); |
39 | push @$inverse, [ "0000", |
40 | sprintf("%04X", $last - 1) ]; |
41 | } |
42 | for my $i (0..$#$table-1) { |
43 | $first = defined $table->[$i ]->[1] ? |
44 | hex($table->[$i ]->[1]) : 0; |
45 | $last = defined $table->[$i + 1]->[0] ? |
46 | hex($table->[$i + 1]->[0]) : $first; |
47 | push @$inverse, [ sprintf("%04X", $first + 1), |
48 | sprintf("%04X", $last - 1) ] |
49 | unless $first + 1 == $last; |
50 | } |
51 | return $inverse; |
52 | } |
53 | |
54 | sub header { |
55 | my $fh = shift; |
56 | |
57 | print $fh <<EOT; |
58 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
59 | # This file is built by $0 from e.g. Unicode.txt. |
60 | # Any changes made here will be lost! |
61 | EOT |
62 | } |
63 | |
64 | sub begin { |
65 | my $fh = shift; |
66 | |
67 | print $fh <<EOT; |
68 | return <<'END'; |
69 | EOT |
70 | } |
71 | |
72 | sub end { |
73 | my $fh = shift; |
74 | |
75 | print $fh <<EOT; |
76 | END |
77 | EOT |
78 | } |
79 | |
80 | sub flush { |
81 | my ($table, $file) = @_; |
82 | print "$file\n"; |
83 | if (open(my $fh, ">$file")) { |
84 | header($fh); |
85 | begin($fh); |
86 | for my $i (@$table) { |
87 | print $fh $i->[0], "\t", |
88 | $i->[1] ne $i->[0] ? $i->[1] : "", "\t", |
89 | defined $i->[2] ? $i->[2] : "", "\n"; |
90 | } |
91 | end($fh); |
92 | close($fh); |
93 | } else { |
94 | die "$0: $file: $!\n"; |
95 | } |
96 | } |
97 | |
98 | # |
99 | # The %In contains the mapping of the script/block name into a number. |
100 | # |
101 | |
102 | my %In; |
103 | my $InId = 0; |
104 | my %InIn; |
105 | |
106 | # |
107 | # Read in the Unicode.txt, the main Unicode database. |
108 | # |
109 | |
110 | my %Cat; |
111 | my %General; |
112 | my @General; |
113 | |
114 | if (open(my $Unicode, "Unicode.txt")) { |
115 | my @Name; |
116 | my @Bidi; |
117 | my %Bidi; |
118 | my @Comb; |
119 | my @Deco; |
120 | my %Deco; |
121 | my %DC; |
122 | my @Number; |
123 | my @Mirrored; |
124 | my %To; |
125 | while (<$Unicode>) { |
d2d499f5 |
126 | next unless /^[0-9A-Fa-f]+;/; |
d73e5302 |
127 | s/\s+$//; |
d2d499f5 |
128 | |
d73e5302 |
129 | my ($code, $name, $cat, $comb, $bidi, $deco, |
130 | $decimal, $digit, $number, |
131 | $mirrored, $unicode10, $comment, |
132 | $upper, $lower, $title) = split(/\s*;\s*/); |
133 | |
134 | if ($name =~ /^<(.+), (First|Last)>$/) { |
135 | $name = $1; |
136 | if ($2 eq 'First') { |
137 | append($General{$name} ||= [], $code, $name); |
138 | } else { |
139 | extend($General{$name} , $code); |
140 | } |
141 | unless (defined $In{$name}) { |
142 | $In{$name} = $InId++; |
143 | $InIn{$name} = $General{$name}; |
144 | } |
145 | append($Cat{$cat} ||= [], $code); |
146 | append($Cat{substr($cat, 0, 1)} |
147 | ||= [], $code); |
148 | } else { |
149 | append(\@Name, $code, $name); |
150 | |
151 | append(\@General, $code, $cat); |
152 | |
153 | append($Cat{$cat} ||= [], $code); |
154 | append($Cat{substr($cat, 0, 1)} |
155 | ||= [], $code); |
156 | # 005F: SPACING UNDERSCORE |
157 | append($Cat{Word} ||= [], $code) |
158 | if $cat =~ /^[LMN]/ or $code eq "005F"; |
159 | append($Cat{Alnum} ||= [], $code) |
160 | if $cat =~ /^[LMN]/; |
161 | append($Cat{Alpha} ||= [], $code) |
162 | if $cat =~ /^[LM]/; |
163 | # 0009: HORIZONTAL TABULATION |
164 | # 000A: LINE FEED |
165 | # 000B: VERTICAL TABULATION |
166 | # 000C: FORM FEED |
167 | # 000D: CARRIAGE RETURN |
168 | # 0020: SPACE |
169 | append($Cat{Space} ||= [], $code) |
170 | if $cat =~ /^Z/ || |
171 | $code =~ /^(0009|000A|000B|000C|000D)$/; |
172 | append($Cat{SpacePerl} ||= [], $code) |
173 | if $cat =~ /^Z/ || |
174 | $code =~ /^(0009|000A|000C|000D)$/; |
175 | append($Cat{Blank} ||= [], $code) |
176 | if $code =~ /^(0020|0009)$/ || |
177 | $cat =~ /^Z[^lp]$/; |
178 | append($Cat{Digit} ||= [], $code) if $cat eq "Nd"; |
179 | append($Cat{Upper} ||= [], $code) if $cat eq "Lu"; |
180 | append($Cat{Lower} ||= [], $code) if $cat eq "Ll"; |
181 | append($Cat{Title} ||= [], $code) if $cat eq "Lt"; |
182 | append($Cat{ASCII} ||= [], $code) if $code le "007F"; |
183 | append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/; |
184 | append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; |
185 | append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; |
186 | append($Cat{Punct} ||= [], $code) if $cat =~ /^P/; |
187 | # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f |
188 | append($Cat{XDigit} ||= [], $code) |
189 | if $code =~ /^00(3[0-9]|[46][1-6])$/; |
190 | |
191 | append($To{Upper} ||= [], $code, $upper) if $upper; |
192 | append($To{Lower} ||= [], $code, $lower) if $lower; |
193 | append($To{Title} ||= [], $code, $title) if $title; |
194 | append($To{Digit} ||= [], $code, $decimal) if $decimal; |
195 | |
196 | append(\@Bidi, $code, $bidi); |
197 | append($Bidi{$bidi} ||= [], $code); |
198 | |
199 | append(\@Comb, $code, $comb) if $comb; |
200 | |
201 | if ($deco) { |
202 | append(\@Deco, $code, $deco); |
203 | if ($deco =~/^<(\w+)>/) { |
204 | append($Deco{Compat} ||= [], $code); |
205 | append($DC{$1} ||= [], $code); |
206 | } else { |
207 | append($Deco{Canon} ||= [], $code); |
208 | } |
209 | } |
210 | |
211 | append(\@Number, $code, $number) if $number; |
212 | |
213 | append(\@Mirrored, $code) if $mirrored eq "Y"; |
214 | } |
215 | } |
216 | |
217 | flush(\@Name, "Name.pl"); |
218 | |
219 | foreach my $cat (sort keys %Cat) { |
220 | flush($Cat{$cat}, "Is/$cat.pl"); |
221 | } |
222 | |
223 | foreach my $to (sort keys %To) { |
224 | flush($To{$to}, "To/$to.pl"); |
225 | } |
226 | |
227 | flush(\@Bidi, "Bidirectional.pl"); |
228 | foreach my $bidi (sort keys %Bidi) { |
229 | flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); |
230 | } |
231 | |
232 | flush(\@Comb, "CombiningClass.pl"); |
233 | |
234 | flush(\@Deco, "Decomposition.pl"); |
235 | foreach my $deco (sort keys %Deco) { |
236 | flush($Deco{$deco}, "Is/Deco$deco.pl"); |
237 | } |
238 | foreach my $dc (sort keys %DC) { |
239 | flush($DC{$dc}, "Is/DC$dc.pl"); |
240 | } |
241 | |
242 | flush(\@Number, "Number.pl"); |
243 | |
244 | flush(\@Mirrored, "Is/Mirrored.pl"); |
245 | } else { |
246 | die "$0: Unicode.txt: $!\n"; |
247 | } |
248 | |
249 | # The general cateory can be written out already now. |
250 | |
251 | flush(\@General, "Category.pl"); |
252 | |
253 | # |
254 | # Read in the LineBrk.txt. |
255 | # |
256 | |
257 | if (open(my $LineBrk, "LineBrk.txt")) { |
258 | my @Lbrk; |
259 | my %Lbrk; |
260 | |
261 | while (<$LineBrk>) { |
d2d499f5 |
262 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; |
d73e5302 |
263 | |
264 | my ($first, $last, $lbrk) = ($1, $2, $3); |
265 | |
266 | append(\@Lbrk, $first, $lbrk); |
267 | append($Lbrk{$lbrk} ||= [], $first); |
268 | if (defined $last) { |
269 | extend(\@Lbrk, $last); |
270 | extend($Lbrk{$lbrk}, $last); |
271 | } |
272 | } |
273 | |
274 | flush(\@Lbrk, "Lbrk.pl"); |
275 | foreach my $lbrk (sort keys %Lbrk) { |
276 | flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); |
277 | } |
278 | } else { |
279 | die "$0: LineBrk.txt: $!\n"; |
280 | } |
281 | |
282 | # |
283 | # Read in the ArabShap.txt. |
284 | # |
285 | |
286 | if (open(my $ArabShap, "ArabShap.txt")) { |
287 | my @ArabLink; |
288 | my @ArabLinkGroup; |
289 | |
290 | while (<$ArabShap>) { |
d2d499f5 |
291 | next unless /^[0-9A-Fa-f]+;/; |
d73e5302 |
292 | s/\s+$//; |
d2d499f5 |
293 | |
d73e5302 |
294 | my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); |
295 | |
296 | append(\@ArabLink, $code, $link); |
297 | append(\@ArabLinkGroup, $code, $linkgroup); |
298 | } |
299 | |
300 | flush(\@ArabLink, "ArabLink.pl"); |
301 | flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); |
302 | } else { |
303 | die "$0: ArabShap.txt: $!\n"; |
304 | } |
305 | |
306 | # |
307 | # Read in the Jamo.txt. |
308 | # |
309 | |
310 | if (open(my $Jamo, "Jamo.txt")) { |
311 | my @Short; |
312 | |
313 | while (<$Jamo>) { |
d2d499f5 |
314 | next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; |
315 | |
316 | my ($code, $short) = ($1, $2); |
d73e5302 |
317 | |
318 | append(\@Short, $code, $short); |
319 | } |
320 | |
321 | flush(\@Short, "JamoShort.pl"); |
322 | } else { |
323 | die "$0: Jamo.txt: $!\n"; |
324 | } |
325 | |
326 | # |
327 | # Read in the Scripts.txt. |
328 | # |
329 | |
330 | my @Scripts; |
331 | |
332 | if (open(my $Scripts, "Scripts.txt")) { |
333 | while (<$Scripts>) { |
d2d499f5 |
334 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; |
d73e5302 |
335 | |
336 | # Wait until all the scripts have been read since |
337 | # they are not listed in numeric order. |
338 | push @Scripts, [ hex($1), $1, $2, $3 ]; |
339 | } |
340 | } else { |
341 | die "$0: Scripts.txt: $!\n"; |
342 | } |
343 | |
344 | # Now append the scripts properties in their code point order. |
345 | |
346 | my %Script; |
347 | my $Scripts = []; |
348 | |
349 | for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { |
350 | my ($code, $first, $last, $name) = @$script; |
351 | append($Scripts, $first, $name); |
352 | append($Script{$name} ||= [], $first, $name); |
353 | if (defined $last) { |
354 | extend($Scripts, $last); |
355 | extend($Script{$name}, $last); |
356 | } |
357 | unless (defined $In{$name}) { |
358 | $In{$name} = $InId++; |
359 | $InIn{$name} = $Script{$name}; |
360 | } |
361 | } |
362 | |
363 | # Scripts.pl can be written out already now. |
364 | |
365 | flush(\@Scripts, "Scripts.pl"); |
366 | |
367 | # Common is everything not explicitly assigned to a Script |
368 | |
369 | $In{Common} = $InId++; |
370 | my $Common = inverse($Scripts); |
371 | $InIn{Common} = $Common; |
372 | |
373 | # |
374 | # Read in the Blocks.txt. |
375 | # |
376 | |
377 | my @Blocks; |
378 | my %Blocks; |
379 | |
380 | if (open(my $Blocks, "Blocks.txt")) { |
381 | while (<$Blocks>) { |
d2d499f5 |
382 | next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; |
383 | |
d73e5302 |
384 | my ($first, $last, $name) = ($1, $2, $3); |
385 | |
386 | # If there's a naming conflict (the script names are |
387 | # in uppercase), the name of the block has " Block" |
388 | # appended to it. |
389 | $name = "$name Block" if defined $In{"\U$name"}; |
390 | |
391 | append(\@Blocks, $first, $name); |
392 | append($Blocks{$name} ||= [], $first, $name); |
393 | if (defined $last) { |
394 | extend(\@Blocks, $last); |
395 | extend($Blocks{$name}, $last); |
396 | } |
397 | unless (defined $In{$name}) { |
398 | $In{$name} = $InId++; |
399 | $InIn{$name} = $Blocks{$name}; |
400 | } |
401 | } |
402 | } else { |
403 | die "$0: Blocks.txt: $!\n"; |
404 | } |
405 | |
406 | # Blocks.pl can be written out already now. |
407 | |
408 | flush(\@Blocks, "Blocks.pl"); |
409 | |
410 | # |
411 | # Read in the PropList.txt. It contains extended properties not |
412 | # listed in the Unicode.txt, such as 'Other_Alphabetic': |
413 | # alphabetic but not of the general category L; many modifiers |
414 | # belong to this extended property category: while they are not |
415 | # alphabets, they are alphabetic in nature. |
416 | # |
417 | |
418 | my @Props; |
419 | |
420 | if (open(my $Props, "PropList.txt")) { |
421 | while (<$Props>) { |
d2d499f5 |
422 | next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; |
d73e5302 |
423 | |
424 | # Wait until all the extended properties have been read since |
425 | # they are not listed in numeric order. |
426 | push @Props, [ hex($1), $1, $2, $3 ]; |
427 | } |
428 | } else { |
429 | die "$0: PropList.txt: $!\n"; |
430 | } |
431 | |
432 | # Now append the extended properties in their code point order. |
433 | |
434 | my %Prop; |
435 | my $Props = []; |
436 | |
437 | for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { |
438 | my ($code, $first, $last, $name) = @$prop; |
439 | append($Props, $first, $name); |
440 | append($Prop{$name} ||= [], $first, $name); |
441 | if (defined $last) { |
442 | extend($Props, $last); |
443 | extend($Prop{$name}, $last); |
444 | } |
445 | unless (defined $In{$name}) { |
446 | $In{$name} = $InId++; |
447 | $InIn{$name} = $Prop{$name}; |
448 | } |
449 | } |
450 | |
451 | # Assigned is everything not Cn aka Noncharacter_Code_Point |
452 | |
453 | $In{Assigned} = $InId++; |
454 | my $Assigned = inverse($Prop{Noncharacter_Code_Point}); |
455 | $InIn{Assigned} = $Assigned; |
456 | |
457 | sub merge_general_and_extended { |
458 | my ($name, $general, $extended) = @_; |
459 | my $merged; |
460 | |
461 | push @$merged, |
462 | map { pop @{$_}; $_ } |
463 | sort { $a->[2] <=> $b->[2] } |
464 | map { [ $_->[0], $_->[1], hex($_->[0]) ] } |
465 | ($general ? |
466 | map { ref $_ ? @$_ : $_ } |
467 | @Cat {ref $general ? @$general : $general } : |
468 | (), |
469 | $extended ? |
470 | map { ref $_ ? @$_ : $_ } |
471 | @Prop{ref $extended ? @$extended : $extended} : |
472 | ()); |
473 | |
474 | $In{$name} = $InId++; |
475 | $InIn{$name} = $merged; |
476 | |
477 | return $merged; |
478 | } |
479 | |
480 | # Alphabetic is L and Other_Alphabetic. |
481 | |
482 | my $Alphabetic = |
483 | merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); |
484 | |
485 | # Lowercase is Ll and Other_Lowercase. |
486 | |
487 | my $Lowercase = |
488 | merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); |
489 | |
490 | # Uppercase is Lu and Other_Uppercase. |
491 | |
492 | my $Uppercase = |
493 | merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); |
494 | |
495 | # Math is Sm and Other_Math. |
496 | |
497 | my $Math = |
498 | merge_general_and_extended('Math', 'Sm', 'Other_Math'); |
499 | |
500 | # Lampersand is Ll, Lu, and Lt. |
501 | |
502 | my $Lampersand = |
503 | merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); |
504 | |
505 | # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. |
506 | |
507 | my $ID_Start = |
508 | merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); |
509 | |
510 | # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. |
511 | |
512 | my $ID_Continue = |
513 | merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl |
514 | Mn Mc Nd Pc) ]); |
515 | |
516 | # |
517 | # Any is any. |
518 | # |
519 | |
520 | $In{Any} = $InId++; |
521 | my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; |
522 | $InIn{Any} = $Any; |
523 | |
524 | # |
525 | # mapping() will be used to write out the In and Is virtual mappings. |
526 | # |
527 | |
528 | sub mapping { |
529 | my ($map, $name) = @_; |
530 | |
531 | if (open(my $fh, ">$name.pl")) { |
532 | print "$name.pl\n"; |
533 | header($fh); |
534 | |
535 | # The %pat will hold a hash that maps the first two |
536 | # lowercased letters of a class to a 'fuzzified' regular |
537 | # expression that points to the real mapping. |
538 | |
539 | my %pat; |
540 | |
541 | # But first write out the offical name to real name |
542 | # (the filename) mapping. |
543 | |
544 | print $fh <<EOT; |
545 | %utf8::${name} = |
546 | ( |
547 | EOT |
67765ba6 |
548 | for my $i (sort { lc $a cmp lc $b } keys %$map) { |
d73e5302 |
549 | my $pat = $i; |
550 | # Here is the 'fuzzification': accept any space, |
551 | # dash, or underbar where in the official name |
552 | # there is space or a dash (or underbar, but |
553 | # there never is). |
554 | $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; |
555 | # The prefix length of 2 is enough spread, |
556 | # and besides, we have 'Yi' as an In category. |
557 | push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ]; |
67765ba6 |
558 | printf $fh "%-45s => '$map->{$i}',\n", "'$i'"; |
d73e5302 |
559 | } |
560 | print $fh <<EOT; |
561 | ); |
562 | EOT |
563 | |
564 | # Now write out the %pat mapping. |
565 | |
566 | print $fh <<EOT; |
567 | %utf8::${name}Pat = |
568 | ( |
569 | EOT |
570 | foreach my $prefix (sort keys %pat) { |
571 | print $fh "'$prefix' => {\n"; |
572 | foreach my $ipat (@{$pat{$prefix}}) { |
573 | my ($i, $pat) = @$ipat; |
574 | print $fh "\t'$pat' => '$map->{$i}',\n"; |
575 | } |
576 | print $fh "},\n"; |
577 | } |
578 | print $fh <<EOT; |
579 | ); |
580 | EOT |
581 | |
582 | close($fh); |
583 | } else { |
584 | die "$0: $name.pl: $!\n"; |
585 | } |
586 | } |
587 | |
588 | # |
589 | # Write out the virtual In mappings. |
590 | # |
591 | |
592 | mapping(\%In, "In"); |
593 | |
594 | # Easy low-calorie cheat. |
595 | use File::Copy; |
596 | copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl"); |
597 | |
598 | # |
599 | # Write out the real In mappings |
600 | # (the In.pl written out just above has the virtual In mappings) |
601 | # |
602 | |
603 | foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) { |
604 | flush($InIn{$in}, "In/$In{$in}.pl"); |
605 | } |
606 | |
607 | # |
608 | # The mapping from General Category long forms to short forms is |
609 | # currently hardwired here since no simple data file in the UCD |
610 | # seems to do that. |
611 | # |
612 | |
613 | my %Is = ( |
614 | 'Letter' => 'L', |
615 | 'Uppercase Letter' => 'Lu', |
616 | 'Lowercase Letter' => 'Ll', |
617 | 'Titlecase Letter' => 'Lt', |
618 | 'Modifier Letter' => 'Lm', |
619 | 'Other Letter' => 'Lo', |
620 | |
621 | 'Mark' => 'M', |
622 | 'Non-Spacing Mark' => 'Mn', |
623 | 'Spacing Combining Mark' => 'Mc', |
624 | 'Enclosing Mark' => 'Me', |
625 | |
626 | 'Separator' => 'Z', |
627 | 'Space Separator' => 'Zs', |
628 | 'Line Separator' => 'Zl', |
629 | 'Paragraph Separator' => 'Zp', |
630 | |
631 | 'Number' => 'N', |
632 | 'Decimal Digit Number' => 'Nd', |
633 | 'Letter Number' => 'Nl', |
634 | 'Other Number' => 'No', |
635 | |
636 | 'Punctuation' => 'P', |
637 | 'Connector Punctuation' => 'Pc', |
638 | 'Dash Punctuation' => 'Pd', |
639 | 'Open Punctuation' => 'Ps', |
640 | 'Close Punctuation' => 'Pe', |
641 | 'Initial Punctuation' => 'Pi', |
642 | 'Final Punctuation' => 'Pf', |
643 | 'Other Punctuation' => 'Po', |
644 | |
645 | 'Symbol' => 'S', |
646 | 'Math Symbol' => 'Sm', |
647 | 'Currency Symbol' => 'Sc', |
648 | 'Modifier Symbol' => 'Sk', |
649 | 'Other Symbol' => 'So', |
650 | |
651 | 'Other' => 'C', |
652 | 'Control' => 'Cc', |
653 | 'Format' => 'Cf', |
654 | 'Surrogate' => 'Cs', |
655 | 'Private Use' => 'Co', |
656 | 'Not Assigned' => 'Cn', |
657 | # 'Other' aliases |
658 | 'Other Control' => 'Cc', |
659 | 'Other Format' => 'Cf', |
660 | 'Other Surrogate' => 'Cs', |
661 | 'Other Private Use' => 'Co', |
662 | 'Other Not Assigned' => 'Cn', |
663 | ); |
664 | |
665 | # |
666 | # Write out the virtual Is mappings. |
667 | # |
668 | |
669 | mapping(\%Is, "Is"); |
670 | |
d2d499f5 |
671 | # |
672 | # Read in the special cases. |
673 | # |
674 | |
675 | my %Case; |
676 | |
677 | if (open(my $SpecCase, "SpecCase.txt")) { |
678 | while (<$SpecCase>) { |
679 | next unless /^[0-9A-Fa-f]+;/; |
680 | s/\#.*//; |
681 | s/\s+$//; |
682 | |
683 | my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); |
684 | |
685 | if ($condition) { # not implemented yet |
686 | print "# SKIPPING $_\n"; |
687 | next; |
688 | } |
689 | |
690 | # Wait until all the special cases have been read since |
691 | # they are not listed in numeric order. |
692 | my $ix = hex($code); |
693 | push @{$Case{Lower}}, [ $ix, $code, $lower ]; |
694 | push @{$Case{Title}}, [ $ix, $code, $title ]; |
695 | push @{$Case{Upper}}, [ $ix, $code, $upper ]; |
696 | } |
697 | } else { |
698 | die "$0: SpecCase.txt: $!\n"; |
699 | } |
700 | |
701 | # Now write out the special cases properties in their code point order. |
702 | # The To/Spec{Lower,Title,Upper}.pl are unused for now since the swash |
703 | # routines do not do returning multiple characters. |
704 | |
705 | for my $case (qw(Lower Title Upper)) { |
706 | my @case; |
707 | for my $prop (sort { $a->[0] <=> $b->[0] } @{$Case{$case}}) { |
708 | my ($ix, $code, $to) = @$prop; |
709 | append(\@case, $code, $to); |
710 | } |
711 | flush(\@case, "To/Spec$case.pl"); |
712 | } |
713 | |
d73e5302 |
714 | # That's all, folks! |
715 | |