Memoize tests
[p5sagit/p5-mst-13.2.git] / lib / unicore / mktables.PL
CommitLineData
a0ed51b3 1#!../../miniperl
2
7c6f5cd2 3use bytes;
4
190eec7c 5$UnicodeData = "Unicode.txt";
7c6f5cd2 6$SyllableData = "syllables.txt";
8836d2a5 7$PropData = "PropList.txt";
7c6f5cd2 8
1ac13f9a 9my $UnicodeLastHex = '10FFFF';
11695a73 10
a0ed51b3 11# Note: we try to keep filenames unique within first 8 chars. Using
12# subdirectories for the following helps.
289d4f09 13mkdir "In", 0755;
14mkdir "Is", 0755;
15mkdir "To", 0755;
a0ed51b3 16
17@todo = (
18# typical
19
f59877d4 20 # 005F: SPACING UNDERSCROE
21 ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
22 ['IsAlnum', '$cat =~ /^[LMN]/', ''],
23 ['IsAlpha', '$cat =~ /^[LM]/', ''],
24 # 0009: HORIZONTAL TABULATION
25 # 000A: LINE FEED
26 # 000B: VERTICAL TABULATION
27 # 000C: FORM FEED
28 # 000D: CARRIAGE RETURN
3bec3564 29 # 0020: SPACE
f59877d4 30 ['IsSpace', '$cat =~ /^Z/ ||
31 $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
3bec3564 32 ['IsSpacePerl',
33 '$cat =~ /^Z/ ||
34 $code =~ /^(0009|000A|000C|000D)$/', ''],
f5868911 35 ['IsBlank', '$code =~ /^(0020|0009)$/ ||
36 $cat =~ /^Z[^lp]$/', ''],
b8c5462f 37 ['IsDigit', '$cat =~ /^Nd$/', ''],
7c6f5cd2 38 ['IsUpper', '$cat =~ /^L[ut]$/', ''],
b8c5462f 39 ['IsLower', '$cat =~ /^Ll$/', ''],
f59877d4 40 ['IsASCII', '$code le "007f"', ''],
b8c5462f 41 ['IsCntrl', '$cat =~ /^C/', ''],
f59877d4 42 ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
43 ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
44 ['IsPunct', '$cat =~ /^P/', ''],
45 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
b8c5462f 46 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
47 ['ToUpper', '$up', '$up'],
48 ['ToLower', '$down', '$down'],
49 ['ToTitle', '$title', '$title'],
50 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3 51
52# Name
53
54 ['Name', '$name', '$name'],
55
56# Category
57
58 ['Category', '$cat', '$cat'],
59
60# Normative
61
62 ['IsM', '$cat =~ /^M/', ''], # Mark
63 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
64 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
7c6f5cd2 65 ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
a0ed51b3 66
67 ['IsN', '$cat =~ /^N/', ''], # Number
68 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
69 ['IsNo', '$cat eq "No"', ''], # Number, Other
7c6f5cd2 70 ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
a0ed51b3 71
7c6f5cd2 72 ['IsZ', '$cat =~ /^Z/', ''], # Separator
a0ed51b3 73 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
74 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
75 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
76
77 ['IsC', '$cat =~ /^C/', ''], # Crazy
78 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
79 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
80 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
7c6f5cd2 81 ['IsCf', '$cat eq "Cf"', ''], # Other, Format
82 ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
83 ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
a0ed51b3 84
85# Informative
86
87 ['IsL', '$cat =~ /^L/', ''], # Letter
88 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
89 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
90 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
91 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
92 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
93
94 ['IsP', '$cat =~ /^P/', ''], # Punctuation
95 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
96 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
97 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
98 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
7c6f5cd2 99 ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
100 ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
101 ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
a0ed51b3 102
103 ['IsS', '$cat =~ /^S/', ''], # Symbol
104 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
7c6f5cd2 105 ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
a0ed51b3 106 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
107 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
108
109# Combining class
110 ['CombiningClass', '$comb', '$comb'],
111
112# BIDIRECTIONAL PROPERTIES
113
114 ['Bidirectional', '$bid', '$bid'],
115
116# Strong types:
117
118 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
119 # syllabic, and logographic
120 # characters (e.g., CJK
121 # ideographs)
122 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
123 # and punctuation specific to
124 # those scripts
125
7c6f5cd2 126 ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
127 ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
128 ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
129 ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
130 ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
131 ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
132 ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
133 ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
134
a0ed51b3 135# Weak types:
136
137 ['IsBidiEN','$bid eq "EN"', ''], # European Number
138 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
139 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
140 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
141 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
142
143# Separators:
144
145 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
146 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
147
148# Neutrals:
149
150 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
151 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
152 # characters: punctuation,
153 # symbols
154
155# Decomposition
156
157 ['Decomposition', '$decomp', '$decomp'],
158 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
159 ['IsDecoCompat', '$decomp =~ /^</', ''],
160 ['IsDCfont', '$decomp =~ /^<font>/', ''],
161 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
162 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
f59877d4 163 ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
a0ed51b3 164 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
165 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
166 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
167 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
168 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
169 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
170 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
171 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
172 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
173 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
7c6f5cd2 174 ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
a0ed51b3 175 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
176
177# Number
178
e1b504a6 179 ['Number', '$num ne ""', '$num'],
a0ed51b3 180
181# Mirrored
182
183 ['IsMirrored', '$mir eq "Y"', ''],
184
185# Arabic
186
187 ['ArabLink', '1', '$link'],
188 ['ArabLnkGrp', '1', '$linkgroup'],
189
190# Jamo
191
192 ['JamoShort', '1', '$short'],
499bfa7a 193
194# Syllables
195
7c6f5cd2 196 syllable_defs(),
197
a77b4ae5 198# Line break properties - Normative
199
200 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
201 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
202 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
203 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
204 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
205 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
206 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
207 ['IsLbrkSP','$brk eq "SP"', ''], # Space
208 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
209
210# Line break properties - Informative
211 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
212 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
213 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
214 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
215 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
216 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
217 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
218 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
219 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
220 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
221 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
222 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
223 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
224 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
225 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
226 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
227 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
228 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
229 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
230 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3 231);
232
233# This is not written for speed...
234
d9efae67 235my %InIdScript;
236my %InIdBlock;
9fdf68be 237my $InId = 0;
238
a0ed51b3 239foreach $file (@todo) {
240 my ($table, $wanted, $val) = @$file;
241 next if @ARGV and not grep { $_ eq $table } @ARGV;
9fdf68be 242 print $table, "\n";
243 $table =~ s/\W+//g;
2796c109 244 if ($table =~ /^(Is|To)(.+)/) {
a0ed51b3 245 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
246 }
247 else {
248 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
249 }
14055466 250 print OUT <<EOH;
251# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
252# This file is built by $0 from e.g. $UnicodeData.
253# Any changes made here will be lost!
254EOH
a0ed51b3 255 print OUT <<"END";
256return <<'END';
257END
258 print OUT proplist($table, $wanted, $val);
259 print OUT "END\n";
260 close OUT;
261}
262
2796c109 263print "Scripts\n";
264open(UD, 'Scripts.txt') or die "Can't open Scripts.txt: $!\n";
265open(OUT, ">Scripts.pl") or die "Can't create Scripts.pl: $!\n";
266print OUT <<EOH;
267# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
268# This file is built by $0 from e.g. $UnicodeData.
269# Any changes made here will be lost!
270EOH
271print OUT <<"END";
272return <<'END';
273END
274
275my %Scripts;
c602af67 276my $ScriptsVec = '';
277my $lastlast = 0;
2796c109 278
279while (<UD>) {
280 next if /^#/;
281 next if /^$/;
282 chomp;
5f9563ea 283 ($code, $last, $name) = /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s+;\s+(.+)\s+\#/i;
2796c109 284 if ($name) {
d9efae67 285 my $InName = $name;
2796c109 286 my $id;
d9efae67 287 unless (exists $InIdScript{$InName}) {
2796c109 288 print "\t$InName\n";
d9efae67 289 $id = $Scripts{$InName} = $InIdScript{$InName} = $InId++;
2796c109 290 open(SCRIPT, ">In/$id.pl") or die "create In/$id.pl: $!\n";
291 print SCRIPT <<EOH;
292# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
293# This file is built by $0 from e.g. $UnicodeData.
294# Any changes made here will be lost!
295return <<'END';
296EOH
297 close(SCRIPT);
298 } else {
d9efae67 299 $id = $InIdScript{$InName};
2796c109 300 }
5f9563ea 301 $last = "" unless defined $last;
d9efae67 302 print OUT "$code\t$last\t$name\t# In/$id.pl\n";
2796c109 303 open(SCRIPT, ">>In/$id.pl");
304 print SCRIPT <<END;
305$code $last
306END
307 close SCRIPT;
308 }
c602af67 309 my $firsti = hex($code);
310 my $lasti = $last ? hex($last) : $firsti;
311 for my $i ($firsti..$lasti) {
312 vec($ScriptsVec, $i, 1) = 1;
313 }
314 $lastlast = $lasti if $lasti > $lastlast;
315 print "\t\t$code..$last\n";
2796c109 316}
317
d9efae67 318for my $id (values %InIdScript) {
2796c109 319 open(SCRIPT, ">>In/$id.pl");
320 print SCRIPT <<END2;
321END
322END2
323 close(SCRIPT);
324}
325
326print OUT "END\n";
327close OUT;
328
a0ed51b3 329# Must treat blocks specially.
330
331exit if @ARGV and not grep { $_ eq Block } @ARGV;
9fdf68be 332print "Blocks\n";
7c6f5cd2 333open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
9fdf68be 334open(OUT, ">Blocks.pl") or die "Can't create Blocks.pl: $!\n";
14055466 335print OUT <<EOH;
336# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
337# This file is built by $0 from e.g. $UnicodeData.
338# Any changes made here will be lost!
339EOH
a0ed51b3 340print OUT <<"END";
341return <<'END';
342END
343
344while (<UD>) {
345 next if /^#/;
346 next if /^$/;
347 chomp;
d9efae67 348 ($code, $last, $name) = /^([0-9a-f]+)\.\.([0-9a-f]+); (.+?)\s*$/i;
a0ed51b3 349 if ($name) {
2796c109 350 my $InName = $name;
2796c109 351 print "\t$InName\n";
9fdf68be 352 my $id;
2796c109 353 # TODO: only the first one of Private Use blocks qualifies
d9efae67 354 unless (exists $InIdBlock{$InName}) {
355 $InIdBlock{$InName} = $InId++;
9fdf68be 356 }
d9efae67 357 $id = $InIdBlock{$InName};
2796c109 358 open(BLOCK, ">In/$id.pl") or die "create In/$id.pl: $!\n";
d9efae67 359 print OUT "$code\t$last\t$name\t# In/$id.pl\n";
14055466 360 print BLOCK <<EOH;
361# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
362# This file is built by $0 from e.g. $UnicodeData.
363# Any changes made here will be lost!
364EOH
a0ed51b3 365 print BLOCK <<"END2";
366return <<'END';
367$code $last
368END
369END2
370 close BLOCK;
371 }
372}
373
374print OUT "END\n";
375close OUT;
376
1ac13f9a 377#
378# \p{Common} is any code point not assigned to a script
379#
380
381my $first;
382
c8b5a1e3 383sub flush_zero_range {
384 my ($i) = @_;
1ac13f9a 385 if (defined $first) {
386 my $last = $i - 1;
387 $last = $last == $first ? "" : sprintf("%04x", $last);
c8b5a1e3 388 printf SCRIPT "%04x\t$last\n", $first;
1ac13f9a 389 printf "\t\t%04x..$last\n", $first;
390 undef $first;
391 }
392}
393
c602af67 394print "\tCommon\n";
395my $CommonId = $Scripts{Common} = $InIdScript{Common} = $InId++;
396open(SCRIPT, ">In/$CommonId.pl") or die "create In/$CommonId.pl: $!\n";
397print SCRIPT <<EOH;
398# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
399# This file is built by $0 from e.g. $UnicodeData.
400# Any changes made here will be lost!
401return <<'END';
402EOH
1ac13f9a 403
404undef $first;
c602af67 405for my $i (0..$lastlast) {
406 if (vec($ScriptsVec, $i, 1)) {
c8b5a1e3 407 defined $first && flush_zero_range($i);
1ac13f9a 408 } else {
409 $first = $i unless defined $first;
410 }
411}
c8b5a1e3 412flush_zero_range($lastlast+1);
1ac13f9a 413print SCRIPT "END\n";
414close(SCRIPT);
415
416#
417# \p{Any} is 0..10FFFF (in Unicode 3.1.1)
418#
419
420print "\tAny\n";
421my $AnyId = $Scripts{Any} = $InIdScript{Any} = $InId++;
422open(SCRIPT, ">In/$AnyId.pl") or die "create In/$AnyId.pl: $!\n";
423print SCRIPT <<EOH;
424# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
425# This file is built by $0 from e.g. $UnicodeData.
426# Any changes made here will be lost!
427return <<END;
4280000 $UnicodeLastHex
429END
430EOH
431
432my $CnVec = '';
433
434open(UD, 'PropList.txt') or die "Can't open PropList.txt: $!\n";
435
436my $InIdProp;
437while (<UD>) {
438 next if /^#/;
439 next if /^$/;
440 chomp;
441 ($code, $last, $name) = /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s+; (\w+)\s/i;
442 $last = "" unless defined $last;
443 if ($name) {
444 my $InName = $name;
445 my $id;
446 unless (exists $InIdScript{$InName}) {
447 print "\t$InName\n";
448 print PROP <<EOH if defined $InIdProp;
449END
450EOH
451 $id = $InIdProp = $InIdScript{$InName} = $InId++;
452 open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
453 print PROP <<EOH;
454# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
455# This file is built by $0 from e.g. $UnicodeData.
456# Any changes made here will be lost!
457return <<END;
458EOH
459 }
460 $id = $InIdScript{$InName};
461 print PROP "\L$code\t\L$last\n";
c8b5a1e3 462 if ($InName eq 'Noncharacter_Code_Point') {
1ac13f9a 463 my $firsti = hex($code);
464 my $lasti = $last ? hex($last) : $firsti;
c8b5a1e3 465 for my $i ($firsti..$lasti) {
1ac13f9a 466 vec($CnVec, $i, 1) = 1;
467 }
468 }
469 }
470}
c8b5a1e3 471print PROP "END\n";
1ac13f9a 472
473print "\tAssigned\n";
474my $AssignedId = $Scripts{Assigned} = $InIdScript{Assigned} = $InId++;
475open(SCRIPT, ">In/$AssignedId.pl") or die "create In/$AssignedId.pl: $!\n";
476print SCRIPT <<EOH;
477# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
478# This file is built by $0 from e.g. $UnicodeData.
479# Any changes made here will be lost!
480return <<'END';
481EOH
482
483undef $first;
484for my $i (0..hex($UnicodeLastHex)) {
485 if (vec($CnVec, $i, 1)) {
c8b5a1e3 486 defined $first && flush_zero_range($i);
c602af67 487 } else {
488 $first = $i unless defined $first;
489 }
490}
c8b5a1e3 491flush_zero_range(hex($UnicodeLastHex)+1);
c602af67 492print SCRIPT "END\n";
c602af67 493
1ac13f9a 494#
495# \p{Alphabetic} is \pL and \p{Other_Alphabetic}
496#
497
498print "\tAlphabetic\n";
499my @Alphabetic;
500push @Alphabetic, split(/\n/, do "Is/L.pl");
501push @Alphabetic, split(/\n/, do "In/$InIdScript{Other_Alphabetic}.pl");
502$id = $InIdScript{Alphabetic} = $InId++;
503open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
504print PROP <<EOH;
505# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
506# This file is built by $0 from e.g. $UnicodeData.
507# Any changes made here will be lost!
508return <<END;
509EOH
510for (sort { hex($a) <=> hex($b) } @Alphabetic) {
511 print PROP "$_\n";
512}
513print PROP <<EOH;
514END
515EOH
516
517#
518# \p{Lowercase} is \p{Ll} and \p{Other_Lowercase}
519#
520
521print "\tLowercase\n";
522my @Lowercase;
523push @Lowercase, split(/\n/, do "Is/Ll.pl");
524push @Lowercase, split(/\n/, do "In/$InIdScript{Other_Lowercase}.pl");
525$id = $InIdScript{Lowercase} = $InId++;
526open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
527print PROP <<EOH;
528# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
529# This file is built by $0 from e.g. $UnicodeData.
530# Any changes made here will be lost!
531return <<END;
532EOH
533for (sort { hex($a) <=> hex($b) } @Lowercase) {
534 print PROP "$_\n";
535}
536print PROP <<EOH;
537END
538EOH
539
540#
541# \p{Uppercase} is \p{Lu} and \p{Other_Uppercase}
542#
543
544print "\tUppercase\n";
545my @Uppercase;
546push @Uppercase, split(/\n/, do "Is/Lu.pl");
547push @Uppercase, split(/\n/, do "In/$InIdScript{Other_Uppercase}.pl");
548$id = $InIdScript{Uppercase} = $InId++;
549open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
550print PROP <<EOH;
551# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
552# This file is built by $0 from e.g. $UnicodeData.
553# Any changes made here will be lost!
554return <<END;
555EOH
556for (sort { hex($a) <=> hex($b) } @Uppercase) {
557 print PROP "$_\n";
558}
559print PROP <<EOH;
560END
561EOH
562
563#
564# \p{Math} is \p{Sm} and \p{Other_Math}
565#
566
567print "\tMath\n";
568my @Math;
569push @Math, split(/\n/, do "Is/Sm.pl");
570push @Math, split(/\n/, do "In/$InIdScript{Other_Math}.pl");
571$id = $InIdScript{Math} = $InId++;
572open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
573print PROP <<EOH;
574# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
575# This file is built by $0 from e.g. $UnicodeData.
576# Any changes made here will be lost!
577return <<END;
578EOH
579for (sort { hex($a) <=> hex($b) } @Math) {
580 print PROP "$_\n";
581}
582print PROP <<EOH;
583END
584EOH
585
586#
587# \p{L&} is \p{Ll}, \p{Lu} and \p{Lt}
588#
589
590print "\tLampersand\n";
591my @Lampersand;
592push @Lampersand, split(/\n/, do "Is/Ll.pl");
593push @Lampersand, split(/\n/, do "Is/Lu.pl");
594push @Lampersand, split(/\n/, do "Is/Lt.pl");
595$id = $InIdScript{Lampersand} = $InId++;
596open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
597print PROP <<EOH;
598# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
599# This file is built by $0 from e.g. $UnicodeData.
600# Any changes made here will be lost!
601return <<END;
602EOH
603for (sort { hex($a) <=> hex($b) } @Lampersand) {
604 print PROP "$_\n";
605}
606print PROP <<EOH;
607END
608EOH
609
610#
611# \p{ID_Start} is \p{Ll}, \p{Lu}, \p{Lt}, \p{Lm}, \p{Lo}, and \p{Nl}
612#
613
614print "\tID_Start\n";
615my @ID_Start;
616push @ID_Start, split(/\n/, do "Is/Ll.pl");
617push @ID_Start, split(/\n/, do "Is/Lu.pl");
618push @ID_Start, split(/\n/, do "Is/Lt.pl");
619push @ID_Start, split(/\n/, do "Is/Lm.pl");
620push @ID_Start, split(/\n/, do "Is/Lo.pl");
621push @ID_Start, split(/\n/, do "Is/Nl.pl");
622$id = $InIdScript{ID_Start} = $InId++;
623open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
624print PROP <<EOH;
625# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
626# This file is built by $0 from e.g. $UnicodeData.
627# Any changes made here will be lost!
628return <<END;
629EOH
630for (sort { hex($a) <=> hex($b) } @ID_Start) {
631 print PROP "$_\n";
632}
633print PROP <<EOH;
634END
635EOH
636
637#
638# \p{ID_Continue} is \p{ID_Start}, \p{Mn}, \p{Mc}, \p{Nd}, and \p{Pc}
639#
640
641print "\tID_Continue\n";
642my @ID_Continue;
643push @ID_Continue, split(/\n/, do "In/$InIdScript{ID_Start}.pl");
644push @ID_Continue, split(/\n/, do "Is/Mn.pl");
645push @ID_Continue, split(/\n/, do "Is/Mc.pl");
646push @ID_Continue, split(/\n/, do "Is/Nd.pl");
647push @ID_Continue, split(/\n/, do "Is/Pc.pl");
648$id = $InIdScript{ID_Continue} = $InId++;
649open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n";
650print PROP <<EOH;
651# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
652# This file is built by $0 from e.g. $UnicodeData.
653# Any changes made here will be lost!
654return <<END;
655EOH
656for (sort { hex($a) <=> hex($b) } @ID_Continue) {
657 print PROP "$_\n";
658}
659print PROP <<EOH;
660END
661EOH
662
9fdf68be 663open(INID, ">In.pl");
664
665print INID <<EOH;
666# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
667# This file is built by $0 from e.g. $UnicodeData.
668# Any changes made here will be lost!
669%utf8::In = (
670EOH
671
d9efae67 672my %InIdScriptById = reverse %InIdScript;
673my %InIdBlockById = reverse %InIdBlock;
674
675my @InIdScriptById = sort { $a <=> $b } keys %InIdScriptById;
676my @InIdBlockById = sort { $a <=> $b } keys %InIdBlockById;
677
678my %InId;
679my %IdIdLcName;
680
681for my $id (@InIdScriptById) {
682 my $name = $InIdScriptById{$id};
683 my $lcname = lc($name);
684 $InId{$name} = $id;
685 $IdIdLcName{$lcname} = $id;
686}
687
688for my $id (@InIdBlockById) {
689 my $name = $InIdBlockById{$id};
690 my $lcname = lc($name);
691 if (exists $IdIdLcName{$lcname}) {
692 $InId{"$name Block"} = $id;
693 } else {
694 $InId{$name} = $id;
695 }
696 $IdIdLcName{$lcname} = $id;
697}
698
699my @InId = sort { $InId{$a} <=> $InId{$b} } keys %InId;
700
701my %InIdPrefix;
702
703foreach my $in (@InId) {
704 my $inpat = $in;
f173cd49 705 $inpat =~ s/([- _])/(?:[-_]|\\s+)?/g;
c8b5a1e3 706 my $inprefix = lc(substr($in, 0, 2));
707 push @{$InIdPrefix{$inprefix}}, [ $in, $inpat ];
d9efae67 708 printf INID "%-45s => %3d,\n", "'$in'", $InId{$in};
709}
710
711print INID ");\n";
712
713print INID <<EOH;
714%utf8::InPat = (
715EOH
716
717foreach my $prefix (sort keys %InIdPrefix) {
718 printf INID "'$prefix' => {\n";
719 foreach my $ininpat (@{$InIdPrefix{$prefix}}) {
720 my ($in, $inpat) = @$ininpat;
721 printf INID "\t'$inpat' => '$in',\n";
722 }
723 printf INID "},\n";
9fdf68be 724}
725
726print INID ");\n";
727
728close(INID);
729
a0ed51b3 730##################################################
731
732sub proplist {
733 my ($table, $wanted, $val) = @_;
734 my @wanted;
735 my $out;
736 my $split;
737
7c6f5cd2 738 return listFromPropFile($wanted) if $val eq $PropData;
739
a0ed51b3 740 if ($table =~ /^Arab/) {
d357d9fe 741 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3 742
743 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
744 }
745 elsif ($table =~ /^Jamo/) {
505afebf 746 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3 747
748 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
749 }
499bfa7a 750 elsif ($table =~ /^IsSyl/) {
7c6f5cd2 751 open(UD, $SyllableData) or warn "Can't open $table: $!";
499bfa7a 752
753 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
754 }
a77b4ae5 755 elsif ($table =~ /^IsLbrk/) {
756 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
757
50fc4248 758 $split = '($code, $brk, $name) = /^([0-9a-f]+);(\w+) # (.+)/i;';
a77b4ae5 759 }
a0ed51b3 760 else {
11695a73 761 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3 762
763 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
764 $comment, $up, $down, $title) = split(/;/);';
765 }
766
767 if ($table =~ /^(?:To|Is)[A-Z]/) {
768 eval <<"END";
769 while (<UD>) {
770 next if /^#/;
7c6f5cd2 771 next if /^\\s/;
772 s/\\s+\$//;
a0ed51b3 773 $split
774 if ($wanted) {
775 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
776 }
777 }
778END
779 die $@ if $@;
780
781 while (@wanted) {
782 $beg = shift @wanted;
783 $last = $beg;
784 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
785 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
786 $last = shift @wanted;
787 }
788 $out .= sprintf "%04x", $beg->[0];
789 if ($beg->[2]) {
790 $last = shift @wanted;
791 }
792 if ($beg == $last) {
793 $out .= "\t";
794 }
795 else {
796 $out .= sprintf "\t%04x", $last->[0];
797 }
798 $out .= sprintf "\t%04x", $beg->[1] if $val;
799 $out .= "\n";
800 }
801 }
802 else {
803 eval <<"END";
804 while (<UD>) {
805 next if /^#/;
7c6f5cd2 806 next if /^\\s*\$/;
a0ed51b3 807 chop;
808 $split
809 if ($wanted) {
810 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
811 }
812 }
813END
814 die $@ if $@;
815
816 while (@wanted) {
817 $beg = shift @wanted;
818 $last = $beg;
819 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
820 ($wanted[0]->[1] eq $last->[1])) {
821 $last = shift @wanted;
822 }
823 $out .= sprintf "%04x", $beg->[0];
824 if ($beg->[2]) {
825 $last = shift @wanted;
826 }
827 if ($beg == $last) {
828 $out .= "\t";
829 }
830 else {
831 $out .= sprintf "\t%04x", $last->[0];
832 }
833 $out .= sprintf "\t%s\n", $beg->[1];
834 }
835 }
836 $out;
837}
11695a73 838
7c6f5cd2 839sub listFromPropFile {
840 my ($wanted) = @_;
841 my $out;
842
843 open (UD, $PropData) or die "Can't open $PropData: $!\n";
844 local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
845
846 <UD>;
847 while (<UD>) {
848 chomp;
849 if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
850 s/\(\d+ chars\)//g;
851 s/^\s+//mg;
852 s/\s+$//mg;
853 s/\.\./\t/g;
854 $out = lc $_;
855 last;
856 }
857 }
858 close (UD);
859 "$out\n";
860}
861
862sub syllable_defs {
863 my @defs;
864 my %seen;
865
866 open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
867 while (<SD>) {
868 next if /^\s*(#|$)/;
869 s/\s+$//;
870 ($code, $name, $syl) = split /; */;
871 next unless $syl;
872 push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
873 unless $seen{$syl}++;
874 }
875 close (SD);
876 return (@defs);
877}
878
6dd159d1 879# eof