add note about the handling of negative indices to tied arrays
[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3 1#!../../miniperl
2
505afebf 3$UnicodeData = "Unicode.300";
11695a73 4
a0ed51b3 5# Note: we try to keep filenames unique within first 8 chars. Using
6# subdirectories for the following helps.
7mkdir "In", 0777;
8mkdir "Is", 0777;
9mkdir "To", 0777;
10
11@todo = (
12# typical
13
6ab16782 14 ['IsWord', '$cat =~ /^L[ulot]|^Nd/ or $code eq "005F"', ''],
15 ['IsAlnum', '$cat =~ /^L[ulot]|^Nd/', ''],
16 ['IsAlpha', '$cat =~ /^L[ulot]/', ''],
17 # XXX broken: recursive definition (/\s/ will look up IsSpace in future)
b8c5462f 18 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
19 ['IsDigit', '$cat =~ /^Nd$/', ''],
20 ['IsUpper', '$cat =~ /^Lu$/', ''],
21 ['IsLower', '$cat =~ /^Ll$/', ''],
22 ['IsASCII', 'hex $code <= 127', ''],
23 ['IsCntrl', '$cat =~ /^C/', ''],
24 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
25 ['IsPrint', '$cat =~ /^[^C]/', ''],
26 ['IsPunct', '$cat =~ /^P/', ''],
27 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
28 ['ToUpper', '$up', '$up'],
29 ['ToLower', '$down', '$down'],
30 ['ToTitle', '$title', '$title'],
31 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3 32
33# Name
34
35 ['Name', '$name', '$name'],
36
37# Category
38
39 ['Category', '$cat', '$cat'],
40
41# Normative
42
43 ['IsM', '$cat =~ /^M/', ''], # Mark
44 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
45 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
46
47 ['IsN', '$cat =~ /^N/', ''], # Number
48 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
49 ['IsNo', '$cat eq "No"', ''], # Number, Other
50
51 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
52 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
53 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
54 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
55
56 ['IsC', '$cat =~ /^C/', ''], # Crazy
57 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
58 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
59 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
60
61# Informative
62
63 ['IsL', '$cat =~ /^L/', ''], # Letter
64 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
65 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
66 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
67 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
68 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
69
70 ['IsP', '$cat =~ /^P/', ''], # Punctuation
71 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
72 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
73 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
74 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
75
76 ['IsS', '$cat =~ /^S/', ''], # Symbol
77 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
78 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
79 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
80
81# Combining class
82 ['CombiningClass', '$comb', '$comb'],
83
84# BIDIRECTIONAL PROPERTIES
85
86 ['Bidirectional', '$bid', '$bid'],
87
88# Strong types:
89
90 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
91 # syllabic, and logographic
92 # characters (e.g., CJK
93 # ideographs)
94 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
95 # and punctuation specific to
96 # those scripts
97
98# Weak types:
99
100 ['IsBidiEN','$bid eq "EN"', ''], # European Number
101 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
102 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
103 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
104 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
105
106# Separators:
107
108 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
109 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
110
111# Neutrals:
112
113 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
114 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
115 # characters: punctuation,
116 # symbols
117
118# Decomposition
119
120 ['Decomposition', '$decomp', '$decomp'],
121 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
122 ['IsDecoCompat', '$decomp =~ /^</', ''],
123 ['IsDCfont', '$decomp =~ /^<font>/', ''],
124 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
125 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
126 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
127 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
128 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
129 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
130 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
131 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
132 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
133 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
134 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
135 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
136 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
137 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
138
139# Number
140
141 ['Number', '$num', '$num'],
142
143# Mirrored
144
145 ['IsMirrored', '$mir eq "Y"', ''],
146
147# Arabic
148
149 ['ArabLink', '1', '$link'],
150 ['ArabLnkGrp', '1', '$linkgroup'],
151
152# Jamo
153
154 ['JamoShort', '1', '$short'],
499bfa7a 155
156# Syllables
157
158 ['IsSylV', '$syl eq "V"', ''],
159 ['IsSylU', '$syl eq "U"', ''],
160 ['IsSylI', '$syl eq "I"', ''],
161 ['IsSylA', '$syl eq "A"', ''],
162 ['IsSylE', '$syl eq "E"', ''],
163 ['IsSylC', '$syl eq "C"', ''],
164 ['IsSylO', '$syl eq "O"', ''],
165 ['IsSylWV', '$syl eq "V"', ''],
166 ['IsSylWI', '$syl eq "I"', ''],
167 ['IsSylWA', '$syl eq "A"', ''],
168 ['IsSylWE', '$syl eq "E"', ''],
169 ['IsSylWC', '$syl eq "C"', ''],
a77b4ae5 170
171# Line break properties - Normative
172
173 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
174 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
175 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
176 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
177 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
178 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
179 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
180 ['IsLbrkSP','$brk eq "SP"', ''], # Space
181 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
182
183# Line break properties - Informative
184 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
185 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
186 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
187 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
188 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
189 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
190 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
191 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
192 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
193 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
194 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
195 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
196 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
197 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
198 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
199 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
200 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
201 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
202 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
203 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3 204);
205
206# This is not written for speed...
207
208foreach $file (@todo) {
209 my ($table, $wanted, $val) = @$file;
210 next if @ARGV and not grep { $_ eq $table } @ARGV;
211 print $table,"\n";
212 if ($table =~ /^(Is|In|To)(.*)/) {
213 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
214 }
215 else {
216 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
217 }
14055466 218 print OUT <<EOH;
219# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
220# This file is built by $0 from e.g. $UnicodeData.
221# Any changes made here will be lost!
222EOH
a0ed51b3 223 print OUT <<"END";
224return <<'END';
225END
226 print OUT proplist($table, $wanted, $val);
227 print OUT "END\n";
228 close OUT;
229}
230
231# Must treat blocks specially.
232
233exit if @ARGV and not grep { $_ eq Block } @ARGV;
234print "Block\n";
d357d9fe 235open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
a0ed51b3 236open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
14055466 237print OUT <<EOH;
238# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
239# This file is built by $0 from e.g. $UnicodeData.
240# Any changes made here will be lost!
241EOH
a0ed51b3 242print OUT <<"END";
243return <<'END';
244END
245
246while (<UD>) {
247 next if /^#/;
248 next if /^$/;
249 chomp;
250 ($code, $last, $name) = split(/; */);
251 if ($name) {
252 print OUT "$code $last $name\n";
253 $name =~ s/\s+//g;
254 open(BLOCK, ">In/$name.pl");
14055466 255 print BLOCK <<EOH;
256# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
257# This file is built by $0 from e.g. $UnicodeData.
258# Any changes made here will be lost!
259EOH
a0ed51b3 260 print BLOCK <<"END2";
261return <<'END';
262$code $last
263END
264END2
265 close BLOCK;
266 }
267}
268
269print OUT "END\n";
270close OUT;
271
272##################################################
273
274sub proplist {
275 my ($table, $wanted, $val) = @_;
276 my @wanted;
277 my $out;
278 my $split;
279
280 if ($table =~ /^Arab/) {
d357d9fe 281 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3 282
283 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
284 }
285 elsif ($table =~ /^Jamo/) {
505afebf 286 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3 287
288 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
289 }
499bfa7a 290 elsif ($table =~ /^IsSyl/) {
291 open(UD, "syllables.txt") or warn "Can't open $table: $!";
292
293 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
294 }
a77b4ae5 295 elsif ($table =~ /^IsLbrk/) {
296 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
297
298 $split = '($code, $brk, $name) = split(/;/);';
299 }
a0ed51b3 300 else {
11695a73 301 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3 302
303 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
304 $comment, $up, $down, $title) = split(/;/);';
305 }
306
307 if ($table =~ /^(?:To|Is)[A-Z]/) {
308 eval <<"END";
309 while (<UD>) {
310 next if /^#/;
311 next if /^\s/;
312 chop;
313 $split
314 if ($wanted) {
315 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
316 }
317 }
318END
319 die $@ if $@;
320
321 while (@wanted) {
322 $beg = shift @wanted;
323 $last = $beg;
324 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
325 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
326 $last = shift @wanted;
327 }
328 $out .= sprintf "%04x", $beg->[0];
329 if ($beg->[2]) {
330 $last = shift @wanted;
331 }
332 if ($beg == $last) {
333 $out .= "\t";
334 }
335 else {
336 $out .= sprintf "\t%04x", $last->[0];
337 }
338 $out .= sprintf "\t%04x", $beg->[1] if $val;
339 $out .= "\n";
340 }
341 }
342 else {
343 eval <<"END";
344 while (<UD>) {
345 next if /^#/;
346 next if /^\s*\$/;
347 chop;
348 $split
349 if ($wanted) {
350 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
351 }
352 }
353END
354 die $@ if $@;
355
356 while (@wanted) {
357 $beg = shift @wanted;
358 $last = $beg;
359 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
360 ($wanted[0]->[1] eq $last->[1])) {
361 $last = shift @wanted;
362 }
363 $out .= sprintf "%04x", $beg->[0];
364 if ($beg->[2]) {
365 $last = shift @wanted;
366 }
367 if ($beg == $last) {
368 $out .= "\t";
369 }
370 else {
371 $out .= sprintf "\t%04x", $last->[0];
372 }
373 $out .= sprintf "\t%s\n", $beg->[1];
374 }
375 }
376 $out;
377}
11695a73 378
6dd159d1 379# eof