extend "Unrecognized escape" warning to \8, \9, and \_ as well
[p5sagit/p5-mst-13.2.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3 1#!../../miniperl
2
505afebf 3$UnicodeData = "Unicode.300";
11695a73 4
a0ed51b3 5# Note: we try to keep filenames unique within first 8 chars. Using
6# subdirectories for the following helps.
7mkdir "In", 0777;
8mkdir "Is", 0777;
9mkdir "To", 0777;
10
11@todo = (
12# typical
13
b8c5462f 14 ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
15 ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
16 ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
17 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
18 ['IsDigit', '$cat =~ /^Nd$/', ''],
19 ['IsUpper', '$cat =~ /^Lu$/', ''],
20 ['IsLower', '$cat =~ /^Ll$/', ''],
21 ['IsASCII', 'hex $code <= 127', ''],
22 ['IsCntrl', '$cat =~ /^C/', ''],
23 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
24 ['IsPrint', '$cat =~ /^[^C]/', ''],
25 ['IsPunct', '$cat =~ /^P/', ''],
26 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
27 ['ToUpper', '$up', '$up'],
28 ['ToLower', '$down', '$down'],
29 ['ToTitle', '$title', '$title'],
30 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3 31
32# Name
33
34 ['Name', '$name', '$name'],
35
36# Category
37
38 ['Category', '$cat', '$cat'],
39
40# Normative
41
42 ['IsM', '$cat =~ /^M/', ''], # Mark
43 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
44 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
45
46 ['IsN', '$cat =~ /^N/', ''], # Number
47 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
48 ['IsNo', '$cat eq "No"', ''], # Number, Other
49
50 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
51 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
52 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
53 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
54
55 ['IsC', '$cat =~ /^C/', ''], # Crazy
56 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
57 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
58 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
59
60# Informative
61
62 ['IsL', '$cat =~ /^L/', ''], # Letter
63 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
64 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
65 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
66 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
67 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
68
69 ['IsP', '$cat =~ /^P/', ''], # Punctuation
70 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
71 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
72 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
73 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
74
75 ['IsS', '$cat =~ /^S/', ''], # Symbol
76 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
77 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
78 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
79
80# Combining class
81 ['CombiningClass', '$comb', '$comb'],
82
83# BIDIRECTIONAL PROPERTIES
84
85 ['Bidirectional', '$bid', '$bid'],
86
87# Strong types:
88
89 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
90 # syllabic, and logographic
91 # characters (e.g., CJK
92 # ideographs)
93 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
94 # and punctuation specific to
95 # those scripts
96
97# Weak types:
98
99 ['IsBidiEN','$bid eq "EN"', ''], # European Number
100 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
101 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
102 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
103 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
104
105# Separators:
106
107 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
108 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
109
110# Neutrals:
111
112 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
113 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
114 # characters: punctuation,
115 # symbols
116
117# Decomposition
118
119 ['Decomposition', '$decomp', '$decomp'],
120 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
121 ['IsDecoCompat', '$decomp =~ /^</', ''],
122 ['IsDCfont', '$decomp =~ /^<font>/', ''],
123 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
124 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
125 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
126 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
127 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
128 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
129 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
130 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
131 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
132 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
133 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
134 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
135 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
136 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
137
138# Number
139
140 ['Number', '$num', '$num'],
141
142# Mirrored
143
144 ['IsMirrored', '$mir eq "Y"', ''],
145
146# Arabic
147
148 ['ArabLink', '1', '$link'],
149 ['ArabLnkGrp', '1', '$linkgroup'],
150
151# Jamo
152
153 ['JamoShort', '1', '$short'],
499bfa7a 154
155# Syllables
156
157 ['IsSylV', '$syl eq "V"', ''],
158 ['IsSylU', '$syl eq "U"', ''],
159 ['IsSylI', '$syl eq "I"', ''],
160 ['IsSylA', '$syl eq "A"', ''],
161 ['IsSylE', '$syl eq "E"', ''],
162 ['IsSylC', '$syl eq "C"', ''],
163 ['IsSylO', '$syl eq "O"', ''],
164 ['IsSylWV', '$syl eq "V"', ''],
165 ['IsSylWI', '$syl eq "I"', ''],
166 ['IsSylWA', '$syl eq "A"', ''],
167 ['IsSylWE', '$syl eq "E"', ''],
168 ['IsSylWC', '$syl eq "C"', ''],
a77b4ae5 169
170# Line break properties - Normative
171
172 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
173 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
174 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
175 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
176 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
177 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
178 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
179 ['IsLbrkSP','$brk eq "SP"', ''], # Space
180 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
181
182# Line break properties - Informative
183 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
184 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
185 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
186 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
187 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
188 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
189 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
190 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
191 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
192 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
193 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
194 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
195 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
196 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
197 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
198 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
199 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
200 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
201 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
202 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3 203);
204
205# This is not written for speed...
206
207foreach $file (@todo) {
208 my ($table, $wanted, $val) = @$file;
209 next if @ARGV and not grep { $_ eq $table } @ARGV;
210 print $table,"\n";
211 if ($table =~ /^(Is|In|To)(.*)/) {
212 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
213 }
214 else {
215 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
216 }
14055466 217 print OUT <<EOH;
218# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
219# This file is built by $0 from e.g. $UnicodeData.
220# Any changes made here will be lost!
221EOH
a0ed51b3 222 print OUT <<"END";
223return <<'END';
224END
225 print OUT proplist($table, $wanted, $val);
226 print OUT "END\n";
227 close OUT;
228}
229
230# Must treat blocks specially.
231
232exit if @ARGV and not grep { $_ eq Block } @ARGV;
233print "Block\n";
d357d9fe 234open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
a0ed51b3 235open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
14055466 236print OUT <<EOH;
237# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
238# This file is built by $0 from e.g. $UnicodeData.
239# Any changes made here will be lost!
240EOH
a0ed51b3 241print OUT <<"END";
242return <<'END';
243END
244
245while (<UD>) {
246 next if /^#/;
247 next if /^$/;
248 chomp;
249 ($code, $last, $name) = split(/; */);
250 if ($name) {
251 print OUT "$code $last $name\n";
252 $name =~ s/\s+//g;
253 open(BLOCK, ">In/$name.pl");
14055466 254 print BLOCK <<EOH;
255# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
256# This file is built by $0 from e.g. $UnicodeData.
257# Any changes made here will be lost!
258EOH
a0ed51b3 259 print BLOCK <<"END2";
260return <<'END';
261$code $last
262END
263END2
264 close BLOCK;
265 }
266}
267
268print OUT "END\n";
269close OUT;
270
271##################################################
272
273sub proplist {
274 my ($table, $wanted, $val) = @_;
275 my @wanted;
276 my $out;
277 my $split;
278
279 if ($table =~ /^Arab/) {
d357d9fe 280 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3 281
282 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
283 }
284 elsif ($table =~ /^Jamo/) {
505afebf 285 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3 286
287 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
288 }
499bfa7a 289 elsif ($table =~ /^IsSyl/) {
290 open(UD, "syllables.txt") or warn "Can't open $table: $!";
291
292 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
293 }
a77b4ae5 294 elsif ($table =~ /^IsLbrk/) {
295 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
296
297 $split = '($code, $brk, $name) = split(/;/);';
298 }
a0ed51b3 299 else {
11695a73 300 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3 301
302 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
303 $comment, $up, $down, $title) = split(/;/);';
304 }
305
306 if ($table =~ /^(?:To|Is)[A-Z]/) {
307 eval <<"END";
308 while (<UD>) {
309 next if /^#/;
310 next if /^\s/;
311 chop;
312 $split
313 if ($wanted) {
314 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
315 }
316 }
317END
318 die $@ if $@;
319
320 while (@wanted) {
321 $beg = shift @wanted;
322 $last = $beg;
323 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
324 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
325 $last = shift @wanted;
326 }
327 $out .= sprintf "%04x", $beg->[0];
328 if ($beg->[2]) {
329 $last = shift @wanted;
330 }
331 if ($beg == $last) {
332 $out .= "\t";
333 }
334 else {
335 $out .= sprintf "\t%04x", $last->[0];
336 }
337 $out .= sprintf "\t%04x", $beg->[1] if $val;
338 $out .= "\n";
339 }
340 }
341 else {
342 eval <<"END";
343 while (<UD>) {
344 next if /^#/;
345 next if /^\s*\$/;
346 chop;
347 $split
348 if ($wanted) {
349 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
350 }
351 }
352END
353 die $@ if $@;
354
355 while (@wanted) {
356 $beg = shift @wanted;
357 $last = $beg;
358 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
359 ($wanted[0]->[1] eq $last->[1])) {
360 $last = shift @wanted;
361 }
362 $out .= sprintf "%04x", $beg->[0];
363 if ($beg->[2]) {
364 $last = shift @wanted;
365 }
366 if ($beg == $last) {
367 $out .= "\t";
368 }
369 else {
370 $out .= sprintf "\t%04x", $last->[0];
371 }
372 $out .= sprintf "\t%s\n", $beg->[1];
373 }
374 }
375 $out;
376}
11695a73 377
6dd159d1 378# eof