Commit | Line | Data |
a0ed51b3 |
1 | #!../../miniperl |
2 | |
7c6f5cd2 |
3 | use bytes; |
4 | |
190eec7c |
5 | $UnicodeData = "Unicode.txt"; |
7c6f5cd2 |
6 | $SyllableData = "syllables.txt"; |
8836d2a5 |
7 | $PropData = "PropList.txt"; |
7c6f5cd2 |
8 | |
1ac13f9a |
9 | my $UnicodeLastHex = '10FFFF'; |
11695a73 |
10 | |
a0ed51b3 |
11 | # Note: we try to keep filenames unique within first 8 chars. Using |
12 | # subdirectories for the following helps. |
289d4f09 |
13 | mkdir "In", 0755; |
14 | mkdir "Is", 0755; |
15 | mkdir "To", 0755; |
a0ed51b3 |
16 | |
17 | @todo = ( |
18 | # typical |
19 | |
f59877d4 |
20 | # 005F: SPACING UNDERSCROE |
21 | ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''], |
22 | ['IsAlnum', '$cat =~ /^[LMN]/', ''], |
23 | ['IsAlpha', '$cat =~ /^[LM]/', ''], |
24 | # 0009: HORIZONTAL TABULATION |
25 | # 000A: LINE FEED |
26 | # 000B: VERTICAL TABULATION |
27 | # 000C: FORM FEED |
28 | # 000D: CARRIAGE RETURN |
3bec3564 |
29 | # 0020: SPACE |
f59877d4 |
30 | ['IsSpace', '$cat =~ /^Z/ || |
31 | $code =~ /^(0009|000A|000B|000C|000D)$/', ''], |
3bec3564 |
32 | ['IsSpacePerl', |
33 | '$cat =~ /^Z/ || |
34 | $code =~ /^(0009|000A|000C|000D)$/', ''], |
f5868911 |
35 | ['IsBlank', '$code =~ /^(0020|0009)$/ || |
36 | $cat =~ /^Z[^lp]$/', ''], |
b8c5462f |
37 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
7c6f5cd2 |
38 | ['IsUpper', '$cat =~ /^L[ut]$/', ''], |
b8c5462f |
39 | ['IsLower', '$cat =~ /^Ll$/', ''], |
f59877d4 |
40 | ['IsASCII', '$code le "007f"', ''], |
b8c5462f |
41 | ['IsCntrl', '$cat =~ /^C/', ''], |
f59877d4 |
42 | ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''], |
43 | ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''], |
44 | ['IsPunct', '$cat =~ /^P/', ''], |
45 | # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f |
b8c5462f |
46 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], |
47 | ['ToUpper', '$up', '$up'], |
48 | ['ToLower', '$down', '$down'], |
49 | ['ToTitle', '$title', '$title'], |
50 | ['ToDigit', '$dec ne ""', '$dec'], |
a0ed51b3 |
51 | |
52 | # Name |
53 | |
54 | ['Name', '$name', '$name'], |
55 | |
56 | # Category |
57 | |
58 | ['Category', '$cat', '$cat'], |
59 | |
60 | # Normative |
61 | |
62 | ['IsM', '$cat =~ /^M/', ''], # Mark |
63 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing |
64 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining |
7c6f5cd2 |
65 | ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing |
a0ed51b3 |
66 | |
67 | ['IsN', '$cat =~ /^N/', ''], # Number |
68 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit |
69 | ['IsNo', '$cat eq "No"', ''], # Number, Other |
7c6f5cd2 |
70 | ['IsNl', '$cat eq "Nl"', ''], # Number, Letter |
a0ed51b3 |
71 | |
7c6f5cd2 |
72 | ['IsZ', '$cat =~ /^Z/', ''], # Separator |
a0ed51b3 |
73 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
74 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line |
75 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph |
76 | |
77 | ['IsC', '$cat =~ /^C/', ''], # Crazy |
78 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format |
79 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use |
80 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned |
7c6f5cd2 |
81 | ['IsCf', '$cat eq "Cf"', ''], # Other, Format |
82 | ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate |
83 | ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned |
a0ed51b3 |
84 | |
85 | # Informative |
86 | |
87 | ['IsL', '$cat =~ /^L/', ''], # Letter |
88 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase |
89 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase |
90 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase |
91 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier |
92 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other |
93 | |
94 | ['IsP', '$cat =~ /^P/', ''], # Punctuation |
95 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash |
96 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open |
97 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close |
98 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other |
7c6f5cd2 |
99 | ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector |
100 | ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote |
101 | ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote |
a0ed51b3 |
102 | |
103 | ['IsS', '$cat =~ /^S/', ''], # Symbol |
104 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math |
7c6f5cd2 |
105 | ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier |
a0ed51b3 |
106 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
107 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other |
108 | |
109 | # Combining class |
110 | ['CombiningClass', '$comb', '$comb'], |
111 | |
112 | # BIDIRECTIONAL PROPERTIES |
113 | |
114 | ['Bidirectional', '$bid', '$bid'], |
115 | |
116 | # Strong types: |
117 | |
118 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, |
119 | # syllabic, and logographic |
120 | # characters (e.g., CJK |
121 | # ideographs) |
122 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, |
123 | # and punctuation specific to |
124 | # those scripts |
125 | |
7c6f5cd2 |
126 | ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding |
127 | ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override |
128 | ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic |
129 | ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding |
130 | ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override |
131 | ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format |
132 | ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark |
133 | ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral |
134 | |
a0ed51b3 |
135 | # Weak types: |
136 | |
137 | ['IsBidiEN','$bid eq "EN"', ''], # European Number |
138 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator |
139 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator |
140 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number |
141 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator |
142 | |
143 | # Separators: |
144 | |
145 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator |
146 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator |
147 | |
148 | # Neutrals: |
149 | |
150 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace |
151 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other |
152 | # characters: punctuation, |
153 | # symbols |
154 | |
155 | # Decomposition |
156 | |
157 | ['Decomposition', '$decomp', '$decomp'], |
158 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], |
159 | ['IsDecoCompat', '$decomp =~ /^</', ''], |
160 | ['IsDCfont', '$decomp =~ /^<font>/', ''], |
161 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], |
162 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], |
f59877d4 |
163 | ['IsDCmedial', '$decomp =~ /^<medial>/', ''], |
a0ed51b3 |
164 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
165 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], |
166 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], |
167 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], |
168 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], |
169 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], |
170 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], |
171 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], |
172 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], |
173 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], |
7c6f5cd2 |
174 | ['IsDCfraction', '$decomp =~ /^<fraction>/', ''], |
a0ed51b3 |
175 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
176 | |
177 | # Number |
178 | |
e1b504a6 |
179 | ['Number', '$num ne ""', '$num'], |
a0ed51b3 |
180 | |
181 | # Mirrored |
182 | |
183 | ['IsMirrored', '$mir eq "Y"', ''], |
184 | |
185 | # Arabic |
186 | |
187 | ['ArabLink', '1', '$link'], |
188 | ['ArabLnkGrp', '1', '$linkgroup'], |
189 | |
190 | # Jamo |
191 | |
192 | ['JamoShort', '1', '$short'], |
499bfa7a |
193 | |
194 | # Syllables |
195 | |
7c6f5cd2 |
196 | syllable_defs(), |
197 | |
a77b4ae5 |
198 | # Line break properties - Normative |
199 | |
200 | ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break |
201 | ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return |
202 | ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed |
203 | ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks |
204 | ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates |
205 | ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue) |
206 | ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity |
207 | ['IsLbrkSP','$brk eq "SP"', ''], # Space |
208 | ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space |
209 | |
210 | # Line break properties - Informative |
211 | ['IsLbrkXX','$brk eq "XX"', ''], # Unknown |
212 | ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation |
213 | ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation |
214 | ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation |
215 | ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter |
216 | ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation |
217 | ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks |
218 | ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric) |
219 | ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric) |
220 | ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric) |
221 | ['IsLbrkNU','$brk eq "NU"', ''], # Numeric |
222 | ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters |
223 | ['IsLbrkID','$brk eq "ID"', ''], # Ideographic |
224 | ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable |
225 | ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen |
226 | ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before |
227 | ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After |
228 | ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian) |
229 | ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic) |
230 | ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After |
a0ed51b3 |
231 | ); |
232 | |
233 | # This is not written for speed... |
234 | |
d9efae67 |
235 | my %InIdScript; |
236 | my %InIdBlock; |
9fdf68be |
237 | my $InId = 0; |
238 | |
a0ed51b3 |
239 | foreach $file (@todo) { |
240 | my ($table, $wanted, $val) = @$file; |
241 | next if @ARGV and not grep { $_ eq $table } @ARGV; |
9fdf68be |
242 | print $table, "\n"; |
243 | $table =~ s/\W+//g; |
2796c109 |
244 | if ($table =~ /^(Is|To)(.+)/) { |
a0ed51b3 |
245 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; |
246 | } |
247 | else { |
248 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; |
249 | } |
14055466 |
250 | print OUT <<EOH; |
251 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
252 | # This file is built by $0 from e.g. $UnicodeData. |
253 | # Any changes made here will be lost! |
254 | EOH |
a0ed51b3 |
255 | print OUT <<"END"; |
256 | return <<'END'; |
257 | END |
258 | print OUT proplist($table, $wanted, $val); |
259 | print OUT "END\n"; |
260 | close OUT; |
261 | } |
262 | |
2796c109 |
263 | print "Scripts\n"; |
264 | open(UD, 'Scripts.txt') or die "Can't open Scripts.txt: $!\n"; |
265 | open(OUT, ">Scripts.pl") or die "Can't create Scripts.pl: $!\n"; |
266 | print OUT <<EOH; |
267 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
268 | # This file is built by $0 from e.g. $UnicodeData. |
269 | # Any changes made here will be lost! |
270 | EOH |
271 | print OUT <<"END"; |
272 | return <<'END'; |
273 | END |
274 | |
275 | my %Scripts; |
c602af67 |
276 | my $ScriptsVec = ''; |
277 | my $lastlast = 0; |
2796c109 |
278 | |
279 | while (<UD>) { |
280 | next if /^#/; |
281 | next if /^$/; |
282 | chomp; |
5f9563ea |
283 | ($code, $last, $name) = /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s+;\s+(.+)\s+\#/i; |
2796c109 |
284 | if ($name) { |
d9efae67 |
285 | my $InName = $name; |
2796c109 |
286 | my $id; |
d9efae67 |
287 | unless (exists $InIdScript{$InName}) { |
2796c109 |
288 | print "\t$InName\n"; |
d9efae67 |
289 | $id = $Scripts{$InName} = $InIdScript{$InName} = $InId++; |
2796c109 |
290 | open(SCRIPT, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
291 | print SCRIPT <<EOH; |
292 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
293 | # This file is built by $0 from e.g. $UnicodeData. |
294 | # Any changes made here will be lost! |
295 | return <<'END'; |
296 | EOH |
297 | close(SCRIPT); |
298 | } else { |
d9efae67 |
299 | $id = $InIdScript{$InName}; |
2796c109 |
300 | } |
5f9563ea |
301 | $last = "" unless defined $last; |
d9efae67 |
302 | print OUT "$code\t$last\t$name\t# In/$id.pl\n"; |
2796c109 |
303 | open(SCRIPT, ">>In/$id.pl"); |
304 | print SCRIPT <<END; |
305 | $code $last |
306 | END |
307 | close SCRIPT; |
308 | } |
c602af67 |
309 | my $firsti = hex($code); |
310 | my $lasti = $last ? hex($last) : $firsti; |
311 | for my $i ($firsti..$lasti) { |
312 | vec($ScriptsVec, $i, 1) = 1; |
313 | } |
314 | $lastlast = $lasti if $lasti > $lastlast; |
315 | print "\t\t$code..$last\n"; |
2796c109 |
316 | } |
317 | |
d9efae67 |
318 | for my $id (values %InIdScript) { |
2796c109 |
319 | open(SCRIPT, ">>In/$id.pl"); |
320 | print SCRIPT <<END2; |
321 | END |
322 | END2 |
323 | close(SCRIPT); |
324 | } |
325 | |
326 | print OUT "END\n"; |
327 | close OUT; |
328 | |
a0ed51b3 |
329 | # Must treat blocks specially. |
330 | |
331 | exit if @ARGV and not grep { $_ eq Block } @ARGV; |
9fdf68be |
332 | print "Blocks\n"; |
7c6f5cd2 |
333 | open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n"; |
9fdf68be |
334 | open(OUT, ">Blocks.pl") or die "Can't create Blocks.pl: $!\n"; |
14055466 |
335 | print OUT <<EOH; |
336 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
337 | # This file is built by $0 from e.g. $UnicodeData. |
338 | # Any changes made here will be lost! |
339 | EOH |
a0ed51b3 |
340 | print OUT <<"END"; |
341 | return <<'END'; |
342 | END |
343 | |
344 | while (<UD>) { |
345 | next if /^#/; |
346 | next if /^$/; |
347 | chomp; |
d9efae67 |
348 | ($code, $last, $name) = /^([0-9a-f]+)\.\.([0-9a-f]+); (.+?)\s*$/i; |
a0ed51b3 |
349 | if ($name) { |
2796c109 |
350 | my $InName = $name; |
2796c109 |
351 | print "\t$InName\n"; |
9fdf68be |
352 | my $id; |
2796c109 |
353 | # TODO: only the first one of Private Use blocks qualifies |
d9efae67 |
354 | unless (exists $InIdBlock{$InName}) { |
355 | $InIdBlock{$InName} = $InId++; |
9fdf68be |
356 | } |
d9efae67 |
357 | $id = $InIdBlock{$InName}; |
2796c109 |
358 | open(BLOCK, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
d9efae67 |
359 | print OUT "$code\t$last\t$name\t# In/$id.pl\n"; |
14055466 |
360 | print BLOCK <<EOH; |
361 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
362 | # This file is built by $0 from e.g. $UnicodeData. |
363 | # Any changes made here will be lost! |
364 | EOH |
a0ed51b3 |
365 | print BLOCK <<"END2"; |
366 | return <<'END'; |
367 | $code $last |
368 | END |
369 | END2 |
370 | close BLOCK; |
371 | } |
372 | } |
373 | |
374 | print OUT "END\n"; |
375 | close OUT; |
376 | |
1ac13f9a |
377 | # |
378 | # \p{Common} is any code point not assigned to a script |
379 | # |
380 | |
381 | my $first; |
382 | |
c8b5a1e3 |
383 | sub flush_zero_range { |
384 | my ($i) = @_; |
1ac13f9a |
385 | if (defined $first) { |
386 | my $last = $i - 1; |
387 | $last = $last == $first ? "" : sprintf("%04x", $last); |
c8b5a1e3 |
388 | printf SCRIPT "%04x\t$last\n", $first; |
1ac13f9a |
389 | printf "\t\t%04x..$last\n", $first; |
390 | undef $first; |
391 | } |
392 | } |
393 | |
c602af67 |
394 | print "\tCommon\n"; |
395 | my $CommonId = $Scripts{Common} = $InIdScript{Common} = $InId++; |
396 | open(SCRIPT, ">In/$CommonId.pl") or die "create In/$CommonId.pl: $!\n"; |
397 | print SCRIPT <<EOH; |
398 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
399 | # This file is built by $0 from e.g. $UnicodeData. |
400 | # Any changes made here will be lost! |
401 | return <<'END'; |
402 | EOH |
1ac13f9a |
403 | |
404 | undef $first; |
c602af67 |
405 | for my $i (0..$lastlast) { |
406 | if (vec($ScriptsVec, $i, 1)) { |
c8b5a1e3 |
407 | defined $first && flush_zero_range($i); |
1ac13f9a |
408 | } else { |
409 | $first = $i unless defined $first; |
410 | } |
411 | } |
c8b5a1e3 |
412 | flush_zero_range($lastlast+1); |
1ac13f9a |
413 | print SCRIPT "END\n"; |
414 | close(SCRIPT); |
415 | |
416 | # |
417 | # \p{Any} is 0..10FFFF (in Unicode 3.1.1) |
418 | # |
419 | |
420 | print "\tAny\n"; |
421 | my $AnyId = $Scripts{Any} = $InIdScript{Any} = $InId++; |
422 | open(SCRIPT, ">In/$AnyId.pl") or die "create In/$AnyId.pl: $!\n"; |
423 | print SCRIPT <<EOH; |
424 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
425 | # This file is built by $0 from e.g. $UnicodeData. |
426 | # Any changes made here will be lost! |
427 | return <<END; |
428 | 0000 $UnicodeLastHex |
429 | END |
430 | EOH |
431 | |
432 | my $CnVec = ''; |
433 | |
434 | open(UD, 'PropList.txt') or die "Can't open PropList.txt: $!\n"; |
435 | |
436 | my $InIdProp; |
437 | while (<UD>) { |
438 | next if /^#/; |
439 | next if /^$/; |
440 | chomp; |
441 | ($code, $last, $name) = /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s+; (\w+)\s/i; |
442 | $last = "" unless defined $last; |
443 | if ($name) { |
444 | my $InName = $name; |
445 | my $id; |
446 | unless (exists $InIdScript{$InName}) { |
447 | print "\t$InName\n"; |
448 | print PROP <<EOH if defined $InIdProp; |
449 | END |
450 | EOH |
451 | $id = $InIdProp = $InIdScript{$InName} = $InId++; |
452 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
453 | print PROP <<EOH; |
454 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
455 | # This file is built by $0 from e.g. $UnicodeData. |
456 | # Any changes made here will be lost! |
457 | return <<END; |
458 | EOH |
459 | } |
460 | $id = $InIdScript{$InName}; |
461 | print PROP "\L$code\t\L$last\n"; |
c8b5a1e3 |
462 | if ($InName eq 'Noncharacter_Code_Point') { |
1ac13f9a |
463 | my $firsti = hex($code); |
464 | my $lasti = $last ? hex($last) : $firsti; |
c8b5a1e3 |
465 | for my $i ($firsti..$lasti) { |
1ac13f9a |
466 | vec($CnVec, $i, 1) = 1; |
467 | } |
468 | } |
469 | } |
470 | } |
c8b5a1e3 |
471 | print PROP "END\n"; |
1ac13f9a |
472 | |
473 | print "\tAssigned\n"; |
474 | my $AssignedId = $Scripts{Assigned} = $InIdScript{Assigned} = $InId++; |
475 | open(SCRIPT, ">In/$AssignedId.pl") or die "create In/$AssignedId.pl: $!\n"; |
476 | print SCRIPT <<EOH; |
477 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
478 | # This file is built by $0 from e.g. $UnicodeData. |
479 | # Any changes made here will be lost! |
480 | return <<'END'; |
481 | EOH |
482 | |
483 | undef $first; |
484 | for my $i (0..hex($UnicodeLastHex)) { |
485 | if (vec($CnVec, $i, 1)) { |
c8b5a1e3 |
486 | defined $first && flush_zero_range($i); |
c602af67 |
487 | } else { |
488 | $first = $i unless defined $first; |
489 | } |
490 | } |
c8b5a1e3 |
491 | flush_zero_range(hex($UnicodeLastHex)+1); |
c602af67 |
492 | print SCRIPT "END\n"; |
c602af67 |
493 | |
1ac13f9a |
494 | # |
495 | # \p{Alphabetic} is \pL and \p{Other_Alphabetic} |
496 | # |
497 | |
498 | print "\tAlphabetic\n"; |
499 | my @Alphabetic; |
500 | push @Alphabetic, split(/\n/, do "Is/L.pl"); |
501 | push @Alphabetic, split(/\n/, do "In/$InIdScript{Other_Alphabetic}.pl"); |
502 | $id = $InIdScript{Alphabetic} = $InId++; |
503 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
504 | print PROP <<EOH; |
505 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
506 | # This file is built by $0 from e.g. $UnicodeData. |
507 | # Any changes made here will be lost! |
508 | return <<END; |
509 | EOH |
510 | for (sort { hex($a) <=> hex($b) } @Alphabetic) { |
511 | print PROP "$_\n"; |
512 | } |
513 | print PROP <<EOH; |
514 | END |
515 | EOH |
516 | |
517 | # |
518 | # \p{Lowercase} is \p{Ll} and \p{Other_Lowercase} |
519 | # |
520 | |
521 | print "\tLowercase\n"; |
522 | my @Lowercase; |
523 | push @Lowercase, split(/\n/, do "Is/Ll.pl"); |
524 | push @Lowercase, split(/\n/, do "In/$InIdScript{Other_Lowercase}.pl"); |
525 | $id = $InIdScript{Lowercase} = $InId++; |
526 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
527 | print PROP <<EOH; |
528 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
529 | # This file is built by $0 from e.g. $UnicodeData. |
530 | # Any changes made here will be lost! |
531 | return <<END; |
532 | EOH |
533 | for (sort { hex($a) <=> hex($b) } @Lowercase) { |
534 | print PROP "$_\n"; |
535 | } |
536 | print PROP <<EOH; |
537 | END |
538 | EOH |
539 | |
540 | # |
541 | # \p{Uppercase} is \p{Lu} and \p{Other_Uppercase} |
542 | # |
543 | |
544 | print "\tUppercase\n"; |
545 | my @Uppercase; |
546 | push @Uppercase, split(/\n/, do "Is/Lu.pl"); |
547 | push @Uppercase, split(/\n/, do "In/$InIdScript{Other_Uppercase}.pl"); |
548 | $id = $InIdScript{Uppercase} = $InId++; |
549 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
550 | print PROP <<EOH; |
551 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
552 | # This file is built by $0 from e.g. $UnicodeData. |
553 | # Any changes made here will be lost! |
554 | return <<END; |
555 | EOH |
556 | for (sort { hex($a) <=> hex($b) } @Uppercase) { |
557 | print PROP "$_\n"; |
558 | } |
559 | print PROP <<EOH; |
560 | END |
561 | EOH |
562 | |
563 | # |
564 | # \p{Math} is \p{Sm} and \p{Other_Math} |
565 | # |
566 | |
567 | print "\tMath\n"; |
568 | my @Math; |
569 | push @Math, split(/\n/, do "Is/Sm.pl"); |
570 | push @Math, split(/\n/, do "In/$InIdScript{Other_Math}.pl"); |
571 | $id = $InIdScript{Math} = $InId++; |
572 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
573 | print PROP <<EOH; |
574 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
575 | # This file is built by $0 from e.g. $UnicodeData. |
576 | # Any changes made here will be lost! |
577 | return <<END; |
578 | EOH |
579 | for (sort { hex($a) <=> hex($b) } @Math) { |
580 | print PROP "$_\n"; |
581 | } |
582 | print PROP <<EOH; |
583 | END |
584 | EOH |
585 | |
586 | # |
587 | # \p{L&} is \p{Ll}, \p{Lu} and \p{Lt} |
588 | # |
589 | |
590 | print "\tLampersand\n"; |
591 | my @Lampersand; |
592 | push @Lampersand, split(/\n/, do "Is/Ll.pl"); |
593 | push @Lampersand, split(/\n/, do "Is/Lu.pl"); |
594 | push @Lampersand, split(/\n/, do "Is/Lt.pl"); |
595 | $id = $InIdScript{Lampersand} = $InId++; |
596 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
597 | print PROP <<EOH; |
598 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
599 | # This file is built by $0 from e.g. $UnicodeData. |
600 | # Any changes made here will be lost! |
601 | return <<END; |
602 | EOH |
603 | for (sort { hex($a) <=> hex($b) } @Lampersand) { |
604 | print PROP "$_\n"; |
605 | } |
606 | print PROP <<EOH; |
607 | END |
608 | EOH |
609 | |
610 | # |
611 | # \p{ID_Start} is \p{Ll}, \p{Lu}, \p{Lt}, \p{Lm}, \p{Lo}, and \p{Nl} |
612 | # |
613 | |
614 | print "\tID_Start\n"; |
615 | my @ID_Start; |
616 | push @ID_Start, split(/\n/, do "Is/Ll.pl"); |
617 | push @ID_Start, split(/\n/, do "Is/Lu.pl"); |
618 | push @ID_Start, split(/\n/, do "Is/Lt.pl"); |
619 | push @ID_Start, split(/\n/, do "Is/Lm.pl"); |
620 | push @ID_Start, split(/\n/, do "Is/Lo.pl"); |
621 | push @ID_Start, split(/\n/, do "Is/Nl.pl"); |
622 | $id = $InIdScript{ID_Start} = $InId++; |
623 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
624 | print PROP <<EOH; |
625 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
626 | # This file is built by $0 from e.g. $UnicodeData. |
627 | # Any changes made here will be lost! |
628 | return <<END; |
629 | EOH |
630 | for (sort { hex($a) <=> hex($b) } @ID_Start) { |
631 | print PROP "$_\n"; |
632 | } |
633 | print PROP <<EOH; |
634 | END |
635 | EOH |
636 | |
637 | # |
638 | # \p{ID_Continue} is \p{ID_Start}, \p{Mn}, \p{Mc}, \p{Nd}, and \p{Pc} |
639 | # |
640 | |
641 | print "\tID_Continue\n"; |
642 | my @ID_Continue; |
643 | push @ID_Continue, split(/\n/, do "In/$InIdScript{ID_Start}.pl"); |
644 | push @ID_Continue, split(/\n/, do "Is/Mn.pl"); |
645 | push @ID_Continue, split(/\n/, do "Is/Mc.pl"); |
646 | push @ID_Continue, split(/\n/, do "Is/Nd.pl"); |
647 | push @ID_Continue, split(/\n/, do "Is/Pc.pl"); |
648 | $id = $InIdScript{ID_Continue} = $InId++; |
649 | open(PROP, ">In/$id.pl") or die "create In/$id.pl: $!\n"; |
650 | print PROP <<EOH; |
651 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
652 | # This file is built by $0 from e.g. $UnicodeData. |
653 | # Any changes made here will be lost! |
654 | return <<END; |
655 | EOH |
656 | for (sort { hex($a) <=> hex($b) } @ID_Continue) { |
657 | print PROP "$_\n"; |
658 | } |
659 | print PROP <<EOH; |
660 | END |
661 | EOH |
662 | |
9fdf68be |
663 | open(INID, ">In.pl"); |
664 | |
665 | print INID <<EOH; |
666 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! |
667 | # This file is built by $0 from e.g. $UnicodeData. |
668 | # Any changes made here will be lost! |
669 | %utf8::In = ( |
670 | EOH |
671 | |
d9efae67 |
672 | my %InIdScriptById = reverse %InIdScript; |
673 | my %InIdBlockById = reverse %InIdBlock; |
674 | |
675 | my @InIdScriptById = sort { $a <=> $b } keys %InIdScriptById; |
676 | my @InIdBlockById = sort { $a <=> $b } keys %InIdBlockById; |
677 | |
678 | my %InId; |
679 | my %IdIdLcName; |
680 | |
681 | for my $id (@InIdScriptById) { |
682 | my $name = $InIdScriptById{$id}; |
683 | my $lcname = lc($name); |
684 | $InId{$name} = $id; |
685 | $IdIdLcName{$lcname} = $id; |
686 | } |
687 | |
688 | for my $id (@InIdBlockById) { |
689 | my $name = $InIdBlockById{$id}; |
690 | my $lcname = lc($name); |
691 | if (exists $IdIdLcName{$lcname}) { |
692 | $InId{"$name Block"} = $id; |
693 | } else { |
694 | $InId{$name} = $id; |
695 | } |
696 | $IdIdLcName{$lcname} = $id; |
697 | } |
698 | |
699 | my @InId = sort { $InId{$a} <=> $InId{$b} } keys %InId; |
700 | |
701 | my %InIdPrefix; |
702 | |
703 | foreach my $in (@InId) { |
704 | my $inpat = $in; |
f173cd49 |
705 | $inpat =~ s/([- _])/(?:[-_]|\\s+)?/g; |
c8b5a1e3 |
706 | my $inprefix = lc(substr($in, 0, 2)); |
707 | push @{$InIdPrefix{$inprefix}}, [ $in, $inpat ]; |
d9efae67 |
708 | printf INID "%-45s => %3d,\n", "'$in'", $InId{$in}; |
709 | } |
710 | |
711 | print INID ");\n"; |
712 | |
713 | print INID <<EOH; |
714 | %utf8::InPat = ( |
715 | EOH |
716 | |
717 | foreach my $prefix (sort keys %InIdPrefix) { |
718 | printf INID "'$prefix' => {\n"; |
719 | foreach my $ininpat (@{$InIdPrefix{$prefix}}) { |
720 | my ($in, $inpat) = @$ininpat; |
721 | printf INID "\t'$inpat' => '$in',\n"; |
722 | } |
723 | printf INID "},\n"; |
9fdf68be |
724 | } |
725 | |
726 | print INID ");\n"; |
727 | |
728 | close(INID); |
729 | |
a0ed51b3 |
730 | ################################################## |
731 | |
732 | sub proplist { |
733 | my ($table, $wanted, $val) = @_; |
734 | my @wanted; |
735 | my $out; |
736 | my $split; |
737 | |
7c6f5cd2 |
738 | return listFromPropFile($wanted) if $val eq $PropData; |
739 | |
a0ed51b3 |
740 | if ($table =~ /^Arab/) { |
d357d9fe |
741 | open(UD, "ArabShap.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
742 | |
743 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; |
744 | } |
745 | elsif ($table =~ /^Jamo/) { |
505afebf |
746 | open(UD, "Jamo.txt") or warn "Can't open $table: $!"; |
a0ed51b3 |
747 | |
748 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; |
749 | } |
499bfa7a |
750 | elsif ($table =~ /^IsSyl/) { |
7c6f5cd2 |
751 | open(UD, $SyllableData) or warn "Can't open $table: $!"; |
499bfa7a |
752 | |
753 | $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;'; |
754 | } |
a77b4ae5 |
755 | elsif ($table =~ /^IsLbrk/) { |
756 | open(UD, "LineBrk.txt") or warn "Can't open $table: $!"; |
757 | |
50fc4248 |
758 | $split = '($code, $brk, $name) = /^([0-9a-f]+);(\w+) # (.+)/i;'; |
a77b4ae5 |
759 | } |
a0ed51b3 |
760 | else { |
11695a73 |
761 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 |
762 | |
763 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, |
764 | $comment, $up, $down, $title) = split(/;/);'; |
765 | } |
766 | |
767 | if ($table =~ /^(?:To|Is)[A-Z]/) { |
768 | eval <<"END"; |
769 | while (<UD>) { |
770 | next if /^#/; |
7c6f5cd2 |
771 | next if /^\\s/; |
772 | s/\\s+\$//; |
a0ed51b3 |
773 | $split |
774 | if ($wanted) { |
775 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); |
776 | } |
777 | } |
778 | END |
779 | die $@ if $@; |
780 | |
781 | while (@wanted) { |
782 | $beg = shift @wanted; |
783 | $last = $beg; |
784 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
785 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { |
786 | $last = shift @wanted; |
787 | } |
788 | $out .= sprintf "%04x", $beg->[0]; |
789 | if ($beg->[2]) { |
790 | $last = shift @wanted; |
791 | } |
792 | if ($beg == $last) { |
793 | $out .= "\t"; |
794 | } |
795 | else { |
796 | $out .= sprintf "\t%04x", $last->[0]; |
797 | } |
798 | $out .= sprintf "\t%04x", $beg->[1] if $val; |
799 | $out .= "\n"; |
800 | } |
801 | } |
802 | else { |
803 | eval <<"END"; |
804 | while (<UD>) { |
805 | next if /^#/; |
7c6f5cd2 |
806 | next if /^\\s*\$/; |
a0ed51b3 |
807 | chop; |
808 | $split |
809 | if ($wanted) { |
810 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); |
811 | } |
812 | } |
813 | END |
814 | die $@ if $@; |
815 | |
816 | while (@wanted) { |
817 | $beg = shift @wanted; |
818 | $last = $beg; |
819 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and |
820 | ($wanted[0]->[1] eq $last->[1])) { |
821 | $last = shift @wanted; |
822 | } |
823 | $out .= sprintf "%04x", $beg->[0]; |
824 | if ($beg->[2]) { |
825 | $last = shift @wanted; |
826 | } |
827 | if ($beg == $last) { |
828 | $out .= "\t"; |
829 | } |
830 | else { |
831 | $out .= sprintf "\t%04x", $last->[0]; |
832 | } |
833 | $out .= sprintf "\t%s\n", $beg->[1]; |
834 | } |
835 | } |
836 | $out; |
837 | } |
11695a73 |
838 | |
7c6f5cd2 |
839 | sub listFromPropFile { |
840 | my ($wanted) = @_; |
841 | my $out; |
842 | |
843 | open (UD, $PropData) or die "Can't open $PropData: $!\n"; |
844 | local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42? |
845 | |
846 | <UD>; |
847 | while (<UD>) { |
848 | chomp; |
849 | if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) { |
850 | s/\(\d+ chars\)//g; |
851 | s/^\s+//mg; |
852 | s/\s+$//mg; |
853 | s/\.\./\t/g; |
854 | $out = lc $_; |
855 | last; |
856 | } |
857 | } |
858 | close (UD); |
859 | "$out\n"; |
860 | } |
861 | |
862 | sub syllable_defs { |
863 | my @defs; |
864 | my %seen; |
865 | |
866 | open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n"; |
867 | while (<SD>) { |
868 | next if /^\s*(#|$)/; |
869 | s/\s+$//; |
870 | ($code, $name, $syl) = split /; */; |
871 | next unless $syl; |
872 | push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, '']) |
873 | unless $seen{$syl}++; |
874 | } |
875 | close (SD); |
876 | return (@defs); |
877 | } |
878 | |
6dd159d1 |
879 | # eof |