Commit | Line | Data |
84281c31 |
1 | #!./perl |
2 | # |
3 | # Tests that have to do with checking whether characters have (or not have) |
4 | # certain Unicode properties; belong (or not belong) to blocks, scripts, etc. |
5 | # |
6 | |
7 | use strict; |
8 | use warnings; |
9 | use 5.010; |
10 | |
11 | my $IS_EBCDIC = ord ('A') == 193; |
12 | |
13 | sub run_tests; |
14 | |
15 | # |
16 | # This is the data to test. |
17 | # |
18 | # This is a hash; keys are the property to test. |
19 | # Values are arrays containing characters to test. The characters can |
20 | # have the following formats: |
21 | # '\N{CHARACTER NAME}' - Use character with that name |
22 | # '\x{1234}' - Use character with that hex escape |
23 | # '0x1234' - Use chr() to get that character |
24 | # "a" - Character to use |
25 | # |
26 | # If a character entry starts with ! the character does not belong to the class |
27 | # |
28 | # If the class is just single letter, we use both \pL and \p{L} |
29 | # |
30 | |
31 | use charnames ':full'; |
32 | |
33 | my @CLASSES = ( |
34 | L => ["a", "A"], |
35 | Ll => ["b", "!B"], |
36 | Lu => ["!c", "C"], |
37 | IsLl => ["d", "!D"], |
38 | IsLu => ["!e", "E"], |
39 | LC => ["f", "!1"], |
40 | 'L&' => ["g", "!2"], |
41 | 'Lowercase Letter' => ["h", "!H"], |
42 | |
43 | Common => ["!i", "3"], |
44 | Inherited => ["!j", '\x{300}'], |
45 | |
46 | InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], |
47 | InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], |
48 | InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], |
49 | InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], |
50 | InKatakana => ['\N{KATAKANA LETTER SMALL A}'], |
51 | IsLatin => ["0x100", "0x212b"], |
52 | IsHebrew => ["0x5d0", "0xfb4f"], |
53 | IsGreek => ["0x37a", "0x386", "!0x387", "0x388", |
54 | "0x38a", "!0x38b", "0x38c"], |
55 | HangulSyllables => ['\x{AC00}'], |
56 | 'Script=Latin' => ['\x{0100}'], |
57 | 'Block=LatinExtendedA' => ['\x{0100}'], |
58 | 'Category=UppercaseLetter' => ['\x{0100}'], |
59 | |
60 | # |
61 | # It's ok to repeat class names. |
62 | # |
63 | InLatin1Supplement => |
64 | $IS_EBCDIC ? ['!\x{7f}', '\x{80}', '!\x{100}'] |
65 | : ['!\x{7f}', '\x{80}', '\x{ff}', '!\x{100}'], |
66 | InLatinExtendedA => |
67 | ['!\x{7f}', '!\x{80}', '!\x{ff}', '\x{100}'], |
68 | |
69 | # |
70 | # Properties are case-insensitive, and may have whitespace, |
71 | # dashes and underscores. |
72 | # |
73 | 'in-latin1_SUPPLEMENT' => ['\x{80}', |
74 | '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], |
75 | ' ^ In Latin 1 Supplement ' |
76 | => ['!\x{80}', '\N{COFFIN}'], |
77 | 'latin-1 supplement' => ['\x{80}', "0xDF"], |
78 | |
79 | ); |
80 | |
81 | my @USER_DEFINED_PROPERTIES = ( |
82 | # |
83 | # User defined properties |
84 | # |
85 | InKana1 => ['\x{3040}', '!\x{303F}'], |
86 | InKana2 => ['\x{3040}', '!\x{303F}'], |
87 | InKana3 => ['\x{3041}', '!\x{3040}'], |
88 | InNotKana => ['\x{3040}', '!\x{3041}'], |
89 | InConsonant => ['d', '!e'], |
90 | IsSyriac1 => ['\x{0712}', '!\x{072F}'], |
91 | Syriac1 => ['\x{0712}', '!\x{072F}'], |
92 | '# User-defined character properties my lack \n at the end', |
93 | InGreekSmall => ['\N{GREEK SMALL LETTER PI}', |
94 | '\N{GREEK SMALL LETTER FINAL SIGMA}'], |
95 | InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], |
96 | Dash => ['-'], |
97 | ASCII_Hex_Digit => ['!-', 'A'], |
98 | AsciiHexAndDash => ['-', 'A'], |
99 | ); |
100 | |
101 | |
102 | # |
103 | # From the short properties we populate POSIX-like classes. |
104 | # |
105 | my %SHORT_PROPERTIES = ( |
106 | 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], |
107 | 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], |
108 | 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], |
109 | 'Mn' => ['\N{COMBINING GRAVE ACCENT}'], |
110 | 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], |
111 | 'Pc' => ["_"], |
112 | 'Po' => ["!"], |
113 | 'Zs' => [" "], |
114 | 'Cc' => ['\x{00}'], |
115 | ); |
116 | |
117 | # |
118 | # Illegal properties |
119 | # |
120 | my @ILLEGAL_PROPERTIES = qw [q qrst]; |
121 | |
122 | my %d; |
123 | |
124 | while (my ($class, $chars) = each %SHORT_PROPERTIES) { |
125 | push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; |
126 | push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; |
127 | push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' |
128 | ? $_ : "!$_"} @$chars; |
129 | push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; |
130 | push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; |
131 | push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; |
132 | push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ |
133 | ? $_ : "!$_"} @$chars; |
134 | push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ |
135 | ? $_ : "!$_"} @$chars; |
136 | push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; |
137 | push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; |
138 | push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; |
139 | push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" |
140 | ? $_ : "!$_"} @$chars; |
141 | push @{$d {IsSpace}} => map {$class =~ /^Z/ || |
142 | length ($_) == 1 && ord ($_) >= 0x09 |
143 | && ord ($_) <= 0x0D |
144 | ? $_ : "!$_"} @$chars; |
145 | } |
146 | |
147 | delete $d {IsASCII} if $IS_EBCDIC; |
148 | |
149 | push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, |
150 | "# POSIX like properties" => %d, |
151 | "# User defined properties" => @USER_DEFINED_PROPERTIES; |
152 | |
153 | |
154 | # |
155 | # Calculate the number of tests. |
156 | # |
157 | my $count = 0; |
158 | for (my $i = 0; $i < @CLASSES; $i += 2) { |
159 | $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; |
160 | $count += (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; |
161 | } |
162 | $count += 2 * @ILLEGAL_PROPERTIES; |
163 | $count += 2 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; |
164 | |
165 | my $tests = 0; |
166 | |
167 | say "1..$count"; |
168 | |
169 | run_tests unless caller (); |
170 | |
171 | sub match { |
172 | my ($char, $match, $nomatch) = @_; |
173 | |
174 | my ($str, $name); |
175 | |
176 | given ($char) { |
177 | when (/^\\/) { |
178 | $str = eval qq ["$char"]; |
179 | $name = qq ["$char"]; |
180 | } |
181 | when (/^0x([0-9A-Fa-f]+)$/) { |
182 | $str = chr hex $1; |
183 | $name = "chr ($char)"; |
184 | } |
185 | default { |
186 | $str = $char; |
187 | $name = qq ["$char"]; |
188 | } |
189 | } |
190 | |
191 | print "not " unless $str =~ /$match/; |
192 | print "ok ", ++ $tests, " - $name =~ /$match/\n"; |
193 | print "not " unless $str !~ /$nomatch/; |
194 | print "ok ", ++ $tests, " - $name !~ /$nomatch/\n"; |
195 | } |
196 | |
197 | sub run_tests { |
198 | |
199 | while (@CLASSES) { |
200 | my $class = shift @CLASSES; |
201 | if ($class =~ /^\h*#\h*(.*)/) { |
202 | print "# $1\n"; |
203 | next; |
204 | } |
205 | last unless @CLASSES; |
206 | my $chars = shift @CLASSES; |
207 | my @in = grep {!/^!./} @$chars; |
208 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; |
209 | my $in_pat = eval qq ['\\p{$class}']; |
210 | my $out_pat = eval qq ['\\P{$class}']; |
211 | |
212 | match $_, $in_pat, $out_pat for @in; |
213 | match $_, $out_pat, $in_pat for @out; |
214 | |
215 | if (1 == length $class) { |
216 | my $in_pat = eval qq ['\\p$class']; |
217 | my $out_pat = eval qq ['\\P$class']; |
218 | |
219 | match $_, $in_pat, $out_pat for @in; |
220 | match $_, $out_pat, $in_pat for @out; |
221 | } |
222 | } |
223 | |
224 | |
225 | my $pat = qr /^Can't find Unicode property definition/; |
226 | print "# Illegal properties\n"; |
227 | foreach my $p (@ILLEGAL_PROPERTIES) { |
228 | undef $@; |
229 | my $r = eval "'a' =~ /\\p{$p}/; 1"; |
230 | print "not " unless !$r && $@ && $@ =~ $pat; |
231 | print "ok ", ++ $tests, " - Unknown Unicode property \\p{$p}\n"; |
232 | undef $@; |
233 | my $s = eval "'a' =~ /\\P{$p}/; 1"; |
234 | print "not " unless !$s && $@ && $@ =~ $pat; |
235 | print "ok ", ++ $tests, " - Unknown Unicode property \\P{$p}\n"; |
236 | if (length $p == 1) { |
237 | undef $@; |
238 | my $r = eval "'a' =~ /\\p$p/; 1"; |
239 | print "not " unless !$r && $@ && $@ =~ $pat; |
240 | print "ok ", ++ $tests, " - Unknown Unicode property \\p$p\n"; |
241 | undef $@; |
242 | my $s = eval "'a' =~ /\\P$p/; 1"; |
243 | print "not " unless !$s && $@ && $@ =~ $pat; |
244 | print "ok ", ++ $tests, " - Unknown Unicode property \\P$p\n"; |
245 | } |
246 | } |
247 | } |
248 | |
249 | |
250 | # |
251 | # User defined properties |
252 | # |
253 | |
254 | sub InKana1 {<<'--'} |
255 | 3040 309F |
256 | 30A0 30FF |
257 | -- |
258 | |
259 | sub InKana2 {<<'--'} |
260 | +utf8::InHiragana |
261 | +utf8::InKatakana |
262 | -- |
263 | |
264 | sub InKana3 {<<'--'} |
265 | +utf8::InHiragana |
266 | +utf8::InKatakana |
267 | -utf8::IsCn |
268 | -- |
269 | |
270 | sub InNotKana {<<'--'} |
271 | !utf8::InHiragana |
272 | -utf8::InKatakana |
273 | +utf8::IsCn |
274 | -- |
275 | |
276 | sub InConsonant {<<'--'} # Not EBCDIC-aware. |
277 | 0061 007f |
278 | -0061 |
279 | -0065 |
280 | -0069 |
281 | -006f |
282 | -0075 |
283 | -- |
284 | |
285 | sub IsSyriac1 {<<'--'} |
286 | 0712 072C |
287 | 0730 074A |
288 | -- |
289 | |
290 | sub Syriac1 {<<'--'} |
291 | 0712 072C |
292 | 0730 074A |
293 | -- |
294 | |
295 | sub InGreekSmall {return "03B1\t03C9"} |
296 | sub InGreekCapital {return "0391\t03A9\n-03A2"} |
297 | |
298 | sub AsciiHexAndDash {<<'--'} |
299 | +utf8::ASCII_Hex_Digit |
300 | +utf8::Dash |
301 | -- |
302 | |
303 | __END__ |