Commit | Line | Data |
e425a60b |
1 | #!./perl |
2 | # |
3 | # This is a home for regular expression tests that don't fit into |
4 | # the format supported by re/regexp.t. If you want to add a test |
5 | # that does fit that format, add it to re/re_tests, not here. |
6 | |
7 | use strict; |
8 | use warnings; |
9 | use 5.010; |
10 | |
11 | |
12 | sub run_tests; |
13 | |
14 | $| = 1; |
15 | |
e425a60b |
16 | |
17 | BEGIN { |
18 | chdir 't' if -d 't'; |
9d45b377 |
19 | @INC = ('../lib','.'); |
20 | do "re/ReTest.pl" or die $@; |
e425a60b |
21 | } |
e425a60b |
22 | |
e425a60b |
23 | |
6182169b |
24 | plan tests => 1143; # Update this when adding/deleting tests. |
e425a60b |
25 | |
9d45b377 |
26 | run_tests() unless caller; |
e425a60b |
27 | |
28 | # |
29 | # Tests start here. |
30 | # |
31 | sub run_tests { |
32 | |
e425a60b |
33 | SKIP: |
34 | { |
35 | local $Message = '\C matches octet'; |
36 | $_ = "a\x{100}b"; |
37 | ok /(.)(\C)(\C)(.)/ or skip q [\C doesn't match], 4; |
38 | iseq $1, "a"; |
39 | if ($IS_ASCII) { # ASCII (or equivalent), should be UTF-8 |
40 | iseq $2, "\xC4"; |
41 | iseq $3, "\x80"; |
42 | } |
43 | elsif ($IS_EBCDIC) { # EBCDIC (or equivalent), should be UTF-EBCDIC |
44 | iseq $2, "\x8C"; |
45 | iseq $3, "\x41"; |
46 | } |
47 | else { |
48 | SKIP: { |
49 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; |
50 | skip "Unexpected platform"; |
51 | } |
52 | } |
53 | iseq $4, "b"; |
54 | } |
55 | |
56 | |
57 | SKIP: |
58 | { |
59 | local $Message = '\C matches octet'; |
60 | $_ = "\x{100}"; |
61 | ok /(\C)/g or skip q [\C doesn't match], 2; |
62 | if ($IS_ASCII) { |
63 | iseq $1, "\xC4"; |
64 | } |
65 | elsif ($IS_EBCDIC) { |
66 | iseq $1, "\x8C"; |
67 | } |
68 | else { |
69 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; |
70 | } |
71 | ok /(\C)/g or skip q [\C doesn't match]; |
72 | if ($IS_ASCII) { |
73 | iseq $1, "\x80"; |
74 | } |
75 | elsif ($IS_EBCDIC) { |
76 | iseq $1, "\x41"; |
77 | } |
78 | else { |
79 | ok 0, "Unexpected platform", "ord ('A') = $ordA"; |
80 | } |
81 | } |
82 | |
83 | |
84 | { |
85 | # Japhy -- added 03/03/2001 |
86 | () = (my $str = "abc") =~ /(...)/; |
87 | $str = "def"; |
88 | iseq $1, "abc", 'Changing subject does not modify $1'; |
89 | } |
90 | |
91 | |
92 | SKIP: |
93 | { |
94 | # The trick is that in EBCDIC the explicit numeric range should |
95 | # match (as also in non-EBCDIC) but the explicit alphabetic range |
96 | # should not match. |
97 | ok "\x8e" =~ /[\x89-\x91]/, '"\x8e" =~ /[\x89-\x91]/'; |
98 | ok "\xce" =~ /[\xc9-\xd1]/, '"\xce" =~ /[\xc9-\xd1]/'; |
99 | |
100 | skip "Not an EBCDIC platform", 2 unless ord ('i') == 0x89 && |
101 | ord ('J') == 0xd1; |
102 | |
103 | # In most places these tests would succeed since \x8e does not |
104 | # in most character sets match 'i' or 'j' nor would \xce match |
105 | # 'I' or 'J', but strictly speaking these tests are here for |
106 | # the good of EBCDIC, so let's test these only there. |
107 | nok "\x8e" !~ /[i-j]/, '"\x8e" !~ /[i-j]/'; |
108 | nok "\xce" !~ /[I-J]/, '"\xce" !~ /[I-J]/'; |
109 | } |
110 | |
111 | |
112 | { |
113 | ok "\x{ab}" =~ /\x{ab}/, '"\x{ab}" =~ /\x{ab}/ '; |
114 | ok "\x{abcd}" =~ /\x{abcd}/, '"\x{abcd}" =~ /\x{abcd}/'; |
115 | } |
116 | |
117 | |
118 | { |
119 | local $Message = 'bug id 20001008.001'; |
120 | |
121 | my @x = ("stra\337e 138", "stra\337e 138"); |
122 | for (@x) { |
123 | ok s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; |
124 | ok my ($latin) = /^(.+)(?:\s+\d)/; |
125 | iseq $latin, "stra\337e"; |
0f289c68 |
126 | ok $latin =~ s/stra\337e/straße/; |
e425a60b |
127 | # |
128 | # Previous code follows, but outcommented - there were no tests. |
129 | # |
130 | # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a |
131 | # use utf8; # needed for the raw UTF-8 |
132 | # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a |
133 | } |
134 | } |
135 | |
136 | |
137 | { |
138 | local $Message = 'Test \x escapes'; |
139 | ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; |
140 | ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; |
141 | ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; |
142 | ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; |
143 | ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; |
144 | ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; |
145 | ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; |
146 | ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; |
147 | } |
148 | |
149 | |
e425a60b |
150 | SKIP: |
151 | { |
152 | local $Message = 'Match code points > 255'; |
153 | $_ = "abc\x{100}\x{200}\x{300}\x{380}\x{400}defg"; |
154 | ok /(.\x{300})./ or skip "No match", 4; |
155 | ok $` eq "abc\x{100}" && length ($`) == 4; |
156 | ok $& eq "\x{200}\x{300}\x{380}" && length ($&) == 3; |
157 | ok $' eq "\x{400}defg" && length ($') == 5; |
158 | ok $1 eq "\x{200}\x{300}" && length ($1) == 2; |
159 | } |
160 | |
161 | |
e425a60b |
162 | |
163 | { |
164 | my $x = "\x{10FFFD}"; |
165 | $x =~ s/(.)/$1/g; |
166 | ok ord($x) == 0x10FFFD && length($x) == 1, "From Robin Houston"; |
167 | } |
168 | |
169 | |
170 | { |
171 | my %d = ( |
172 | "7f" => [0, 0, 0], |
173 | "80" => [1, 1, 0], |
174 | "ff" => [1, 1, 0], |
175 | "100" => [0, 1, 1], |
176 | ); |
177 | SKIP: |
178 | while (my ($code, $match) = each %d) { |
179 | local $Message = "Properties of \\x$code"; |
180 | my $char = eval qq ["\\x{$code}"]; |
181 | my $i = 0; |
182 | ok (($char =~ /[\x80-\xff]/) xor !$$match [$i ++]); |
183 | ok (($char =~ /[\x80-\x{100}]/) xor !$$match [$i ++]); |
184 | ok (($char =~ /[\x{100}]/) xor !$$match [$i ++]); |
185 | } |
186 | } |
187 | |
188 | |
189 | { |
190 | # From Japhy |
191 | local $Message; |
192 | must_warn 'qr/(?c)/', '^Useless \(\?c\)'; |
193 | must_warn 'qr/(?-c)/', '^Useless \(\?-c\)'; |
194 | must_warn 'qr/(?g)/', '^Useless \(\?g\)'; |
195 | must_warn 'qr/(?-g)/', '^Useless \(\?-g\)'; |
196 | must_warn 'qr/(?o)/', '^Useless \(\?o\)'; |
197 | must_warn 'qr/(?-o)/', '^Useless \(\?-o\)'; |
198 | |
199 | # Now test multi-error regexes |
200 | must_warn 'qr/(?g-o)/', '^Useless \(\?g\).*\nUseless \(\?-o\)'; |
201 | must_warn 'qr/(?g-c)/', '^Useless \(\?g\).*\nUseless \(\?-c\)'; |
202 | # (?c) means (?g) error won't be thrown |
203 | must_warn 'qr/(?o-cg)/', '^Useless \(\?o\).*\nUseless \(\?-c\)'; |
204 | must_warn 'qr/(?ogc)/', '^Useless \(\?o\).*\nUseless \(\?g\).*\n' . |
205 | 'Useless \(\?c\)'; |
206 | } |
207 | |
208 | |
209 | { |
210 | local $Message = "/x tests"; |
211 | $_ = "foo"; |
212 | eval_ok <<" --"; |
213 | /f |
214 | o\r |
215 | o |
216 | \$ |
217 | /x |
218 | -- |
219 | eval_ok <<" --"; |
220 | /f |
221 | o |
222 | o |
223 | \$\r |
224 | /x |
225 | -- |
226 | } |
227 | |
228 | |
229 | { |
230 | local $Message = "/o feature"; |
231 | sub test_o {$_ [0] =~ /$_[1]/o; return $1} |
232 | iseq test_o ('abc', '(.)..'), 'a'; |
233 | iseq test_o ('abc', '..(.)'), 'a'; |
234 | } |
235 | |
e425a60b |
236 | { |
237 | # Test basic $^N usage outside of a regex |
238 | local $Message = '$^N usage outside of a regex'; |
239 | my $x = "abcdef"; |
240 | ok ($x =~ /cde/ and !defined $^N); |
241 | ok ($x =~ /(cde)/ and $^N eq "cde"); |
242 | ok ($x =~ /(c)(d)(e)/ and $^N eq "e"); |
243 | ok ($x =~ /(c(d)e)/ and $^N eq "cde"); |
244 | ok ($x =~ /(foo)|(c(d)e)/ and $^N eq "cde"); |
245 | ok ($x =~ /(c(d)e)|(foo)/ and $^N eq "cde"); |
246 | ok ($x =~ /(c(d)e)|(abc)/ and $^N eq "abc"); |
247 | ok ($x =~ /(c(d)e)|(abc)x/ and $^N eq "cde"); |
248 | ok ($x =~ /(c(d)e)(abc)?/ and $^N eq "cde"); |
249 | ok ($x =~ /(?:c(d)e)/ and $^N eq "d"); |
250 | ok ($x =~ /(?:c(d)e)(?:f)/ and $^N eq "d"); |
251 | ok ($x =~ /(?:([abc])|([def]))*/ and $^N eq "f"); |
252 | ok ($x =~ /(?:([ace])|([bdf]))*/ and $^N eq "f"); |
253 | ok ($x =~ /(([ace])|([bd]))*/ and $^N eq "e"); |
254 | {ok ($x =~ /(([ace])|([bdf]))*/ and $^N eq "f");} |
255 | ## Test to see if $^N is automatically localized -- it should now |
256 | ## have the value set in the previous test. |
257 | iseq $^N, "e", '$^N is automatically localized'; |
258 | |
259 | # Now test inside (?{ ... }) |
260 | local $Message = '$^N usage inside (?{ ... })'; |
261 | our ($y, $z); |
262 | ok ($x =~ /a([abc])(?{$y=$^N})c/ and $y eq "b"); |
263 | ok ($x =~ /a([abc]+)(?{$y=$^N})d/ and $y eq "bc"); |
264 | ok ($x =~ /a([abcdefg]+)(?{$y=$^N})d/ and $y eq "bc"); |
265 | ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})d)(?{$z=$^N})e/ and $y eq "bc" |
266 | and $z eq "abcd"); |
267 | ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})de)(?{$z=$^N})/ and $y eq "bc" |
268 | and $z eq "abcde"); |
269 | |
270 | } |
271 | |
272 | |
273 | SKIP: |
274 | { |
275 | ## Should probably put in tests for all the POSIX stuff, |
276 | ## but not sure how to guarantee a specific locale...... |
277 | |
278 | skip "Not an ASCII platform", 2 unless $IS_ASCII; |
279 | local $Message = 'Test [[:cntrl:]]'; |
280 | my $AllBytes = join "" => map {chr} 0 .. 255; |
281 | (my $x = $AllBytes) =~ s/[[:cntrl:]]//g; |
282 | iseq $x, join "", map {chr} 0x20 .. 0x7E, 0x80 .. 0xFF; |
283 | |
284 | ($x = $AllBytes) =~ s/[^[:cntrl:]]//g; |
285 | iseq $x, join "", map {chr} 0x00 .. 0x1F, 0x7F; |
286 | } |
287 | |
288 | |
289 | { |
290 | # With /s modifier UTF8 chars were interpreted as bytes |
291 | local $Message = "UTF-8 chars aren't bytes"; |
292 | my $a = "Hello \x{263A} World"; |
293 | my @a = ($a =~ /./gs); |
294 | iseq $#a, 12; |
295 | } |
296 | |
297 | |
298 | { |
299 | local $Message = '. matches \n with /s'; |
300 | my $str1 = "foo\nbar"; |
301 | my $str2 = "foo\n\x{100}bar"; |
302 | my ($a, $b) = map {chr} $IS_ASCII ? (0xc4, 0x80) : (0x8c, 0x41); |
303 | my @a; |
304 | @a = $str1 =~ /./g; iseq @a, 6; iseq "@a", "f o o b a r"; |
305 | @a = $str1 =~ /./gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; |
306 | @a = $str1 =~ /\C/g; iseq @a, 7; iseq "@a", "f o o \n b a r"; |
307 | @a = $str1 =~ /\C/gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; |
308 | @a = $str2 =~ /./g; iseq @a, 7; iseq "@a", "f o o \x{100} b a r"; |
309 | @a = $str2 =~ /./gs; iseq @a, 8; iseq "@a", "f o o \n \x{100} b a r"; |
310 | @a = $str2 =~ /\C/g; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; |
311 | @a = $str2 =~ /\C/gs; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; |
312 | } |
313 | |
314 | |
315 | { |
e425a60b |
316 | no warnings 'digit'; |
317 | # Check that \x## works. 5.6.1 and 5.005_03 fail some of these. |
318 | my $x; |
319 | $x = "\x4e" . "E"; |
320 | ok ($x =~ /^\x4EE$/, "Check only 2 bytes of hex are matched."); |
321 | |
322 | $x = "\x4e" . "i"; |
323 | ok ($x =~ /^\x4Ei$/, "Check that invalid hex digit stops it (2)"); |
324 | |
325 | $x = "\x4" . "j"; |
326 | ok ($x =~ /^\x4j$/, "Check that invalid hex digit stops it (1)"); |
327 | |
328 | $x = "\x0" . "k"; |
329 | ok ($x =~ /^\xk$/, "Check that invalid hex digit stops it (0)"); |
330 | |
331 | $x = "\x0" . "x"; |
332 | ok ($x =~ /^\xx$/, "\\xx isn't to be treated as \\0"); |
333 | |
334 | $x = "\x0" . "xa"; |
335 | ok ($x =~ /^\xxa$/, "\\xxa isn't to be treated as \\xa"); |
336 | |
337 | $x = "\x9" . "_b"; |
338 | ok ($x =~ /^\x9_b$/, "\\x9_b isn't to be treated as \\x9b"); |
339 | |
340 | # and now again in [] ranges |
341 | |
342 | $x = "\x4e" . "E"; |
343 | ok ($x =~ /^[\x4EE]{2}$/, "Check only 2 bytes of hex are matched."); |
344 | |
345 | $x = "\x4e" . "i"; |
346 | ok ($x =~ /^[\x4Ei]{2}$/, "Check that invalid hex digit stops it (2)"); |
347 | |
348 | $x = "\x4" . "j"; |
349 | ok ($x =~ /^[\x4j]{2}$/, "Check that invalid hex digit stops it (1)"); |
350 | |
351 | $x = "\x0" . "k"; |
352 | ok ($x =~ /^[\xk]{2}$/, "Check that invalid hex digit stops it (0)"); |
353 | |
354 | $x = "\x0" . "x"; |
355 | ok ($x =~ /^[\xx]{2}$/, "\\xx isn't to be treated as \\0"); |
356 | |
357 | $x = "\x0" . "xa"; |
358 | ok ($x =~ /^[\xxa]{3}$/, "\\xxa isn't to be treated as \\xa"); |
359 | |
360 | $x = "\x9" . "_b"; |
361 | ok ($x =~ /^[\x9_b]{3}$/, "\\x9_b isn't to be treated as \\x9b"); |
362 | |
363 | # Check that \x{##} works. 5.6.1 fails quite a few of these. |
364 | |
365 | $x = "\x9b"; |
366 | ok ($x =~ /^\x{9_b}$/, "\\x{9_b} is to be treated as \\x9b"); |
367 | |
368 | $x = "\x9b" . "y"; |
369 | ok ($x =~ /^\x{9_b}y$/, "\\x{9_b} is to be treated as \\x9b (again)"); |
370 | |
371 | $x = "\x9b" . "y"; |
372 | ok ($x =~ /^\x{9b_}y$/, "\\x{9b_} is to be treated as \\x9b"); |
373 | |
374 | $x = "\x9b" . "y"; |
375 | ok ($x =~ /^\x{9_bq}y$/, "\\x{9_bc} is to be treated as \\x9b"); |
376 | |
377 | $x = "\x0" . "y"; |
378 | ok ($x =~ /^\x{x9b}y$/, "\\x{x9b} is to be treated as \\x0"); |
379 | |
380 | $x = "\x0" . "y"; |
381 | ok ($x =~ /^\x{0x9b}y$/, "\\x{0x9b} is to be treated as \\x0"); |
382 | |
383 | $x = "\x9b" . "y"; |
384 | ok ($x =~ /^\x{09b}y$/, "\\x{09b} is to be treated as \\x9b"); |
385 | |
386 | $x = "\x9b"; |
387 | ok ($x =~ /^[\x{9_b}]$/, "\\x{9_b} is to be treated as \\x9b"); |
388 | |
389 | $x = "\x9b" . "y"; |
390 | ok ($x =~ /^[\x{9_b}y]{2}$/, |
391 | "\\x{9_b} is to be treated as \\x9b (again)"); |
392 | |
393 | $x = "\x9b" . "y"; |
394 | ok ($x =~ /^[\x{9b_}y]{2}$/, "\\x{9b_} is to be treated as \\x9b"); |
395 | |
396 | $x = "\x9b" . "y"; |
397 | ok ($x =~ /^[\x{9_bq}y]{2}$/, "\\x{9_bc} is to be treated as \\x9b"); |
398 | |
399 | $x = "\x0" . "y"; |
400 | ok ($x =~ /^[\x{x9b}y]{2}$/, "\\x{x9b} is to be treated as \\x0"); |
401 | |
402 | $x = "\x0" . "y"; |
403 | ok ($x =~ /^[\x{0x9b}y]{2}$/, "\\x{0x9b} is to be treated as \\x0"); |
404 | |
405 | $x = "\x9b" . "y"; |
406 | ok ($x =~ /^[\x{09b}y]{2}$/, "\\x{09b} is to be treated as \\x9b"); |
407 | |
408 | } |
409 | |
410 | |
411 | { |
412 | # High bit bug -- japhy |
413 | my $x = "ab\200d"; |
414 | ok $x =~ /.*?\200/, "High bit fine"; |
415 | } |
416 | |
417 | |
418 | { |
419 | # The basic character classes and Unicode |
420 | ok "\x{0100}" =~ /\w/, 'LATIN CAPITAL LETTER A WITH MACRON in /\w/'; |
421 | ok "\x{0660}" =~ /\d/, 'ARABIC-INDIC DIGIT ZERO in /\d/'; |
422 | ok "\x{1680}" =~ /\s/, 'OGHAM SPACE MARK in /\s/'; |
423 | } |
424 | |
425 | |
426 | { |
427 | local $Message = "Folding matches and Unicode"; |
428 | ok "a\x{100}" =~ /A/i; |
429 | ok "A\x{100}" =~ /a/i; |
430 | ok "a\x{100}" =~ /a/i; |
431 | ok "A\x{100}" =~ /A/i; |
432 | ok "\x{101}a" =~ /\x{100}/i; |
433 | ok "\x{100}a" =~ /\x{100}/i; |
434 | ok "\x{101}a" =~ /\x{101}/i; |
435 | ok "\x{100}a" =~ /\x{101}/i; |
436 | ok "a\x{100}" =~ /A\x{100}/i; |
437 | ok "A\x{100}" =~ /a\x{100}/i; |
438 | ok "a\x{100}" =~ /a\x{100}/i; |
439 | ok "A\x{100}" =~ /A\x{100}/i; |
440 | ok "a\x{100}" =~ /[A]/i; |
441 | ok "A\x{100}" =~ /[a]/i; |
442 | ok "a\x{100}" =~ /[a]/i; |
443 | ok "A\x{100}" =~ /[A]/i; |
444 | ok "\x{101}a" =~ /[\x{100}]/i; |
445 | ok "\x{100}a" =~ /[\x{100}]/i; |
446 | ok "\x{101}a" =~ /[\x{101}]/i; |
447 | ok "\x{100}a" =~ /[\x{101}]/i; |
448 | } |
449 | |
450 | |
451 | { |
452 | use charnames ':full'; |
453 | local $Message = "Folding 'LATIN LETTER A WITH GRAVE'"; |
454 | |
455 | my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; |
456 | my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; |
0f289c68 |
457 | |
e425a60b |
458 | ok $lower =~ m/$UPPER/i; |
459 | ok $UPPER =~ m/$lower/i; |
460 | ok $lower =~ m/[$UPPER]/i; |
461 | ok $UPPER =~ m/[$lower]/i; |
462 | |
463 | local $Message = "Folding 'GREEK LETTER ALPHA WITH VRACHY'"; |
464 | |
465 | $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; |
466 | $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; |
467 | |
468 | ok $lower =~ m/$UPPER/i; |
469 | ok $UPPER =~ m/$lower/i; |
470 | ok $lower =~ m/[$UPPER]/i; |
471 | ok $UPPER =~ m/[$lower]/i; |
472 | |
473 | local $Message = "Folding 'LATIN LETTER Y WITH DIAERESIS'"; |
474 | |
475 | $lower = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; |
476 | $UPPER = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; |
477 | |
478 | ok $lower =~ m/$UPPER/i; |
479 | ok $UPPER =~ m/$lower/i; |
480 | ok $lower =~ m/[$UPPER]/i; |
481 | ok $UPPER =~ m/[$lower]/i; |
482 | } |
483 | |
484 | |
485 | { |
486 | use charnames ':full'; |
487 | local $PatchId = "13843"; |
488 | local $Message = "GREEK CAPITAL LETTER SIGMA vs " . |
489 | "COMBINING GREEK PERISPOMENI"; |
490 | |
491 | my $SIGMA = "\N{GREEK CAPITAL LETTER SIGMA}"; |
492 | my $char = "\N{COMBINING GREEK PERISPOMENI}"; |
493 | |
494 | may_not_warn sub {ok "_:$char:_" !~ m/_:$SIGMA:_/i}; |
495 | } |
496 | |
497 | |
498 | { |
499 | local $Message = '\X'; |
500 | use charnames ':full'; |
501 | |
502 | ok "a!" =~ /^(\X)!/ && $1 eq "a"; |
503 | ok "\xDF!" =~ /^(\X)!/ && $1 eq "\xDF"; |
504 | ok "\x{100}!" =~ /^(\X)!/ && $1 eq "\x{100}"; |
505 | ok "\x{100}\x{300}!" =~ /^(\X)!/ && $1 eq "\x{100}\x{300}"; |
506 | ok "\N{LATIN CAPITAL LETTER E}!" =~ /^(\X)!/ && |
507 | $1 eq "\N{LATIN CAPITAL LETTER E}"; |
508 | ok "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}!" |
509 | =~ /^(\X)!/ && |
510 | $1 eq "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}"; |
511 | |
512 | local $Message = '\C and \X'; |
513 | ok "!abc!" =~ /a\Cc/; |
514 | ok "!abc!" =~ /a\Xc/; |
515 | } |
516 | |
517 | |
518 | { |
519 | local $Message = "Final Sigma"; |
520 | |
521 | my $SIGMA = "\x{03A3}"; # CAPITAL |
522 | my $Sigma = "\x{03C2}"; # SMALL FINAL |
523 | my $sigma = "\x{03C3}"; # SMALL |
524 | |
525 | ok $SIGMA =~ /$SIGMA/i; |
526 | ok $SIGMA =~ /$Sigma/i; |
527 | ok $SIGMA =~ /$sigma/i; |
528 | |
529 | ok $Sigma =~ /$SIGMA/i; |
530 | ok $Sigma =~ /$Sigma/i; |
531 | ok $Sigma =~ /$sigma/i; |
532 | |
533 | ok $sigma =~ /$SIGMA/i; |
534 | ok $sigma =~ /$Sigma/i; |
535 | ok $sigma =~ /$sigma/i; |
0f289c68 |
536 | |
e425a60b |
537 | ok $SIGMA =~ /[$SIGMA]/i; |
538 | ok $SIGMA =~ /[$Sigma]/i; |
539 | ok $SIGMA =~ /[$sigma]/i; |
540 | |
541 | ok $Sigma =~ /[$SIGMA]/i; |
542 | ok $Sigma =~ /[$Sigma]/i; |
543 | ok $Sigma =~ /[$sigma]/i; |
544 | |
545 | ok $sigma =~ /[$SIGMA]/i; |
546 | ok $sigma =~ /[$Sigma]/i; |
547 | ok $sigma =~ /[$sigma]/i; |
548 | |
549 | local $Message = "More final Sigma"; |
550 | |
551 | my $S3 = "$SIGMA$Sigma$sigma"; |
552 | |
553 | ok ":$S3:" =~ /:(($SIGMA)+):/i && $1 eq $S3 && $2 eq $sigma; |
554 | ok ":$S3:" =~ /:(($Sigma)+):/i && $1 eq $S3 && $2 eq $sigma; |
555 | ok ":$S3:" =~ /:(($sigma)+):/i && $1 eq $S3 && $2 eq $sigma; |
556 | |
557 | ok ":$S3:" =~ /:(([$SIGMA])+):/i && $1 eq $S3 && $2 eq $sigma; |
558 | ok ":$S3:" =~ /:(([$Sigma])+):/i && $1 eq $S3 && $2 eq $sigma; |
559 | ok ":$S3:" =~ /:(([$sigma])+):/i && $1 eq $S3 && $2 eq $sigma; |
560 | } |
561 | |
562 | |
563 | { |
564 | use charnames ':full'; |
565 | local $Message = "Parlez-Vous " . |
566 | "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais?"; |
567 | |
568 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran.ais/ && |
569 | $& eq "Francais"; |
570 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran.ais/ && |
571 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; |
572 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Cais/ && |
573 | $& eq "Francais"; |
574 | # COMBINING CEDILLA is two bytes when encoded |
575 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\C\Cais/; |
576 | ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Xais/ && |
577 | $& eq "Francais"; |
578 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran\Xais/ && |
579 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; |
580 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Fran\Xais/ && |
581 | $& eq "Franc\N{COMBINING CEDILLA}ais"; |
582 | ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ |
583 | /Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais/ && |
584 | $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; |
585 | ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\N{COMBINING CEDILLA}ais/ && |
586 | $& eq "Franc\N{COMBINING CEDILLA}ais"; |
587 | |
588 | my @f = ( |
589 | ["Fran\N{LATIN SMALL LETTER C}ais", "Francais"], |
590 | ["Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", |
591 | "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"], |
592 | ["Franc\N{COMBINING CEDILLA}ais", "Franc\N{COMBINING CEDILLA}ais"], |
593 | ); |
594 | foreach my $entry (@f) { |
595 | my ($subject, $match) = @$entry; |
596 | ok $subject =~ /Fran(?:c\N{COMBINING CEDILLA}?| |
597 | \N{LATIN SMALL LETTER C WITH CEDILLA})ais/x && |
598 | $& eq $match; |
599 | } |
600 | } |
601 | |
602 | |
603 | { |
604 | local $Message = "Lingering (and useless) UTF8 flag doesn't mess up /i"; |
605 | my $pat = "ABcde"; |
606 | my $str = "abcDE\x{100}"; |
607 | chop $str; |
608 | ok $str =~ /$pat/i; |
609 | |
610 | $pat = "ABcde\x{100}"; |
611 | $str = "abcDE"; |
612 | chop $pat; |
613 | ok $str =~ /$pat/i; |
614 | |
615 | $pat = "ABcde\x{100}"; |
616 | $str = "abcDE\x{100}"; |
617 | chop $pat; |
618 | chop $str; |
619 | ok $str =~ /$pat/i; |
620 | } |
621 | |
622 | |
623 | { |
624 | use charnames ':full'; |
625 | local $Message = "LATIN SMALL LETTER SHARP S " . |
626 | "(\N{LATIN SMALL LETTER SHARP S})"; |
627 | |
628 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
629 | /\N{LATIN SMALL LETTER SHARP S}/; |
630 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
631 | /\N{LATIN SMALL LETTER SHARP S}/i; |
632 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
633 | /[\N{LATIN SMALL LETTER SHARP S}]/; |
634 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
635 | /[\N{LATIN SMALL LETTER SHARP S}]/i; |
636 | |
637 | ok "ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i; |
638 | ok "SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i; |
639 | ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; |
640 | ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; |
641 | |
642 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i; |
643 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i; |
0f289c68 |
644 | |
e425a60b |
645 | local $Message = "Unoptimized named sequence in class"; |
646 | ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; |
647 | ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; |
648 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
649 | /[\N{LATIN SMALL LETTER SHARP S}x]/; |
650 | ok "\N{LATIN SMALL LETTER SHARP S}" =~ |
651 | /[\N{LATIN SMALL LETTER SHARP S}x]/i; |
652 | } |
653 | |
654 | |
655 | { |
656 | # More whitespace: U+0085, U+2028, U+2029\n"; |
657 | |
658 | # U+0085, U+00A0 need to be forced to be Unicode, the \x{100} does that. |
659 | SKIP: { |
660 | skip "EBCDIC platform", 4 if $IS_EBCDIC; |
661 | # Do \x{0015} and \x{0041} match \s in EBCDIC? |
662 | ok "<\x{100}\x{0085}>" =~ /<\x{100}\s>/, '\x{0085} in \s'; |
663 | ok "<\x{0085}>" =~ /<\v>/, '\x{0085} in \v'; |
664 | ok "<\x{100}\x{00A0}>" =~ /<\x{100}\s>/, '\x{00A0} in \s'; |
665 | ok "<\x{00A0}>" =~ /<\h>/, '\x{00A0} in \h'; |
666 | } |
667 | my @h = map {sprintf "%05x" => $_} 0x01680, 0x0180E, 0x02000 .. 0x0200A, |
668 | 0x0202F, 0x0205F, 0x03000; |
669 | my @v = map {sprintf "%05x" => $_} 0x02028, 0x02029; |
670 | |
671 | my @H = map {sprintf "%05x" => $_} 0x01361, 0x0200B, 0x02408, 0x02420, |
672 | 0x0303F, 0xE0020; |
673 | my @V = map {sprintf "%05x" => $_} 0x0008A .. 0x0008D, 0x00348, 0x10100, |
674 | 0xE005F, 0xE007C; |
675 | |
676 | for my $hex (@h) { |
677 | my $str = eval qq ["<\\x{$hex}>"]; |
678 | ok $str =~ /<\s>/, "\\x{$hex} in \\s"; |
679 | ok $str =~ /<\h>/, "\\x{$hex} in \\h"; |
680 | ok $str !~ /<\v>/, "\\x{$hex} not in \\v"; |
681 | } |
682 | |
683 | for my $hex (@v) { |
684 | my $str = eval qq ["<\\x{$hex}>"]; |
685 | ok $str =~ /<\s>/, "\\x{$hex} in \\s"; |
686 | ok $str =~ /<\v>/, "\\x{$hex} in \\v"; |
687 | ok $str !~ /<\h>/, "\\x{$hex} not in \\h"; |
688 | } |
689 | |
690 | for my $hex (@H) { |
691 | my $str = eval qq ["<\\x{$hex}>"]; |
692 | ok $str =~ /<\S>/, "\\x{$hex} in \\S"; |
693 | ok $str =~ /<\H>/, "\\x{$hex} in \\H"; |
694 | } |
695 | |
696 | for my $hex (@V) { |
697 | my $str = eval qq ["<\\x{$hex}>"]; |
698 | ok $str =~ /<\S>/, "\\x{$hex} in \\S"; |
699 | ok $str =~ /<\V>/, "\\x{$hex} in \\V"; |
700 | } |
701 | } |
702 | |
703 | |
704 | { |
705 | # . with /s should work on characters, as opposed to bytes |
706 | local $Message = ". with /s works on characters, not bytes"; |
707 | |
708 | my $s = "\x{e4}\x{100}"; |
709 | # This is not expected to match: the point is that |
710 | # neither should we get "Malformed UTF-8" warnings. |
711 | may_not_warn sub {$s =~ /\G(.+?)\n/gcs}, "No 'Malformed UTF-8' warning"; |
712 | |
713 | my @c; |
714 | push @c => $1 while $s =~ /\G(.)/gs; |
715 | |
716 | local $" = ""; |
717 | iseq "@c", $s; |
718 | |
719 | # Test only chars < 256 |
720 | my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; |
721 | my $r1 = ""; |
722 | while ($t1 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { |
0f289c68 |
723 | $r1 .= $1 . $2; |
e425a60b |
724 | } |
725 | |
726 | my $t2 = $t1 . "\x{100}"; # Repeat with a larger char |
727 | my $r2 = ""; |
728 | while ($t2 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { |
0f289c68 |
729 | $r2 .= $1 . $2; |
e425a60b |
730 | } |
731 | $r2 =~ s/\x{100}//; |
732 | |
733 | iseq $r1, $r2; |
734 | } |
735 | |
736 | |
737 | { |
738 | local $Message = "Unicode lookbehind"; |
739 | ok "A\x{100}B" =~ /(?<=A.)B/; |
740 | ok "A\x{200}\x{300}B" =~ /(?<=A..)B/; |
741 | ok "\x{400}AB" =~ /(?<=\x{400}.)B/; |
742 | ok "\x{500}\x{600}B" =~ /(?<=\x{500}.)B/; |
743 | |
744 | # Original code also contained: |
745 | # ok "\x{500\x{600}}B" =~ /(?<=\x{500}.)B/; |
746 | # but that looks like a typo. |
747 | } |
748 | |
749 | |
750 | { |
751 | local $Message = 'UTF-8 hash keys and /$/'; |
752 | # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters |
753 | # /2002-01/msg01327.html |
754 | |
755 | my $u = "a\x{100}"; |
756 | my $v = substr ($u, 0, 1); |
757 | my $w = substr ($u, 1, 1); |
758 | my %u = ($u => $u, $v => $v, $w => $w); |
759 | for (keys %u) { |
760 | my $m1 = /^\w*$/ ? 1 : 0; |
761 | my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; |
762 | iseq $m1, $m2; |
763 | } |
764 | } |
765 | |
766 | |
767 | { |
e425a60b |
768 | local $Message = "No SEGV in s/// and UTF-8"; |
769 | my $s = "s#\x{100}" x 4; |
770 | ok $s =~ s/[^\w]/ /g; |
2e84be61 |
771 | if ( 1 or $ENV{PERL_TEST_LEGACY_POSIX_CC} ) { |
e425a60b |
772 | iseq $s, "s \x{100}" x 4; |
773 | } |
774 | else { |
775 | iseq $s, "s " x 4; |
776 | } |
777 | } |
778 | |
779 | |
780 | { |
781 | local $Message = "UTF-8 bug (maybe already known?)"; |
782 | my $u = "foo"; |
783 | $u =~ s/./\x{100}/g; |
784 | iseq $u, "\x{100}\x{100}\x{100}"; |
785 | |
786 | $u = "foobar"; |
787 | $u =~ s/[ao]/\x{100}/g; |
788 | iseq $u, "f\x{100}\x{100}b\x{100}r"; |
789 | |
790 | $u =~ s/\x{100}/e/g; |
791 | iseq $u, "feeber"; |
792 | } |
793 | |
794 | |
795 | { |
796 | local $Message = "UTF-8 bug with s///"; |
797 | # check utf8/non-utf8 mixtures |
798 | # try to force all float/anchored check combinations |
799 | |
800 | my $c = "\x{100}"; |
801 | my $subst; |
802 | for my $re ("xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", |
803 | "xx.*(?=$c)", "(?=$c).*xx",) { |
804 | ok "xxx" !~ /$re/; |
805 | ok +($subst = "xxx") !~ s/$re//; |
806 | } |
807 | for my $re ("xx.*$c*", "$c*.*xx") { |
808 | ok "xxx" =~ /$re/; |
809 | ok +($subst = "xxx") =~ s/$re//; |
810 | iseq $subst, ""; |
811 | } |
812 | for my $re ("xxy*", "y*xx") { |
813 | ok "xx$c" =~ /$re/; |
814 | ok +($subst = "xx$c") =~ s/$re//; |
815 | iseq $subst, $c; |
816 | ok "xy$c" !~ /$re/; |
817 | ok +($subst = "xy$c") !~ s/$re//; |
818 | } |
819 | for my $re ("xy$c*z", "x$c*yz") { |
820 | ok "xyz" =~ /$re/; |
821 | ok +($subst = "xyz") =~ s/$re//; |
822 | iseq $subst, ""; |
823 | } |
824 | } |
825 | |
826 | |
827 | { |
828 | local $Message = "qr /.../x"; |
829 | my $R = qr / A B C # D E/x; |
830 | ok "ABCDE" =~ $R && $& eq "ABC"; |
831 | ok "ABCDE" =~ /$R/ && $& eq "ABC"; |
832 | ok "ABCDE" =~ m/$R/ && $& eq "ABC"; |
833 | ok "ABCDE" =~ /($R)/ && $1 eq "ABC"; |
834 | ok "ABCDE" =~ m/($R)/ && $1 eq "ABC"; |
835 | } |
836 | |
837 | |
e425a60b |
838 | |
0f289c68 |
839 | |
e425a60b |
840 | { |
841 | local $\; |
842 | $_ = 'aaaaaaaaaa'; |
843 | utf8::upgrade($_); chop $_; $\="\n"; |
844 | ok /[^\s]+/, 'm/[^\s]/ utf8'; |
845 | ok /[^\d]+/, 'm/[^\d]/ utf8'; |
846 | ok +($a = $_, $_ =~ s/[^\s]+/./g), 's/[^\s]/ utf8'; |
847 | ok +($a = $_, $a =~ s/[^\d]+/./g), 's/[^\s]/ utf8'; |
848 | } |
849 | |
850 | |
e425a60b |
851 | |
852 | |
853 | { |
854 | # Subject: Odd regexp behavior |
855 | # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> |
856 | # Date: Wed, 26 Feb 2003 16:53:12 +0000 |
857 | # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> |
858 | # To: perl-unicode@perl.org |
859 | |
860 | local $Message = 'Markus Kuhn 2003-02-26'; |
0f289c68 |
861 | |
e425a60b |
862 | my $x = "\x{2019}\nk"; |
863 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; |
864 | ok $x eq "\x{2019} k"; |
865 | |
866 | $x = "b\nk"; |
867 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; |
868 | ok $x eq "b k"; |
869 | |
870 | ok "\x{2019}" =~ /\S/; |
871 | } |
872 | |
873 | |
874 | { |
e425a60b |
875 | # XXX DAPM 13-Apr-06. Recursive split is still broken. It's only luck it |
876 | # hasn't been crashing. Disable this test until it is fixed properly. |
877 | # XXX also check what it returns rather than just doing ok(1,...) |
878 | # split /(?{ split "" })/, "abc"; |
879 | local $TODO = "Recursive split is still broken"; |
880 | ok 0, 'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0'; |
881 | } |
882 | |
883 | |
884 | { |
885 | ok "\x{100}\n" =~ /\x{100}\n$/, "UTF-8 length cache and fbm_compile"; |
886 | } |
887 | |
888 | |
889 | { |
890 | package Str; |
891 | use overload q /""/ => sub {${$_ [0]};}; |
892 | sub new {my ($c, $v) = @_; bless \$v, $c;} |
893 | |
894 | package main; |
895 | $_ = Str -> new ("a\x{100}/\x{100}b"); |
896 | ok join (":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr"; |
897 | } |
898 | |
e425a60b |
899 | { |
900 | my $re = qq /^([^X]*)X/; |
901 | utf8::upgrade ($re); |
902 | ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; |
903 | } |
904 | |
e425a60b |
905 | { |
906 | ok "123\x{100}" =~ /^.*1.*23\x{100}$/, |
907 | 'UTF-8 + multiple floating substr'; |
908 | } |
909 | |
e425a60b |
910 | { |
911 | local $Message = '<20030808193656.5109.1@llama.ni-s.u-net.com>'; |
912 | |
913 | # LATIN SMALL/CAPITAL LETTER A WITH MACRON |
914 | ok " \x{101}" =~ qr/\x{100}/i; |
915 | |
916 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW |
917 | ok " \x{1E01}" =~ qr/\x{1E00}/i; |
918 | |
919 | # DESERET SMALL/CAPITAL LETTER LONG I |
920 | ok " \x{10428}" =~ qr/\x{10400}/i; |
921 | |
922 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' |
923 | ok " \x{1E01}x" =~ qr/\x{1E00}X/i; |
924 | } |
925 | |
e425a60b |
926 | { |
927 | for (120 .. 130) { |
928 | my $head = 'x' x $_; |
929 | local $Message = q [Don't misparse \x{...} in regexp ] . |
930 | q [near 127 char EXACT limit]; |
931 | for my $tail ('\x{0061}', '\x{1234}', '\x61') { |
932 | eval_ok qq ["$head$tail" =~ /$head$tail/]; |
933 | } |
934 | local $Message = q [Don't misparse \N{...} in regexp ] . |
935 | q [near 127 char EXACT limit]; |
936 | for my $tail ('\N{SNOWFLAKE}') { |
937 | eval_ok qq [use charnames ':full'; |
938 | "$head$tail" =~ /$head$tail/]; |
939 | } |
940 | } |
941 | } |
942 | |
e425a60b |
943 | { # TRIE related |
944 | our @got = (); |
945 | "words" =~ /(word|word|word)(?{push @got, $1})s$/; |
946 | iseq @got, 1, "TRIE optimation"; |
947 | |
948 | @got = (); |
949 | "words" =~ /(word|word|word)(?{push @got,$1})s$/i; |
950 | iseq @got, 1,"TRIEF optimisation"; |
951 | |
952 | my @nums = map {int rand 1000} 1 .. 100; |
953 | my $re = "(" . (join "|", @nums) . ")"; |
954 | $re = qr/\b$re\b/; |
955 | |
956 | foreach (@nums) { |
957 | ok $_ =~ /$re/, "Trie nums"; |
958 | } |
959 | |
960 | $_ = join " ", @nums; |
961 | @got = (); |
962 | push @got, $1 while /$re/g; |
963 | |
964 | my %count; |
965 | $count {$_} ++ for @got; |
966 | my $ok = 1; |
967 | for (@nums) { |
968 | $ok = 0 if --$count {$_} < 0; |
969 | } |
970 | ok $ok, "Trie min count matches"; |
971 | } |
972 | |
973 | |
974 | { |
975 | # TRIE related |
976 | # LATIN SMALL/CAPITAL LETTER A WITH MACRON |
977 | ok "foba \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i && |
978 | $1 eq "\x{101}foo", |
979 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON"; |
980 | |
981 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW |
982 | ok "foba \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i && |
983 | $1 eq "\x{1E01}foo", |
984 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW"; |
985 | |
986 | # DESERET SMALL/CAPITAL LETTER LONG I |
987 | ok "foba \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i && |
988 | $1 eq "\x{10428}foo", |
989 | "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I"; |
990 | |
991 | # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' |
992 | ok "foba \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i && |
993 | $1 eq "\x{1E01}xfoo", |
994 | "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'"; |
995 | |
996 | use charnames ':full'; |
997 | |
998 | my $s = "\N{LATIN SMALL LETTER SHARP S}"; |
999 | ok "foba ba$s" =~ qr/(foo|Ba$s|bar)/i && $1 eq "ba$s", |
1000 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; |
1001 | ok "foba ba$s" =~ qr/(Ba$s|foo|bar)/i && $1 eq "ba$s", |
1002 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; |
1003 | ok "foba ba$s" =~ qr/(foo|bar|Ba$s)/i && $1 eq "ba$s", |
1004 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; |
1005 | |
1006 | ok "foba ba$s" =~ qr/(foo|Bass|bar)/i && $1 eq "ba$s", |
1007 | "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; |
1008 | |
1009 | ok "foba ba$s" =~ qr/(foo|BaSS|bar)/i && $1 eq "ba$s", |
1010 | "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"; |
1011 | |
1012 | ok "foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i |
1013 | && $1 eq "ba${s}pxySS$s$s", |
1014 | "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S"; |
1015 | } |
1016 | |
1017 | |
e425a60b |
1018 | |
1019 | |
1020 | { |
0f289c68 |
1021 | BEGIN { |
1022 | unshift @INC, 'lib'; |
1023 | } |
e425a60b |
1024 | use Cname; |
0f289c68 |
1025 | |
e425a60b |
1026 | ok 'fooB' =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname"; |
1027 | my $test = 1233; |
1028 | # |
1029 | # Why doesn't must_warn work here? |
1030 | # |
1031 | my $w; |
1032 | local $SIG {__WARN__} = sub {$w .= "@_"}; |
1033 | eval 'q(xxWxx) =~ /[\N{WARN}]/'; |
1034 | ok $w && $w =~ /^Ignoring excess chars from/, |
1035 | "Ignoring excess chars warning"; |
1036 | |
1037 | undef $w; |
1038 | eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/, |
1039 | "Zerolength charname in charclass doesn't match \\0"]; |
1040 | ok $w && $w =~ /^Ignoring zero length/, |
1041 | 'Ignoring zero length \N{%} in character class warning'; |
1042 | |
1043 | ok 'AB' =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1'; |
1044 | ok 'ABC' =~ /(\N{EVIL})/, 'Charname caching $1'; |
1045 | ok 'xy' =~ /x\N{EMPTY-STR}y/, |
1046 | 'Empty string charname produces NOTHING node'; |
1047 | ok '' =~ /\N{EMPTY-STR}/, |
1048 | 'Empty string charname produces NOTHING node'; |
0f289c68 |
1049 | |
e425a60b |
1050 | } |
1051 | |
1052 | |
1053 | { |
1054 | use charnames ':full'; |
1055 | |
1056 | ok 'aabc' !~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against aabc'; |
1057 | ok 'a+bc' =~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against a+bc'; |
1058 | |
1059 | ok ' A B' =~ /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, |
1060 | 'Intermixed named and unicode escapes'; |
1061 | ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ |
1062 | /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, |
1063 | 'Intermixed named and unicode escapes'; |
1064 | ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ |
1065 | /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/, |
0f289c68 |
1066 | 'Intermixed named and unicode escapes'; |
e425a60b |
1067 | } |
1068 | |
1069 | |
1070 | { |
1071 | our $brackets; |
1072 | $brackets = qr{ |
1073 | { (?> [^{}]+ | (??{ $brackets }) )* } |
1074 | }x; |
1075 | |
1076 | ok "{b{c}d" !~ m/^((??{ $brackets }))/, "Bracket mismatch"; |
1077 | |
1078 | SKIP: { |
1079 | our @stack = (); |
1080 | my @expect = qw( |
1081 | stuff1 |
1082 | stuff2 |
1083 | <stuff1>and<stuff2> |
1084 | right |
1085 | <right> |
1086 | <<right>> |
1087 | <<<right>>> |
1088 | <<stuff1>and<stuff2>><<<<right>>>> |
1089 | ); |
1090 | |
1091 | local $_ = '<<<stuff1>and<stuff2>><<<<right>>>>>'; |
1092 | ok /^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/, |
1093 | "Recursion matches"; |
1094 | iseq @stack, @expect, "Right amount of matches" |
1095 | or skip "Won't test individual results as count isn't equal", |
1096 | 0 + @expect; |
1097 | my $idx = 0; |
1098 | foreach my $expect (@expect) { |
1099 | iseq $stack [$idx], $expect, |
1100 | "Expecting '$expect' at stack pos #$idx"; |
1101 | $idx ++; |
1102 | } |
1103 | } |
1104 | } |
1105 | |
1106 | |
1107 | { |
1108 | my $s = '123453456'; |
1109 | $s =~ s/(?<digits>\d+)\k<digits>/$+{digits}/; |
1110 | ok $s eq '123456', 'Named capture (angle brackets) s///'; |
1111 | $s = '123453456'; |
1112 | $s =~ s/(?'digits'\d+)\k'digits'/$+{digits}/; |
0f289c68 |
1113 | ok $s eq '123456', 'Named capture (single quotes) s///'; |
e425a60b |
1114 | } |
1115 | |
1116 | |
1117 | { |
1118 | my @ary = ( |
1119 | pack('U', 0x00F1), # n-tilde |
1120 | '_'.pack('U', 0x00F1), # _ + n-tilde |
1121 | 'c'.pack('U', 0x0327), # c + cedilla |
1122 | pack('U*', 0x00F1, 0x0327), # n-tilde + cedilla |
e425a60b |
1123 | pack('U', 0x0391), # ALPHA |
1124 | pack('U', 0x0391).'2', # ALPHA + 2 |
1125 | pack('U', 0x0391).'_', # ALPHA + _ |
1126 | ); |
1127 | |
1128 | for my $uni (@ary) { |
1129 | my ($r1, $c1, $r2, $c2) = eval qq { |
1130 | use utf8; |
1131 | scalar ("..foo foo.." =~ /(?'${uni}'foo) \\k'${uni}'/), |
1132 | \$+{${uni}}, |
1133 | scalar ("..bar bar.." =~ /(?<${uni}>bar) \\k<${uni}>/), |
1134 | \$+{${uni}}; |
1135 | }; |
1136 | ok $r1, "Named capture UTF (?'')"; |
1137 | ok defined $c1 && $c1 eq 'foo', "Named capture UTF \%+"; |
1138 | ok $r2, "Named capture UTF (?<>)"; |
1139 | ok defined $c2 && $c2 eq 'bar', "Named capture UTF \%+"; |
1140 | } |
1141 | } |
1142 | |
e425a60b |
1143 | { |
1144 | my $s = 'foo bar baz'; |
1145 | my @res; |
1146 | if ('1234' =~ /(?<A>1)(?<B>2)(?<A>3)(?<B>4)/) { |
1147 | foreach my $name (sort keys(%-)) { |
1148 | my $ary = $- {$name}; |
1149 | foreach my $idx (0 .. $#$ary) { |
1150 | push @res, "$name:$idx:$ary->[$idx]"; |
1151 | } |
1152 | } |
1153 | } |
1154 | my @expect = qw (A:0:1 A:1:3 B:0:2 B:1:4); |
1155 | iseq "@res", "@expect", "Check %-"; |
1156 | eval' |
1157 | no warnings "uninitialized"; |
1158 | print for $- {this_key_doesnt_exist}; |
1159 | '; |
1160 | ok !$@,'lvalue $- {...} should not throw an exception'; |
1161 | } |
1162 | |
e425a60b |
1163 | { |
1164 | # \, breaks {3,4} |
1165 | ok "xaaay" !~ /xa{3\,4}y/, '\, in a pattern'; |
1166 | ok "xa{3,4}y" =~ /xa{3\,4}y/, '\, in a pattern'; |
1167 | |
1168 | # \c\ followed by _ |
1169 | ok "x\c_y" !~ /x\c\_y/, '\_ in a pattern'; |
1170 | ok "x\c\_y" =~ /x\c\_y/, '\_ in a pattern'; |
1171 | |
1172 | # \c\ followed by other characters |
1173 | for my $c ("z", "\0", "!", chr(254), chr(256)) { |
1174 | my $targ = "a\034$c"; |
1175 | my $reg = "a\\c\\$c"; |
1176 | ok eval ("qq/$targ/ =~ /$reg/"), "\\c\\ in pattern"; |
1177 | } |
1178 | } |
1179 | |
e425a60b |
1180 | { # Test the (*PRUNE) pattern |
1181 | our $count = 0; |
1182 | 'aaab' =~ /a+b?(?{$count++})(*FAIL)/; |
1183 | iseq $count, 9, "Expect 9 for no (*PRUNE)"; |
1184 | $count = 0; |
1185 | 'aaab' =~ /a+b?(*PRUNE)(?{$count++})(*FAIL)/; |
1186 | iseq $count, 3, "Expect 3 with (*PRUNE)"; |
1187 | local $_ = 'aaab'; |
1188 | $count = 0; |
1189 | 1 while /.(*PRUNE)(?{$count++})(*FAIL)/g; |
1190 | iseq $count, 4, "/.(*PRUNE)/"; |
1191 | $count = 0; |
1192 | 'aaab' =~ /a+b?(??{'(*PRUNE)'})(?{$count++})(*FAIL)/; |
1193 | iseq $count, 3, "Expect 3 with (*PRUNE)"; |
1194 | local $_ = 'aaab'; |
1195 | $count = 0; |
1196 | 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; |
1197 | iseq $count, 4, "/.(*PRUNE)/"; |
1198 | } |
1199 | |
1200 | |
1201 | { # Test the (*SKIP) pattern |
1202 | our $count = 0; |
1203 | 'aaab' =~ /a+b?(*SKIP)(?{$count++})(*FAIL)/; |
1204 | iseq $count, 1, "Expect 1 with (*SKIP)"; |
1205 | local $_ = 'aaab'; |
1206 | $count = 0; |
1207 | 1 while /.(*SKIP)(?{$count++})(*FAIL)/g; |
1208 | iseq $count, 4, "/.(*SKIP)/"; |
1209 | $_ = 'aaabaaab'; |
1210 | $count = 0; |
1211 | our @res = (); |
1212 | 1 while /(a+b?)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; |
1213 | iseq $count, 2, "Expect 2 with (*SKIP)"; |
1214 | iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; |
1215 | } |
1216 | |
1217 | |
1218 | { # Test the (*SKIP) pattern |
1219 | our $count = 0; |
1220 | 'aaab' =~ /a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; |
1221 | iseq $count, 1, "Expect 1 with (*SKIP)"; |
1222 | local $_ = 'aaab'; |
1223 | $count = 0; |
1224 | 1 while /.(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/g; |
1225 | iseq $count, 4, "/.(*SKIP)/"; |
1226 | $_ = 'aaabaaab'; |
1227 | $count = 0; |
1228 | our @res = (); |
1229 | 1 while /(a+b?)(*MARK:foo)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; |
1230 | iseq $count, 2, "Expect 2 with (*SKIP)"; |
1231 | iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; |
1232 | } |
1233 | |
1234 | |
1235 | { # Test the (*SKIP) pattern |
1236 | our $count = 0; |
1237 | 'aaab' =~ /a*(*MARK:a)b?(*MARK:b)(*SKIP:a)(?{$count++})(*FAIL)/; |
1238 | iseq $count, 3, "Expect 3 with *MARK:a)b?(*MARK:b)(*SKIP:a)"; |
1239 | local $_ = 'aaabaaab'; |
1240 | $count = 0; |
1241 | our @res = (); |
1242 | 1 while |
1243 | /(a*(*MARK:a)b?)(*MARK:x)(*SKIP:a)(?{$count++; push @res,$1})(*FAIL)/g; |
1244 | iseq $count, 5, "Expect 5 with (*MARK:a)b?)(*MARK:x)(*SKIP:a)"; |
1245 | iseq "@res", "aaab b aaab b ", |
1246 | "Adjacent (*MARK:a)b?)(*MARK:x)(*SKIP:a) works as expected"; |
1247 | } |
1248 | |
1249 | |
1250 | { # Test the (*COMMIT) pattern |
1251 | our $count = 0; |
1252 | 'aaabaaab' =~ /a+b?(*COMMIT)(?{$count++})(*FAIL)/; |
1253 | iseq $count, 1, "Expect 1 with (*COMMIT)"; |
1254 | local $_ = 'aaab'; |
1255 | $count = 0; |
1256 | 1 while /.(*COMMIT)(?{$count++})(*FAIL)/g; |
1257 | iseq $count, 1, "/.(*COMMIT)/"; |
1258 | $_ = 'aaabaaab'; |
1259 | $count = 0; |
1260 | our @res = (); |
1261 | 1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g; |
1262 | iseq $count, 1, "Expect 1 with (*COMMIT)"; |
1263 | iseq "@res", "aaab", "Adjacent (*COMMIT) works as expected"; |
1264 | } |
1265 | |
1266 | |
1267 | { |
1268 | # Test named commits and the $REGERROR var |
1269 | our $REGERROR; |
1270 | for my $name ('', ':foo') { |
1271 | for my $pat ("(*PRUNE$name)", |
1272 | ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", |
0f289c68 |
1273 | "(*COMMIT$name)") { |
e425a60b |
1274 | for my $suffix ('(*FAIL)', '') { |
1275 | 'aaaab' =~ /a+b$pat$suffix/; |
1276 | iseq $REGERROR, |
1277 | ($suffix ? ($name ? 'foo' : "1") : ""), |
1278 | "Test $pat and \$REGERROR $suffix"; |
1279 | } |
1280 | } |
1281 | } |
1282 | } |
1283 | |
1284 | |
1285 | { |
1286 | # Test named commits and the $REGERROR var |
1287 | package Fnorble; |
1288 | our $REGERROR; |
1289 | for my $name ('', ':foo') { |
1290 | for my $pat ("(*PRUNE$name)", |
1291 | ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", |
0f289c68 |
1292 | "(*COMMIT$name)") { |
e425a60b |
1293 | for my $suffix ('(*FAIL)','') { |
1294 | 'aaaab' =~ /a+b$pat$suffix/; |
1295 | ::iseq $REGERROR, |
1296 | ($suffix ? ($name ? 'foo' : "1") : ""), |
1297 | "Test $pat and \$REGERROR $suffix"; |
1298 | } |
1299 | } |
0f289c68 |
1300 | } |
1301 | } |
e425a60b |
1302 | |
1303 | |
1304 | { |
1305 | # Test named commits and the $REGERROR var |
1306 | local $Message = '$REGERROR'; |
1307 | our $REGERROR; |
1308 | for my $word (qw (bar baz bop)) { |
1309 | $REGERROR = ""; |
1310 | "aaaaa$word" =~ |
1311 | /a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/; |
1312 | iseq $REGERROR, $word; |
0f289c68 |
1313 | } |
e425a60b |
1314 | } |
1315 | |
e425a60b |
1316 | { |
1317 | #Mindnumbingly simple test of (*THEN) |
1318 | for ("ABC","BAX") { |
1319 | ok /A (*THEN) X | B (*THEN) C/x, "Simple (*THEN) test"; |
1320 | } |
1321 | } |
1322 | |
1323 | |
1324 | { |
1325 | local $Message = "Relative Recursion"; |
1326 | my $parens = qr/(\((?:[^()]++|(?-1))*+\))/; |
1327 | local $_ = 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; |
1328 | my ($all, $one, $two) = ('', '', ''); |
9d45b377 |
1329 | ok m/foo $parens \s* \+ \s* bar $parens/x; |
e425a60b |
1330 | iseq $1, '((2*3)+4-3)'; |
1331 | iseq $2, '(2*(3+4)-1*(2-3))'; |
1332 | iseq $&, 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; |
1333 | iseq $&, $_; |
1334 | } |
1335 | |
1336 | { |
1337 | my $spaces=" "; |
1338 | local $_ = join 'bar', $spaces, $spaces; |
1339 | our $count = 0; |
1340 | s/(?>\s+bar)(?{$count++})//g; |
1341 | iseq $_, $spaces, "SUSPEND final string"; |
1342 | iseq $count, 1, "Optimiser should have prevented more than one match"; |
1343 | } |
1344 | |
e425a60b |
1345 | |
1346 | { |
1347 | # From Message-ID: <877ixs6oa6.fsf@k75.linux.bogus> |
1348 | my $dow_name = "nada"; |
1349 | my $parser = "(\$dow_name) = \$time_string =~ /(D\x{e9}\\ " . |
1350 | "C\x{e9}adaoin|D\x{e9}\\ Sathairn|\\w+|\x{100})/"; |
1351 | my $time_string = "D\x{e9} C\x{e9}adaoin"; |
1352 | eval $parser; |
1353 | ok !$@, "Test Eval worked"; |
1354 | iseq $dow_name, $time_string, "UTF-8 trie common prefix extraction"; |
1355 | } |
1356 | |
1357 | |
1358 | { |
1359 | my $v; |
1360 | ($v = 'bar') =~ /(\w+)/g; |
1361 | $v = 'foo'; |
1362 | iseq "$1", 'bar', '$1 is safe after /g - may fail due ' . |
1363 | 'to specialized config in pp_hot.c' |
1364 | } |
1365 | |
1366 | |
1367 | { |
1368 | local $Message = "http://nntp.perl.org/group/perl.perl5.porters/118663"; |
1369 | my $qr_barR1 = qr/(bar)\g-1/; |
1370 | ok "foobarbarxyz" =~ $qr_barR1; |
1371 | ok "foobarbarxyz" =~ qr/foo${qr_barR1}xyz/; |
1372 | ok "foobarbarxyz" =~ qr/(foo)${qr_barR1}xyz/; |
1373 | ok "foobarbarxyz" =~ qr/(foo)(bar)\g{-1}xyz/; |
1374 | ok "foobarbarxyz" =~ qr/(foo${qr_barR1})xyz/; |
1375 | ok "foobarbarxyz" =~ qr/(foo(bar)\g{-1})xyz/; |
0f289c68 |
1376 | } |
e425a60b |
1377 | |
e425a60b |
1378 | { |
1379 | local $Message = '$REGMARK'; |
1380 | our @r = (); |
1381 | our ($REGMARK, $REGERROR); |
1382 | ok 'foofoo' =~ /foo (*MARK:foo) (?{push @r,$REGMARK}) /x; |
0f289c68 |
1383 | iseq "@r","foo"; |
e425a60b |
1384 | iseq $REGMARK, "foo"; |
1385 | ok 'foofoo' !~ /foo (*MARK:foo) (*FAIL) /x; |
1386 | ok !$REGMARK; |
1387 | iseq $REGERROR, 'foo'; |
1388 | } |
1389 | |
1390 | |
1391 | { |
1392 | local $Message = '\K test'; |
1393 | my $x; |
1394 | $x = "abc.def.ghi.jkl"; |
1395 | $x =~ s/.*\K\..*//; |
1396 | iseq $x, "abc.def.ghi"; |
0f289c68 |
1397 | |
e425a60b |
1398 | $x = "one two three four"; |
1399 | $x =~ s/o+ \Kthree//g; |
1400 | iseq $x, "one two four"; |
0f289c68 |
1401 | |
e425a60b |
1402 | $x = "abcde"; |
1403 | $x =~ s/(.)\K/$1/g; |
1404 | iseq $x, "aabbccddee"; |
1405 | } |
1406 | |
1407 | |
1408 | { |
1409 | sub kt { |
1410 | return '4' if $_[0] eq '09028623'; |
1411 | } |
1412 | # Nested EVAL using PL_curpm (via $1 or friends) |
1413 | my $re; |
1414 | our $grabit = qr/ ([0-6][0-9]{7}) (??{ kt $1 }) [890] /x; |
1415 | $re = qr/^ ( (??{ $grabit }) ) $ /x; |
1416 | my @res = '0902862349' =~ $re; |
1417 | iseq join ("-", @res), "0902862349", |
1418 | 'PL_curpm is set properly on nested eval'; |
1419 | |
1420 | our $qr = qr/ (o) (??{ $1 }) /x; |
1421 | ok 'boob'=~/( b (??{ $qr }) b )/x && 1, "PL_curpm, nested eval"; |
1422 | } |
1423 | |
1424 | |
1425 | { |
1426 | use charnames ":full"; |
1427 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "I =~ Alphabetic"; |
1428 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/, "I =~ Uppercase"; |
1429 | ok "\N{ROMAN NUMERAL ONE}" !~ /\p{Lowercase}/, "I !~ Lowercase"; |
1430 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "I =~ ID_Start"; |
1431 | ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "I =~ ID_Continue"; |
1432 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "i =~ Alphabetic"; |
1433 | ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Uppercase}/, "i !~ Uppercase"; |
1434 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/, "i =~ Lowercase"; |
1435 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "i =~ ID_Start"; |
1436 | ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "i =~ ID_Continue" |
1437 | } |
1438 | |
1439 | |
1440 | { |
1441 | # requirement of Unicode Technical Standard #18, 1.7 Code Points |
1442 | # cf. http://www.unicode.org/reports/tr18/#Supplementary_Characters |
1443 | for my $u (0x7FF, 0x800, 0xFFFF, 0x10000) { |
1444 | no warnings 'utf8'; # oops |
1445 | my $c = chr $u; |
1446 | my $x = sprintf '%04X', $u; |
1447 | ok "A${c}B" =~ /A[\0-\x{10000}]B/, "Unicode range - $x"; |
1448 | } |
1449 | } |
1450 | |
1451 | |
1452 | { |
1453 | my $res=""; |
1454 | |
1455 | if ('1' =~ /(?|(?<digit>1)|(?<digit>2))/) { |
1456 | $res = "@{$- {digit}}"; |
1457 | } |
1458 | iseq $res, "1", |
1459 | "Check that (?|...) doesnt cause dupe entries in the names array"; |
0f289c68 |
1460 | |
e425a60b |
1461 | $res = ""; |
1462 | if ('11' =~ /(?|(?<digit>1)|(?<digit>2))(?&digit)/) { |
1463 | $res = "@{$- {digit}}"; |
1464 | } |
1465 | iseq $res, "1", "Check that (?&..) to a buffer inside " . |
1466 | "a (?|...) goes to the leftmost"; |
1467 | } |
1468 | |
1469 | |
1470 | { |
1471 | use warnings; |
1472 | local $Message = "ASCII pattern that really is UTF-8"; |
1473 | my @w; |
1474 | local $SIG {__WARN__} = sub {push @w, "@_"}; |
0f289c68 |
1475 | my $c = qq (\x{DF}); |
e425a60b |
1476 | ok $c =~ /${c}|\x{100}/; |
1477 | ok @w == 0; |
0f289c68 |
1478 | } |
e425a60b |
1479 | |
1480 | |
1481 | { |
1482 | local $Message = "Corruption of match results of qr// across scopes"; |
1483 | my $qr = qr/(fo+)(ba+r)/; |
1484 | 'foobar' =~ /$qr/; |
1485 | iseq "$1$2", "foobar"; |
1486 | { |
1487 | 'foooooobaaaaar' =~ /$qr/; |
0f289c68 |
1488 | iseq "$1$2", 'foooooobaaaaar'; |
e425a60b |
1489 | } |
1490 | iseq "$1$2", "foobar"; |
1491 | } |
1492 | |
1493 | |
1494 | { |
1495 | local $Message = "HORIZWS"; |
1496 | local $_ = "\t \r\n \n \t".chr(11)."\n"; |
1497 | s/\H/H/g; |
1498 | s/\h/h/g; |
1499 | iseq $_, "hhHHhHhhHH"; |
1500 | $_ = "\t \r\n \n \t" . chr (11) . "\n"; |
1501 | utf8::upgrade ($_); |
1502 | s/\H/H/g; |
1503 | s/\h/h/g; |
1504 | iseq $_, "hhHHhHhhHH"; |
0f289c68 |
1505 | } |
e425a60b |
1506 | |
1507 | |
1508 | { |
1509 | local $Message = "Various whitespace special patterns"; |
1510 | my @h = map {chr $_} 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, |
1511 | 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, |
1512 | 0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, |
1513 | 0x3000; |
1514 | my @v = map {chr $_} 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, |
1515 | 0x2029; |
1516 | my @lb = ("\x0D\x0A", map {chr $_} 0x0A .. 0x0D, 0x85, 0x2028, 0x2029); |
1517 | foreach my $t ([\@h, qr/\h/, qr/\h+/], |
1518 | [\@v, qr/\v/, qr/\v+/], |
1519 | [\@lb, qr/\R/, qr/\R+/],) { |
1520 | my $ary = shift @$t; |
1521 | foreach my $pat (@$t) { |
1522 | foreach my $str (@$ary) { |
1523 | ok $str =~ /($pat)/, $pat; |
1524 | iseq $1, $str, $pat; |
1525 | utf8::upgrade ($str); |
1526 | ok $str =~ /($pat)/, "Upgraded string - $pat"; |
1527 | iseq $1, $str, "Upgraded string - $pat"; |
1528 | } |
1529 | } |
1530 | } |
1531 | } |
1532 | |
1533 | |
1534 | { |
1535 | local $Message = "Check that \\xDF match properly in its various forms"; |
1536 | # Test that \xDF matches properly. this is pretty hacky stuff, |
1537 | # but its actually needed. The malarky with '-' is to prevent |
1538 | # compilation caching from playing any role in the test. |
1539 | my @df = (chr (0xDF), '-', chr (0xDF)); |
1540 | utf8::upgrade ($df [2]); |
1541 | my @strs = ('ss', 'sS', 'Ss', 'SS', chr (0xDF)); |
1542 | my @ss = map {("$_", "$_")} @strs; |
1543 | utf8::upgrade ($ss [$_ * 2 + 1]) for 0 .. $#strs; |
1544 | |
1545 | for my $ssi (0 .. $#ss) { |
1546 | for my $dfi (0 .. $#df) { |
1547 | my $pat = $df [$dfi]; |
1548 | my $str = $ss [$ssi]; |
1549 | my $utf_df = ($dfi > 1) ? 'utf8' : ''; |
1550 | my $utf_ss = ($ssi % 2) ? 'utf8' : ''; |
1551 | (my $sstr = $str) =~ s/\xDF/\\xDF/; |
1552 | |
1553 | if ($utf_df || $utf_ss || length ($ss [$ssi]) == 1) { |
1554 | my $ret = $str =~ /$pat/i; |
1555 | next if $pat eq '-'; |
1556 | ok $ret, "\"$sstr\" =~ /\\xDF/i " . |
1557 | "(str is @{[$utf_ss||'latin']}, pat is " . |
1558 | "@{[$utf_df||'latin']})"; |
1559 | } |
1560 | else { |
1561 | my $ret = $str !~ /$pat/i; |
1562 | next if $pat eq '-'; |
1563 | ok $ret, "\"$sstr\" !~ /\\xDF/i " . |
1564 | "(str is @{[$utf_ss||'latin']}, pat is " . |
1565 | "@{[$utf_df||'latin']})"; |
1566 | } |
1567 | } |
1568 | } |
1569 | } |
1570 | |
1571 | |
1572 | { |
1573 | local $Message = "BBC(Bleadperl Breaks CPAN) Today: String::Multibyte"; |
1574 | my $re = qr/(?:[\x00-\xFF]{4})/; |
1575 | my $hyp = "\0\0\0-"; |
1576 | my $esc = "\0\0\0\\"; |
1577 | |
1578 | my $str = "$esc$hyp$hyp$esc$esc"; |
1579 | my @a = ($str =~ /\G(?:\Q$esc$esc\E|\Q$esc$hyp\E|$re)/g); |
1580 | |
1581 | iseq @a,3; |
1582 | local $" = "="; |
1583 | iseq "@a","$esc$hyp=$hyp=$esc$esc"; |
1584 | } |
1585 | |
1586 | |
1587 | { |
1588 | # Test for keys in %+ and %- |
1589 | local $Message = 'Test keys in %+ and %-'; |
1590 | no warnings 'uninitialized'; |
1591 | my $_ = "abcdef"; |
1592 | /(?<foo>a)|(?<foo>b)/; |
1593 | iseq ((join ",", sort keys %+), "foo"); |
1594 | iseq ((join ",", sort keys %-), "foo"); |
1595 | iseq ((join ",", sort values %+), "a"); |
1596 | iseq ((join ",", sort map "@$_", values %-), "a "); |
1597 | /(?<bar>a)(?<bar>b)(?<quux>.)/; |
1598 | iseq ((join ",", sort keys %+), "bar,quux"); |
1599 | iseq ((join ",", sort keys %-), "bar,quux"); |
1600 | iseq ((join ",", sort values %+), "a,c"); # leftmost |
1601 | iseq ((join ",", sort map "@$_", values %-), "a b,c"); |
1602 | /(?<un>a)(?<deux>c)?/; # second buffer won't capture |
1603 | iseq ((join ",", sort keys %+), "un"); |
1604 | iseq ((join ",", sort keys %-), "deux,un"); |
1605 | iseq ((join ",", sort values %+), "a"); |
1606 | iseq ((join ",", sort map "@$_", values %-), ",a"); |
1607 | } |
1608 | |
1609 | |
1610 | { |
1611 | # length() on captures, the numbered ones end up in Perl_magic_len |
1612 | my $_ = "aoeu \xe6var ook"; |
1613 | /^ \w+ \s (?<eek>\S+)/x; |
1614 | |
1615 | iseq length ($`), 0, q[length $`]; |
1616 | iseq length ($'), 4, q[length $']; |
1617 | iseq length ($&), 9, q[length $&]; |
1618 | iseq length ($1), 4, q[length $1]; |
1619 | iseq length ($+{eek}), 4, q[length $+{eek} == length $1]; |
1620 | } |
1621 | |
1622 | |
1623 | { |
1624 | my $ok = -1; |
1625 | |
1626 | $ok = exists ($-{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; |
1627 | iseq $ok, 1, '$-{x} exists after "bar"=~/(?<x>foo)|bar/'; |
1628 | iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; |
1629 | iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; |
1630 | |
1631 | $ok = -1; |
1632 | $ok = exists ($+{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; |
1633 | iseq $ok, 0, '$+{x} not exists after "bar"=~/(?<x>foo)|bar/'; |
1634 | iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; |
1635 | iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; |
1636 | |
1637 | $ok = -1; |
1638 | $ok = exists ($-{x}) ? 1 : 0 if 'foo' =~ /(?<x>foo)|bar/; |
1639 | iseq $ok, 1, '$-{x} exists after "foo"=~/(?<x>foo)|bar/'; |
1640 | iseq scalar (%+), 1, 'scalar %+ == 1 after "foo"=~/(?<x>foo)|bar/'; |
1641 | iseq scalar (%-), 1, 'scalar %- == 1 after "foo"=~/(?<x>foo)|bar/'; |
1642 | |
1643 | $ok = -1; |
1644 | $ok = exists ($+{x}) ? 1 : 0 if 'foo'=~/(?<x>foo)|bar/; |
1645 | iseq $ok, 1, '$+{x} exists after "foo"=~/(?<x>foo)|bar/'; |
1646 | } |
1647 | |
1648 | |
1649 | { |
1650 | local $_; |
1651 | ($_ = 'abc') =~ /(abc)/g; |
0f289c68 |
1652 | $_ = '123'; |
e425a60b |
1653 | iseq "$1", 'abc', "/g leads to unsafe match vars: $1"; |
1654 | } |
1655 | |
1656 | |
1657 | { |
1658 | local $Message = 'Message-ID: <20070818091501.7eff4831@r2d2>'; |
1659 | my $str = ""; |
1660 | for (0 .. 5) { |
1661 | my @x; |
1662 | $str .= "@x"; # this should ALWAYS be the empty string |
1663 | 'a' =~ /(a|)/; |
1664 | push @x, 1; |
1665 | } |
1666 | iseq length ($str), 0, "Trie scope error, string should be empty"; |
1667 | $str = ""; |
1668 | my @foo = ('a') x 5; |
1669 | for (@foo) { |
1670 | my @bar; |
1671 | $str .= "@bar"; |
1672 | s/a|/push @bar, 1/e; |
1673 | } |
1674 | iseq length ($str), 0, "Trie scope error, string should be empty"; |
1675 | } |
1676 | |
1677 | |
1678 | { |
e425a60b |
1679 | # more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding |
0f289c68 |
1680 | for my $chr (160 .. 255) { |
1681 | my $chr_byte = chr($chr); |
1682 | my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8); |
1683 | my $rx = qr{$chr_byte|X}i; |
1684 | ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr"); |
1685 | } |
e425a60b |
1686 | } |
1687 | |
1688 | { |
e425a60b |
1689 | our $a = 3; "" =~ /(??{ $a })/; |
1690 | our $b = $a; |
1691 | iseq $b, $a, "Copy of scalar used for postponed subexpression"; |
1692 | } |
1693 | |
1694 | |
1695 | { |
e425a60b |
1696 | our @ctl_n = (); |
1697 | our @plus = (); |
1698 | our $nested_tags; |
1699 | $nested_tags = qr{ |
1700 | < |
1701 | (\w+) |
1702 | (?{ |
1703 | push @ctl_n,$^N; |
1704 | push @plus,$+; |
1705 | }) |
1706 | > |
1707 | (??{$nested_tags})* |
1708 | </\s* \w+ \s*> |
1709 | }x; |
1710 | |
1711 | my $match = '<bla><blubb></blubb></bla>' =~ m/^$nested_tags$/; |
1712 | ok $match, 'nested construct matches'; |
1713 | iseq "@ctl_n", "bla blubb", '$^N inside of (?{}) works as expected'; |
1714 | iseq "@plus", "bla blubb", '$+ inside of (?{}) works as expected'; |
1715 | } |
1716 | |
1717 | |
e425a60b |
1718 | SKIP: { |
1719 | # XXX: This set of tests is essentially broken, POSIX character classes |
0f289c68 |
1720 | # should not have differing definitions under Unicode. |
e425a60b |
1721 | # There are property names for that. |
1722 | skip "Tests assume ASCII", 4 unless $IS_ASCII; |
1723 | |
1724 | my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} |
1725 | map {chr} 0x20 .. 0x7f; |
1726 | iseq join ('', @notIsPunct), '$+<=>^`|~', |
1727 | '[:punct:] disagress with IsPunct on Symbols'; |
1728 | |
1729 | my @isPrint = grep {not /[[:print:]]/ and /\p{IsPrint}/} |
1730 | map {chr} 0 .. 0x1f, 0x7f .. 0x9f; |
99870f4d |
1731 | iseq join ('', @isPrint), "", |
1732 | 'IsPrint agrees with [:print:] on control characters'; |
e425a60b |
1733 | |
1734 | my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/} |
1735 | map {chr} 0x80 .. 0xff; |
0f289c68 |
1736 | iseq join ('', @isPunct), "\xa1\xab\xb7\xbb\xbf", # ¡ « · » ¿ |
e425a60b |
1737 | 'IsPunct disagrees with [:punct:] outside ASCII'; |
1738 | |
1739 | my @isPunctLatin1 = eval q { |
1740 | use encoding 'latin1'; |
1741 | grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80 .. 0xff; |
1742 | }; |
1743 | skip "Eval failed ($@)", 1 if $@; |
1744 | skip "PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS set to 0", 1 |
1745 | if !$ENV{PERL_TEST_LEGACY_POSIX_CC}; |
0f289c68 |
1746 | iseq join ('', @isPunctLatin1), '', |
e425a60b |
1747 | 'IsPunct agrees with [:punct:] with explicit Latin1'; |
0f289c68 |
1748 | } |
e425a60b |
1749 | |
e425a60b |
1750 | # |
1751 | # Keep the following tests last -- they may crash perl |
1752 | # |
1753 | print "# Tests that follow may crash perl\n"; |
e425a60b |
1754 | { |
1755 | eval '/\k/'; |
1756 | ok $@ =~ /\QSequence \k... not terminated in regex;\E/, |
1757 | 'Lone \k not allowed'; |
1758 | } |
1759 | |
e425a60b |
1760 | { |
1761 | local $Message = "Substitution with lookahead (possible segv)"; |
1762 | $_ = "ns1ns1ns1"; |
1763 | s/ns(?=\d)/ns_/g; |
1764 | iseq $_, "ns_1ns_1ns_1"; |
1765 | $_ = "ns1"; |
1766 | s/ns(?=\d)/ns_/; |
1767 | iseq $_, "ns_1"; |
1768 | $_ = "123"; |
1769 | s/(?=\d+)|(?<=\d)/!Bang!/g; |
1770 | iseq $_, "!Bang!1!Bang!2!Bang!3!Bang!"; |
1771 | } |
1772 | |
6182169b |
1773 | { |
1774 | # Earlier versions of Perl said this was fatal. |
1775 | local $Message = "U+0FFFF shouldn't crash the regex engine"; |
1776 | no warnings 'utf8'; |
1777 | my $a = eval "chr(65535)"; |
1778 | use warnings; |
1779 | my $warning_message; |
1780 | local $SIG{__WARN__} = sub { $warning_message = $_[0] }; |
1781 | eval $a =~ /[a-z]/; |
1782 | ok(1); # If it didn't crash, it worked. |
1783 | } |
e425a60b |
1784 | } # End of sub run_tests |
1785 | |
1786 | 1; |