Commit | Line | Data |
e425a60b |
1 | #!./perl |
2 | # |
3 | # This is a home for regular expression tests that don't fit into |
4 | # the format supported by re/regexp.t. If you want to add a test |
5 | # that does fit that format, add it to re/re_tests, not here. |
6 | |
7 | use strict; |
8 | use warnings; |
9 | use 5.010; |
10 | |
11 | |
12 | sub run_tests; |
13 | |
14 | $| = 1; |
15 | |
e425a60b |
16 | |
17 | BEGIN { |
18 | chdir 't' if -d 't'; |
9d45b377 |
19 | @INC = ('../lib','.'); |
20 | do "re/ReTest.pl" or die $@; |
e425a60b |
21 | } |
e425a60b |
22 | |
e425a60b |
23 | |
0f289c68 |
24 | plan tests => 2510; # Update this when adding/deleting tests. |
e425a60b |
25 | |
9d45b377 |
26 | run_tests() unless caller; |
e425a60b |
27 | |
28 | # |
29 | # Tests start here. |
30 | # |
31 | sub run_tests { |
32 | |
e425a60b |
33 | |
34 | { |
35 | local $BugId = '20000731.001'; |
36 | ok "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/, |
37 | "Match UTF-8 char in presense of (??{ })"; |
38 | } |
39 | |
40 | |
41 | { |
42 | local $BugId = '20001021.005'; |
43 | no warnings 'uninitialized'; |
44 | ok undef =~ /^([^\/]*)(.*)$/, "Used to cause a SEGV"; |
45 | } |
46 | |
e425a60b |
47 | { |
48 | local $Message = 'bug id 20001008.001'; |
49 | |
50 | my @x = ("stra\337e 138", "stra\337e 138"); |
51 | for (@x) { |
52 | ok s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; |
53 | ok my ($latin) = /^(.+)(?:\s+\d)/; |
54 | iseq $latin, "stra\337e"; |
55 | ok $latin =~ s/stra\337e/straße/; |
56 | # |
57 | # Previous code follows, but outcommented - there were no tests. |
58 | # |
59 | # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a |
60 | # use utf8; # needed for the raw UTF-8 |
61 | # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a |
62 | } |
63 | } |
64 | |
65 | |
66 | { |
e425a60b |
67 | local $BugId = '20001028.003'; |
68 | |
69 | # Fist half of the bug. |
70 | local $Message = 'HEBREW ACCENT QADMA matched by .*'; |
71 | my $X = chr (1448); |
72 | ok my ($Y) = $X =~ /(.*)/; |
73 | iseq $Y, v1448; |
74 | iseq length ($Y), 1; |
75 | |
76 | # Second half of the bug. |
77 | $Message = 'HEBREW ACCENT QADMA in replacement'; |
78 | $X = ''; |
79 | $X =~ s/^/chr(1488)/e; |
80 | iseq length $X, 1; |
81 | iseq ord ($X), 1488; |
82 | } |
83 | |
84 | |
85 | { |
86 | local $BugId = '20001108.001'; |
87 | local $Message = 'Repeated s///'; |
88 | my $X = "Szab\x{f3},Bal\x{e1}zs"; |
89 | my $Y = $X; |
90 | $Y =~ s/(B)/$1/ for 0 .. 3; |
91 | iseq $Y, $X; |
92 | iseq $X, "Szab\x{f3},Bal\x{e1}zs"; |
93 | } |
94 | |
95 | |
96 | { |
97 | local $BugId = '20000517.001'; |
98 | local $Message = 's/// on UTF-8 string'; |
99 | my $x = "\x{100}A"; |
100 | $x =~ s/A/B/; |
101 | iseq $x, "\x{100}B"; |
102 | iseq length $x, 2; |
103 | } |
104 | |
105 | |
106 | { |
107 | local $BugId = '20001230.002'; |
108 | local $Message = '\C and É'; |
109 | ok "École" =~ /^\C\C(.)/ && $1 eq 'c'; |
110 | ok "École" =~ /^\C\C(c)/; |
111 | } |
112 | |
113 | |
e425a60b |
114 | { |
115 | # The original bug report had 'no utf8' here but that was irrelevant. |
116 | local $BugId = '20010306.008'; |
117 | local $Message = "Don't dump core"; |
118 | my $a = "a\x{1234}"; |
119 | ok $a =~ m/\w/; # used to core dump. |
120 | } |
121 | |
122 | |
123 | { |
124 | local $BugId = '20010410.006'; |
125 | local $Message = '/g in scalar context'; |
126 | for my $rx ('/(.*?)\{(.*?)\}/csg', |
127 | '/(.*?)\{(.*?)\}/cg', |
128 | '/(.*?)\{(.*?)\}/sg', |
129 | '/(.*?)\{(.*?)\}/g', |
130 | '/(.+?)\{(.+?)\}/csg',) { |
131 | my $i = 0; |
132 | my $input = "a{b}c{d}"; |
133 | eval <<" --"; |
134 | while (eval \$input =~ $rx) { |
135 | \$i ++; |
136 | } |
137 | -- |
138 | iseq $i, 2; |
139 | } |
140 | } |
141 | |
e425a60b |
142 | { |
143 | local $BugId = "20010619.003"; |
144 | # Amazingly vertical tabulator is the same in ASCII and EBCDIC. |
145 | for ("\n", "\t", "\014", "\r") { |
146 | ok !/[[:print:]]/, "'$_' not in [[:print:]]"; |
147 | } |
148 | for (" ") { |
149 | ok /[[:print:]]/, "'$_' in [[:print:]]"; |
150 | } |
151 | } |
152 | |
153 | |
e425a60b |
154 | |
155 | { |
156 | # [ID 20010814.004] pos() doesn't work when using =~m// in list context |
157 | local $BugId = '20010814.004'; |
158 | $_ = "ababacadaea"; |
159 | my $a = join ":", /b./gc; |
160 | my $b = join ":", /a./gc; |
161 | my $c = pos; |
162 | iseq "$a $b $c", 'ba:ba ad:ae 10', "pos() works with () = m//"; |
163 | } |
164 | |
165 | |
166 | { |
167 | # [ID 20010407.006] matching utf8 return values from |
168 | # functions does not work |
169 | local $BugId = '20010407.006'; |
170 | local $Message = 'UTF-8 return values from functions'; |
171 | package ID_20010407_006; |
172 | sub x {"a\x{1234}"} |
173 | my $x = x; |
174 | my $y; |
175 | ::ok $x =~ /(..)/; |
176 | $y = $1; |
177 | ::ok length ($y) == 2 && $y eq $x; |
178 | ::ok x =~ /(..)/; |
179 | $y = $1; |
180 | ::ok length ($y) == 2 && $y eq $x; |
181 | } |
182 | |
e425a60b |
183 | { |
184 | # High bit bug -- japhy |
185 | my $x = "ab\200d"; |
186 | ok $x =~ /.*?\200/, "High bit fine"; |
187 | } |
188 | |
189 | |
190 | { |
e425a60b |
191 | local $Message = 'UTF-8 hash keys and /$/'; |
192 | # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters |
193 | # /2002-01/msg01327.html |
194 | |
195 | my $u = "a\x{100}"; |
196 | my $v = substr ($u, 0, 1); |
197 | my $w = substr ($u, 1, 1); |
198 | my %u = ($u => $u, $v => $v, $w => $w); |
199 | for (keys %u) { |
200 | my $m1 = /^\w*$/ ? 1 : 0; |
201 | my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; |
202 | iseq $m1, $m2; |
203 | } |
204 | } |
205 | |
206 | |
207 | { |
208 | local $BugId = "20020124.005"; |
209 | local $PatchId = "14795"; |
210 | local $Message = "s///eg"; |
211 | |
212 | for my $char ("a", "\x{df}", "\x{100}") { |
213 | my $x = "$char b $char"; |
214 | $x =~ s{($char)}{ |
215 | "c" =~ /c/; |
216 | "x"; |
217 | }ge; |
218 | iseq substr ($x, 0, 1), substr ($x, -1, 1); |
219 | } |
220 | } |
221 | |
222 | |
223 | { |
e425a60b |
224 | local $BugId = "20020412.005"; |
225 | local $Message = "Correct pmop flags checked when empty pattern"; |
226 | |
227 | # Requires reuse of last successful pattern. |
228 | my $num = 123; |
229 | $num =~ /\d/; |
230 | for (0 .. 1) { |
231 | my $match = ?? + 0; |
232 | ok $match != $_, $Message, |
233 | sprintf "'match one' %s on %s iteration" => |
234 | $match ? 'succeeded' : 'failed', |
235 | $_ ? 'second' : 'first'; |
236 | } |
237 | $num =~ /(\d)/; |
238 | my $result = join "" => $num =~ //g; |
239 | iseq $result, $num; |
240 | } |
241 | |
242 | |
243 | { |
244 | local $BugId = '20020630.002'; |
245 | local $Message = 'UTF-8 regex matches above 32k'; |
246 | for (['byte', "\x{ff}"], ['utf8', "\x{1ff}"]) { |
247 | my ($type, $char) = @$_; |
248 | for my $len (32000, 32768, 33000) { |
249 | my $s = $char . "f" x $len; |
250 | my $r = $s =~ /$char([f]*)/gc; |
251 | ok $r, $Message, "<$type x $len>"; |
252 | ok !$r || pos ($s) == $len + 1, $Message, |
253 | "<$type x $len>; pos = @{[pos $s]}"; |
254 | } |
255 | } |
256 | } |
257 | |
e425a60b |
258 | { |
259 | local $PatchId = '18179'; |
260 | my $s = "\x{100}" x 5; |
261 | my $ok = $s =~ /(\x{100}{4})/; |
262 | my ($ord, $len) = (ord $1, length $1); |
263 | ok $ok && $ord == 0x100 && $len == 4, "No panic: end_shift"; |
264 | } |
265 | |
266 | |
267 | { |
268 | local $BugId = '15763'; |
269 | our $a = "x\x{100}"; |
270 | chop $a; # Leaves the UTF-8 flag |
271 | $a .= "y"; # 1 byte before 'y'. |
272 | |
273 | ok $a =~ /^\C/, 'match one \C on 1-byte UTF-8'; |
274 | ok $a =~ /^\C{1}/, 'match \C{1}'; |
275 | |
276 | ok $a =~ /^\Cy/, 'match \Cy'; |
277 | ok $a =~ /^\C{1}y/, 'match \C{1}y'; |
278 | |
279 | ok $a !~ /^\C\Cy/, q {don't match two \Cy}; |
280 | ok $a !~ /^\C{2}y/, q {don't match \C{2}y}; |
281 | |
282 | $a = "\x{100}y"; # 2 bytes before "y" |
283 | |
284 | ok $a =~ /^\C/, 'match one \C on 2-byte UTF-8'; |
285 | ok $a =~ /^\C{1}/, 'match \C{1}'; |
286 | ok $a =~ /^\C\C/, 'match two \C'; |
287 | ok $a =~ /^\C{2}/, 'match \C{2}'; |
288 | |
289 | ok $a =~ /^\C\C\C/, 'match three \C on 2-byte UTF-8 and a byte'; |
290 | ok $a =~ /^\C{3}/, 'match \C{3}'; |
291 | |
292 | ok $a =~ /^\C\Cy/, 'match two \C'; |
293 | ok $a =~ /^\C{2}y/, 'match \C{2}'; |
294 | |
295 | ok $a !~ /^\C\C\Cy/, q {don't match three \Cy}; |
296 | ok $a !~ /^\C{2}\Cy/, q {don't match \C{2}\Cy}; |
297 | ok $a !~ /^\C{3}y/, q {don't match \C{3}y}; |
298 | |
299 | $a = "\x{1000}y"; # 3 bytes before "y" |
300 | |
301 | ok $a =~ /^\C/, 'match one \C on three-byte UTF-8'; |
302 | ok $a =~ /^\C{1}/, 'match \C{1}'; |
303 | ok $a =~ /^\C\C/, 'match two \C'; |
304 | ok $a =~ /^\C{2}/, 'match \C{2}'; |
305 | ok $a =~ /^\C\C\C/, 'match three \C'; |
306 | ok $a =~ /^\C{3}/, 'match \C{3}'; |
307 | |
308 | ok $a =~ /^\C\C\C\C/, 'match four \C on three-byte UTF-8 and a byte'; |
309 | ok $a =~ /^\C{4}/, 'match \C{4}'; |
310 | |
311 | ok $a =~ /^\C\C\Cy/, 'match three \Cy'; |
312 | ok $a =~ /^\C{3}y/, 'match \C{3}y'; |
313 | |
314 | ok $a !~ /^\C\C\C\Cy/, q {don't match four \Cy}; |
315 | ok $a !~ /^\C{4}y/, q {don't match \C{4}y}; |
316 | } |
317 | |
318 | |
319 | { |
e425a60b |
320 | local $BugId = '15397'; |
321 | local $Message = 'UTF-8 matching'; |
322 | ok "\x{100}" =~ /\x{100}/; |
323 | ok "\x{100}" =~ /(\x{100})/; |
324 | ok "\x{100}" =~ /(\x{100}){1}/; |
325 | ok "\x{100}\x{100}" =~ /(\x{100}){2}/; |
326 | ok "\x{100}\x{100}" =~ /(\x{100})(\x{100})/; |
327 | } |
328 | |
329 | |
330 | { |
331 | local $BugId = '7471'; |
332 | local $Message = 'Neither ()* nor ()*? sets $1 when matched 0 times'; |
333 | local $_ = 'CD'; |
334 | ok /(AB)*?CD/ && !defined $1; |
335 | ok /(AB)*CD/ && !defined $1; |
336 | } |
337 | |
338 | |
339 | { |
340 | local $BugId = '3547'; |
341 | local $Message = "Caching shouldn't prevent match"; |
342 | my $pattern = "^(b+?|a){1,2}c"; |
343 | ok "bac" =~ /$pattern/ && $1 eq 'a'; |
344 | ok "bbac" =~ /$pattern/ && $1 eq 'a'; |
345 | ok "bbbac" =~ /$pattern/ && $1 eq 'a'; |
346 | ok "bbbbac" =~ /$pattern/ && $1 eq 'a'; |
347 | } |
348 | |
349 | |
350 | |
351 | { |
352 | local $BugId = '18232'; |
353 | local $Message = '$1 should keep UTF-8 ness'; |
354 | ok "\x{100}" =~ /(.)/; |
355 | iseq $1, "\x{100}", '$1 is UTF-8'; |
356 | { 'a' =~ /./; } |
357 | iseq $1, "\x{100}", '$1 is still UTF-8'; |
358 | isneq $1, "\xC4\x80", '$1 is not non-UTF-8'; |
359 | } |
360 | |
361 | |
362 | { |
363 | local $BugId = '19767'; |
364 | local $Message = "Optimizer doesn't prematurely reject match"; |
365 | use utf8; |
366 | |
367 | my $attr = 'Name-1'; |
368 | my $NormalChar = qr /[\p{IsDigit}\p{IsLower}\p{IsUpper}]/; |
369 | my $NormalWord = qr /${NormalChar}+?/; |
370 | my $PredNameHyphen = qr /^${NormalWord}(\-${NormalWord})*?$/; |
371 | |
372 | $attr =~ /^$/; |
373 | ok $attr =~ $PredNameHyphen; # Original test. |
374 | |
375 | "a" =~ m/[b]/; |
376 | ok "0" =~ /\p{N}+\z/; # Variant. |
377 | } |
378 | |
379 | |
380 | { |
381 | local $BugId = '20683'; |
382 | local $Message = "(??{ }) doesn't return stale values"; |
383 | our $p = 1; |
384 | foreach (1, 2, 3, 4) { |
385 | $p ++ if /(??{ $p })/ |
386 | } |
387 | iseq $p, 5; |
388 | |
389 | { |
390 | package P; |
391 | $a = 1; |
392 | sub TIESCALAR {bless []} |
393 | sub FETCH {$a ++} |
394 | } |
395 | tie $p, "P"; |
396 | foreach (1, 2, 3, 4) { |
397 | /(??{ $p })/ |
398 | } |
399 | iseq $p, 5; |
400 | } |
401 | |
402 | |
403 | { |
404 | # Subject: Odd regexp behavior |
405 | # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> |
406 | # Date: Wed, 26 Feb 2003 16:53:12 +0000 |
407 | # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> |
408 | # To: perl-unicode@perl.org |
409 | |
410 | local $Message = 'Markus Kuhn 2003-02-26'; |
411 | |
412 | my $x = "\x{2019}\nk"; |
413 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; |
414 | ok $x eq "\x{2019} k"; |
415 | |
416 | $x = "b\nk"; |
417 | ok $x =~ s/(\S)\n(\S)/$1 $2/sg; |
418 | ok $x eq "b k"; |
419 | |
420 | ok "\x{2019}" =~ /\S/; |
421 | } |
422 | |
423 | |
424 | { |
425 | local $BugId = '21411'; |
426 | local $Message = "(??{ .. }) in split doesn't corrupt its stack"; |
427 | our $i; |
428 | ok '-1-3-5-' eq join '', split /((??{$i++}))/, '-1-3-5-'; |
429 | no warnings 'syntax'; |
430 | @_ = split /(?{'WOW'})/, 'abc'; |
431 | local $" = "|"; |
432 | iseq "@_", "a|b|c"; |
433 | } |
434 | |
435 | |
436 | { |
437 | # XXX DAPM 13-Apr-06. Recursive split is still broken. It's only luck it |
438 | # hasn't been crashing. Disable this test until it is fixed properly. |
439 | # XXX also check what it returns rather than just doing ok(1,...) |
440 | # split /(?{ split "" })/, "abc"; |
441 | local $TODO = "Recursive split is still broken"; |
442 | ok 0, 'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0'; |
443 | } |
444 | |
445 | |
446 | { |
e425a60b |
447 | local $BugId = '17757'; |
448 | $_ = "code: 'x' { '...' }\n"; study; |
449 | my @x; push @x, $& while m/'[^\']*'/gx; |
450 | local $" = ":"; |
451 | iseq "@x", "'x':'...'", "Parse::RecDescent triggered infinite loop"; |
452 | } |
453 | |
454 | |
455 | { |
e425a60b |
456 | local $BugId = '22354'; |
457 | sub func ($) { |
458 | ok "a\nb" !~ /^b/, "Propagated modifier; $_[0]"; |
459 | ok "a\nb" =~ /^b/m, "Propagated modifier; $_[0] - with /m"; |
460 | } |
461 | func "standalone"; |
462 | $_ = "x"; s/x/func "in subst"/e; |
463 | $_ = "x"; s/x/func "in multiline subst"/em; |
464 | |
465 | # |
466 | # Next two give 'panic: malloc'. |
467 | # Outcommented, using two TODOs. |
468 | # |
469 | local $TODO = 'panic: malloc'; |
470 | local $Message = 'Postponed regexp and propaged modifier'; |
471 | # ok 0 for 1 .. 2; |
472 | SKIP: { |
473 | skip "panic: malloc", 2; |
474 | $_ = "x"; /x(?{func "in regexp"})/; |
475 | $_ = "x"; /x(?{func "in multiline regexp"})/m; |
476 | } |
477 | } |
478 | |
479 | |
480 | { |
481 | local $BugId = '19049'; |
482 | $_ = "abcdef\n"; |
483 | my @x = m/./g; |
484 | iseq "abcde", $`, 'Global match sets $`'; |
485 | } |
486 | |
487 | |
488 | { |
e425a60b |
489 | # [perl #23769] Unicode regex broken on simple example |
490 | # regrepeat() didn't handle UTF-8 EXACT case right. |
491 | local $BugId = '23769'; |
492 | my $Mess = 'regrepeat() handles UTF-8 EXACT case right'; |
493 | local $Message = $Mess; |
494 | |
495 | my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s; |
496 | |
497 | ok $s =~ /\x{a0}/; |
498 | ok $s =~ /\x{a0}+/; |
499 | ok $s =~ /\x{a0}\x{a0}/; |
500 | |
501 | $Message = "$Mess (easy variant)"; |
502 | ok "aaa\x{100}" =~ /(a+)/; |
503 | iseq $1, "aaa"; |
504 | |
505 | $Message = "$Mess (easy invariant)"; |
506 | ok "aaa\x{100} " =~ /(a+?)/; |
507 | iseq $1, "a"; |
508 | |
509 | $Message = "$Mess (regrepeat variant)"; |
510 | ok "\xa0\xa0\xa0\x{100} " =~ /(\xa0+?)/; |
511 | iseq $1, "\xa0"; |
512 | |
513 | $Message = "$Mess (regrepeat invariant)"; |
514 | ok "\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/; |
515 | iseq $1, "\xa0\xa0\xa0"; |
516 | |
517 | $Message = "$Mess (hard variant)"; |
518 | ok "\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/; |
519 | iseq $1, "\xa0\xa1"; |
520 | |
521 | $Message = "$Mess (hard invariant)"; |
522 | ok "ababab\x{100} " =~ /((?:ab)+)/; |
523 | iseq $1, 'ababab'; |
524 | |
525 | ok "\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/; |
526 | iseq $1, "\xa0\xa1\xa0\xa1\xa0\xa1"; |
527 | |
528 | ok "ababab\x{100} " =~ /((?:ab)+?)/; |
529 | iseq $1, "ab"; |
530 | |
531 | $Message = "Don't match first byte of UTF-8 representation"; |
532 | ok "\xc4\xc4\xc4" !~ /(\x{100}+)/; |
533 | ok "\xc4\xc4\xc4" !~ /(\x{100}+?)/; |
534 | ok "\xc4\xc4\xc4" !~ /(\x{100}++)/; |
535 | } |
536 | |
537 | |
538 | { |
e425a60b |
539 | # perl panic: pp_match start/end pointers |
540 | local $BugId = '25269'; |
541 | iseq "a-bc", eval {my ($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; "$x-$y"}, |
542 | 'Captures can move backwards in string'; |
543 | } |
544 | |
545 | |
546 | { |
547 | local $BugId = '27940'; # \cA not recognized in character classes |
548 | ok "a\cAb" =~ /\cA/, '\cA in pattern'; |
549 | ok "a\cAb" =~ /[\cA]/, '\cA in character class'; |
550 | ok "a\cAb" =~ /[\cA-\cB]/, '\cA in character class range'; |
551 | ok "abc" =~ /[^\cA-\cB]/, '\cA in negated character class range'; |
552 | ok "a\cBb" =~ /[\cA-\cC]/, '\cB in character class range'; |
553 | ok "a\cCbc" =~ /[^\cA-\cB]/, '\cC in negated character class range'; |
554 | ok "a\cAb" =~ /(??{"\cA"})/, '\cA in ??{} pattern'; |
555 | ok "ab" !~ /a\cIb/x, '\cI in pattern'; |
556 | } |
557 | |
558 | |
559 | { |
9d45b377 |
560 | # perl #28532: optional zero-width match at end of string is ignored |
561 | local $BugId = '28532'; |
562 | ok "abc" =~ /^abc(\z)?/ && defined($1), |
563 | 'Optional zero-width match at end of string'; |
564 | ok "abc" =~ /^abc(\z)??/ && !defined($1), |
565 | 'Optional zero-width match at end of string'; |
e425a60b |
566 | } |
567 | |
568 | |
e425a60b |
569 | |
570 | { |
571 | local $BugId = '36207'; |
572 | my $utf8 = "\xe9\x{100}"; chop $utf8; |
573 | my $latin1 = "\xe9"; |
574 | |
575 | ok $utf8 =~ /\xe9/i, "utf8/latin"; |
576 | ok $utf8 =~ /$latin1/i, "utf8/latin runtime"; |
577 | ok $utf8 =~ /(abc|\xe9)/i, "utf8/latin trie"; |
578 | ok $utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime"; |
579 | |
580 | ok "\xe9" =~ /$utf8/i, "latin/utf8"; |
581 | ok "\xe9" =~ /(abc|$utf8)/i, "latin/utf8 trie"; |
582 | ok $latin1 =~ /$utf8/i, "latin/utf8 runtime"; |
583 | ok $latin1 =~ /(abc|$utf8)/i, "latin/utf8 trie runtime"; |
584 | } |
585 | |
586 | |
587 | { |
588 | local $BugId = '37038'; |
589 | my $s = "abcd"; |
590 | $s =~ /(..)(..)/g; |
591 | $s = $1; |
592 | $s = $2; |
593 | iseq $2, 'cd', |
594 | "Assigning to original string does not corrupt match vars"; |
595 | } |
596 | |
597 | |
598 | { |
9d45b377 |
599 | local $PatchId = '26410'; |
e425a60b |
600 | { |
601 | package wooosh; |
602 | sub gloople {"!"} |
603 | } |
604 | my $aeek = bless {} => 'wooosh'; |
605 | eval_ok sub {$aeek -> gloople () =~ /(.)/g}, |
606 | "//g match against return value of sub"; |
607 | |
608 | sub gloople {"!"} |
609 | eval_ok sub {gloople () =~ /(.)/g}, |
610 | "26410 didn't affect sub calls for some reason"; |
611 | } |
612 | |
613 | |
614 | { |
615 | local $TODO = "See changes 26925-26928, which reverted change 26410"; |
616 | { |
617 | package lv; |
618 | our $var = "abc"; |
619 | sub variable : lvalue {$var} |
620 | } |
621 | my $o = bless [] => 'lv'; |
622 | my $f = ""; |
623 | my $r = eval { |
624 | for (1 .. 2) { |
625 | $f .= $1 if $o -> variable =~ /(.)/g; |
626 | } |
627 | 1; |
628 | }; |
629 | if ($r) { |
630 | iseq $f, "ab", "pos() retained between calls"; |
631 | } |
632 | else { |
633 | local $TODO; |
634 | ok 0, "Code failed: $@"; |
635 | } |
636 | |
637 | our $var = "abc"; |
638 | sub variable : lvalue {$var} |
639 | my $g = ""; |
640 | my $s = eval { |
641 | for (1 .. 2) { |
642 | $g .= $1 if variable =~ /(.)/g; |
643 | } |
644 | 1; |
645 | }; |
646 | if ($s) { |
647 | iseq $g, "ab", "pos() retained between calls"; |
648 | } |
649 | else { |
650 | local $TODO; |
651 | ok 0, "Code failed: $@"; |
652 | } |
653 | } |
654 | |
655 | |
656 | SKIP: |
657 | { |
658 | local $BugId = '37836'; |
659 | skip "In EBCDIC" if $IS_EBCDIC; |
660 | no warnings 'utf8'; |
661 | $_ = pack 'U0C2', 0xa2, 0xf8; # Ill-formed UTF-8 |
662 | my $ret = 0; |
663 | eval_ok sub {!($ret = s/[\0]+//g)}, |
664 | "Ill-formed UTF-8 doesn't match NUL in class"; |
665 | } |
666 | |
667 | |
668 | { |
669 | # chr(65535) should be allowed in regexes |
670 | local $BugId = '38293'; |
671 | no warnings 'utf8'; # To allow non-characters |
672 | my ($c, $r, $s); |
673 | |
674 | $c = chr 0xffff; |
675 | $c =~ s/$c//g; |
676 | ok $c eq "", "U+FFFF, parsed as atom"; |
677 | |
678 | $c = chr 0xffff; |
679 | $r = "\\$c"; |
680 | $c =~ s/$r//g; |
681 | ok $c eq "", "U+FFFF backslashed, parsed as atom"; |
682 | |
683 | $c = chr 0xffff; |
684 | $c =~ s/[$c]//g; |
685 | ok $c eq "", "U+FFFF, parsed in class"; |
686 | |
687 | $c = chr 0xffff; |
688 | $r = "[\\$c]"; |
689 | $c =~ s/$r//g; |
690 | ok $c eq "", "U+FFFF backslashed, parsed in class"; |
691 | |
692 | $s = "A\x{ffff}B"; |
693 | $s =~ s/\x{ffff}//i; |
694 | ok $s eq "AB", "U+FFFF, EXACTF"; |
695 | |
696 | $s = "\x{ffff}A"; |
697 | $s =~ s/\bA//; |
698 | ok $s eq "\x{ffff}", "U+FFFF, BOUND"; |
699 | |
700 | $s = "\x{ffff}!"; |
701 | $s =~ s/\B!//; |
702 | ok $s eq "\x{ffff}", "U+FFFF, NBOUND"; |
703 | } |
704 | |
705 | |
706 | { |
707 | local $BugId = '39583'; |
708 | |
709 | # The printing characters |
710 | my @chars = ("A" .. "Z"); |
711 | my $delim = ","; |
712 | my $size = 32771 - 4; |
713 | my $str = ''; |
714 | |
715 | # Create some random junk. Inefficient, but it works. |
716 | for (my $i = 0; $i < $size; $ i++) { |
717 | $str .= $chars [rand @chars]; |
718 | } |
719 | |
720 | $str .= ($delim x 4); |
721 | my $res; |
722 | my $matched; |
723 | ok $str =~ s/^(.*?)${delim}{4}//s, "Pattern matches"; |
724 | iseq $str, "", "Empty string"; |
725 | ok defined $1 && length ($1) == $size, '$1 is correct size'; |
726 | } |
727 | |
728 | |
729 | { |
730 | local $BugId = '27940'; |
731 | ok "\0-A" =~ /\c@-A/, '@- should not be interpolated in a pattern'; |
732 | ok "\0\0A" =~ /\c@+A/, '@+ should not be interpolated in a pattern'; |
733 | ok "X\@-A" =~ /X@-A/, '@- should not be interpolated in a pattern'; |
734 | ok "X\@\@A" =~ /X@+A/, '@+ should not be interpolated in a pattern'; |
735 | |
736 | ok "X\0A" =~ /X\c@?A/, '\c@?'; |
737 | ok "X\0A" =~ /X\c@*A/, '\c@*'; |
738 | ok "X\0A" =~ /X\c@(A)/, '\c@('; |
739 | ok "X\0A" =~ /X(\c@)A/, '\c@)'; |
740 | ok "X\0A" =~ /X\c@|ZA/, '\c@|'; |
741 | |
742 | ok "X\@A" =~ /X@?A/, '@?'; |
743 | ok "X\@A" =~ /X@*A/, '@*'; |
744 | ok "X\@A" =~ /X@(A)/, '@('; |
745 | ok "X\@A" =~ /X(@)A/, '@)'; |
746 | ok "X\@A" =~ /X@|ZA/, '@|'; |
747 | |
748 | local $" = ','; # non-whitespace and non-RE-specific |
749 | ok 'abc' =~ /(.)(.)(.)/, 'The last successful match is bogus'; |
750 | ok "A@+B" =~ /A@{+}B/, 'Interpolation of @+ in /@{+}/'; |
751 | ok "A@-B" =~ /A@{-}B/, 'Interpolation of @- in /@{-}/'; |
752 | ok "A@+B" =~ /A@{+}B/x, 'Interpolation of @+ in /@{+}/x'; |
753 | ok "A@-B" =~ /A@{-}B/x, 'Interpolation of @- in /@{-}/x'; |
754 | } |
755 | |
756 | |
757 | { |
9d45b377 |
758 | local $BugId = '50496'; |
e425a60b |
759 | my $s = 'foo bar baz'; |
760 | my (@k, @v, @fetch, $res); |
761 | my $count = 0; |
762 | my @names = qw ($+{A} $+{B} $+{C}); |
763 | if ($s =~ /(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) { |
764 | while (my ($k, $v) = each (%+)) { |
765 | $count++; |
766 | } |
767 | @k = sort keys (%+); |
768 | @v = sort values (%+); |
769 | $res = 1; |
770 | push @fetch, |
771 | ["$+{A}", "$1"], |
772 | ["$+{B}", "$2"], |
773 | ["$+{C}", "$3"], |
774 | ; |
775 | } |
776 | foreach (0 .. 2) { |
777 | if ($fetch [$_]) { |
778 | iseq $fetch [$_] [0], $fetch [$_] [1], $names [$_]; |
779 | } else { |
780 | ok 0, $names[$_]; |
781 | } |
782 | } |
783 | iseq $res, 1, "'$s' =~ /(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz)/"; |
784 | iseq $count, 3, "Got 3 keys in %+ via each"; |
785 | iseq 0 + @k, 3, 'Got 3 keys in %+ via keys'; |
786 | iseq "@k", "A B C", "Got expected keys"; |
787 | iseq "@v", "bar baz foo", "Got expected values"; |
788 | eval ' |
789 | no warnings "uninitialized"; |
790 | print for $+ {this_key_doesnt_exist}; |
791 | '; |
792 | ok !$@, 'lvalue $+ {...} should not throw an exception'; |
793 | } |
794 | |
795 | |
796 | { |
797 | # |
798 | # Almost the same as the block above, except that the capture is nested. |
799 | # |
800 | local $BugId = '50496'; |
801 | my $s = 'foo bar baz'; |
802 | my (@k, @v, @fetch, $res); |
803 | my $count = 0; |
804 | my @names = qw ($+{A} $+{B} $+{C} $+{D}); |
805 | if ($s =~ /(?<D>(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz))/) { |
806 | while (my ($k,$v) = each(%+)) { |
807 | $count++; |
808 | } |
809 | @k = sort keys (%+); |
810 | @v = sort values (%+); |
811 | $res = 1; |
812 | push @fetch, |
813 | ["$+{A}", "$2"], |
814 | ["$+{B}", "$3"], |
815 | ["$+{C}", "$4"], |
816 | ["$+{D}", "$1"], |
817 | ; |
818 | } |
819 | foreach (0 .. 3) { |
820 | if ($fetch [$_]) { |
821 | iseq $fetch [$_] [0], $fetch [$_] [1], $names [$_]; |
822 | } else { |
823 | ok 0, $names [$_]; |
824 | } |
825 | } |
826 | iseq $res, 1, "'$s' =~ /(?<D>(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz))/"; |
827 | iseq $count, 4, "Got 4 keys in %+ via each"; |
828 | iseq @k, 4, 'Got 4 keys in %+ via keys'; |
829 | iseq "@k", "A B C D", "Got expected keys"; |
830 | iseq "@v", "bar baz foo foo bar baz", "Got expected values"; |
831 | eval ' |
832 | no warnings "uninitialized"; |
833 | print for $+ {this_key_doesnt_exist}; |
834 | '; |
835 | ok !$@,'lvalue $+ {...} should not throw an exception'; |
836 | } |
837 | |
838 | |
839 | { |
e425a60b |
840 | local $BugId = '36046'; |
841 | my $str = 'abc'; |
842 | my $count = 0; |
843 | my $mval = 0; |
844 | my $pval = 0; |
845 | while ($str =~ /b/g) {$mval = $#-; $pval = $#+; $count ++} |
846 | iseq $mval, 0, '@- should be empty'; |
847 | iseq $pval, 0, '@+ should be empty'; |
848 | iseq $count, 1, 'Should have matched once only'; |
849 | } |
850 | |
851 | |
e425a60b |
852 | |
853 | |
854 | { |
855 | local $BugId = '40684'; |
856 | local $Message = '/m in precompiled regexp'; |
857 | my $s = "abc\ndef"; |
858 | my $rex = qr'^abc$'m; |
859 | ok $s =~ m/$rex/; |
860 | ok $s =~ m/^abc$/m; |
861 | } |
862 | |
863 | |
864 | { |
e425a60b |
865 | local $BugId = '36909'; |
866 | local $Message = '(?: ... )? should not lose $^R'; |
867 | $^R = 'Nothing'; |
868 | { |
869 | local $^R = "Bad"; |
870 | ok 'x foofoo y' =~ m { |
871 | (foo) # $^R correctly set |
872 | (?{ "last regexp code result" }) |
873 | }x; |
874 | iseq $^R, 'last regexp code result'; |
875 | } |
876 | iseq $^R, 'Nothing'; |
877 | |
878 | { |
879 | local $^R = "Bad"; |
880 | |
881 | ok 'x foofoo y' =~ m { |
882 | (?:foo|bar)+ # $^R correctly set |
883 | (?{ "last regexp code result" }) |
884 | }x; |
885 | iseq $^R, 'last regexp code result'; |
886 | } |
887 | iseq $^R, 'Nothing'; |
888 | |
889 | { |
890 | local $^R = "Bad"; |
891 | ok 'x foofoo y' =~ m { |
892 | (foo|bar)\1+ # $^R undefined |
893 | (?{ "last regexp code result" }) |
894 | }x; |
895 | iseq $^R, 'last regexp code result'; |
896 | } |
897 | iseq $^R, 'Nothing'; |
898 | |
899 | { |
900 | local $^R = "Bad"; |
901 | ok 'x foofoo y' =~ m { |
902 | (foo|bar)\1 # This time without the + |
903 | (?{"last regexp code result"}) |
904 | }x; |
905 | iseq $^R, 'last regexp code result'; |
906 | } |
907 | iseq $^R, 'Nothing'; |
908 | } |
909 | |
910 | |
911 | { |
912 | local $BugId = '22395'; |
913 | local $Message = 'Match is linear, not quadratic'; |
914 | our $count; |
915 | for my $l (10, 100, 1000) { |
916 | $count = 0; |
917 | ('a' x $l) =~ /(.*)(?{$count++})[bc]/; |
918 | local $TODO = "Should be L+1 not L*(L+3)/2 (L=$l)"; |
919 | iseq $count, $l + 1; |
920 | } |
921 | } |
922 | |
923 | |
924 | { |
925 | local $BugId = '22614'; |
926 | local $Message = '@-/@+ should not have undefined values'; |
927 | local $_ = 'ab'; |
928 | our @len = (); |
929 | /(.){1,}(?{push @len,0+@-})(.){1,}(?{})^/; |
930 | iseq "@len", "2 2 2"; |
931 | } |
932 | |
933 | |
934 | { |
935 | local $BugId = '18209'; |
936 | local $Message = '$& set on s///'; |
937 | my $text = ' word1 word2 word3 word4 word5 word6 '; |
938 | |
939 | my @words = ('word1', 'word3', 'word5'); |
940 | my $count; |
941 | foreach my $word (@words) { |
942 | $text =~ s/$word\s//gi; # Leave a space to seperate words |
943 | # in the resultant str. |
944 | # The following block is not working. |
945 | if ($&) { |
946 | $count ++; |
947 | } |
948 | # End bad block |
949 | } |
950 | iseq $count, 3; |
951 | iseq $text, ' word2 word4 word6 '; |
952 | } |
953 | |
954 | |
955 | { |
956 | # RT#6893 |
957 | local $BugId = '6893'; |
958 | local $_ = qq (A\nB\nC\n); |
959 | my @res; |
960 | while (m#(\G|\n)([^\n]*)\n#gsx) { |
961 | push @res, "$2"; |
962 | last if @res > 3; |
963 | } |
964 | iseq "@res", "A B C", "/g pattern shouldn't infinite loop"; |
965 | } |
966 | |
967 | |
e425a60b |
968 | |
969 | { |
970 | local $BugId = '41010'; |
971 | local $Message = 'No optimizer bug'; |
972 | my @tails = ('', '(?(1))', '(|)', '()?'); |
973 | my @quants = ('*','+'); |
974 | my $doit = sub { |
975 | my $pats = shift; |
976 | for (@_) { |
977 | for my $pat (@$pats) { |
978 | for my $quant (@quants) { |
979 | for my $tail (@tails) { |
980 | my $re = "($pat$quant\$)$tail"; |
981 | ok /$re/ && $1 eq $_, "'$_' =~ /$re/"; |
982 | ok /$re/m && $1 eq $_, "'$_' =~ /$re/m"; |
983 | } |
984 | } |
985 | } |
986 | } |
987 | }; |
988 | |
989 | my @dpats = ('\d', |
990 | '[1234567890]', |
991 | '(1|[23]|4|[56]|[78]|[90])', |
992 | '(?:1|[23]|4|[56]|[78]|[90])', |
993 | '(1|2|3|4|5|6|7|8|9|0)', |
994 | '(?:1|2|3|4|5|6|7|8|9|0)'); |
995 | my @spats = ('[ ]', ' ', '( |\t)', '(?: |\t)', '[ \t]', '\s'); |
996 | my @sstrs = (' '); |
997 | my @dstrs = ('12345'); |
998 | $doit -> (\@spats, @sstrs); |
999 | $doit -> (\@dpats, @dstrs); |
1000 | } |
1001 | |
1002 | |
e425a60b |
1003 | |
1004 | { |
1005 | local $BugId = '45605'; |
1006 | # [perl #45605] Regexp failure with utf8-flagged and byte-flagged string |
1007 | |
1008 | my $utf_8 = "\xd6schel"; |
1009 | utf8::upgrade ($utf_8); |
1010 | $utf_8 =~ m {(\xd6|Ö)schel}; |
1011 | iseq $1, "\xd6", "Upgrade error"; |
1012 | } |
1013 | |
1014 | { |
e425a60b |
1015 | # Regardless of utf8ness any character matches itself when |
1016 | # doing a case insensitive match. See also [perl #36207] |
1017 | local $BugId = '36207'; |
1018 | for my $o (0 .. 255) { |
1019 | my @ch = (chr ($o), chr ($o)); |
1020 | utf8::upgrade ($ch [1]); |
1021 | for my $u_str (0, 1) { |
1022 | for my $u_pat (0, 1) { |
1023 | ok $ch [$u_str] =~ /\Q$ch[$u_pat]\E/i, |
1024 | "\$c =~ /\$c/i : chr ($o) : u_str = $u_str u_pat = $u_pat"; |
1025 | ok $ch [$u_str] =~ /\Q$ch[$u_pat]\E|xyz/i, |
1026 | "\$c=~/\$c|xyz/i : chr($o) : u_str = $u_str u_pat = $u_pat"; |
1027 | } |
1028 | } |
1029 | } |
1030 | } |
1031 | |
1032 | |
1033 | { |
e425a60b |
1034 | local $BugId = '49190'; |
1035 | local $Message = '$REGMARK in replacement'; |
1036 | our $REGMARK; |
1037 | my $_ = "A"; |
1038 | ok s/(*:B)A/$REGMARK/; |
1039 | iseq $_, "B"; |
1040 | $_ = "CCCCBAA"; |
1041 | ok s/(*:X)A+|(*:Y)B+|(*:Z)C+/$REGMARK/g; |
1042 | iseq $_, "ZYX"; |
1043 | } |
1044 | |
1045 | |
1046 | { |
e425a60b |
1047 | local $BugId = '52658'; |
1048 | local $Message = 'Substitution evaluation in list context'; |
1049 | my $reg = '../xxx/'; |
1050 | my @te = ($reg =~ m{^(/?(?:\.\./)*)}, |
1051 | $reg =~ s/(x)/'b'/eg > 1 ? '##' : '++'); |
1052 | iseq $reg, '../bbb/'; |
1053 | iseq $te [0], '../'; |
1054 | } |
1055 | |
1056 | # This currently has to come before any "use encoding" in this file. |
1057 | { |
1058 | local $Message; |
1059 | local $BugId = '59342'; |
1060 | must_warn 'qr/\400/', '^Use of octal value above 377'; |
1061 | } |
1062 | |
1063 | |
e425a60b |
1064 | |
1065 | { |
1066 | local $BugId = '60034'; |
1067 | my $a = "xyzt" x 8192; |
1068 | ok $a =~ /\A(?>[a-z])*\z/, |
1069 | '(?>) does not cause wrongness on long string'; |
1070 | my $b = $a . chr 256; |
1071 | chop $b; |
1072 | { |
1073 | iseq $a, $b; |
1074 | } |
1075 | ok $b =~ /\A(?>[a-z])*\z/, |
1076 | '(?>) does not cause wrongness on long string with UTF-8'; |
1077 | } |
1078 | |
1079 | |
1080 | # |
1081 | # Keep the following tests last -- they may crash perl |
1082 | # |
1083 | print "# Tests that follow may crash perl\n"; |
1084 | { |
1085 | local $BugId = '19049/38869'; |
1086 | local $Message = 'Pattern in a loop, failure should not ' . |
1087 | 'affect previous success'; |
1088 | my @list = ( |
1089 | 'ab cdef', # Matches regex |
1090 | ('e' x 40000 ) .'ab c' # Matches not, but 'ab c' matches part of it |
1091 | ); |
1092 | my $y; |
1093 | my $x; |
1094 | foreach (@list) { |
1095 | m/ab(.+)cd/i; # The ignore-case seems to be important |
1096 | $y = $1; # Use $1, which might not be from the last match! |
1097 | $x = substr ($list [0], $- [0], $+ [0] - $- [0]); |
1098 | } |
1099 | iseq $y, ' '; |
1100 | iseq $x, 'ab cd'; |
1101 | } |
1102 | |
1103 | |
1104 | { |
1105 | local $BugId = '24274'; |
1106 | |
1107 | ok (("a" x (2 ** 15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker"); |
1108 | ok ((q(a)x 100) =~ /^(??{'(.)'x 100})/, |
1109 | "Regexp /^(??{'(.)'x 100})/ crashes older perls"); |
1110 | } |
1111 | |
1112 | |
1113 | { |
e425a60b |
1114 | # [perl #45337] utf8 + "[a]a{2}" + /$.../ = panic: sv_len_utf8 cache |
1115 | local $BugId = '45337'; |
1116 | local ${^UTF8CACHE} = -1; |
1117 | local $Message = "Shouldn't panic"; |
1118 | my $s = "[a]a{2}"; |
1119 | utf8::upgrade $s; |
1120 | ok "aaa" =~ /$s/; |
1121 | } |
1122 | { |
1123 | local $BugId = '57042'; |
1124 | local $Message = "Check if tree logic breaks \$^R"; |
1125 | my $cond_re = qr/\s* |
1126 | \s* (?: |
1127 | \( \s* A (?{1}) |
1128 | | \( \s* B (?{2}) |
1129 | ) |
1130 | /x; |
1131 | my @res; |
1132 | for my $line ("(A)","(B)") { |
1133 | if ($line =~ m/$cond_re/) { |
1134 | push @res, $^R ? "#$^R" : "UNDEF"; |
1135 | } |
1136 | } |
1137 | iseq "@res","#1 #2"; |
1138 | } |
1139 | { |
1140 | no warnings 'closure'; |
1141 | my $re = qr/A(??{"1"})/; |
1142 | ok "A1B" =~ m/^((??{ $re }))((??{"B"}))$/; |
1143 | ok $1 eq "A1"; |
1144 | ok $2 eq "B"; |
1145 | } |
1146 | |
1147 | |
e425a60b |
1148 | |
1149 | # This only works under -DEBUGGING because it relies on an assert(). |
1150 | { |
1151 | local $BugId = '60508'; |
1152 | local $Message = "Check capture offset re-entrancy of utf8 code."; |
1153 | |
1154 | sub fswash { $_[0] =~ s/([>X])//g; } |
1155 | |
1156 | my $k1 = "." x 4 . ">>"; |
1157 | fswash($k1); |
1158 | |
1159 | my $k2 = "\x{f1}\x{2022}"; |
1160 | $k2 =~ s/([\360-\362])/>/g; |
1161 | fswash($k2); |
1162 | |
1163 | iseq($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks"); |
1164 | } |
1165 | |
1166 | |
1167 | { |
1168 | local $BugId = 65372; # minimal CURLYM limited to 32767 matches |
1169 | my @pat = ( |
1170 | qr{a(x|y)*b}, # CURLYM |
1171 | qr{a(x|y)*?b}, # .. with minmod |
1172 | qr{a([wx]|[yz])*b}, # .. and without tries |
1173 | qr{a([wx]|[yz])*?b}, |
1174 | ); |
1175 | my $len = 32768; |
1176 | my $s = join '', 'a', 'x' x $len, 'b'; |
1177 | for my $pat (@pat) { |
1178 | ok($s =~ $pat, $pat); |
1179 | } |
1180 | } |
e425a60b |
1181 | } # End of sub run_tests |
1182 | |
1183 | 1; |