X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=t%2Fop%2Fpat.t;h=dff0b65bf105ca95da3d8efc091d7b2a52b45c90;hb=57c348a981665d6305f7f38920ab85e57a77ae65;hp=a00e624bf4333e8c2fda4f98bbdbbf4b9277f36f;hpb=f272994b1ce9066a01ab0ed3d6c5353b37057838;p=p5sagit%2Fp5-mst-13.2.git diff --git a/t/op/pat.t b/t/op/pat.t index a00e624..dff0b65 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..864\n"; +print "1..1195\n"; BEGIN { chdir 't' if -d 't'; @@ -20,9 +20,8 @@ $x = "abc\ndef\n"; if ($x =~ /^abc/) {print "ok 1\n";} else {print "not ok 1\n";} if ($x !~ /^def/) {print "ok 2\n";} else {print "not ok 2\n";} -$* = 1; -if ($x =~ /^def/) {print "ok 3\n";} else {print "not ok 3\n";} -$* = 0; +# used to be a test for $* +if ($x =~ /^def/m) {print "ok 3\n";} else {print "not ok 3\n";} $_ = '123'; if (/^([0-9][0-9]*)/) {print "ok 4\n";} else {print "not ok 4\n";} @@ -69,9 +68,8 @@ if (m|bc/*d|) {print "ok 22\n";} else {print "not ok 22\n";} if (/^$_$/) {print "ok 23\n";} else {print "not ok 23\n";} -$* = 1; # test 3 only tested the optimized version--this one is for real -if ("ab\ncd\n" =~ /^cd/) {print "ok 24\n";} else {print "not ok 24\n";} -$* = 0; +# used to be a test for $* +if ("ab\ncd\n" =~ /^cd/m) {print "ok 24\n";} else {print "not ok 24\n";} $XXX{123} = 123; $XXX{234} = 234; @@ -81,12 +79,21 @@ $XXX{345} = 345; while ($_ = shift(@XXX)) { ?(.*)? && (print $1,"\n"); /not/ && reset; - /not ok 26/ && reset 'X'; + if (/not ok 26/) { + if ($^O eq 'VMS') { + $_ = shift(@XXX); + } + else { + reset 'X'; + } + } } -while (($key,$val) = each(%XXX)) { +if ($^O ne 'VMS') { + while (($key,$val) = each(%XXX)) { print "not ok 27\n"; exit; + } } print "ok 27\n"; @@ -1367,10 +1374,10 @@ print "ok 247\n"; print "ok 263\n"; } -{ +SKIP: { my $test = 264; # till 575 - use charnames ':full'; + use charnames ":full"; # This is far from complete testing, there are dozens of character # classes in Unicode. The mixing of literals and \N{...} is @@ -1691,10 +1698,11 @@ EOT print "not " if $x =~ /[\x{100}]/; print "ok 604\n"; - print "not " unless $x =~ /\p{InLatin1Supplement}/; + # the next two tests must be ignored on EBCDIC + print "not " unless $x =~ /\p{InLatin1Supplement}/ or ord("A") == 193; print "ok 605\n"; - print "not " if $x =~ /\P{InLatin1Supplement}/; + print "not " if $x =~ /\P{InLatin1Supplement}/ and ord("A") != 193; print "ok 606\n"; print "not " if $x =~ /\p{InLatinExtendedA}/; @@ -1909,8 +1917,10 @@ print "ok 663\n"; print "not " unless chr(0xfb4f) =~ /\p{IsHebrew}/; # outside InHebrew print "ok 664\n"; -print "not " unless chr(0xb5) =~ /\p{IsGreek}/; # singleton (not in a range) -print "ok 665\n"; +# # singleton (not in a range, this test must be ignored on EBCDIC) +# print "not " unless chr(0xb5) =~ /\p{IsGreek}/ or ord("A") == 193; +# print "ok 665\n"; +print "ok 665 # 0xb5 moved from Greek to Common with Unicode 4.0.1\n"; print "not " unless chr(0x37a) =~ /\p{IsGreek}/; # singleton print "ok 666\n"; @@ -2235,10 +2245,11 @@ print "# some Unicode properties\n"; } { - print "not " unless "a" =~ /\p{L&}/; + # L& and LC are the same + print "not " unless "a" =~ /\p{LC}/ and "a" =~ /\p{L&}/; print "ok 743\n"; - print "not " if "1" =~ /\p{L&}/; + print "not " if "1" =~ /\p{LC}/ or "1" =~ /\p{L&}/; print "ok 744\n"; } @@ -2730,3 +2741,687 @@ print "# some Unicode properties\n"; print $u eq "feeber" ? "ok 864\n" : "not ok 864\n"; } +{ + print "# UTF-8 bug with s///\n"; + # check utf8/non-utf8 mixtures + # try to force all float/anchored check combinations + my $c = "\x{100}"; + my $test = 865; + my $subst; + for my $re ( + "xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", "xx.*(?=$c)", "(?=$c).*xx", + ) { + print "xxx" =~ /$re/ ? "not ok $test\n" : "ok $test\n"; + ++$test; + print +($subst = "xxx") =~ s/$re// ? "not ok $test\n" : "ok $test\n"; + ++$test; + } + for my $re ("xx.*$c*", "$c*.*xx") { + print "xxx" =~ /$re/ ? "ok $test\n" : "not ok $test\n"; + ++$test; + ($subst = "xxx") =~ s/$re//; + print $subst eq '' ? "ok $test\n" : "not ok $test\t# $subst\n"; + ++$test; + } + for my $re ("xxy*", "y*xx") { + print "xx$c" =~ /$re/ ? "ok $test\n" : "not ok $test\n"; + ++$test; + ($subst = "xx$c") =~ s/$re//; + print $subst eq $c ? "ok $test\n" : "not ok $test\n"; + ++$test; + print "xy$c" =~ /$re/ ? "not ok $test\n" : "ok $test\n"; + ++$test; + print +($subst = "xy$c") =~ /$re/ ? "not ok $test\n" : "ok $test\n"; + ++$test; + } + for my $re ("xy$c*z", "x$c*yz") { + print "xyz" =~ /$re/ ? "ok $test\n" : "not ok $test\n"; + ++$test; + ($subst = "xyz") =~ s/$re//; + print $subst eq '' ? "ok $test\n" : "not ok $test\n"; + ++$test; + } +} + +{ + print "# qr/.../x\n"; + my $test = 893; + + my $R = qr/ A B C # D E/x; + + print eval {"ABCDE" =~ $R} ? "ok $test\n" : "not ok $test\n"; + $test++; + + print eval {"ABCDE" =~ m/$R/} ? "ok $test\n" : "not ok $test\n"; + $test++; + + print eval {"ABCDE" =~ m/($R)/} ? "ok $test\n" : "not ok $test\n"; + $test++; +} + +{ + print "# illegal Unicode properties\n"; + my $test = 896; + + print eval { "a" =~ /\pq / } ? "not ok $test\n" : "ok $test\n"; + $test++; + + print eval { "a" =~ /\p{qrst} / } ? "not ok $test\n" : "ok $test\n"; + $test++; +} + +{ + print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n"; + # requires reuse of last successful pattern + my $test = 898; + $test =~ /\d/; + for (0 .. 1) { + my $match = ?? + 0; + if ($match != $_) { + print "ok $test\n"; + } else { + printf "not ok %s\t# 'match once' %s on %s iteration\n", $test, + $match ? 'succeeded' : 'failed', $_ ? 'second' : 'first'; + } + ++$test; + } + $test =~ /(\d)/; + my $result = join '', $test =~ //g; + if ($result eq $test) { + print "ok $test\n"; + } else { + printf "not ok %s\t# expected '%s', got '%s'\n", $test, $test, $result; + } + ++$test; +} + +print "# user-defined character properties\n"; + +sub InKana1 { + return <<'END'; +3040 309F +30A0 30FF +END +} + +sub InKana2 { + return <<'END'; ++utf8::InHiragana ++utf8::InKatakana +END +} + +sub InKana3 { + return <<'END'; ++utf8::InHiragana ++utf8::InKatakana +-utf8::IsCn +END +} + +sub InNotKana { + return <<'END'; +!utf8::InHiragana +-utf8::InKatakana ++utf8::IsCn +END +} + +$test = 901; + +print "\x{3040}" =~ /\p{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{303F}" =~ /\P{InKana1}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3040}" =~ /\p{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{303F}" =~ /\P{InKana2}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3041}" =~ /\p{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{3040}" =~ /\P{InKana3}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +print "\x{3040}" =~ /\p{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++; +print "\x{3041}" =~ /\P{InNotKana}/ ? "ok $test\n" : "not ok $test\n"; $test++; + +sub InConsonant { # Not EBCDIC-aware. + return < fail\n"; + ++$test; + print +(!$r or pos($s) == $len + 1) ? "ok $test\n" + : "not ok $test\t# <$type x $len> pos @{[ pos($s) ]}\n"; + ++$test; + } + } +} + +$test = 923; + +$a = bless qr/foo/, 'Foo'; +print(('goodfood' =~ $a ? '' : 'not '), + "ok $test\t# reblessed qr// matches\n"); +++$test; + +print(($a eq '(?-xism:foo)' ? '' : 'not '), + "ok $test\t# reblessed qr// stringizes\n"); +++$test; + +$x = "\x{3fe}"; +$z=$y = "\317\276"; # $y is byte representation of $x + +$a = qr/$x/; +print(($x =~ $a ? '' : 'not '), "ok $test - utf8 interpolation in qr//\n"); +++$test; + +print(("a$a" =~ $x ? '' : 'not '), + "ok $test - stringifed qr// preserves utf8\n"); +++$test; + +print(("a$x" =~ /^a$a\z/ ? '' : 'not '), + "ok $test - interpolated qr// preserves utf8\n"); +++$test; + +print(("a$x" =~ /^a(??{$a})\z/ ? '' : 'not '), + "ok $test - postponed interpolation of qr// preserves utf8\n"); +++$test; + +print((length(qr/##/x) == 12 ? '' : 'not '), + "ok $test - ## in qr// doesn't corrupt memory [perl #17776]\n"); +++$test; + +{ use re 'eval'; + +print(("$x$x" =~ /^$x(??{$x})\z/ ? '' : 'not '), + "ok $test - postponed utf8 string in utf8 re matches utf8\n"); +++$test; + +print(("$y$x" =~ /^$y(??{$x})\z/ ? '' : 'not '), + "ok $test - postponed utf8 string in non-utf8 re matches utf8\n"); +++$test; + +print(("$y$x" !~ /^$y(??{$y})\z/ ? '' : 'not '), + "ok $test - postponed non-utf8 string in non-utf8 re doesn't match utf8\n"); +++$test; + +print(("$x$x" !~ /^$x(??{$y})\z/ ? '' : 'not '), + "ok $test - postponed non-utf8 string in utf8 re doesn't match utf8\n"); +++$test; + +print(("$y$y" =~ /^$y(??{$y})\z/ ? '' : 'not '), + "ok $test - postponed non-utf8 string in non-utf8 re matches non-utf8\n"); +++$test; + +print(("$x$y" =~ /^$x(??{$y})\z/ ? '' : 'not '), + "ok $test - postponed non-utf8 string in utf8 re matches non-utf8\n"); +++$test; +$y = $z; # reset $y after upgrade + +print(("$x$y" !~ /^$x(??{$x})\z/ ? '' : 'not '), + "ok $test - postponed utf8 string in utf8 re doesn't match non-utf8\n"); +++$test; +$y = $z; # reset $y after upgrade + +print(("$y$y" !~ /^$y(??{$x})\z/ ? '' : 'not '), + "ok $test - postponed utf8 string in non-utf8 re doesn't match non-utf8\n"); +++$test; + +} # no re 'eval' + +print "# more user-defined character properties\n"; + +sub IsSyriac1 { + return <<'END'; +0712 072C +0730 074A +END +} + +ok("\x{0712}" =~ /\p{IsSyriac1}/, '\x{0712}, \p{IsSyriac1}'); +ok("\x{072F}" =~ /\P{IsSyriac1}/, '\x{072F}, \P{IsSyriac1}'); + +sub Syriac1 { + return <<'END'; +0712 072C +0730 074A +END +} + +ok("\x{0712}" =~ /\p{Syriac1}/, '\x{0712}, \p{Syriac1}'); +ok("\x{072F}" =~ /\P{Syriac1}/, '\x{072F}, \p{Syriac1}'); + +print "# user-defined character properties may lack \\n at the end\n"; +sub InGreekSmall { return "03B1\t03C9" } +sub InGreekCapital { return "0391\t03A9\n-03A2" } + +ok("\x{03C0}" =~ /\p{InGreekSmall}/, "Small pi"); +ok("\x{03C2}" =~ /\p{InGreekSmall}/, "Final sigma"); +ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI"); +ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved"); + +sub AsciiHexAndDash { + return <<'END'; ++utf8::ASCII_Hex_Digit ++utf8::Dash +END +} + +ok("-" =~ /\p{Dash}/, "'-' is Dash"); +ok("A" =~ /\p{ASCII_Hex_Digit}/, "'A' is ASCII_Hex_Digit"); +ok("-" =~ /\p{AsciiHexAndDash}/, "'-' is AsciiHexAndDash"); +ok("A" =~ /\p{AsciiHexAndDash}/, "'A' is AsciiHexAndDash"); + +{ + print "# Change #18179\n"; + # previously failed with "panic: end_shift + my $s = "\x{100}" x 5; + my $ok = $s =~ /(\x{100}{4})/; + my($ord, $len) = (ord $1, length $1); + print +($ok && $ord == 0x100 && $len == 4) + ? "ok $test\n" : "not ok $test\t# $ok/$ord/$len\n"; + ++$test; +} + +{ + print "# [perl #15763]\n"; + + $a = "x\x{100}"; + chop $a; # but leaves the UTF-8 flag + $a .= "y"; # 1 byte before "y" + + ok($a =~ /^\C/, 'match one \C on 1-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + + ok($a =~ /^\Cy/, 'match \Cy'); + ok($a =~ /^\C{1}y/, 'match \C{1}y'); + + $a = "\x{100}y"; # 2 bytes before "y" + + ok($a =~ /^\C/, 'match one \C on 2-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + ok($a =~ /^\C\C/, 'match two \C'); + ok($a =~ /^\C{2}/, 'match \C{2}'); + + ok($a =~ /^\C\C\C/, 'match three \C on 2-byte UTF-8 and a byte'); + ok($a =~ /^\C{3}/, 'match \C{3}'); + + ok($a =~ /^\C\Cy/, 'match two \C'); + ok($a =~ /^\C{2}y/, 'match \C{2}'); + + ok($a !~ /^\C\C\Cy/, q{don't match three \Cy}); + ok($a !~ /^\C{2}\Cy/, q{don't match \C{3}y}); + + $a = "\x{1000}y"; # 3 bytes before "y" + + ok($a =~ /^\C/, 'match one \C on three-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + ok($a =~ /^\C\C/, 'match two \C'); + ok($a =~ /^\C{2}/, 'match \C{2}'); + ok($a =~ /^\C\C\C/, 'match three \C'); + ok($a =~ /^\C{3}/, 'match \C{3}'); + + ok($a =~ /^\C\C\C\C/, 'match four \C on three-byte UTF-8 and a byte'); + ok($a =~ /^\C{4}/, 'match \C{4}'); + + ok($a =~ /^\C\C\Cy/, 'match three \Cy'); + ok($a =~ /^\C{3}y/, 'match \C{3}y'); + + ok($a !~ /^\C\C\C\C\y/, q{don't match four \Cy}); + ok($a !~ /^\C{4}y/, q{don't match \C{4}y}); +} + +$_ = 'aaaaaaaaaa'; +utf8::upgrade($_); chop $_; $\="\n"; +ok(/[^\s]+/, "m/[^\s]/ utf8"); +ok(/[^\d]+/, "m/[^\d]/ utf8"); +ok(($a = $_, $_ =~ s/[^\s]+/./g), "s/[^\s]/ utf8"); +ok(($a = $_, $a =~ s/[^\d]+/./g), "s/[^\s]/ utf8"); + +ok("\x{100}" =~ /\x{100}/, "[perl #15397]"); +ok("\x{100}" =~ /(\x{100})/, "[perl #15397]"); +ok("\x{100}" =~ /(\x{100}){1}/, "[perl #15397]"); +ok("\x{100}\x{100}" =~ /(\x{100}){2}/, "[perl #15397]"); +ok("\x{100}\x{100}" =~ /(\x{100})(\x{100})/, "[perl #15397]"); + +$x = "CD"; +$x =~ /(AB)*?CD/; +ok(!defined $1, "[perl #7471]"); + +$x = "CD"; +$x =~ /(AB)*CD/; +ok(!defined $1, "[perl #7471]"); + +$pattern = "^(b+?|a){1,2}c"; +ok("bac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]"); +ok("bbac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]"); +ok("bbbac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]"); +ok("bbbbac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]"); + +{ + # [perl #18232] + "\x{100}" =~ /(.)/; + ok( $1 eq "\x{100}", '$1 is utf-8 [perl #18232]' ); + { 'a' =~ /./; } + ok( $1 eq "\x{100}", '$1 is still utf-8' ); + ok( $1 ne "\xC4\x80", '$1 is not non-utf-8' ); +} + +{ + use utf8; + my $attr = 'Name-1' ; + + my $NormalChar = qr/[\p{IsDigit}\p{IsLower}\p{IsUpper}]/; + my $NormalWord = qr/${NormalChar}+?/; + my $PredNameHyphen = qr/^${NormalWord}(\-${NormalWord})*?$/; + + $attr =~ /^$/; + ok( $attr =~ $PredNameHyphen, "[perl #19767] original test" ); +} + +{ + use utf8; + "a" =~ m/[b]/; + ok ( "0" =~ /\p{N}+\z/, "[perl #19767] variant test" ); +} + +{ + + $p = 1; + foreach (1,2,3,4) { + $p++ if /(??{ $p })/ + } + ok ($p == 5, "[perl #20683] (??{ }) returns stale values"); + { package P; $a=1; sub TIESCALAR { bless[] } sub FETCH { $a++ } } + tie $p, P; + foreach (1,2,3,4) { + /(??{ $p })/ + } + ok ( $p == 5, "(??{ }) returns stale values"); +} + +{ + # Subject: Odd regexp behavior + # From: Markus Kuhn + # Date: Wed, 26 Feb 2003 16:53:12 +0000 + # Message-Id: + # To: perl-unicode@perl.org + + $x = "\x{2019}\nk"; $x =~ s/(\S)\n(\S)/$1 $2/sg; + ok($x eq "\x{2019} k", "Markus Kuhn 2003-02-26"); + + $x = "b\nk"; $x =~ s/(\S)\n(\S)/$1 $2/sg; + ok($x eq "b k", "Markus Kuhn 2003-02-26"); + + ok("\x{2019}" =~ /\S/, "Markus Kuhn 2003-02-26"); +} + +{ + my $i; + ok('-1-3-5-' eq join('', split /((??{$i++}))/, '-1-3-5-'), + "[perl #21411] (??{ .. }) corrupts split's stack"); + split /(?{'WOW'})/, 'abc'; + ok('a|b|c' eq join ('|', @_), + "[perl #21411] (?{ .. }) version of the above"); +} + +{ + split /(?{ split "" })/, "abc"; + ok(1,'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0'); +} + +{ + ok("\x{100}\n" =~ /\x{100}\n$/, "UTF8 length cache and fbm_compile"); +} + +{ + package Str; + use overload q/""/ => sub { ${$_[0]}; }; + sub new { my ($c, $v) = @_; bless \$v, $c; } + + package main; + $_ = Str->new("a\x{100}/\x{100}b"); + ok(join(":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr"); +} + +{ + $_ = "code: 'x' { '...' }\n"; study; + my @x; push @x, $& while m/'[^\']*'/gx; + ok(join(":", @x) eq "'x':'...'", + "[perl #17757] Parse::RecDescent triggers infinite loop"); +} + +{ + my $re = qq/^([^X]*)X/; + utf8::upgrade($re); + ok("\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"); +} + +# bug #22354 +sub func ($) { + ok( "a\nb" !~ /^b/, $_[0] ); + ok( "a\nb" =~ /^b/m, "$_[0] - with /m" ); +} +func "standalone"; +$_ = "x"; s/x/func "in subst"/e; +$_ = "x"; s/x/func "in multiline subst"/em; +#$_ = "x"; /x(?{func "in regexp"})/; +#$_ = "x"; /x(?{func "in multiline regexp"})/m; + +# bug #19049 +$_="abcdef\n"; +@x = m/./g; +ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`'); + +ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr'); + +# LATIN SMALL/CAPITAL LETTER A WITH MACRON +ok(" \x{101}" =~ qr/\x{100}/i, + "<20030808193656.5109.1@llama.ni-s.u-net.com>"); + +# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW +ok(" \x{1E01}" =~ qr/\x{1E00}/i, + "<20030808193656.5109.1@llama.ni-s.u-net.com>"); + +# DESERET SMALL/CAPITAL LETTER LONG I +ok(" \x{10428}" =~ qr/\x{10400}/i, + "<20030808193656.5109.1@llama.ni-s.u-net.com>"); + +# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' +ok(" \x{1E01}x" =~ qr/\x{1E00}X/i, + "<20030808193656.5109.1@llama.ni-s.u-net.com>"); + +{ + # [perl #23769] Unicode regex broken on simple example + # regrepeat() didn't handle UTF-8 EXACT case right. + + my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s; + + ok($s =~ /\x{a0}/, "[perl #23769]"); + ok($s =~ /\x{a0}+/, "[perl #23769]"); + ok($s =~ /\x{a0}\x{a0}/, "[perl #23769]"); + + ok("aaa\x{100}" =~ /(a+)/, "[perl #23769] easy invariant"); + ok($1 eq "aaa", "[perl #23769]"); + + ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, "[perl #23769] regrepeat invariant"); + ok($1 eq "\xa0\xa0\xa0", "[perl #23769]"); + + ok("ababab\x{100} " =~ /((?:ab)+)/, "[perl #23769] hard invariant"); + ok($1 eq "ababab", "[perl #23769]"); + + ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, "[perl #23769] hard variant"); + ok($1 eq "\xa0\xa1\xa0\xa1\xa0\xa1", "[perl #23769]"); + + ok("aaa\x{100} " =~ /(a+?)/, "[perl #23769] easy invariant"); + ok($1 eq "a", "[perl #23769]"); + + ok("\xa0\xa0\xa0\x{100} " =~ /(\xa0+?)/, "[perl #23769] regrepeat variant"); + ok($1 eq "\xa0", "[perl #23769]"); + + ok("ababab\x{100} " =~ /((?:ab)+?)/, "[perl #23769] hard invariant"); + ok($1 eq "ab", "[perl #23769]"); + + ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, "[perl #23769] hard variant"); + ok($1 eq "\xa0\xa1", "[perl #23769]"); + + ok("\xc4\xc4\xc4" !~ /(\x{100}+)/, "[perl #23769] don't match first byte of utf8 representation"); + ok("\xc4\xc4\xc4" !~ /(\x{100}+?)/, "[perl #23769] don't match first byte of utf8 representation"); +} + +for (120 .. 130) { + my $head = 'x' x $_; + for my $tail ('\x{0061}', '\x{1234}') { + ok( + eval qq{ "$head$tail" =~ /$head$tail/ }, + '\x{...} misparsed in regexp near 127 char EXACT limit' + ); + } +} + +# perl #25269: panic: pp_match start/end pointers +ok("a-bc" eq eval { + my($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; + "$x-$y"; +}, 'captures can move backwards in string'); + +# perl #27940: \cA not recognized in character classes +ok("a\cAb" =~ /\cA/, '\cA in pattern'); +ok("a\cAb" =~ /[\cA]/, '\cA in character class'); +ok("a\cAb" =~ /[\cA-\cB]/, '\cA in character class range'); +ok("abc" =~ /[^\cA-\cB]/, '\cA in negated character class range'); +ok("a\cBb" =~ /[\cA-\cC]/, '\cB in character class range'); +ok("a\cCbc" =~ /[^\cA-\cB]/, '\cC in negated character class range'); +ok("a\cAb" =~ /(??{"\cA"})/, '\cA in ??{} pattern'); + +# perl #28532: optional zero-width match at end of string is ignored +ok(("abc" =~ /^abc(\z)?/) && defined($1), + 'optional zero-width match at end of string'); +ok(("abc" =~ /^abc(\z)??/) && !defined($1), + 'optional zero-width match at end of string'); + + + +{ # TRIE related + my @got=(); + "words"=~/(word|word|word)(?{push @got,$1})s$/; + ok(@got==1,"TRIE optimation is working") or warn "# @got"; + @got=(); + "words"=~/(word|word|word)(?{push @got,$1})s$/i; + ok(@got==1,"TRIEF optimisation is working") or warn "# @got"; + + my @nums=map {int rand 1000} 1..100; + my $re="(".(join "|",@nums).")"; + $re=qr/\b$re\b/; + + foreach (@nums) { + ok($_=~/$re/,"Trie nums"); + } + $_=join " ", @nums; + @got=(); + push @got,$1 while /$re/g; + + my %count; + $count{$_}++ for @got; + my $ok=1; + for (@nums) { + $ok=0 if --$count{$_}<0; + } + ok($ok,"Trie min count matches"); +} + + +# TRIE related +# LATIN SMALL/CAPITAL LETTER A WITH MACRON +ok(("foba \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i) && $1 eq "\x{101}foo", + "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON"); + +# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW +ok(("foba \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i) && $1 eq "\x{1E01}foo", + "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW"); + +# DESERET SMALL/CAPITAL LETTER LONG I +ok(("foba \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i) && $1 eq "\x{10428}foo", + "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I"); + +# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' +ok(("foba \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i) && $1 eq "\x{1E01}xfoo", + "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'"); + +{# TRIE related + +use charnames ':full'; + +$s="\N{LATIN SMALL LETTER SHARP S}"; +ok(("foba ba$s" =~ qr/(foo|Ba$s|bar)/i) + && $1 eq "ba$s", + "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"); +ok(("foba ba$s" =~ qr/(Ba$s|foo|bar)/i) + && $1 eq "ba$s", + "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"); +ok(("foba ba$s" =~ qr/(foo|bar|Ba$s)/i) + && $1 eq "ba$s", + "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"); + +ok(("foba ba$s" =~ qr/(foo|Bass|bar)/i) + && $1 eq "ba$s", + "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"); + +ok(("foba ba$s" =~ qr/(foo|BaSS|bar)/i) + && $1 eq "ba$s", + "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"); +} + + + +{ + my @normal=qw(these are some normal words); + my $psycho=join "|",@normal,map chr $_,255..20000; + ok(('these'=~/($psycho)/) && $1 eq 'these','Pyscho'); +} + +# [perl #36207] mixed utf8 / latin-1 and case folding + +{ + my $utf8 = "\xe9\x{100}"; chop $utf8; + my $latin1 = "\xe9"; + + ok($utf8 =~ /\xe9/i, "utf8/latin"); + ok($utf8 =~ /$latin1/i, "utf8/latin runtime"); + ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie"); + ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime"); + + ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8"); + ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie"); + ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime"); + ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime"); +} + +# [perl #37038] Global regular matches generate invalid pointers + +{ + my $s = "abcd"; + $s =~ /(..)(..)/g; + $s = $1; + $s = $2; + ok($s eq 'cd', + "# TODO assigning to original string should not corrupt match vars"); +} + +# last test 1195 +