From: Rafael Garcia-Suarez Date: Thu, 12 Dec 2002 20:35:29 +0000 (+0000) Subject: Integrate from maint-5.8 : changes 18290-1, 18293-5, 18297 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f14c76ed18fcf3fc609cea29294703220581a43a;p=p5sagit%2Fp5-mst-13.2.git Integrate from maint-5.8 : changes 18290-1, 18293-5, 18297 p4raw-id: //depot/perl@18299 p4raw-integrated: from //depot/maint-5.8/perl@18298 'copy in' pod/perlretut.pod (@17645..) pod/perlre.pod (@18080..) ext/POSIX/t/is.t (@18189..) t/op/subst.t (@18214..) ext/POSIX/t/posix.t (@18271..) t/op/pat.t (@18276..) ext/POSIX/POSIX.pod (@18294..) 'merge in' regexec.c (@18095..) --- diff --git a/ext/POSIX/POSIX.pod b/ext/POSIX/POSIX.pod index a455085..4d1ded6 100644 --- a/ext/POSIX/POSIX.pod +++ b/ext/POSIX/POSIX.pod @@ -580,15 +580,20 @@ see L. =item isalnum -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead, or possibly the C construct. +This is identical to the C function, except that it can apply to a +single character or to a whole string. Note that locale settings may +affect what characters are considered C. Does not work on +Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead, or possibly +the C construct. =item isalpha -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. =item isatty @@ -597,60 +602,82 @@ to a tty. Similar to the C<-t> operator, see L. =item iscntrl -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. =item isdigit -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead, or the C construct. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C (unlikely, but +still possible). Does not work on Unicode characters code point 256 +or higher. Consider using regular expressions and the C +construct instead, or the C construct. =item isgraph -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. =item islower -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. Do B use C. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. Do B use +C. =item isprint -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. =item ispunct -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. =item isspace -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead, or the C construct. -(Note that C and C are slightly different in that -C can normally match a vertical tab, while C does -not.) +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead, or the C +construct. (Note that C and C are slightly +different in that C can normally match a vertical tab, +while C does not.) =item isupper -This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead. Do B use C. +This is identical to the C function, except that it can apply to +a single character or to a whole string. Note that locale settings +may affect what characters are considered C. Does not work +on Unicode characters code point 256 or higher. Consider using regular +expressions and the C construct instead. Do B use +C. =item isxdigit This is identical to the C function, except that it can apply to a single -character or to a whole string. Consider using regular expressions and the -C construct instead, or simply C. +character or to a whole string. Note that locale settings may affect what +characters are considered C (unlikely, but still possible). +Does not work on Unicode characters code point 256 or higher. +Consider using regular expressions and the C +construct instead, or simply C. =item kill @@ -1224,12 +1251,23 @@ I.e. January is 0, not 1; Sunday is 0, not 1; January 1st is 0, not 1. The year (C) is given in years since 1900. I.e., the year 1995 is 95; the year 2001 is 101. Consult your system's C manpage for details about these and the other arguments. + If you want your code to be portable, your format (C) argument should use only the conversion specifiers defined by the ANSI C -standard. These are C. -The given arguments are made consistent -as though by calling C before calling your system's -C function, except that the C value is not affected. +standard (C89, to play safe). These are C. +But even then, the B of some of the conversion specifiers are +non-portable. For example, the specifiers C change according +to the locale settings of the user, and both how to set locales (the +locale names) and what output to expect are non-standard. +The specifier C changes according to the timezone settings of the +user and the timezone computation rules of the operating system. +The C specifier is notoriously unportable since the names of +timezones are non-standard. Sticking to the numeric specifiers is the +safest route. + +The given arguments are made consistent as though by calling +C before calling your system's C function, +except that the C value is not affected. The string for Tuesday, December 12, 1995. diff --git a/ext/POSIX/t/is.t b/ext/POSIX/t/is.t index 6aa96f0..9ab851c 100644 --- a/ext/POSIX/t/is.t +++ b/ext/POSIX/t/is.t @@ -10,12 +10,14 @@ BEGIN { } } - use POSIX; use strict ; -$| = 1; +# E.g. \t might or might not be isprint() depending on the locale, +# so let's reset to the default. +setlocale(LC_ALL, 'C') if $Config{d_setlocale}; +$| = 1; # List of characters (and strings) to feed to the is functions. # diff --git a/ext/POSIX/t/posix.t b/ext/POSIX/t/posix.t index 9b0a751..6ce418c 100644 --- a/ext/POSIX/t/posix.t +++ b/ext/POSIX/t/posix.t @@ -11,8 +11,7 @@ BEGIN { } require "./test.pl"; -plan(tests => 66); - +plan(tests => 61); use POSIX qw(fcntl_h signal_h limits_h _exit getcwd open read strftime write errno); @@ -183,26 +182,6 @@ try_strftime("Wed Mar 01 00:00:00 2000 061", 0,0,0, 1,2,100); try_strftime("Fri Mar 31 00:00:00 2000 091", 0,0,0, 31,2,100); &POSIX::setlocale(&POSIX::LC_TIME, $lc) if $Config{d_setlocale}; -SKIP: { - # XXX wait for smokers to see which OSs else to skip - skip("No mktime and/or tm_gmtoff", 5) - if !$Config{d_mktime} || !$Config{d_tm_tm_gmtoff} || !$Config{d_tm_tm_zone}; - local $ENV{TZ} = "Europe/Berlin"; - - # May fail for ancient FreeBSD versions. - # %z is not included in POSIX, but valid on Linux and FreeBSD. - foreach $def ([1000,'Sun Sep 9 03:46:40 2001 +0200 CEST'], - [900, 'Thu Jul 9 18:00:00 1998 +0200 CEST'], - [800, 'Tue May 9 08:13:20 1995 +0200 CEST'], - [700, 'Sat Mar 7 21:26:40 1992 +0100 CET'], - [600, 'Thu Jan 5 11:40:00 1989 +0100 CET'], - ) { - my($t, $expected) = @$def; - my @tm = localtime($t*1000000); - is(strftime("%c %z %Z",@tm), $expected, "validating zone setting: $expected"); - } -} - { for my $test (0, 1) { $! = 0; diff --git a/pod/perlre.pod b/pod/perlre.pod index 5e99fd3..85ce658 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -188,6 +188,7 @@ In addition, Perl defines the following: \C Match a single C char (octet) even under Unicode. NOTE: breaks up characters into their UTF-8 bytes, so you may end up with malformed pieces of UTF-8. + Unsupported in lookbehind. A C<\w> matches a single alphanumeric character (an alphabetic character, or a decimal digit) or C<_>, not a whole word. Use C<\w+> diff --git a/pod/perlretut.pod b/pod/perlretut.pod index f0b5d1d..57fc772 100644 --- a/pod/perlretut.pod +++ b/pod/perlretut.pod @@ -1707,7 +1707,7 @@ it matches I byte 0-255. So The last regexp matches, but is dangerous because the string I position is no longer synchronized to the string I position. This generates the warning 'Malformed UTF-8 -character'. C<\C> is best used for matching the binary data in strings +character'. The C<\C> is best used for matching the binary data in strings with binary data intermixed with Unicode characters. Let us now discuss the rest of the character classes. Just as with @@ -2004,6 +2004,10 @@ They evaluate true if the regexps do I match: $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo' $x =~ /(? is unsupported in lookbehind, because the already +treacherous definition of C<\C> would become even more so +when going backwards. + =head2 Using independent subexpressions to prevent backtracking The last few extended patterns in this tutorial are experimental as of diff --git a/regexec.c b/regexec.c index f69c360..53e4015 100644 --- a/regexec.c +++ b/regexec.c @@ -1882,9 +1882,12 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * goto phooey; } else if ((c = prog->regstclass)) { - if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT) + if (minlen) { + I32 op = (U8)OP(prog->regstclass); /* don't bother with what can't match */ - strend = HOPc(strend, -(minlen - 1)); + if (PL_regkind[op] != EXACT && op != CANY) + strend = HOPc(strend, -(minlen - 1)); + } DEBUG_r({ SV *prop = sv_newmortal(); char *s0; @@ -2269,17 +2272,17 @@ S_regmatch(pTHX_ regnode *prog) regprop(prop, scan); { char *s0 = - do_utf8 ? + do_utf8 && OP(scan) != CANY ? pv_uni_display(dsv0, (U8*)(locinput - pref_len), pref0_len, 60, UNI_DISPLAY_REGEX) : locinput - pref_len; int len0 = do_utf8 ? strlen(s0) : pref0_len; - char *s1 = do_utf8 ? + char *s1 = do_utf8 && OP(scan) != CANY ? pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len), pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) : locinput - pref_len + pref0_len; int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len; - char *s2 = do_utf8 ? + char *s2 = do_utf8 && OP(scan) != CANY ? pv_uni_display(dsv2, (U8*)locinput, PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) : locinput; diff --git a/t/op/pat.t b/t/op/pat.t index 20763e4..62520dd 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,7 +6,7 @@ $| = 1; -print "1..942\n"; +print "1..968\n"; BEGIN { chdir 't' if -d 't'; @@ -3006,4 +3006,53 @@ print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++; ++$test; } -# last test 942 +{ + print "# [perl #15763]\n"; + + $a = "x\x{100}"; + chop $a; # but leaves the UTF-8 flag + $a .= "y"; # 1 byte before "y" + + ok($a =~ /^\C/, 'match one \C on 1-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + + ok($a =~ /^\Cy/, 'match \Cy'); + ok($a =~ /^\C{1}y/, 'match \C{1}y'); + + $a = "\x{100}y"; # 2 bytes before "y" + + ok($a =~ /^\C/, 'match one \C on 2-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + ok($a =~ /^\C\C/, 'match two \C'); + ok($a =~ /^\C{2}/, 'match \C{2}'); + + ok($a =~ /^\C\C\C/, 'match three \C on 2-byte UTF-8 and a byte'); + ok($a =~ /^\C{3}/, 'match \C{3}'); + + ok($a =~ /^\C\Cy/, 'match two \C'); + ok($a =~ /^\C{2}y/, 'match \C{2}'); + + ok($a !~ /^\C\C\Cy/, 'not match three \Cy'); + ok($a !~ /^\C{2}\Cy/, 'not match \C{3}y'); + + $a = "\x{1000}y"; # 3 bytes before "y" + + ok($a =~ /^\C/, 'match one \C on three-byte UTF-8'); + ok($a =~ /^\C{1}/, 'match \C{1}'); + ok($a =~ /^\C\C/, 'match two \C'); + ok($a =~ /^\C{2}/, 'match \C{2}'); + ok($a =~ /^\C\C\C/, 'match three \C'); + ok($a =~ /^\C{3}/, 'match \C{3}'); + + ok($a =~ /^\C\C\C\C/, 'match four \C on three-byte UTF-8 and a byte'); + ok($a =~ /^\C{4}/, 'match \C{4}'); + + ok($a =~ /^\C\C\Cy/, 'match three \Cy'); + ok($a =~ /^\C{3}y/, 'match \C{3}y'); + + ok($a !~ /^\C\C\C\C\y/, 'not match four \Cy'); + ok($a !~ /^\C{4}y/, 'not match \C{4}y'); +} + +# last test 968 + diff --git a/t/op/subst.t b/t/op/subst.t index ef0ae0a..797f241 100755 --- a/t/op/subst.t +++ b/t/op/subst.t @@ -7,7 +7,7 @@ BEGIN { } require './test.pl'; -plan( tests => 124 ); +plan( tests => 125 ); $x = 'foo'; $_ = "x"; @@ -494,9 +494,19 @@ SKIP: { $_ = 'aaaa'; $r = 'x'; $s = s/a(?{})/$r/g; -is("<$_> <$s>", " <4>", "perl #7806"); +is("<$_> <$s>", " <4>", "[perl #7806]"); $_ = 'aaaa'; $s = s/a(?{})//g; -is("<$_> <$s>", "<> <4>", "perl #7806"); +is("<$_> <$s>", "<> <4>", "[perl #7806]"); +# [perl #19048] Coredump in silly replacement +{ + local $^W = 0; + $_="abcdef\n"; + s!.!!eg; + is($_, "\n", "[perl #19048]"); +} + + +