From: Jarkko Hietaniemi Date: Tue, 12 Oct 1999 15:30:05 +0000 (+0000) Subject: Revert the parts of #3926 that outlawed character ranges X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1209ba901e0b2880eea69ad70613848af5543517;p=p5sagit%2Fp5-mst-13.2.git Revert the parts of #3926 that outlawed character ranges that have character classes such as \w as either endpoint. This change re-establishes the old behavior which meant that such ranges weren't really ranges, the "-" was literal. Moreover, this change also fixes the old behavior to be more consistent: [\w-.] and [\s-\w] worked, but [.-\w] didn't. Now they all do work as described above. The #3926 outlawed all of those. p4raw-id: //depot/cfgperl@4355 --- diff --git a/pod/perldiag.pod b/pod/perldiag.pod index d0f1be8..fb5c7e6 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1681,8 +1681,7 @@ by Perl or by a user-supplied handler. See L. =item invalid [] range in regexp (F) The range specified in a character class had a minimum character -greater than the maximum character, or the range didn't start/end with -a literal character. See L. +greater than the maximum character. See L. =item Invalid conversion in %s: "%s" diff --git a/pod/perlre.pod b/pod/perlre.pod index 9a06305..1610254 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -185,8 +185,9 @@ Use C<\w+> to match a string of Perl-identifier characters (which isn't the same as matching an English word). If C is in effect, the list of alphabetic characters generated by C<\w> is taken from the current locale. See L. You may use C<\w>, C<\W>, C<\s>, C<\S>, -C<\d>, and C<\D> within character classes (though not as either end of -a range). See L for details about C<\pP>, C<\PP>, and C<\X>. +C<\d>, and C<\D> within character classes, but if you try to use them +as endpoints of a range, that's not a range, the "-" is understood literally. +See L for details about C<\pP>, C<\PP>, and C<\X>. The POSIX character class syntax @@ -940,6 +941,9 @@ at the start or end of the list, or escape it with a backslash. (The following all specify the same class of three characters: C<[-az]>, C<[az-]>, and C<[a\-z]>. All are different from C<[a-z]>, which specifies a class containing twenty-six characters.) +Also, if you try to use the character classes C<\w>, C<\W>, C<\s>, +C<\S>, C<\d>, or C<\D> as endpoints of a range, that's not a range, +the "-" is understood literally. Note also that the whole range idea is rather unportable between character sets--and even within character sets they may cause results diff --git a/regcomp.c b/regcomp.c index 99423e1..02dca51 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2364,8 +2364,10 @@ S_regclass(pTHX) } } if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) { - if (range) - FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */ + if (range) { + ANYOF_BITMAP_SET(opnd, lastvalue); + ANYOF_BITMAP_SET(opnd, '-'); + } switch (namedclass) { case ANYOF_ALNUM: if (LOC) @@ -2608,6 +2610,8 @@ S_regclass(pTHX) ANYOF_FLAGS(opnd) |= ANYOF_CLASS; continue; } + if (range && namedclass > OOB_NAMEDCLASS) + range = 0; /* [a-\d], [a-[:digit:]], not a true range. */ if (range) { if (lastvalue > value) FAIL("invalid [] range in regexp"); /* [b-a] */ @@ -2617,8 +2621,6 @@ S_regclass(pTHX) lastvalue = value; if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && PL_regcomp_parse[1] != ']') { - if (namedclass > OOB_NAMEDCLASS) - FAIL("invalid [] range in regexp"); /* [\w-a] */ PL_regcomp_parse++; range = 1; continue; /* do it next time */ @@ -2777,9 +2779,10 @@ S_regclassutf8(pTHX) } } if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) { - if (range) - FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */ - switch (namedclass) { + if (range) /* [a-\d], [a-[:digit:]] */ + Perl_sv_catpvf(aTHX_ listsv, /* 0x002D is Unicode for '-' */ + "%04"UVxf"\n%002D\n", (UV)lastvalue); + switch (namedclass) { case ANYOF_ALNUM: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: @@ -2835,6 +2838,8 @@ S_regclassutf8(pTHX) } continue; } + if (range && namedclass > OOB_NAMEDCLASS) + range = 0; /* [a-\d], [a-[:digit:]], not a true range. */ if (range) { if (lastvalue > value) FAIL("invalid [] range in regexp"); /* [b-a] */ @@ -2846,8 +2851,6 @@ S_regclassutf8(pTHX) lastvalue = value; if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && PL_regcomp_parse[1] != ']') { - if (namedclass > OOB_NAMEDCLASS) - FAIL("invalid [] range in regexp"); /* [\w-a] */ PL_regcomp_parse++; range = 1; continue; /* do it next time */ diff --git a/t/op/re_tests b/t/op/re_tests index 695672d..974bec5 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -735,8 +735,10 @@ foo.bart foo.bart y - - .[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - .[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - tt+$ xxxtt y - - -[a-\w] - c - /[a-\w]/: invalid [] range in regexp -[\w-z] - c - /[\w-z]/: invalid [] range in regexp -[0-[:digit:]] - c - /[0-[:digit:]]/: invalid [] range in regexp -[[:digit:]-9] - c - /[[:digit:]-9]/: invalid [] range in regexp +([a-\d]+) za-9z y $1 a-9 +([\d-\s]+) a0- z y $1 0- +([\d-z]+) a0-za y $1 0-z +([a-[:digit:]]+) za-9z y $1 a-9 +([[:digit:]-[:alpha:]]+) =0-z= y $1 0-z +([[:digit:]-z]+) =0-z= y $1 0-z \GX.*X aaaXbX n - -