From: Jarkko Hietaniemi Date: Thu, 5 Aug 1999 17:25:19 +0000 (+0000) Subject: Fix regex charclass parsing so that bogus ranges X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9373385978348b5aad2d5fb3ea78b7db27600a69;p=p5sagit%2Fp5-mst-13.2.git Fix regex charclass parsing so that bogus ranges like [0-\d] and [[:word:]-z] are no more allowed. The anomaly was noticed by Guy Decoux. p4raw-id: //depot/cfgperl@3926 --- diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 0ff865d..8e686ba 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1595,7 +1595,8 @@ rebuild Perl. =item invalid [] range in regexp (F) The range specified in a character class had a minimum character -greater than the maximum character. See L. +greater than the maximum character, or the range didn't start/end with +a literal character. See L. =item Invalid conversion in %s: "%s" diff --git a/pod/perlre.pod b/pod/perlre.pod index 85b2a94..a1a118f 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -217,7 +217,7 @@ character class. For example: matches one, zero, any alphabetic character, and the percentage sign. The exact meanings of the above classes depend from many things: -if the C pragma is used, the following equivalenced to Unicode +if the C pragma is used, the following equivalences to Unicode \p{} constructs hold: alpha IsAlpha @@ -238,7 +238,7 @@ For example, [:lower:] and \p{IsLower} are equivalent. If the C pragma is not used but the C pragma is, the classes correlate with the isalpha(3) interface (except for `word', -which is a Perl extension). +which is a Perl extension, mirroring \w). The assumedly non-obviously named classes are: @@ -249,6 +249,8 @@ The assumedly non-obviously named classes are: Any control character. Usually characters that don't produce output as such but instead control the terminal somehow: for example newline and backspace are control characters. + All characters with ord() less than 32 are most often control + classified as characters. =item graph @@ -275,9 +277,11 @@ The assumedly non-obviously named classes are: You can negate the [::] character classes by prefixing the class name with a '^'. This is a Perl extension. For example: - ^digit \D \P{IsDigit} - ^space \S \P{IsSpace} - ^word \W \P{IsWord} + POSIX trad. Perl utf8 Perl + + [:^digit:] \D \P{IsDigit} + [:^space:] \S \P{IsSpace} + [:^word:] \W \P{IsWord} The POSIX character classes [.cc.] and [=cc=] are B supported and trying to use them will cause an error. diff --git a/regcomp.c b/regcomp.c index df2fc0c..9846022 100644 --- a/regcomp.c +++ b/regcomp.c @@ -192,6 +192,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define OOB_CHAR8 1234 #define OOB_UTF8 123456 +#define OOB_NAMEDCLASS -1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -2246,7 +2247,7 @@ S_regpposixcc(pTHX_ I32 value) } break; } - if ((namedclass == -1 || + if ((namedclass == OOB_NAMEDCLASS || !(posixcc + skip + 2 < PL_regxend && (posixcc[skip] == ':' && posixcc[skip + 1] == ']')))) @@ -2275,7 +2276,7 @@ S_checkposixcc(pTHX) *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) { char *s = PL_regcomp_parse; - char c = *s++; + char c = *s++; while(*s && isALNUM(*s)) s++; @@ -2329,7 +2330,7 @@ S_regclass(pTHX) goto skipcond; /* allow 1st char to be ] or - */ while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { skipcond: - namedclass = -1; + namedclass = OOB_NAMEDCLASS; value = UCHARAT(PL_regcomp_parse++); if (value == '[') namedclass = regpposixcc(value); @@ -2364,7 +2365,9 @@ S_regclass(pTHX) break; } } - if (!SIZE_ONLY && namedclass > -1) { + if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) { + if (range) + FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */ switch (namedclass) { case ANYOF_ALNUM: if (LOC) @@ -2605,25 +2608,27 @@ S_regclass(pTHX) } if (LOC) ANYOF_FLAGS(opnd) |= ANYOF_CLASS; - lastvalue = OOB_CHAR8; + continue; } - else if (range) { if (lastvalue > value) - FAIL("invalid [] range in regexp"); + FAIL("invalid [] range in regexp"); /* [b-a] */ range = 0; } else { lastvalue = value; if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && - PL_regcomp_parse[1] != ']') { + PL_regcomp_parse[1] != ']') { + if (namedclass > OOB_NAMEDCLASS) + FAIL("invalid [] range in regexp"); /* [\w-a] */ PL_regcomp_parse++; range = 1; continue; /* do it next time */ } } + /* now is the next time */ if (!SIZE_ONLY) { -#ifndef ASCIIish +#ifndef ASCIIish /* EBCDIC, for example. */ if ((isLOWER(lastvalue) && isLOWER(value)) || (isUPPER(lastvalue) && isUPPER(value))) { @@ -2643,7 +2648,7 @@ S_regclass(pTHX) for ( ; lastvalue <= value; lastvalue++) ANYOF_BITMAP_SET(opnd, lastvalue); } - lastvalue = value; + range = 0; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ if (!SIZE_ONLY && @@ -2701,7 +2706,7 @@ S_regclassutf8(pTHX) while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { skipcond: - namedclass = -1; + namedclass = OOB_NAMEDCLASS; value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); PL_regcomp_parse += numlen; @@ -2773,7 +2778,9 @@ S_regclassutf8(pTHX) break; } } - if (!SIZE_ONLY && namedclass > -1) { + if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) { + if (range) + FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */ switch (namedclass) { case ANYOF_ALNUM: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; @@ -2828,11 +2835,11 @@ S_regclassutf8(pTHX) case ANYOF_NXDIGIT: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; } + continue; } - else if (range) { if (lastvalue > value) - FAIL("invalid [] range in regexp"); + FAIL("invalid [] range in regexp"); /* [b-a] */ #ifdef UV_IS_QUAD if (!SIZE_ONLY) Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\t%04" PERL_PRIx64 "\n", (UV)lastvalue, (UV)value); @@ -2840,25 +2847,28 @@ S_regclassutf8(pTHX) if (!SIZE_ONLY) Perl_sv_catpvf(aTHX_ listsv, "%04x\t%04x\n", lastvalue, value); #endif - lastvalue = value; range = 0; } else { lastvalue = value; if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && - PL_regcomp_parse[1] != ']') { + PL_regcomp_parse[1] != ']') { + if (namedclass > OOB_NAMEDCLASS) + FAIL("invalid [] range in regexp"); /* [\w-a] */ PL_regcomp_parse++; range = 1; continue; /* do it next time */ } + } + /* now is the next time */ #ifdef UV_IS_QUAD - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value); + if (!SIZE_ONLY) + Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value); #else - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value); + if (!SIZE_ONLY) + Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value); #endif - } + range = 0; } ret = reganode(ANYOFUTF8, 0); diff --git a/t/op/re_tests b/t/op/re_tests index 899b35e..b35e964 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -735,3 +735,7 @@ foo.bart foo.bart y - - .[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - .[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa n - - tt+$ xxxtt y - - +[a-\w] - c - /[a-\w]/: invalid [] range in regexp +[\w-z] - c - /[\w-z]/: invalid [] range in regexp +[0-[:digit:]] - c - /[0-[:digit:]]/: invalid [] range in regexp +[[:digit:]-9] - c - /[[:digit:]-9]/: invalid [] range in regexp