matches one, zero, any alphabetic character, and the percentage sign.
The exact meanings of the above classes depend from many things:
-if the C<utf8> pragma is used, the following equivalenced to Unicode
+if the C<utf8> pragma is used, the following equivalences to Unicode
\p{} constructs hold:
alpha IsAlpha
If the C<utf8> pragma is not used but the C<locale> pragma is, the
classes correlate with the isalpha(3) interface (except for `word',
-which is a Perl extension).
+which is a Perl extension, mirroring \w).
The assumedly non-obviously named classes are:
Any control character. Usually characters that don't produce
output as such but instead control the terminal somehow:
for example newline and backspace are control characters.
+ All characters with ord() less than 32 are most often control
+ classified as characters.
=item graph
You can negate the [::] character classes by prefixing the class name
with a '^'. This is a Perl extension. For example:
- ^digit \D \P{IsDigit}
- ^space \S \P{IsSpace}
- ^word \W \P{IsWord}
+ POSIX trad. Perl utf8 Perl
+
+ [:^digit:] \D \P{IsDigit}
+ [:^space:] \S \P{IsSpace}
+ [:^word:] \W \P{IsWord}
The POSIX character classes [.cc.] and [=cc=] are B<not> supported
and trying to use them will cause an error.
#define OOB_CHAR8 1234
#define OOB_UTF8 123456
+#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
}
break;
}
- if ((namedclass == -1 ||
+ if ((namedclass == OOB_NAMEDCLASS ||
!(posixcc + skip + 2 < PL_regxend &&
(posixcc[skip] == ':' &&
posixcc[skip + 1] == ']'))))
*PL_regcomp_parse == '=' ||
*PL_regcomp_parse == '.')) {
char *s = PL_regcomp_parse;
- char c = *s++;
+ char c = *s++;
while(*s && isALNUM(*s))
s++;
goto skipcond; /* allow 1st char to be ] or - */
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
- namedclass = -1;
+ namedclass = OOB_NAMEDCLASS;
value = UCHARAT(PL_regcomp_parse++);
if (value == '[')
namedclass = regpposixcc(value);
break;
}
}
- if (!SIZE_ONLY && namedclass > -1) {
+ if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) {
+ if (range)
+ FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */
switch (namedclass) {
case ANYOF_ALNUM:
if (LOC)
}
if (LOC)
ANYOF_FLAGS(opnd) |= ANYOF_CLASS;
- lastvalue = OOB_CHAR8;
+ continue;
}
- else
if (range) {
if (lastvalue > value)
- FAIL("invalid [] range in regexp");
+ FAIL("invalid [] range in regexp"); /* [b-a] */
range = 0;
}
else {
lastvalue = value;
if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
+ PL_regcomp_parse[1] != ']') {
+ if (namedclass > OOB_NAMEDCLASS)
+ FAIL("invalid [] range in regexp"); /* [\w-a] */
PL_regcomp_parse++;
range = 1;
continue; /* do it next time */
}
}
+ /* now is the next time */
if (!SIZE_ONLY) {
-#ifndef ASCIIish
+#ifndef ASCIIish /* EBCDIC, for example. */
if ((isLOWER(lastvalue) && isLOWER(value)) ||
(isUPPER(lastvalue) && isUPPER(value)))
{
for ( ; lastvalue <= value; lastvalue++)
ANYOF_BITMAP_SET(opnd, lastvalue);
}
- lastvalue = value;
+ range = 0;
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
skipcond:
- namedclass = -1;
+ namedclass = OOB_NAMEDCLASS;
value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
PL_regcomp_parse += numlen;
break;
}
}
- if (!SIZE_ONLY && namedclass > -1) {
+ if (!SIZE_ONLY && namedclass > OOB_NAMEDCLASS) {
+ if (range)
+ FAIL("invalid [] range in regexp"); /* [a-\w], [a-[:word:]] */
switch (namedclass) {
case ANYOF_ALNUM:
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
case ANYOF_NXDIGIT:
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
}
+ continue;
}
- else
if (range) {
if (lastvalue > value)
- FAIL("invalid [] range in regexp");
+ FAIL("invalid [] range in regexp"); /* [b-a] */
#ifdef UV_IS_QUAD
if (!SIZE_ONLY)
Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\t%04" PERL_PRIx64 "\n", (UV)lastvalue, (UV)value);
if (!SIZE_ONLY)
Perl_sv_catpvf(aTHX_ listsv, "%04x\t%04x\n", lastvalue, value);
#endif
- lastvalue = value;
range = 0;
}
else {
lastvalue = value;
if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
+ PL_regcomp_parse[1] != ']') {
+ if (namedclass > OOB_NAMEDCLASS)
+ FAIL("invalid [] range in regexp"); /* [\w-a] */
PL_regcomp_parse++;
range = 1;
continue; /* do it next time */
}
+ }
+ /* now is the next time */
#ifdef UV_IS_QUAD
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value);
+ if (!SIZE_ONLY)
+ Perl_sv_catpvf(aTHX_ listsv, "%04" PERL_PRIx64 "\n", (UV)value);
#else
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value);
+ if (!SIZE_ONLY)
+ Perl_sv_catpvf(aTHX_ listsv, "%04x\n", value);
#endif
- }
+ range = 0;
}
ret = reganode(ANYOFUTF8, 0);