regcomp.h Private declarations for above
regcomp.pl Builder of regnodes.h
regcomp.sym Data for regnodes.h
+regcharclass.h Match various character classes efficiently
regen_lib.pl Common file routines for generator scripts
regen_perly.pl generate perly.{act,h,tab} from perly.y
regen.pl Run all scripts that (re)generate files
Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0);
$Cat{SpacePerl} =
Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
-
+ $Cat{VertSpace} =
+ Table->New(Is => 'VertSpace', Desc => '\v', Fuzzy => 0);
+ $Cat{HorizSpace} =
+ Table->New(Is => 'HorizSpace', Desc => '\h', Fuzzy => 0);
my %To;
$To{Upper} = Table->New();
$To{Lower} = Table->New();
$Cat{SpacePerl}->$op($code) if $isspace
&& $code != 0x000B; # Backward compat.
+ $Cat{VertSpace}->$op($code) if grep {$code == $_}
+ ( 0x0A..0x0D,0x85,0x2028,0x2029 );
+
+ $Cat{HorizSpace}->$op($code) if grep {$code == $_} (
+ 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002,
+ 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+ 0x202f, 0x205f, 0x3000
+ );
+
$Cat{Blank}->$op($code) if $isspace
&& !($code == 0x000A ||
$code == 0x000B ||
while escaping will cause the literal string C<\$> to be matched.
You'll need to write something like C<m/\Quser\E\@\Qhost/>.
-=head3 Character classes
+=head3 Character Classes and other Special Escapes
In addition, Perl defines the following:
X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C>
\x12 Hexadecimal escape sequence
\x{1234} Long hexadecimal escape sequence
\K Keep the stuff left of the \K, don't include it in $&
- \v Shortcut for (*PRUNE)
- \V Shortcut for (*SKIP)
+ \v Vertical whitespace
+ \V Not vertical whitespace
+ \h Horizontal whitespace
+ \H Not horizontal whitespace
+ \R Linebreak (matches like \v inside of a charclass)
A C<\w> matches a single alphanumeric character (an alphabetic
character, or a decimal digit) or C<_>, not a whole word. Use C<\w+>
in general.
X<\w> X<\W> X<word>
+C<\R> will atomically match a linebreak, including the network line-ending
+"\x0D\x0A". Specifically, X<\R> is exactly equivelent to
+
+ (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}])
+
+B<Note:> C<\R> has no special meaning inside of a character class;
+use C<\v> instead (vertical whitespace).
+X<\R>
+
The POSIX character class syntax
X<character class>
--- /dev/null
+/****** WARNING WARNING WARNING ********/
+/* */
+/* Autogenerated code, do not modify! */
+/* */
+/****** WARNING WARNING WARNING ********/
+
+/* LNBREAK Line Break: \j \J
+ Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x0D.0x0A, 0x85, 0x2028, 0x2029
+ */
+#define is_LNBREAK(s,is_utf8) /*** Line Break: \j \J ***/ \
+( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+ ( ((U8*)s)[0]==133 ) ) ) )
+
+#define is_LNBREAK_safe(s,e,is_utf8) /*** Line Break: \j \J ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+ ( ((U8*)s)[0]==133 ) ) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ? 1 : \
+( (is_utf8) ? \
+ ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) : \
+ ( ((U8*)s)[0]==133 ) ) ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \
+( (!is_utf8) ? \
+ ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) )
+
+#define is_LNBREAK_utf8(s) /*** Line Break: \j \J ***/ \
+( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) )
+
+#define is_LNBREAK_utf8_safe(s,e) /*** Line Break: \j \J ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) )
+
+#define is_LNBREAK_latin1(s) /*** Line Break: \j \J ***/ \
+( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) )
+
+#define is_LNBREAK_latin1_safe(s,e) /*** Line Break: \j \J ***/ \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==13 ) ? \
+ ( ( ((U8*)s)[1]==10 ) ? 2 : 1 ) : \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=12) || ((U8*)s)[0]==133 ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 ) )
+
+#define is_LNBREAK_cp(cp) /*** Line Break: \j \J ***/ \
+( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) )
+
+/****** WARNING WARNING WARNING ********/
+/* */
+/* Autogenerated code, do not modify! */
+/* */
+/****** WARNING WARNING WARNING ********/
+
+/* HORIZWS Horizontal Whitespace: \h \H
+ Codepoints: 0x09, 0x20, 0xA0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002,
+ 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F,
+ 0x205F, 0x3000
+ */
+#define is_HORIZWS(s,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \
+( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==225 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? \
+ ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ( ( ((U8*)s)[1]==129 ) ? \
+ ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+ (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\
+ ( ((U8*)s)[0]==160 ) ) )
+
+#define is_HORIZWS_safe(s,e,is_utf8) /*** Horizontal Whitespace: \h \H ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==225 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? \
+ ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ( ( ((U8*)s)[1]==129 ) ? \
+ ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+ (((( ((U8*)s)[0]==227 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) ) ) :\
+ ( ((U8*)s)[0]==160 ) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \
+( (is_utf8) ? \
+ ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==160 )) ? 2 : 0) : \
+ ( ((U8*)s)[0]==160 ) ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ ( ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ? 1 : \
+( (!is_utf8) ? \
+ ( ((U8*)s)[0]==160 ) : 0 ) ) : 0 ) ) )
+
+#define is_HORIZWS_utf8(s) /*** Horizontal Whitespace: \h \H ***/ \
+( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==225 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? \
+ ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ( ( ((U8*)s)[1]==129 ) ? \
+ ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+ ( ( ((U8*)s)[0]==227 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) : \
+ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) )
+
+#define is_HORIZWS_utf8_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==225 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? \
+ ( ( ((U8*)s)[2]==142 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==154 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ( ( ((U8*)s)[1]==129 ) ? \
+ ( ( ((U8*)s)[2]==159 ) ? 3 : 0 ) : \
+ ((( ((U8*)s)[1]==128 ) && ( (128<=((U8*)s)[2] && ((U8*)s)[2]<=138) || ((U8*)s)[2]==175 )) ? 3 : 0) ) :\
+ ( ( ((U8*)s)[0]==227 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==128 )) ? 3 : 0) : \
+ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) ) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==160 ) ? 2 : 0 ) : \
+ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 ) : 0 ) ) )
+
+#define is_HORIZWS_latin1(s) /*** Horizontal Whitespace: \h \H ***/ \
+( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 )
+
+#define is_HORIZWS_latin1_safe(s,e) /*** Horizontal Whitespace: \h \H ***/ \
+( ( (e) - (s) > 0 ) ? \
+ ( ((U8*)s)[0]==9 || ((U8*)s)[0]==32 || ((U8*)s)[0]==160 ) : 0 )
+
+#define is_HORIZWS_cp(cp) /*** Horizontal Whitespace: \h \H ***/ \
+( cp==9 || ( cp > 9 && ( cp==32 || ( cp > 32 && ( cp==160 || ( cp > 160 && ( cp==5760 || ( cp > 5760 && ( cp==6158 || ( cp > 6158 && ( (8192<=cp && cp<=8202) || ( cp > 8202 && ( cp==8239 || ( cp > 8239 && ( cp==8287 || ( cp > 8287 && cp==12288 ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) )
+
+/****** WARNING WARNING WARNING ********/
+/* */
+/* Autogenerated code, do not modify! */
+/* */
+/****** WARNING WARNING WARNING ********/
+
+/* VERTWS Vertical Whitespace: \v \V
+ Codepoints: 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029
+ */
+#define is_VERTWS(s,is_utf8) /*** Vertical Whitespace: \v \V ***/ \
+( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+ ( ((U8*)s)[0]==133 ) ) )
+
+#define is_VERTWS_safe(s,e,is_utf8) /*** Vertical Whitespace: \v \V ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \
+( (is_utf8) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ (((( ((U8*)s)[0]==226 ) && ( ((U8*)s)[1]==128 )) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) ) :\
+ ( ((U8*)s)[0]==133 ) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \
+( (is_utf8) ? \
+ ((( ((U8*)s)[0]==194 ) && ( ((U8*)s)[1]==133 )) ? 2 : 0) : \
+ ( ((U8*)s)[0]==133 ) ) ) : \
+( ( (e) - (s) > 0 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ? 1 : \
+( (!is_utf8) ? \
+ ( ((U8*)s)[0]==133 ) : 0 ) ) : 0 ) ) )
+
+#define is_VERTWS_utf8(s) /*** Vertical Whitespace: \v \V ***/ \
+( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) )
+
+#define is_VERTWS_utf8_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \
+( ( (e) - (s) > 2 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ ( ( ((U8*)s)[0]==226 ) ? \
+ ((( ((U8*)s)[1]==128 ) && ( ((U8*)s)[2]==168 || ((U8*)s)[2]==169 )) ? 3 : 0) :\
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) ) : \
+( ( (e) - (s) > 1 ) ? \
+ ( ( ((U8*)s)[0]==194 ) ? \
+ ( ( ((U8*)s)[1]==133 ) ? 2 : 0 ) : \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) ) : \
+( ( (e) - (s) > 0 ) ? \
+ (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) : 0 ) ) )
+
+#define is_VERTWS_latin1(s) /*** Vertical Whitespace: \v \V ***/ \
+( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 )
+
+#define is_VERTWS_latin1_safe(s,e) /*** Vertical Whitespace: \v \V ***/ \
+( ( (e) - (s) > 0 ) ? \
+ ( (10<=((U8*)s)[0] && ((U8*)s)[0]<=13) || ((U8*)s)[0]==133 ) : 0 )
+
+#define is_VERTWS_cp(cp) /*** Vertical Whitespace: \v \V ***/ \
+( (10<=cp && cp<=13) || ( cp > 13 && ( cp==133 || ( cp > 133 && ( cp==8232 || ( cp > 8232 && cp==8233 ) ) ) ) ) )
+
#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
+#define CASE_SYNST_FNC(nAmE) \
+case nAmE: \
+ if (flags & SCF_DO_STCLASS_AND) { \
+ for (value = 0; value < 256; value++) \
+ if (!is_ ## nAmE ## _cp(value)) \
+ ANYOF_BITMAP_CLEAR(data->start_class, value); \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) \
+ if (is_ ## nAmE ## _cp(value)) \
+ ANYOF_BITMAP_SET(data->start_class, value); \
+ } \
+ break; \
+case N ## nAmE: \
+ if (flags & SCF_DO_STCLASS_AND) { \
+ for (value = 0; value < 256; value++) \
+ if (is_ ## nAmE ## _cp(value)) \
+ ANYOF_BITMAP_CLEAR(data->start_class, value); \
+ } \
+ else { \
+ for (value = 0; value < 256; value++) \
+ if (!is_ ## nAmE ## _cp(value)) \
+ ANYOF_BITMAP_SET(data->start_class, value); \
+ } \
+ break
+
+
+
STATIC I32
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 *minlenp, I32 *deltap,
break;
}
}
+ else if (OP(scan) == LNBREAK) {
+ if (flags & SCF_DO_STCLASS) {
+ int value = 0;
+ data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
+ if (flags & SCF_DO_STCLASS_AND) {
+ for (value = 0; value < 256; value++)
+ if (!is_LNBREAK_cp(value))
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
+ else {
+ for (value = 0; value < 256; value++)
+ if (is_LNBREAK_cp(value))
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ if (flags & SCF_DO_STCLASS_OR)
+ cl_and(data->start_class, and_withp);
+ flags &= ~SCF_DO_STCLASS;
+ }
+ min += 1;
+ delta += 2;
+ if (flags & SCF_DO_SUBSTR) {
+ SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
+ data->pos_min += 1;
+ data->pos_delta += 2;
+ data->longest = &(data->longest_float);
+ }
+
+ }
else if (strchr((const char*)PL_simple,OP(scan))) {
int value = 0;
}
}
break;
+ CASE_SYNST_FNC(VERTWS);
+ CASE_SYNST_FNC(HORIZWS);
+
}
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
}
#endif /* old or new */
#endif /* TRIE_STUDY_OPT */
+
/* Else: zero-length, ignore. */
scan = regnext(scan);
}
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
+ case 'R':
+ ret = reg_node(pRExC_state, LNBREAK);
+ *flagp |= HASWIDTH|SIMPLE;
+ goto finish_meta_pat;
+ case 'h':
+ ret = reg_node(pRExC_state, HORIZWS);
+ *flagp |= HASWIDTH|SIMPLE;
+ goto finish_meta_pat;
+ case 'H':
+ ret = reg_node(pRExC_state, NHORIZWS);
+ *flagp |= HASWIDTH|SIMPLE;
+ goto finish_meta_pat;
case 'v':
- ret = reganode(pRExC_state, PRUNE, 0);
- ret->flags = 1;
- *flagp |= SIMPLE;
+ ret = reg_node(pRExC_state, VERTWS);
+ *flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'V':
- ret = reganode(pRExC_state, SKIP, 0);
- ret->flags = 1;
- *flagp |= SIMPLE;
+ ret = reg_node(pRExC_state, NVERTWS);
+ *flagp |= HASWIDTH|SIMPLE;
finish_meta_pat:
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
case 'C': /* Single char !DANGEROUS! */
case 'd': case 'D': /* digit class */
case 'g': case 'G': /* generic-backref, pos assertion */
+ case 'h': case 'H': /* HORIZWS */
case 'k': case 'K': /* named backref, keep marker */
case 'N': /* named char sequence */
case 'p': case 'P': /* unicode property */
+ case 'R': /* LNBREAK */
case 's': case 'S': /* space class */
- case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
+ case 'v': case 'V': /* VERTWS */
case 'w': case 'W': /* word class */
case 'X': /* eXtended Unicode "combining character sequence" */
case 'z': case 'Z': /* End of line/string assertion */
what = WORD; \
break
+#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
+ANYOF_##NAME: \
+ for (value = 0; value < 256; value++) \
+ if (TEST) \
+ ANYOF_BITMAP_SET(ret, value); \
+ yesno = '+'; \
+ what = WORD; \
+ break; \
+case ANYOF_N##NAME: \
+ for (value = 0; value < 256; value++) \
+ if (!TEST) \
+ ANYOF_BITMAP_SET(ret, value); \
+ yesno = '!'; \
+ what = WORD; \
+ break
/*
parse a class specification and produce either an ANYOF node that
S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
{
dVAR;
- register UV value = 0;
register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register IV range = 0;
+ UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
register regnode *ret;
STRLEN numlen;
IV namedclass;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'v': namedclass = ANYOF_VERTWS; break;
+ case 'V': namedclass = ANYOF_NVERTWS; break;
+ case 'h': namedclass = ANYOF_HORIZWS; break;
+ case 'H': namedclass = ANYOF_NHORIZWS; break;
case 'N': /* Handle \N{NAME} in class */
{
/* We only pay attention to the first char of
case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
case _C_C_T_(UPPER, isUPPER(value), "Upper");
case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
+ case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
+ case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
case ANYOF_ASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
* License or the Artistic License, as specified in the README file.
*
*/
+#include "regcharclass.h"
typedef OP OP_4tree; /* Will be redefined later. */
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
+#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */
/* also used by trie */
struct regnode_charclass {
#define ANYOF_MAX 32
+/* pseudo classes, not stored in the class bitmap, but used as flags
+ during compilation of char classes */
+
+#define ANYOF_VERTWS (ANYOF_MAX+1)
+#define ANYOF_NVERTWS (ANYOF_MAX+2)
+#define ANYOF_HORIZWS (ANYOF_MAX+3)
+#define ANYOF_NHORIZWS (ANYOF_MAX+4)
+
/* Backward source code compatibility. */
#define ANYOF_ALNUML ANYOF_ALNUM
SPACE, SPACEL,
NSPACE, NSPACEL,
DIGIT, NDIGIT,
+ VERTWS, NVERTWS,
+ HORIZWS, NHORIZWS,
0
};
#endif
#endif /* DEBUG RELATED DEFINES */
+
#*Control what to keep in $&.
KEEPS KEEPS, no $& begins here.
-# NEW STUFF ABOVE THIS LINE -- Please update counts below.
+#*New charclass like patterns
+LNBREAK LNBREAK, none generic newline pattern
+VERTWS VERTWS, none vertical whitespace (Perl 6)
+NVERTWS NVERTWS, none not vertical whitespace (Perl 6)
+HORIZWS HORIZWS, none horizontal whitespace (Perl 6)
+NHORIZWS NHORIZWS, none not horizontal whitespace (Perl 6)
+
+# NEW STUFF ABOVE THIS LINE
################################################################################
if ((!reginfo || regtry(reginfo, &s))) \
goto got_it
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
+ if (do_utf8) { \
+ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
+ } \
+ else { \
+ REXEC_FBC_CLASS_SCAN(CoNd); \
+ } \
+ break
+
#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
if (do_utf8) { \
UtFpReLoAd; \
!isDIGIT_LC_utf8((U8*)s),
!isDIGIT_LC(*s)
);
+ case LNBREAK:
+ REXEC_FBC_CSCAN(
+ is_LNBREAK_utf8(s),
+ is_LNBREAK_latin1(s)
+ );
+ case VERTWS:
+ REXEC_FBC_CSCAN(
+ is_VERTWS_utf8(s),
+ is_VERTWS_latin1(s)
+ );
+ case NVERTWS:
+ REXEC_FBC_CSCAN(
+ !is_VERTWS_utf8(s),
+ !is_VERTWS_latin1(s)
+ );
+ case HORIZWS:
+ REXEC_FBC_CSCAN(
+ is_HORIZWS_utf8(s),
+ is_HORIZWS_latin1(s)
+ );
+ case NHORIZWS:
+ REXEC_FBC_CSCAN(
+ !is_HORIZWS_utf8(s),
+ !is_HORIZWS_latin1(s)
+ );
case AHOCORASICKC:
case AHOCORASICK:
{
* pack("U0U*", 0xDF) =~ /ss/i,
* the 0xC3 0x9F are the UTF-8
* byte sequence for the U+00DF. */
+
if (!(do_utf8 &&
- toLOWER(s[0]) == 's' &&
+ toLOWER(s[0]) == 's' &&
ln >= 2 &&
toLOWER(s[1]) == 's' &&
(U8)l[0] == 0xC3 &&
/* NOTREACHED */
#undef ST
+ case LNBREAK:
+ if ((n=is_LNBREAK(locinput,do_utf8))) {
+ locinput += n;
+ nextchr = UCHARAT(locinput);
+ } else
+ sayNO;
+ break;
+
+#define CASE_CLASS(nAmE) \
+ case nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ locinput += n; \
+ nextchr = UCHARAT(locinput); \
+ } else \
+ sayNO; \
+ break; \
+ case N##nAmE: \
+ if ((n=is_##nAmE(locinput,do_utf8))) { \
+ sayNO; \
+ } else { \
+ locinput += UTF8SKIP(locinput); \
+ nextchr = UCHARAT(locinput); \
+ } \
+ break
+
+ CASE_CLASS(VERTWS);
+ CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
+
default:
PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
PTR2UV(scan), OP(scan));
while (scan < loceol && !isDIGIT(*scan))
scan++;
}
+ case LNBREAK:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ /*
+ LNBREAK can match two latin chars, which is ok,
+ because we have a null terminated string, but we
+ have to use hardcount in this situation
+ */
+ while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
+ scan+=c;
+ hardcount++;
+ }
+ }
+ break;
+ case HORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_HORIZWS_latin1(scan))
+ scan++;
+ }
break;
+ case NHORIZWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_HORIZWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+ case VERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+ scan += c;
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+ case NVERTWS:
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !is_VERTWS_latin1(scan))
+ scan++;
+
+ }
+ break;
+
default: /* Called on something of 0 width. */
break; /* So match right here or not at all. */
}
/* Regops and State definitions */
-#define REGNODE_MAX 84
-#define REGMATCH_STATE_MAX 124
+#define REGNODE_MAX 89
+#define REGMATCH_STATE_MAX 129
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
#define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */
#define CUTGROUP 81 /* 0x51 On failure go to the next alternation in the group */
#define KEEPS 82 /* 0x52 $& begins here. */
-#define OPTIMIZED 83 /* 0x53 Placeholder for dump. */
-#define PSEUDO 84 /* 0x54 Pseudo opcode for internal use. */
+#define LNBREAK 83 /* 0x53 generic newline pattern */
+#define VERTWS 84 /* 0x54 vertical whitespace (Perl 6) */
+#define NVERTWS 85 /* 0x55 not vertical whitespace (Perl 6) */
+#define HORIZWS 86 /* 0x56 horizontal whitespace (Perl 6) */
+#define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */
+#define OPTIMIZED 88 /* 0x58 Placeholder for dump. */
+#define PSEUDO 89 /* 0x59 Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
VERB, /* COMMIT */
VERB, /* CUTGROUP */
KEEPS, /* KEEPS */
+ LNBREAK, /* LNBREAK */
+ VERTWS, /* VERTWS */
+ NVERTWS, /* NVERTWS */
+ HORIZWS, /* HORIZWS */
+ NHORIZWS, /* NHORIZWS */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
EXTRA_SIZE(struct regnode_1), /* COMMIT */
EXTRA_SIZE(struct regnode_1), /* CUTGROUP */
0, /* KEEPS */
+ 0, /* LNBREAK */
+ 0, /* VERTWS */
+ 0, /* NVERTWS */
+ 0, /* HORIZWS */
+ 0, /* NHORIZWS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
0, /* COMMIT */
0, /* CUTGROUP */
0, /* KEEPS */
+ 0, /* LNBREAK */
+ 0, /* VERTWS */
+ 0, /* NVERTWS */
+ 0, /* HORIZWS */
+ 0, /* NHORIZWS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
"COMMIT", /* 0x50 */
"CUTGROUP", /* 0x51 */
"KEEPS", /* 0x52 */
- "OPTIMIZED", /* 0x53 */
- "PSEUDO", /* 0x54 */
+ "LNBREAK", /* 0x53 */
+ "VERTWS", /* 0x54 */
+ "NVERTWS", /* 0x55 */
+ "HORIZWS", /* 0x56 */
+ "NHORIZWS", /* 0x57 */
+ "OPTIMIZED", /* 0x58 */
+ "PSEUDO", /* 0x59 */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
}
-
+print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n";
if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
my @normal=qw(these are some normal words);
my $psycho=join "|",@normal,map chr $_,255..20000;
if ($ENV{PERL_SKIP_PSYCHO_TEST}){
printf "ok %d Skip: No psycho tests\n", $test++;
} else {
+ print "# set PERL_SKIP_PSYCHO_TEST to skip this test\n";
my $r = qr/^
(?:
( (?:a|z+)+ )
1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g;
iseq($count,4,"/.(*PRUNE)/");
}
-{ # Test the \v form of the (*PRUNE) pattern
- our $count = 0;
- 'aaab'=~/a+b?(?{$count++})(*FAIL)/;
- iseq($count,9,"expect 9 for no \\v");
- $count = 0;
- 'aaab'=~/a+b?\v(?{$count++})(*FAIL)/;
- iseq($count,3,"expect 3 with \\v");
- local $_='aaab';
- $count=0;
- 1 while /.\v(?{$count++})(*FAIL)/g;
- iseq($count,4,"/.\\v/");
- $count = 0;
- 'aaab'=~/a+b?(??{'\v'})(?{$count++})(*FAIL)/;
- iseq($count,3,"expect 3 with \\v");
- local $_='aaab';
- $count=0;
- 1 while /.(??{'\v'})(?{$count++})(*FAIL)/g;
- iseq($count,4,"/.\\v/");
-}
{ # Test the (*SKIP) pattern
our $count = 0;
'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/;
iseq($count,2,"Expect 2 with (*SKIP)" );
iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" );
}
-{ # Test the \V form of the (*SKIP) pattern
- our $count = 0;
- 'aaab'=~/a+b?\V(?{$count++})(*FAIL)/;
- iseq($count,1,"expect 1 with \\V");
- local $_='aaab';
- $count=0;
- 1 while /.\V(?{$count++})(*FAIL)/g;
- iseq($count,4,"/.\\V/");
- $_='aaabaaab';
- $count=0;
- our @res=();
- 1 while /(a+b?)\V(?{$count++; push @res,$1})(*FAIL)/g;
- iseq($count,2,"Expect 2 with \\V" );
- iseq("@res","aaab aaab","adjacent \\V works as expected" );
-}
{ # Test the (*SKIP) pattern
our $count = 0;
'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/;
iseq("$1$2",'foooooobaaaaar');
}
iseq("$1$2","foobar");
+}
+{
+ local $_="\t \r\n \n \t".chr(11)."\n";
+ s/\H/H/g;
+ s/\h/h/g;
+ iseq($_,"hhHHhHhhHH");
+ $_="\t \r\n \n \t".chr(11)."\n";
+ utf8::upgrade($_);
+ s/\H/H/g;
+ s/\h/h/g;
+ iseq($_,"hhHHhHhhHH");
}
+{
+ my @h=map { chr( $_ ) } (
+ 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002,
+ 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+ 0x202f, 0x205f, 0x3000
+ );
+ my @v=map { chr( $_ ) } ( 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 );
+ my @lb=( "\x0D\x0A",
+ map { chr( $_ ) } ( 0x0A..0x0D,0x85,0x2028,0x2029 ));
+ foreach my $t ([\@h,qr/\h/,qr/\h+/],[\@v,qr/\v/,qr/\v+/],[\@lb,qr/\R/,qr/\R+/],){
+ my $ary=shift @$t;
+ foreach my $pat (@$t) {
+ foreach my $str (@$ary) {
+ ok($str=~/($pat)/);
+ iseq($1,$str);
+ utf8::upgrade($str);
+ ok($str=~/($pat)/);
+ iseq($1,$str);
+ }
+ }
+ }
+}
+
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1663;
+ $::TestCount = 1928;
print "1..$::TestCount\n";
}
+
(?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A) a y $& a
(?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A) aa y $& aa
\x{100}?(??{""})xxx xxx y $& xxx
+
+foo(\R)bar foo\r\nbar y $1 \r\n
+foo(\R)bar foo\nbar y $1 \n
+foo(\R)bar foo\rbar y $1 \r
+
+foo(\R+)bar foo\r\n\x{85}\r\n\nbar y $1 \r\n\x{85}\r\n\n
+(\V+)(\R) foo\r\n\x{85}\r\n\nbar y $1-$2 foo-\r\n
+(\R+)(\V) foo\r\n\x{85}\r\n\nbar y $1-$2 \r\n\x{85}\r\n\n-b
+foo(\R)bar foo\x{85}bar y $1 \x{85}
+(\V)(\R) foo\x{85}bar y $1-$2 o-\x{85}
+(\R)(\V) foo\x{85}bar y $1-$2 \x{85}-b
+foo(\R)bar foo\r\nbar y $1 \r\n
+(\V)(\R) foo\r\nbar y $1-$2 o-\r\n
+(\R)(\V) foo\r\nbar y $1-$2 \r\n-b
+foo(\R)bar foo\r\nbar y $1 \r\n
+(\V)(\R) foo\r\nbar y $1-$2 o-\r\n
+(\R)(\V) foo\r\nbar y $1-$2 \r\n-b
+foo(\R)bar foo\rbar y $1 \r
+(\V)(\R) foo\rbar y $1-$2 o-\r
+(\R)(\V) foo\rbar y $1-$2 \r-b
+
+foo(\v+)bar foo\r\n\x{85}\r\n\nbar y $1 \r\n\x{85}\r\n\n
+(\V+)(\v) foo\r\n\x{85}\r\n\nbar y $1-$2 foo-\r
+(\v+)(\V) foo\r\n\x{85}\r\n\nbar y $1-$2 \r\n\x{85}\r\n\n-b
+foo(\v)bar foo\x{85}bar y $1 \x{85}
+(\V)(\v) foo\x{85}bar y $1-$2 o-\x{85}
+(\v)(\V) foo\x{85}bar y $1-$2 \x{85}-b
+foo(\v)bar foo\rbar y $1 \r
+(\V)(\v) foo\rbar y $1-$2 o-\r
+(\v)(\V) foo\rbar y $1-$2 \r-b
+
+
+foo(\h+)bar foo\t\x{A0}bar y $1 \t\x{A0}
+(\H+)(\h) foo\t\x{A0}bar y $1-$2 foo-\t
+(\h+)(\H) foo\t\x{A0}bar y $1-$2 \t\x{A0}-b
+foo(\h)bar foo\x{A0}bar y $1 \x{A0}
+(\H)(\h) foo\x{A0}bar y $1-$2 o-\x{A0}
+(\h)(\H) foo\x{A0}bar y $1-$2 \x{A0}-b
+foo(\h)bar foo\tbar y $1 \t
+(\H)(\h) foo\tbar y $1-$2 o-\t
+(\h)(\H) foo\tbar y $1-$2 \t-b
\$got = "$repl";
EOFCODE
}
+ #$code.=qq[\n\$expect="$expect";\n];
+ #use Devel::Peek;
+ #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
{
# Probably we should annotate specific tests with which warnings
# categories they're known to trigger, and hence should be