$VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/);
+use re 'asciirange'; # ranges in regular expressions refer to ASCII
sub encode_qp ($)
{
When C<use re 'eval'> is in effect, a regex is allowed to contain
C<(?{ ... })> zero-width assertions even if regular expression contains
-variable interpolation. That is normally disallowed, since it is a
+variable interpolation. That is normally disallowed, since it is a
potential security risk. Note that this pragma is ignored when the regular
expression is obtained from tainted data, i.e. evaluation is always
disallowed with tainted regular expresssions. See L<perlre/(?{ code })>.
-For the purpose of this pragma, interpolation of precompiled regular
+For the purpose of this pragma, interpolation of precompiled regular
expressions (i.e., the result of C<qr//>) is I<not> considered variable
interpolation. Thus:
/foo${pat}bar/
-I<is> allowed if $pat is a precompiled regular expression, even
+I<is> allowed if $pat is a precompiled regular expression, even
if $pat contains C<(?{ ... })> assertions.
-When C<use re 'debug'> is in effect, perl emits debugging messages when
+When C<use re 'debug'> is in effect, perl emits debugging messages when
compiling and using regular expressions. The output is the same as that
obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
B<-Dr> switch. It may be quite voluminous depending on the complexity
form of output that can be used to get a colorful display on terminals
that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
comma-separated list of C<termcap> properties to use for highlighting
-strings on/off, pre-point part on/off.
+strings on/off, pre-point part on/off.
See L<perldebug/"Debugging regular expressions"> for additional info.
The directive C<use re 'debug'> is I<not lexically scoped>, as the
# N.B. File::Basename contains a literal for 'taint' as a fallback. If
# taint is changed here, File::Basename must be updated as well.
my %bitmask = (
-taint => 0x00100000,
-eval => 0x00200000,
+taint => 0x00100000,
+eval => 0x00200000,
+asciirange => 0x02000000,
);
sub setcolor {
#define HINT_FILETEST_ACCESS 0x00400000
#define HINT_UTF8 0x00800000
#define HINT_UTF8_DISTINCT 0x01000000
+#define HINT_RE_ASCIIR 0x02000000
/* Various states of an input record separator SV (rs, nrs) */
#define RsSNARF(sv) (! SvOK(sv))
for (value = 0; value < 128; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) < 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
for (value = 128; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) >= 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (!isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
} /* end of namedclass \blah */
if (range) {
- if (lastvalue > value) /* b-a */ {
+ if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+ ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
if (!SIZE_ONLY) {
if (lastvalue < 256 && value < 256) {
#ifdef EBCDIC /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ if (PL_hints & HINT_RE_ASCIIR) {
+ IV i;
+ /* New style scheme for ranges:
+ * after :
+ * use re 'asciir';
+ * do ranges in ASCII/Unicode space
+ */
+ for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+ ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+ }
+ else if ((isLOWER(lastvalue) && isLOWER(value)) ||
(isUPPER(lastvalue) && isUPPER(value)))
{
IV i;
{
ReREFCNT_dec((regexp *)r);
}
+
use bytes ();
-$d = pack("U*", 0xe3, 0x81, 0x82);
+# on EBCDIC chars are mapped differently so pick something that needs encoding
+# there too.
+$d = pack("U*", 0xe3, 0x81, 0xAF);
$ol = bytes::length($d);
print "not " unless $ol > 3;
print "ok 25\n";
%u = ($d => "downgrade");
for (keys %u) {
use bytes;
- print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+ print "not " if length ne 3 or $_ ne "\xe3\x81\xAF";
print "ok 26\n";
}
{
chdir 't' if -d 't';
@INC = '../lib';
}
+
+use re 'asciirange'; # Compute ranges in ASCII space
+
eval 'use Config'; # Defaults assumed if this fails
$x = "abc\ndef\n";
@INC = '../lib';
}
+use re 'asciirange'; # ranges are computed in ASCII
+
$iters = shift || 1; # Poor man performance suite, 10000 is OK.
open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') ||
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
+
+}
+
+{
+ my $wide = v256;
+ use bytes;
+ print STDERR ord($wide),"\n";
+ if (ord($wide) == 140) {
+ print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
+ exit 0;
+ }
+ elsif (ord($wide) != 196) {
+ warn sprintf("v256 starts with %02X\n",ord($wide));
+ }
}
no utf8;
# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
-# version dated 2000-09-02.
+# version dated 2000-09-02.
# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
# because e.g. many patch programs have issues with binary data.
my @MK = split(/\n/, <<__EOMK__);
1 Correct UTF-8
1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
-2 Boundary conditions
+2 Boundary conditions
2.1 First possible sequence of certain length
2.1.1 y "\x00" 0 1 00 1
2.1.2 y "\xc2\x80" 80 2 c2:80 1
sub moan {
print "$id: @_";
}
-
+
sub test_unpack_U {
$WARNCNT = 0;
$WARNMSG = "";
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
+ /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+ PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))