I32 ch;
/* Need to check this, otherwise 128..255 won't match */
- const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0);
+ const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
if (c < 0x100 && (ch = tbl[c]) >= 0) {
matches++;
d = uvchr_to_utf8(d, ch);
const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
while (s < send) {
STRLEN ulen;
- const UV c = utf8n_to_uvchr(s, send - s, &ulen, 0);
+ const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
if (c < 0x100) {
if (tbl[c] >= 0)
matches++;
UV pch = 0xfeedface;
while (s < send) {
STRLEN len;
- const UV comp = utf8_to_uvchr(s, &len);
+ const UV comp = utf8n_to_uvchr(s, send - s, &len,
+ UTF8_ALLOW_DEFAULT);
I32 ch;
if (comp > 0xff) {
else {
while (s < send) {
STRLEN len;
- const UV comp = utf8_to_uvchr(s, &len);
+ const UV comp = utf8n_to_uvchr(s, send - s, &len,
+ UTF8_ALLOW_DEFAULT);
I32 ch;
if (comp > 0xff) {
if (!complement) {
}
else {
STRLEN len;
- uv = utf8_to_uvuni(s, &len);
+ uv = utf8n_to_uvuni(s, send - s, &len, UTF8_ALLOW_DEFAULT);
if (uv != puv) {
Move(s, d, len, U8);
d += len;
const I32 to_utf = o->op_private & OPpTRANS_TO_UTF;
U8* tsave = NULL;
U8* rsave = NULL;
+ const U32 flags = UTF8_ALLOW_DEFAULT;
if (!from_utf) {
STRLEN len = tlen;
i = 0;
transv = newSVpvs("");
while (t < tend) {
- cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+ cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
t += ulen;
if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {
t++;
- cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+ cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, flags);
t += ulen;
}
else {
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
+ tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
t += ulen;
if (t < tend && NATIVE_TO_UTF(*t) == 0xff) { /* illegal utf8 val indicates range */
t++;
- tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
+ tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, flags);
t += ulen;
}
else
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
+ rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
r += ulen;
if (r < rend && NATIVE_TO_UTF(*r) == 0xff) { /* illegal utf8 val indicates range */
r++;
- rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
+ rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, flags);
r += ulen;
}
else
/* first pass, loop through and scan words */
reg_trie_data *trie;
regnode *cur;
- const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ const U32 uniflags = UTF8_ALLOW_DEFAULT;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
p += numlen;
}
else
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
RExC_parse += numlen;
}
else
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
RExC_parse += numlen;
}
else
U8 *sm = (U8 *) m;
U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
- const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ const U32 uniflags = UTF8_ALLOW_DEFAULT;
to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
UV c, f;
U8 tmpbuf [UTF8_MAXBYTES+1];
STRLEN len, foldlen;
- const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ const U32 uniflags = UTF8_ALLOW_DEFAULT;
if (c1 == c2) {
/* Upper and lower of 1st char are equal -
* probably not a "letter". */
tmp = '\n';
else {
U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
- tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+ tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
}
tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
tmp = '\n';
else {
U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
- tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+ tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
}
tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
{
dVAR;
register const bool do_utf8 = PL_reg_match_utf8;
- const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ const U32 uniflags = UTF8_ALLOW_DEFAULT;
regmatch_slab *orig_slab;
regmatch_state *orig_state;
else {
const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
- st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
+ st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
}
if (OP(scan) == BOUND || OP(scan) == NBOUND) {
st->ln = isALNUM_uni(st->ln);
if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
- ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
- UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
+ (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
+ /* see [perl #37836] for UTF8_ALLOW_ANYUV */
if (len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
}
$| = 1;
-print "1..1200\n";
+print "1..1208\n";
BEGIN {
chdir 't' if -d 't';
ok($f eq "ab", "pos retained between calls # TODO") or print "# $@\n";
}
+# [perl #37836] Simple Regex causes SEGV when run on specific data
+if ($ordA == 193) {
+ print "ok $test # Skip: in EBCDIC\n"; $test++;
+} else {
+ no warnings 'utf8';
+ $_ = pack('U0C2', 0xa2, 0xf8); # ill-formed UTF-8
+ my $ret = 0;
+ eval { $ret = s/[\0]+//g };
+ ok($ret == 0, "ill-formed UTF-8 doesn't match NUL in class");
+}
+
+{ # [perl #38293] chr(65535) should be allowed in regexes
+ no warnings 'utf8'; # to allow non-characters
+ my($c, $r, $s);
+
+ $c = chr 0xffff;
+ $c =~ s/$c//g;
+ ok($c eq "", "U+FFFF, parsed as atom");
+
+ $c = chr 0xffff;
+ $r = "\\$c";
+ $c =~ s/$r//g;
+ ok($c eq "", "U+FFFF backslashed, parsed as atom");
+
+ $c = chr 0xffff;
+ $c =~ s/[$c]//g;
+ ok($c eq "", "U+FFFF, parsed in class");
+
+ $c = chr 0xffff;
+ $r = "[\\$c]";
+ $c =~ s/$r//g;
+ ok($c eq "", "U+FFFF backslashed, parsed in class");
+
+ $s = "A\x{ffff}B";
+ $s =~ s/\x{ffff}//i;
+ ok($s eq "AB", "U+FFFF, EXACTF");
+
+ $s = "\x{ffff}A";
+ $s =~ s/\bA//;
+ ok($s eq "\x{ffff}", "U+FFFF, BOUND");
+
+ $s = "\x{ffff}!";
+ $s =~ s/\B!//;
+ ok($s eq "\x{ffff}", "U+FFFF, NBOUND");
+} # non-characters end
+
+
# Keep the following test last -- it may crash perl
ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
require './test.pl';
}
-plan tests => 100;
+plan tests => 116;
my $Is_EBCDIC = (ord('i') == 0x89 & ord('J') == 0xd1);
# rt.perl.org 36622. Perl didn't like a y/// at end of file. No trailing
# newline allowed.
fresh_perl_is(q[$_ = "foo"; y/A-Z/a-z/], '');
+
+
+{ # [perl #38293] chr(65535) should be allowed in regexes
+no warnings 'utf8'; # to allow non-characters
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/;
+is($s, "\x{d800}\x{ffff}", "do_trans_simple");
+
+$s = "\x{d800}\x{ffff}";
+$i = $s =~ tr/\0//;
+is($i, 0, "do_trans_count");
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/s;
+is($s, "\x{d800}\x{ffff}", "do_trans_complex, SQUASH");
+
+$s = "\x{d800}\x{ffff}";
+$s =~ tr/\0/A/c;
+is($s, "AA", "do_trans_complex, COMPLEMENT");
+
+$s = "A\x{ffff}B";
+$s =~ tr/\x{ffff}/\x{1ffff}/;
+is($s, "A\x{1ffff}B", "utf8, SEARCHLIST");
+
+$s = "\x{fffd}\x{fffe}\x{ffff}";
+$s =~ tr/\x{fffd}-\x{ffff}/ABC/;
+is($s, "ABC", "utf8, SEARCHLIST range");
+
+$s = "ABC";
+$s =~ tr/ABC/\x{ffff}/;
+is($s, "\x{ffff}"x3, "utf8, REPLACEMENTLIST");
+
+$s = "ABC";
+$s =~ tr/ABC/\x{fffd}-\x{ffff}/;
+is($s, "\x{fffd}\x{fffe}\x{ffff}", "utf8, REPLACEMENTLIST range");
+
+$s = "A\x{ffff}B\x{100}\0\x{fffe}\x{ffff}";
+$i = $s =~ tr/\x{ffff}//;
+is($i, 2, "utf8, count");
+
+$s = "A\x{ffff}\x{ffff}C";
+$s =~ tr/\x{ffff}/\x{100}/s;
+is($s, "A\x{100}C", "utf8, SQUASH");
+
+$s = "A\x{ffff}\x{ffff}\x{fffe}\x{fffe}\x{fffe}C";
+$s =~ tr/\x{fffe}\x{ffff}//s;
+is($s, "A\x{ffff}\x{fffe}C", "utf8, SQUASH");
+
+$s = "xAABBBy";
+$s =~ tr/AB/\x{ffff}/s;
+is($s, "x\x{ffff}y", "utf8, SQUASH");
+
+$s = "xAABBBy";
+$s =~ tr/AB/\x{fffe}\x{ffff}/s;
+is($s, "x\x{fffe}\x{ffff}y", "utf8, SQUASH");
+
+$s = "A\x{ffff}B\x{fffe}C";
+$s =~ tr/\x{fffe}\x{ffff}/x/c;
+is($s, "x\x{ffff}x\x{fffe}x", "utf8, COMPLEMENT");
+
+$s = "A\x{10000}B\x{2abcd}C";
+$s =~ tr/\0-\x{ffff}/x/c;
+is($s, "AxBxC", "utf8, COMPLEMENT range");
+
+$s = "A\x{fffe}B\x{ffff}C";
+$s =~ tr/\x{fffe}\x{ffff}/x/d;
+is($s, "AxBC", "utf8, DELETE");
+
+} # non-characters end
+
#define UTF8_ALLOW_EMPTY 0x0001
#define UTF8_ALLOW_CONTINUATION 0x0002
#define UTF8_ALLOW_NON_CONTINUATION 0x0004
-#define UTF8_ALLOW_FE_FF 0x0008
+#define UTF8_ALLOW_FE_FF 0x0008 /* Allow above 0x7fffFFFF */
#define UTF8_ALLOW_SHORT 0x0010
#define UTF8_ALLOW_SURROGATE 0x0020
-#define UTF8_ALLOW_FFFF 0x0040 /* Allows also FFFE. */
+#define UTF8_ALLOW_FFFF 0x0040 /* Allow UNICODE_ILLEGAL */
#define UTF8_ALLOW_LONG 0x0080
#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
#define UTF8_ALLOW_ANY 0x00FF
#define UTF8_CHECK_ONLY 0x0200
+#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
+ UTF8_ALLOW_ANYUV)
#define UNICODE_SURROGATE_FIRST 0xD800
#define UNICODE_SURROGATE_LAST 0xDFFF
#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFF 0x0004 /* Allow 0xFFF[EF], 0x1FFF[EF], ... */
-#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_FFFF 0x0004 /* Allow U+FFF[EF], U+1FFF[EF], ... */
+#define UNICODE_ALLOW_SUPER 0x0008 /* Allow past 0x10FFFF */
#define UNICODE_ALLOW_ANY 0x000F
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \