Newz(0, d, len*2+1, U8);
dstart = d;
while (s < send) {
- I32 ulen;
+ STRLEN ulen;
short c;
ulen = 1;
/* Need to check this, otherwise 128..255 won't match */
- c = utf8_to_uv_chk(s, &ulen, 0);
+ c = utf8_to_uv_chk(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
if (ch < 0x80)
s += UTF8SKIP(s);
else {
UV c;
- I32 ulen;
+ STRLEN ulen;
ulen = 1;
if (hasutf)
- c = utf8_to_uv_chk(s,&ulen, 0);
+ c = utf8_to_uv_chk(s, send - s, &ulen, 0);
else
c = *s;
if (c < 0x100 && tbl[c] >= 0)
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- I32 ulen;
- *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+ STRLEN ulen;
+ *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
s += ulen;
puv = 0xfeedface;
continue;
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- I32 ulen;
- *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+ STRLEN ulen;
+ *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
s += ulen;
continue;
}
char *dcsave = dc;
STRLEN lulen = leftlen;
STRLEN rulen = rightlen;
- I32 ulen;
+ STRLEN ulen;
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c) Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d) Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b)
p |OP* |scalarkids |OP* o
p |OP* |scalarseq |OP* o
p |OP* |scalarvoid |OP* o
-Ap |NV |scan_bin |char* start|I32 len|I32* retlen
-Ap |NV |scan_hex |char* start|I32 len|I32* retlen
+Ap |NV |scan_bin |char* start|STRLEN len|STRLEN* retlen
+Ap |NV |scan_hex |char* start|STRLEN len|STRLEN* retlen
Ap |char* |scan_num |char* s|YYSTYPE *lvalp
-Ap |NV |scan_oct |char* start|I32 len|I32* retlen
+Ap |NV |scan_oct |char* start|STRLEN len|STRLEN* retlen
p |OP* |scope |OP* o
Ap |char* |screaminstr |SV* bigsv|SV* littlesv|I32 start_shift \
|I32 end_shift|I32 *state|I32 last
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
-Ap |UV |utf8_to_uv |U8 *s|I32* retlen
-Ap |UV |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking
+Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen
+Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
p |void |vivify_defelem |SV* sv
p |void |vivify_ref |SV* sv|U32 to_what
s |regnode*|reganode |U8|U32
s |regnode*|regatom |I32 *
s |regnode*|regbranch |I32 *|I32
-s |void |reguni |UV|char *|I32*
+s |void |reguni |UV|char *|STRLEN*
s |regnode*|regclass
s |regnode*|regclassutf8
s |I32 |regcurly |char *
#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f')
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX ((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
SV* transv = 0;
U8* tend = t + tlen;
U8* rend = r + rlen;
- I32 ulen;
+ STRLEN ulen;
U32 tfirst = 1;
U32 tlast = 0;
I32 tdiff;
if (complement) {
U8 tmpbuf[UTF8_MAXLEN];
U8** cp;
+ I32* cl;
UV nextmin = 0;
New(1109, cp, tlen, U8*);
i = 0;
qsort(cp, i, sizeof(U8*), utf8compare);
for (j = 0; j < i; j++) {
U8 *s = cp[j];
- UV val = utf8_to_uv_chk(s, &ulen, 0);
+ I32 cur = j < i ? cp[j+1] - s : tend - s;
+ UV val = utf8_to_uv_chk(s, cur, &ulen, 0);
s += ulen;
diff = val - nextmin;
if (diff > 0) {
}
}
if (*s == 0xff)
- val = utf8_to_uv_chk(s+1, &ulen, 0);
+ val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
if (val >= nextmin)
nextmin = val + 1;
}
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+ tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
t += ulen;
if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
- tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+ t++;
+ tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
t += ulen;
}
else
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+ rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
r += ulen;
if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
- rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+ r++;
+ rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
r += ulen;
}
else
char *
Perl_moreswitches(pTHX_ char *s)
{
- I32 numlen;
+ STRLEN numlen;
U32 rschar;
switch (*s) {
#undef Perl_scan_bin
NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
}
#undef Perl_scan_hex
NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
}
#undef Perl_scan_oct
NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
}
#undef Perl_utf8_to_uv
UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
}
#undef Perl_utf8_to_uv_chk
UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
}
#undef Perl_uv_to_utf8
If C<s> does not point to a well-formed UTF8 character, an optional UTF8
warning is produced.
- U8* s utf8_to_uv(I32 *retlen)
+ U8* s utf8_to_uv(STRLEN *retlen)
=for hackers
Found in file utf8.c
=item utf8_to_uv_chk
Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
is dependent on the value of C<checking>: if this is true, it is
set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
warning is produced.
- U8* s utf8_to_uv_chk(I32 *retlen, I32 checking)
+ U8* s utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
=for hackers
Found in file utf8.c
appear if components are not found, or are too long. See
"PERLLIB_PREFIX" in L<perlos2>.
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
=item Malformed UTF-16 surrogate
Perl thought it was reading UTF-16 encoded character data but while
Regardless of the above, the C<bytes> pragma can always be used to force
byte semantics in a particular lexical scope. See L<bytes>.
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
The C<utf8> pragma is primarily a compatibility device that enables
recognition of UTF-8 in literals encountered by the parser. It may also
be used for enabling some of the more experimental Unicode support features.
STRLEN targlen = 0;
U8 *result;
U8 *send;
- I32 l;
+ STRLEN l;
send = tmps + len;
while (tmps < send) {
{
djSP; dTARGET;
char *tmps;
- I32 argtype;
+ STRLEN argtype;
STRLEN n_a;
tmps = POPpx;
{
djSP; dTARGET;
NV value;
- I32 argtype;
+ STRLEN argtype;
char *tmps;
STRLEN n_a;
{
djSP; dTARGET;
UV value;
- STRLEN n_a;
SV *tmpsv = POPs;
- U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
- I32 retlen;
+ STRLEN len;
+ U8 *tmps = (U8*)SvPVx(tmpsv, len);
+ STRLEN retlen;
if ((*tmps & 0x80) && DO_UTF8(tmpsv))
- value = utf8_to_uv_chk(tmps, &retlen, 0);
+ value = utf8_to_uv_chk(tmps, len, &retlen, 0);
else
value = (UV)(*tmps & 255);
XPUSHu(value);
STRLEN slen;
if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
- I32 ulen;
+ STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, &ulen, 0);
+ UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
STRLEN slen;
if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
- I32 ulen;
+ STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, &ulen, 0);
+ UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
if (DO_UTF8(sv)) {
dTARGET;
- I32 ulen;
+ STRLEN ulen;
register U8 *d;
U8 *send;
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+ d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
s += ulen;
}
}
if (DO_UTF8(sv)) {
dTARGET;
- I32 ulen;
+ STRLEN ulen;
register U8 *d;
U8 *send;
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+ d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
s += ulen;
}
}
/* These must not be in registers: */
I16 ashort;
int aint;
- I32 along;
+ STRLEN along;
#ifdef HAS_QUAD
Quad_t aquad;
#endif
len = strend - s;
if (checksum) {
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv_chk((U8*)s, &along, 0);
+ auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
s += along;
if (checksum > 32)
cdouble += (NV)auint;
EXTEND(SP, len);
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv_chk((U8*)s, &along, 0);
+ auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
s += along;
sv = NEWSV(37, 0);
sv_setuv(sv, (UV)auint);
if (SvNIOKp(sv)) {
if (SvPOK(sv) && SvNOK(sv)) { /* require v5.6.1 */
UV rev = 0, ver = 0, sver = 0;
- I32 len;
+ STRLEN len;
U8 *s = (U8*)SvPVX(sv);
U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
if (s < end) {
- rev = utf8_to_uv_chk(s, &len, 0);
+ rev = utf8_to_uv_chk(s, end - s, &len, 0);
s += len;
if (s < end) {
- ver = utf8_to_uv_chk(s, &len, 0);
+ ver = utf8_to_uv_chk(s, end - s, &len, 0);
s += len;
if (s < end)
- sver = utf8_to_uv_chk(s, &len, 0);
+ sver = utf8_to_uv_chk(s, end - s, &len, 0);
}
}
if (PERL_REVISION < rev
PERL_CALLCONV OP* Perl_scalarkids(pTHX_ OP* o);
PERL_CALLCONV OP* Perl_scalarseq(pTHX_ OP* o);
PERL_CALLCONV OP* Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
PERL_CALLCONV char* Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
PERL_CALLCONV OP* Perl_scope(pTHX_ OP* o);
PERL_CALLCONV char* Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
#if !defined(VMS)
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
STATIC regnode* S_reganode(pTHX_ U8, U32);
STATIC regnode* S_regatom(pTHX_ I32 *);
STATIC regnode* S_regbranch(pTHX_ I32 *, I32);
-STATIC void S_reguni(pTHX_ UV, char *, I32*);
+STATIC void S_reguni(pTHX_ UV, char *, STRLEN*);
STATIC regnode* S_regclass(pTHX);
STATIC regnode* S_regclassutf8(pTHX);
STATIC I32 S_regcurly(pTHX_ char *);
/* FALL THROUGH */
default: {
- register I32 len;
+ register STRLEN len;
register UV ender;
register char *p;
char *oldp, *s;
- I32 numlen;
+ STRLEN numlen;
PL_regcomp_parse++;
default:
normal_default:
if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+ ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+ &numlen, 0);
p += numlen;
}
else
register I32 lastvalue = OOB_CHAR8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 namedclass;
char *rangebegin;
bool need_class = 0;
register U32 lastvalue = OOB_UTF8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 n;
SV *listsv;
U8 flags = 0;
namedclass = OOB_NAMEDCLASS;
if (!range)
rangebegin = PL_regcomp_parse;
- value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
if (value == '[')
namedclass = regpposixcc(value);
else if (value == '\\') {
- value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
- reguni - emit (if appropriate) a Unicode character
*/
STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
{
dTHR;
if (SIZE_ONLY) {
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case BOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ strend - s,
+ 0, 0) : '\n';
tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == BOUNDUTF8 ?
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case NBOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ strend - s,
+ 0, 0) : '\n';
tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == NBOUNDUTF8 ?
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+ if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
toLOWER_utf8((U8*)l) :
toLOWER_LC_utf8((U8*)l)))
{
case NBOUNDUTF8:
/* was last char in word? */
ln = (locinput != PL_regbol)
- ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+ ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+ PL_regeol - locinput, 0, 0) : PL_regprev;
if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
ln = isALNUM_uni(ln);
n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
case 'd':
case 'i':
if (vectorize) {
- I32 ulen;
+ STRLEN ulen;
if (!veclen) {
vectorize = FALSE;
break;
}
if (utf)
- iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+ iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
else {
iv = *vecstr;
ulen = 1;
uns_integer:
if (vectorize) {
- I32 ulen;
+ STRLEN ulen;
vector:
if (!veclen) {
vectorize = FALSE;
break;
}
if (utf)
- uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+ uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
else {
uv = *vecstr;
ulen = 1;
}
}
-print "1..103\n";
+print "1..181\n";
my $test = 1;
print "ok $test\n";
$test++;
}
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02.
+
+my @MK = split(/\n/, <<__EOMK__);
+1 Correct UTF-8
+1.1.1 y "κόσμε" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
+2 Boundary conditions
+2.1 First possible sequence of certain length
+2.1.1 y "\0" 0 1 00 1
+2.1.2 y "\80" 80 2 c2:80 1
+2.1.3 y "ࠀ" 800 3 e0:a0:80 1
+2.1.4 y "𐀀" 10000 4 f0:90:80:80 1
+2.1.5 y "" 200000 5 f8:88:80:80:80 1
+2.1.6 y "" 4000000 6 fc:84:80:80:80:80 1
+2.2 Last possible sequence of certain length
+2.2.1 y "\7f" 7f 1 7f 1
+2.2.2 y "߿" 7ff 2 df:bf 1
+# The ffff is legal unless under use utf8
Software error:
Malformed UTF-8 character (fatal) at /var/www/git.shadowcat.co.uk/docroot/gitweb/gitweb.cgi line 1024, <$fd> line 809.
For help, please send mail to the webmaster (chrisj@shadowcatsystems.co.uk), giving this error message
and the time and date of the error.