From: Eric Brine Date: Fri, 12 Feb 2010 01:28:29 +0000 (-0500) Subject: Removes 32-bit limit on substr arguments. The full range of IV and UV is available... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=777f7c561610dee641c77666e5a4a0d9ac1d4230;p=p5sagit%2Fp5-mst-13.2.git Removes 32-bit limit on substr arguments. The full range of IV and UV is available for the pos and len arguments, with safe conversion to STRLEN where it's smaller than an IV. --- diff --git a/embed.fnc b/embed.fnc index 7463274..7e450aa 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1165,6 +1165,7 @@ ApdR |SV* |sv_newmortal Apd |SV* |sv_newref |NULLOK SV *const sv Ap |char* |sv_peek |NULLOK SV* sv Apd |void |sv_pos_u2b |NULLOK SV *const sv|NN I32 *const offsetp|NULLOK I32 *const lenp +Apd |void |sv_pos_u2b_proper|NULLOK SV *const sv|NN STRLEN *const offsetp|NULLOK STRLEN *const lenp Apd |void |sv_pos_b2u |NULLOK SV *const sv|NN I32 *const offsetp Amdb |char* |sv_pvn_force |NN SV* sv|NULLOK STRLEN* lp Apd |char* |sv_pvutf8n_force|NN SV *const sv|NULLOK STRLEN *const lp diff --git a/embed.h b/embed.h index 246106b..1281fcc 100644 --- a/embed.h +++ b/embed.h @@ -967,6 +967,7 @@ #define sv_newref Perl_sv_newref #define sv_peek Perl_sv_peek #define sv_pos_u2b Perl_sv_pos_u2b +#define sv_pos_u2b_proper Perl_sv_pos_u2b_proper #define sv_pos_b2u Perl_sv_pos_b2u #define sv_pvutf8n_force Perl_sv_pvutf8n_force #define sv_pvbyten_force Perl_sv_pvbyten_force @@ -3371,6 +3372,7 @@ #define sv_newref(a) Perl_sv_newref(aTHX_ a) #define sv_peek(a) Perl_sv_peek(aTHX_ a) #define sv_pos_u2b(a,b,c) Perl_sv_pos_u2b(aTHX_ a,b,c) +#define sv_pos_u2b_proper(a,b,c) Perl_sv_pos_u2b_proper(aTHX_ a,b,c) #define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b) #define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b) #define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b) diff --git a/global.sym b/global.sym index f0361df..f0e462e 100644 --- a/global.sym +++ b/global.sym @@ -567,6 +567,7 @@ Perl_sv_newmortal Perl_sv_newref Perl_sv_peek Perl_sv_pos_u2b +Perl_sv_pos_u2b_proper Perl_sv_pos_b2u Perl_sv_pvn_force Perl_sv_pvutf8n_force diff --git a/mg.c b/mg.c index b9a1464..4f8207c 100644 --- a/mg.c +++ b/mg.c @@ -2008,17 +2008,17 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg) STRLEN len; SV * const lsv = LvTARG(sv); const char * const tmps = SvPV_const(lsv,len); - I32 offs = LvTARGOFF(sv); - I32 rem = LvTARGLEN(sv); + STRLEN offs = LvTARGOFF(sv); + STRLEN rem = LvTARGLEN(sv); PERL_ARGS_ASSERT_MAGIC_GETSUBSTR; PERL_UNUSED_ARG(mg); if (SvUTF8(lsv)) - sv_pos_u2b(lsv, &offs, &rem); - if (offs > (I32)len) + sv_pos_u2b_proper(lsv, &offs, &rem); + if (offs > len) offs = len; - if (rem + offs > (I32)len) + if (rem > len - offs) rem = len - offs; sv_setpvn(sv, tmps + offs, (STRLEN)rem); if (SvUTF8(lsv)) @@ -2033,22 +2033,22 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg) STRLEN len; const char * const tmps = SvPV_const(sv, len); SV * const lsv = LvTARG(sv); - I32 lvoff = LvTARGOFF(sv); - I32 lvlen = LvTARGLEN(sv); + STRLEN lvoff = LvTARGOFF(sv); + STRLEN lvlen = LvTARGLEN(sv); PERL_ARGS_ASSERT_MAGIC_SETSUBSTR; PERL_UNUSED_ARG(mg); if (DO_UTF8(sv)) { sv_utf8_upgrade(lsv); - sv_pos_u2b(lsv, &lvoff, &lvlen); + sv_pos_u2b_proper(lsv, &lvoff, &lvlen); sv_insert(lsv, lvoff, lvlen, tmps, len); LvTARGLEN(sv) = sv_len_utf8(sv); SvUTF8_on(lsv); } else if (lsv && SvUTF8(lsv)) { const char *utf8; - sv_pos_u2b(lsv, &lvoff, &lvlen); + sv_pos_u2b_proper(lsv, &lvoff, &lvlen); LvTARGLEN(sv) = len; utf8 = (char*)bytes_to_utf8((U8*)tmps, &len); sv_insert(lsv, lvoff, lvlen, utf8, len); @@ -2059,7 +2059,6 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg) LvTARGLEN(sv) = len; } - return 0; } diff --git a/pp.c b/pp.c index 2f4703b..95dc5fd 100644 --- a/pp.c +++ b/pp.c @@ -3079,15 +3079,19 @@ PP(pp_substr) { dVAR; dSP; dTARGET; SV *sv; - I32 len = 0; STRLEN curlen; STRLEN utf8_curlen; - I32 pos; - I32 rem; - I32 fail; + SV * pos_sv; + IV pos1_iv; + int pos1_is_uv; + IV pos2_iv; + int pos2_is_uv; + SV * len_sv; + IV len_iv = 0; + int len_is_uv = 1; const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET; const char *tmps; - const I32 arybase = CopARYBASE_get(PL_curcop); + const IV arybase = CopARYBASE_get(PL_curcop); SV *repl_sv = NULL; const char *repl = NULL; STRLEN repl_len; @@ -3103,9 +3107,13 @@ PP(pp_substr) repl = SvPV_const(repl_sv, repl_len); repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv); } - len = POPi; + len_sv = POPs; + len_iv = SvIV(len_sv); + len_is_uv = SvIOK_UV(len_sv); } - pos = POPi; + pos_sv = POPs; + pos1_iv = SvIV(pos_sv); + pos1_is_uv = SvIOK_UV(pos_sv); sv = POPs; PUTBACK; if (repl_sv) { @@ -3127,51 +3135,80 @@ PP(pp_substr) else utf8_curlen = 0; - if (pos >= arybase) { - pos -= arybase; - rem = curlen-pos; - fail = rem; - if (num_args > 2) { - if (len < 0) { - rem += len; - if (rem < 0) - rem = 0; - } - else if (rem > len) - rem = len; + if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */ + UV pos1_uv = pos1_iv-arybase; + /* Overflow can occur when $[ < 0 */ + if (arybase < 0 && pos1_uv < (UV)pos1_iv) + goto BOUND_FAIL; + pos1_iv = pos1_uv; + pos1_is_uv = 1; + } + else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) { + goto BOUND_FAIL; /* $[=3; substr($_,2,...) */ + } + else { /* pos < $[ */ + if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */ + pos1_iv = curlen; + pos1_is_uv = 1; + } else { + if (curlen) { + pos1_is_uv = curlen-1 > ~(UV)pos1_iv; + pos1_iv += curlen; + } } } - else { - pos += curlen; - if (num_args < 3) - rem = curlen; - else if (len >= 0) { - rem = pos+len; - if (rem > (I32)curlen) - rem = curlen; + if (pos1_is_uv || pos1_iv > 0) { + if ((UV)pos1_iv > curlen) + goto BOUND_FAIL; + } + + if (num_args > 2) { + if (!len_is_uv && len_iv < 0) { + pos2_iv = curlen + len_iv; + if (curlen) + pos2_is_uv = curlen-1 > ~(UV)len_iv; + else + pos2_is_uv = 0; + } else { /* len_iv >= 0 */ + if (!pos1_is_uv && pos1_iv < 0) { + pos2_iv = pos1_iv + len_iv; + pos2_is_uv = (UV)len_iv > (UV)IV_MAX; + } else { + if ((UV)len_iv > curlen-(UV)pos1_iv) + pos2_iv = curlen; + else + pos2_iv = pos1_iv+len_iv; + pos2_is_uv = 1; + } } - else { - rem = curlen+len; - if (rem < pos) - rem = pos; - } - if (pos < 0) - pos = 0; - fail = rem; - rem -= pos; - } - if (fail < 0) { - if (lvalue || repl) - Perl_croak(aTHX_ "substr outside of string"); - Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string"); - RETPUSHUNDEF; } else { - const I32 upos = pos; - const I32 urem = rem; + pos2_iv = curlen; + pos2_is_uv = 1; + } + + if (!pos2_is_uv && pos2_iv < 0) { + if (!pos1_is_uv && pos1_iv < 0) + goto BOUND_FAIL; + pos2_iv = 0; + } + else if (!pos1_is_uv && pos1_iv < 0) + pos1_iv = 0; + + if ((UV)pos2_iv < (UV)pos1_iv) + pos2_iv = pos1_iv; + if ((UV)pos2_iv > curlen) + pos2_iv = curlen; + + { + /* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */ + const STRLEN pos = (STRLEN)( (UV)pos1_iv ); + const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv ); + STRLEN byte_pos = pos; + STRLEN byte_len = len; if (utf8_curlen) - sv_pos_u2b(sv, (I32 *)&pos, (I32 *)&rem); - tmps += pos; + sv_pos_u2b_proper(sv, &byte_pos, &byte_len); + tmps += byte_pos; /* we either return a PV or an LV. If the TARG hasn't been used * before, or is of that type, reuse it; otherwise use a mortal * instead. Note that LVs can have an extended lifetime, so also @@ -3185,7 +3222,7 @@ PP(pp_substr) } } - sv_setpvn(TARG, tmps, rem); + sv_setpvn(TARG, tmps, byte_len); #ifdef USE_LOCALE_COLLATE sv_unmagic(TARG, PERL_MAGIC_collxfrm); #endif @@ -3202,7 +3239,7 @@ PP(pp_substr) } if (!SvOK(sv)) sv_setpvs(sv, ""); - sv_insert_flags(sv, pos, rem, repl, repl_len, 0); + sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0); if (repl_is_utf8) SvUTF8_on(sv); SvREFCNT_dec(repl_sv_copy); @@ -3232,13 +3269,19 @@ PP(pp_substr) SvREFCNT_dec(LvTARG(TARG)); LvTARG(TARG) = SvREFCNT_inc_simple(sv); } - LvTARGOFF(TARG) = upos; - LvTARGLEN(TARG) = urem; + LvTARGOFF(TARG) = pos; + LvTARGLEN(TARG) = len; } } SPAGAIN; PUSHs(TARG); /* avoid SvSETMAGIC here */ RETURN; + +BOUND_FAIL: + if (lvalue || repl) + Perl_croak(aTHX_ "substr outside of string"); + Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string"); + RETPUSHUNDEF; } PP(pp_vec) diff --git a/proto.h b/proto.h index 4a343be..ae48597 100644 --- a/proto.h +++ b/proto.h @@ -3374,6 +3374,11 @@ PERL_CALLCONV void Perl_sv_pos_u2b(pTHX_ SV *const sv, I32 *const offsetp, I32 * #define PERL_ARGS_ASSERT_SV_POS_U2B \ assert(offsetp) +PERL_CALLCONV void Perl_sv_pos_u2b_proper(pTHX_ SV *const sv, STRLEN *const offsetp, STRLEN *const lenp) + __attribute__nonnull__(pTHX_2); +#define PERL_ARGS_ASSERT_SV_POS_U2B_PROPER \ + assert(offsetp) + PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV *const sv, I32 *const offsetp) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_SV_POS_B2U \ diff --git a/sv.c b/sv.c index 4ab41f6..02be580 100644 --- a/sv.c +++ b/sv.c @@ -6240,7 +6240,7 @@ S_sv_pos_u2b_cached(pTHX_ SV *const sv, MAGIC **const mgp, const U8 *const start /* -=for apidoc sv_pos_u2b +=for apidoc sv_pos_u2b_proper Converts the value pointed to by offsetp from a count of UTF-8 chars from the start of the string, to a count of the equivalent number of bytes; if @@ -6252,14 +6252,14 @@ type coercion. */ /* - * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential + * sv_pos_u2b_proper() uses, like sv_pos_b2u(), the mg_ptr of the potential * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and * byte offsets. See also the comments of S_utf8_mg_pos_cache_update(). * */ void -Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp) +Perl_sv_pos_u2b_proper(pTHX_ register SV *const sv, STRLEN *const offsetp, STRLEN *const lenp) { const U8 *start; STRLEN len; @@ -6271,17 +6271,17 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp start = (U8*)SvPV_const(sv, len); if (len) { - STRLEN uoffset = (STRLEN) *offsetp; + STRLEN uoffset = *offsetp; const U8 * const send = start + len; MAGIC *mg = NULL; const STRLEN boffset = sv_pos_u2b_cached(sv, &mg, start, send, uoffset, 0, 0); - *offsetp = (I32) boffset; + *offsetp = boffset; if (lenp) { /* Convert the relative offset to absolute. */ - const STRLEN uoffset2 = uoffset + (STRLEN) *lenp; + const STRLEN uoffset2 = uoffset + *lenp; const STRLEN boffset2 = sv_pos_u2b_cached(sv, &mg, start, send, uoffset2, uoffset, boffset) - boffset; @@ -6298,6 +6298,41 @@ Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp return; } +/* +=for apidoc sv_pos_u2b + +Converts the value pointed to by offsetp from a count of UTF-8 chars from +the start of the string, to a count of the equivalent number of bytes; if +lenp is non-zero, it does the same to lenp, but this time starting from +the offset, rather than from the start of the string. Handles magic and +type coercion. + +=cut +*/ + +/* + * sv_pos_u2b() uses, like sv_pos_b2u(), the mg_ptr of the potential + * PERL_MAGIC_utf8 of the sv to store the mapping between UTF-8 and + * byte offsets. See also the comments of S_utf8_mg_pos_cache_update(). + * + */ + +/* This function is subject to size and sign problems */ + +void +Perl_sv_pos_u2b(pTHX_ register SV *const sv, I32 *const offsetp, I32 *const lenp) +{ + STRLEN uoffset = (STRLEN)*offsetp; + if (lenp) { + STRLEN ulen = (STRLEN)*lenp; + sv_pos_u2b_proper(sv, &uoffset, &ulen); + *lenp = (I32)ulen; + } else { + sv_pos_u2b_proper(sv, &uoffset, NULL); + } + *offsetp = (I32)uoffset; +} + /* Create and update the UTF8 magic offset cache, with the proffered utf8/ byte length pairing. The (byte) length of the total SV is passed in too, as blen, because for some (more esoteric) SVs, the call to SvPV_const() diff --git a/t/re/substr.t b/t/re/substr.t index c3fa6e1..d0717ba 100644 --- a/t/re/substr.t +++ b/t/re/substr.t @@ -24,7 +24,7 @@ $SIG{__WARN__} = sub { require './test.pl'; -plan(334); +plan(360); run_tests() unless caller; @@ -201,6 +201,11 @@ is($w--, 1); eval{substr($a,1) = "" ; }; # P=R=S Q like($@, $FATAL_MSG); +$b = substr($a,-7,-6) ; # warn # Q R P S +is($w--, 1); +eval{substr($a,-7,-6) = "" ; }; # Q R P S +like($@, $FATAL_MSG); + my $a = 'zxcvbnm'; substr($a,2,0) = ''; is($a, 'zxcvbnm'); @@ -682,4 +687,39 @@ is($x, "\x{100}\x{200}\xFFb"); is(substr($a,1,1), 'b'); } +# [perl #62646] offsets exceeding 32 bits on 64-bit system +SKIP: { + skip("32-bit system", 24) unless ~0 > 0xffffffff; + my $a = "abc"; + my $s; + my $r; + + utf8::downgrade($a); + for (1..2) { + $w = 0; + $r = substr($a, 0xffffffff, 1); + is($r, undef); + is($w, 1); + + $w = 0; + $r = substr($a, 0xffffffff+1, 1); + is($r, undef); + is($w, 1); + + $w = 0; + ok( !eval { $r = substr($s=$a, 0xffffffff, 1, "_"); 1 } ); + is($r, undef); + is($s, $a); + is($w, 0); + + $w = 0; + ok( !eval { $r = substr($s=$a, 0xffffffff+1, 1, "_"); 1 } ); + is($r, undef); + is($s, $a); + is($w, 0); + + utf8::upgrade($a); + } +} + }