From: Karl Williamson Date: Sun, 24 Nov 2019 18:20:11 +0000 (-0700) Subject: Update to work with Perl 5.31.7 X-Git-Tag: v0.006_020~7^2 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=52790f4ff3b7f4801984e5ef6a7e2604b61a19bd;p=p5sagit%2FDevel-Declare.git Update to work with Perl 5.31.7 This deprecated module was using deprecated macros that recently have been removed. Any use of them should have generated a warning for the past two Perl releases. Nonetheless, no one apparently reported this to the module's bg tracker, and so, the issue was raised only when things stopped working. The code contains the file 'stolen_chunk_of_toke.c'. And this is as scary as it sounds. There was a similar situation a while back with B::Hooks::Parser, and that was solved by making public a few functions in toke.c that should be used outside the perl core only by that module. It makes sense for that module to have access to these, given its purpose, and it becomes innoculated against most future changes to the parser. Devel::Declare, unfortunately is using an earlier version of toke.c, and so the above approach doesn't work, because the parameters to the functions have changed, and the code is highly entwined with various interpreter level lexing variables, expecting them to work in the way it has coded for them, which may not be the case now or at some point in the future if it used the real toke.c functions. So its best to try to get this stolen chunk to work. A problem is that there are bugs in the way toke.c worked at the time this was stolen. I have fixed only the most obvious. There are two main issues. The easiest is that this won't compile because of security-related changes in blead. This is why most of these macros were deprecated: they allow a potential read past the end of the buffer. That means an extra parameter must be passed, giving the upper limit for the buffer. There have been versions of the macros available for several releases that takes the extra parameter, so all we have to do is convert to use those versions when available, and the old versions when not. The harder is that these use is_utf8_mark() to find the ends of identifiers being parsed. This is long obsolete, valid mostly only for Western Latin-based languages. Unicode came up with a better scheme many versions ago, which is to define a property that indicates if a character continues an identifier or not. Perl 5.31.7 finally removed the old deprecated function. I have tried to change the code here that used it without changing the results when using inputs that used to be valid. But there was a bug in the ordering of this. The UTF-8 case must come first, so I reordered it. Also, the new toke.c code has a do-while loop for the non-UTF8 case. I copied that too, without investigating why that change was made in the modern toke.c. This now passes all its tests on blead. I did find some omissions in blead that I fixed to get this to pass. --- diff --git a/stolen_chunk_of_toke.c b/stolen_chunk_of_toke.c index b9e5037..038a91c 100644 --- a/stolen_chunk_of_toke.c +++ b/stolen_chunk_of_toke.c @@ -186,14 +186,6 @@ static SUBLEXINFO *my_Isublex_info_ptr(pTHX) { return &(aTHX->Isublex_info); } #define SvPV_nolen_const SvPV_nolen #endif -/* Name changed in 5.17; use new name in our code. Apparently we're meant - to use something else instead, but no non-underscored way to achieve - this is apparent. */ - -#ifndef _is_utf8_mark -#define _is_utf8_mark is_utf8_mark -#endif - /* utf8_to_uvchr_buf() not defined in earlier perls, but less-capable * substitute is available */ @@ -201,6 +193,18 @@ static SUBLEXINFO *my_Isublex_info_ptr(pTHX) { return &(aTHX->Isublex_info); } #define utf8_to_uvchr_buf(s, e, lp) ((e), utf8_to_uvchr(s, lp)) #endif +#ifndef isIDFIRST_lazy_if_safe +# define isIDFIRST_lazy_if_safe(p,e,UTF) \ + ((! UTF || p > e) ? isIDFIRST_lazy_if(p,UTF) : 0) +#endif +#ifndef isALNUM_lazy_if_safe +# define isALNUM_lazy_if_safe(p,e,UTF) \ + ((! UTF || p > e) ? isALNUM_lazy_if(p,UTF) : 0) +#endif +#ifndef isALNUM_utf8_safe +# define isALNUM_utf8_safe(p,e) ((p > e) ? isALNUM_utf8(p) : 0) +#endif + /* and now we're back to the toke.c stuff again (mst) */ static const char ident_too_long[] = @@ -435,9 +439,30 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) { + if (UTF && isIDFIRST_utf8_safe((const U8*) s, (const U8*) PL_bufend)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isALNUM case below would gobble the 'c' up. + */ + + char *t = s + UTF8SKIP(s); + while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { + t += UTF8SKIP(t); + } + if (d + (t - s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(s, d, t - s, char); + *d += t - s; + s = t; + } + else if (isALNUM(*s)) + do { + *d++ = *s++; + } while (isWORDCHAR_A(*s) && d < e); + else if ( *s == '\'' + && allow_package + && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF)) + { *d++ = ':'; *d++ = ':'; s++; @@ -446,16 +471,6 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag *d++ = *s++; *d++ = *s++; } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } else { *d = '\0'; *slp = d - dest; @@ -921,9 +936,27 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { + if (UTF && isIDFIRST_utf8_safe((const U8*) s, (const U8*) PL_bufend)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isALNUM case below would gobble the 'c' up. + */ + + char *t = s + UTF8SKIP(s); + while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { + t += UTF8SKIP(t); + } + if (d + (t - s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(s, d, t - s, char); + *d += t - s; + s = t; + } + else if (isALNUM(*s)) + do { + *d++ = *s++; + } while (isWORDCHAR_A(*s) && d < e); + else if (*s == '\'' && isIDFIRST_lazy_if_safe(s+1,send,UTF)) { *d++ = ':'; *d++ = ':'; s++; @@ -932,16 +965,6 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL *d++ = *s++; *d++ = *s++; } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } else break; } @@ -954,7 +977,8 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL return s; } if (*s == '$' && s[1] && - (isALNUM_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) + ( isALNUM_lazy_if_safe(s+1,send,UTF) + || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) { return s; } @@ -982,13 +1006,16 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL } } } - if (isIDFIRST_lazy_if(d,UTF)) { + if (isIDFIRST_lazy_if_safe(d,d+destlen,UTF)) { d++; if (UTF) { e = s; - while ((e < send && isALNUM_lazy_if(e,UTF)) || *e == ':') { + while ( ( e < send + && isIDFIRST_utf8_safe(e, send) + || *e == ':')) + { e += UTF8SKIP(e); - while (e < send && UTF8_IS_CONTINUED(*e) && _is_utf8_mark((U8*)e)) + while (e < send && isIDFIRST_utf8_safe(e, send)) e += UTF8SKIP(e); } Copy(s, d, e - s, char);