X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=stolen_chunk_of_toke.c;h=038a91c345a2c78626855b6b103d5aea7dd48309;hb=03b20d2d9f560a40f2509ea8af4bc81e106c62d2;hp=6da1a819ee029d5043b692a06aeca0e81f4938b4;hpb=1479b0b1d8ae6404db89395beb373cce9cbb4834;p=p5sagit%2FDevel-Declare.git diff --git a/stolen_chunk_of_toke.c b/stolen_chunk_of_toke.c index 6da1a81..038a91c 100644 --- a/stolen_chunk_of_toke.c +++ b/stolen_chunk_of_toke.c @@ -186,6 +186,25 @@ static SUBLEXINFO *my_Isublex_info_ptr(pTHX) { return &(aTHX->Isublex_info); } #define SvPV_nolen_const SvPV_nolen #endif +/* utf8_to_uvchr_buf() not defined in earlier perls, but less-capable + * substitute is available */ + +#ifndef utf8_to_uvchr_buf +#define utf8_to_uvchr_buf(s, e, lp) ((e), utf8_to_uvchr(s, lp)) +#endif + +#ifndef isIDFIRST_lazy_if_safe +# define isIDFIRST_lazy_if_safe(p,e,UTF) \ + ((! UTF || p > e) ? isIDFIRST_lazy_if(p,UTF) : 0) +#endif +#ifndef isALNUM_lazy_if_safe +# define isALNUM_lazy_if_safe(p,e,UTF) \ + ((! UTF || p > e) ? isALNUM_lazy_if(p,UTF) : 0) +#endif +#ifndef isALNUM_utf8_safe +# define isALNUM_utf8_safe(p,e) ((p > e) ? isALNUM_utf8(p) : 0) +#endif + /* and now we're back to the toke.c stuff again (mst) */ static const char ident_too_long[] = @@ -327,7 +346,7 @@ S_skipspace(pTHX_ register char *s, int incline) * of the buffer, we're not reading from a source filter, and * we're in normal lexing mode */ - if (s < PL_bufend || !PL_rsfp || PL_sublex_info.sub_inwhat || + if (s < PL_bufend || !PL_rsfp || PL_lex_inwhat || PL_lex_state == LEX_FORMLINE) return s; @@ -420,9 +439,30 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) { + if (UTF && isIDFIRST_utf8_safe((const U8*) s, (const U8*) PL_bufend)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isALNUM case below would gobble the 'c' up. + */ + + char *t = s + UTF8SKIP(s); + while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { + t += UTF8SKIP(t); + } + if (d + (t - s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(s, d, t - s, char); + *d += t - s; + s = t; + } + else if (isALNUM(*s)) + do { + *d++ = *s++; + } while (isWORDCHAR_A(*s) && d < e); + else if ( *s == '\'' + && allow_package + && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF)) + { *d++ = ':'; *d++ = ':'; s++; @@ -431,16 +471,6 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag *d++ = *s++; *d++ = *s++; } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } else { *d = '\0'; *slp = d - dest; @@ -629,7 +659,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) termlen = 1; } else { - termcode = utf8_to_uvchr((U8*)s, &termlen); + termcode = utf8_to_uvchr_buf((U8*)s, PL_bufend, &termlen); Copy(s, termstr, termlen, U8); if (!UTF8_IS_INVARIANT(term)) has_utf8 = TRUE; @@ -906,9 +936,27 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { + if (UTF && isIDFIRST_utf8_safe((const U8*) s, (const U8*) PL_bufend)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isALNUM case below would gobble the 'c' up. + */ + + char *t = s + UTF8SKIP(s); + while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { + t += UTF8SKIP(t); + } + if (d + (t - s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(s, d, t - s, char); + *d += t - s; + s = t; + } + else if (isALNUM(*s)) + do { + *d++ = *s++; + } while (isWORDCHAR_A(*s) && d < e); + else if (*s == '\'' && isIDFIRST_lazy_if_safe(s+1,send,UTF)) { *d++ = ':'; *d++ = ':'; s++; @@ -917,16 +965,6 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL *d++ = *s++; *d++ = *s++; } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } else break; } @@ -939,7 +977,8 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL return s; } if (*s == '$' && s[1] && - (isALNUM_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) + ( isALNUM_lazy_if_safe(s+1,send,UTF) + || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) { return s; } @@ -967,13 +1006,16 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL } } } - if (isIDFIRST_lazy_if(d,UTF)) { + if (isIDFIRST_lazy_if_safe(d,d+destlen,UTF)) { d++; if (UTF) { e = s; - while ((e < send && isALNUM_lazy_if(e,UTF)) || *e == ':') { + while ( ( e < send + && isIDFIRST_utf8_safe(e, send) + || *e == ':')) + { e += UTF8SKIP(e); - while (e < send && UTF8_IS_CONTINUED(*e) && is_utf8_mark((U8*)e)) + while (e < send && isIDFIRST_utf8_safe(e, send)) e += UTF8SKIP(e); } Copy(s, d, e - s, char);