X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=toke.c;h=8c019c510d961b3cf1e98348af9397145ba6e3c4;hb=482aa6ffeb230a1cea9d05e9eb425b3d7fc09217;hp=c7ec476b6d041e7f43c24c0bd0fc293287f33f05;hpb=aa6dbd607b0a3d8a49c43b63b63b73ed167692c3;p=p5sagit%2Fp5-mst-13.2.git diff --git a/toke.c b/toke.c index c7ec476..8c019c5 100644 --- a/toke.c +++ b/toke.c @@ -1054,8 +1054,8 @@ S_skipspace(pTHX_ register char *s) curoff = s - SvPVX(PL_linestr); #endif - if ((s = filter_gets(PL_linestr, PL_rsfp, - (prevlen = SvCUR(PL_linestr)))) == NULL) + if ((s = filter_gets(PL_linestr, (prevlen = SvCUR(PL_linestr)))) + == NULL) { #ifdef PERL_MAD if (PL_madskills && curoff != startoff) { @@ -2917,7 +2917,7 @@ Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) const int old_len = SvCUR(buf_sv); /* ensure buf_sv is large enough */ - SvGROW(buf_sv, (STRLEN)(old_len + correct_length)) ; + SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ; if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len, correct_length)) <= 0) { if (PerlIO_error(PL_rsfp)) @@ -2926,6 +2926,7 @@ Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) return 0 ; /* end of file */ } SvCUR_set(buf_sv, old_len + len) ; + SvPVX(buf_sv)[old_len + len] = '\0'; } else { /* Want a line */ if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) { @@ -2956,7 +2957,7 @@ Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) } STATIC char * -S_filter_gets(pTHX_ register SV *sv, register PerlIO *fp, STRLEN append) +S_filter_gets(pTHX_ register SV *sv, STRLEN append) { dVAR; @@ -2976,7 +2977,7 @@ S_filter_gets(pTHX_ register SV *sv, register PerlIO *fp, STRLEN append) return NULL ; } else - return (sv_gets(sv, fp, append)); + return (sv_gets(sv, PL_rsfp, append)); } STATIC HV * @@ -3739,7 +3740,7 @@ Perl_yylex(pTHX) } do { bof = PL_rsfp ? TRUE : FALSE; - if ((s = filter_gets(PL_linestr, PL_rsfp, 0)) == NULL) { + if ((s = filter_gets(PL_linestr, 0)) == NULL) { fake_eof: #ifdef PERL_MAD PL_realtokenstart = -1; @@ -5808,8 +5809,8 @@ Perl_yylex(pTHX) sv_catpvn(PL_endwhite, tstart, PL_bufend - tstart); PL_realtokenstart = -1; } - while ((s = filter_gets(PL_endwhite, PL_rsfp, - SvCUR(PL_endwhite))) != NULL) ; + while ((s = filter_gets(PL_endwhite, SvCUR(PL_endwhite))) + != NULL) ; } #endif PL_rsfp = NULL; @@ -7128,7 +7129,8 @@ S_pending_ident(pTHX) and @foo isn't a variable we can find in the symbol table. */ - if (pit == '@' && PL_lex_state != LEX_NORMAL && !PL_lex_brackets) { + if (ckWARN(WARN_AMBIGUOUS) && + pit == '@' && PL_lex_state != LEX_NORMAL && !PL_lex_brackets) { GV *const gv = gv_fetchpvn_flags(PL_tokenbuf + 1, tokenbuf_len - 1, 0, SVt_PVAV); if ((!gv || ((PL_tokenbuf[0] == '@') ? !GvAV(gv) : !GvHV(gv))) @@ -7138,9 +7140,9 @@ S_pending_ident(pTHX) ) { /* Downgraded from fatal to warning 20000522 mjd */ - Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS), - "Possible unintended interpolation of %s in string", - PL_tokenbuf); + Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), + "Possible unintended interpolation of %s in string", + PL_tokenbuf); } } @@ -11421,7 +11423,8 @@ S_scan_heredoc(pTHX_ register char *s) } #endif if (!outer || - !(PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = filter_gets(PL_linestr, PL_rsfp, 0))) { + !(PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart + = filter_gets(PL_linestr, 0))) { CopLINE_set(PL_curcop, (line_t)PL_multi_start); missingterm(PL_tokenbuf); } @@ -11933,7 +11936,8 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims) } #endif if (!PL_rsfp || - !(PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = filter_gets(PL_linestr, PL_rsfp, 0))) { + !(PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart + = filter_gets(PL_linestr, 0))) { sv_free(sv); CopLINE_set(PL_curcop, (line_t)PL_multi_start); return NULL; @@ -12484,7 +12488,7 @@ S_scan_formline(pTHX_ register char *s) PL_thistoken = newSVpvn(tokenstart, PL_bufend - tokenstart); } #endif - s = filter_gets(PL_linestr, PL_rsfp, 0); + s = filter_gets(PL_linestr, 0); #ifdef PERL_MAD tokenstart = PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = SvPVX(PL_linestr); #else @@ -12703,30 +12707,8 @@ S_swallow_bom(pTHX_ U8 *s) #ifndef PERL_NO_UTF16_FILTER if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF16-LE script encoding (BOM)\n"); s += 2; - utf16le: if (PL_bufend > (char*)s) { - U8 *news; - I32 newlen; - - IoLINES(filter_add(S_utf16_textfilter, NULL)) = 1; - Newx(news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - utf16_to_utf8_reversed(s, news, - PL_bufend - (char*)s - 1, - &newlen); - sv_setpvn(PL_linestr, (const char*)news, newlen); -#ifdef PERL_MAD - s = (U8*)SvPVX(PL_linestr); - Copy(news, s, newlen, U8); - s[newlen] = '\0'; -#endif - Safefree(news); - SvUTF8_on(PL_linestr); - s = (U8*)SvPVX(PL_linestr); -#ifdef PERL_MAD - /* FIXME - is this a general bug fix? */ - s[newlen] = '\0'; -#endif - PL_bufend = SvPVX(PL_linestr) + newlen; + s = add_utf16_textfilter(s, TRUE); } #else Perl_croak(aTHX_ "Unsupported script encoding UTF16-LE"); @@ -12738,21 +12720,8 @@ S_swallow_bom(pTHX_ U8 *s) #ifndef PERL_NO_UTF16_FILTER if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n"); s += 2; - utf16be: if (PL_bufend > (char *)s) { - U8 *news; - I32 newlen; - - filter_add(S_utf16_textfilter, NULL); - Newx(news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - utf16_to_utf8(s, news, - PL_bufend - (char*)s, - &newlen); - sv_setpvn(PL_linestr, (const char*)news, newlen); - Safefree(news); - SvUTF8_on(PL_linestr); - s = (U8*)SvPVX(PL_linestr); - PL_bufend = SvPVX(PL_linestr) + newlen; + s = add_utf16_textfilter(s, FALSE); } #else Perl_croak(aTHX_ "Unsupported script encoding UTF16-BE"); @@ -12778,7 +12747,7 @@ S_swallow_bom(pTHX_ U8 *s) * 00 xx 00 xx * are a good indicator of UTF-16BE. */ if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n"); - goto utf16be; + s = add_utf16_textfilter(s, FALSE); } } #ifdef EBCDIC @@ -12796,7 +12765,7 @@ S_swallow_bom(pTHX_ U8 *s) * xx 00 xx 00 * are a good indicator of UTF-16LE. */ if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n"); - goto utf16le; + s = add_utf16_textfilter(s, TRUE); } } return (char*)s; @@ -12808,35 +12777,140 @@ static I32 S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) { dVAR; - const STRLEN old = SvCUR(sv); - const I32 count = FILTER_READ(idx+1, sv, maxlen); - const int reverse = IoLINES(sv); + SV *const filter = FILTER_DATA(idx); + /* We re-use this each time round, throwing the contents away before we + return. */ + SV *const utf16_buffer = MUTABLE_SV(IoTOP_GV(filter)); + SV *const utf8_buffer = filter; + IV status = IoPAGE(filter); + const bool reverse = IoLINES(filter); + + /* As we're automatically added, at the lowest level, and hence only called + from this file, we can be sure that we're not called in block mode. Hence + don't bother writing code to deal with block mode. */ + if (maxlen) { + Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen); + } + if (status < 0) { + Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%"IVdf")", status); + } DEBUG_P(PerlIO_printf(Perl_debug_log, - "utf16%s_textfilter(%p): %d %d (%d)\n", - reverse ? "rev" : "", + "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", FPTR2DPTR(void *, S_utf16_textfilter), - idx, maxlen, (int) count)); - if (count) { - U8* tmps; + reverse ? 'l' : 'b', idx, maxlen, status, + (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); + + while (1) { + STRLEN chars; + STRLEN have; I32 newlen; - Newx(tmps, SvCUR(sv) * 3 / 2 + 1, U8); - Copy(SvPVX_const(sv), tmps, old, char); + U8 *end; + /* First, look in our buffer of existing UTF-8 data: */ + char *nl = (char *)memchr(SvPVX(utf8_buffer), '\n', SvCUR(utf8_buffer)); + + if (nl) { + ++nl; + } else if (status == 0) { + /* EOF */ + IoPAGE(filter) = 0; + nl = SvEND(utf8_buffer); + } + if (nl) { + sv_catpvn(sv, SvPVX(utf8_buffer), nl - SvPVX(utf8_buffer)); + /* Everything else in this code works just fine if SVp_POK isn't + set. This, however, needs it, and we need it to work, else + we loop infinitely because the buffer is never consumed. */ + sv_chop(utf8_buffer, nl); + break; + } + + /* OK, not a complete line there, so need to read some more UTF-16. + Read an extra octect if the buffer currently has an odd number. */ + while (1) { + if (status <= 0) + break; + if (SvCUR(utf16_buffer) >= 2) { + /* Location of the high octet of the last complete code point. + Gosh, UTF-16 is a pain. All the benefits of variable length, + *coupled* with all the benefits of partial reads and + endianness. */ + const U8 *const last_hi = (U8*)SvPVX(utf16_buffer) + + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2)); + + if (*last_hi < 0xd8 || *last_hi > 0xdb) { + break; + } + + /* We have the first half of a surrogate. Read more. */ + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi)); + } + + status = FILTER_READ(idx + 1, utf16_buffer, + 160 + (SvCUR(utf16_buffer) & 1)); + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer))); + DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);}); + if (status < 0) { + /* Error */ + IoPAGE(filter) = status; + return status; + } + } + + chars = SvCUR(utf16_buffer) >> 1; + have = SvCUR(utf8_buffer); + SvGROW(utf8_buffer, have + chars * 3 + 1); + if (reverse) { - /* You would expect this to be utf16_to_utf8_reversed() - It was, prior to 1de9afcdf18cf98bbdecaa782da93e907be6fe4e - Effectively, right now, UTF-16LE is being read in off-by-one - See RT #69678 */ - utf16_to_utf8((U8*)SvPVX_const(sv) + old, tmps + old, - SvCUR(sv) - old, &newlen); + end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer), + (U8*)SvPVX_const(utf8_buffer) + have, + chars * 2, &newlen); } else { - utf16_to_utf8((U8*)SvPVX_const(sv) + old, tmps + old, - SvCUR(sv) - old, &newlen); + end = utf16_to_utf8((U8*)SvPVX(utf16_buffer), + (U8*)SvPVX_const(utf8_buffer) + have, + chars * 2, &newlen); + } + SvCUR_set(utf8_buffer, have + newlen); + *end = '\0'; + + /* No need to keep this SV "well-formed" with a '\0' after the end, as + it's private to us, and utf16_to_utf8{,reversed} take a + (pointer,length) pair, rather than a NUL-terminated string. */ + if(SvCUR(utf16_buffer) & 1) { + *SvPVX(utf16_buffer) = SvEND(utf16_buffer)[-1]; + SvCUR_set(utf16_buffer, 1); + } else { + SvCUR_set(utf16_buffer, 0); } - sv_usepvn(sv, (char*)tmps, (STRLEN)newlen + old); } - DEBUG_P({sv_dump(sv);}); + DEBUG_P(PerlIO_printf(Perl_debug_log, + "utf16_textfilter: returns, status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", + status, + (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); + DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);}); return SvCUR(sv); } + +static U8 * +S_add_utf16_textfilter(pTHX_ U8 *const s, bool reversed) +{ + SV *filter = filter_add(S_utf16_textfilter, NULL); + + IoTOP_GV(filter) = MUTABLE_GV(newSVpvn((char *)s, PL_bufend - (char*)s)); + sv_setpvs(filter, ""); + IoLINES(filter) = reversed; + IoPAGE(filter) = 1; /* Not EOF */ + + /* Sadly, we have to return a valid pointer, come what may, so we have to + ignore any error return from this. */ + SvCUR_set(PL_linestr, 0); + if (FILTER_READ(0, PL_linestr, 0)) { + SvUTF8_on(PL_linestr); + } else { + SvUTF8_on(PL_linestr); + } + PL_bufend = SvEND(PL_linestr); + return (U8*)SvPVX(PL_linestr); +} #endif /*