From: Jarkko Hietaniemi Date: Mon, 31 Jul 2000 04:15:02 +0000 (+0000) Subject: The swallow_bom() saga continues. The #23 of require.t X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=dea0fc0b9e5a61b92c4be2ecafe0a8d9396d4cc1;p=p5sagit%2Fp5-mst-13.2.git The swallow_bom() saga continues. The #23 of require.t (UTF16-LE) still fails (silently, no output) but the #22 (UTF16-BE) seems to be working now. The root of the failure may be in sv_gets(): is it UTF-16LE-aware, especially when it comes to line endings? p4raw-id: //depot/perl@6469 --- diff --git a/embed.h b/embed.h index 2969d86..d062f06 100644 --- a/embed.h +++ b/embed.h @@ -2172,8 +2172,8 @@ #define unsharepvn(a,b,c) Perl_unsharepvn(aTHX_ a,b,c) #define unshare_hek(a) Perl_unshare_hek(aTHX_ a) #define utilize(a,b,c,d,e) Perl_utilize(aTHX_ a,b,c,d,e) -#define utf16_to_utf8(a,b,c) Perl_utf16_to_utf8(aTHX_ a,b,c) -#define utf16_to_utf8_reversed(a,b,c) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c) +#define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d) +#define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d) #define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b) #define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b) #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) diff --git a/embed.pl b/embed.pl index a3adadc..0848eec 100755 --- a/embed.pl +++ b/embed.pl @@ -2064,8 +2064,8 @@ Ap |void |unlock_condpair|void* svv Ap |void |unsharepvn |const char* sv|I32 len|U32 hash p |void |unshare_hek |HEK* hek p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg -Ap |U8* |utf16_to_utf8 |U16* p|U8 *d|I32 bytelen -Ap |U8* |utf16_to_utf8_reversed|U16* p|U8 *d|I32 bytelen +Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen +Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen Ap |I32 |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN len diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 9522c1f..4ccb671 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1788,6 +1788,11 @@ a builtin library search path, prefix2 is substituted. The error may appear if components are not found, or are too long. See "PERLLIB_PREFIX" in L. +=item Malformed UTF-16 surrogate + +Perl thought it was reading UTF-16 encoded character data but while +doing it Perl met a malformed Unicode surrogate. + =item %s matches null string many times (W regexp) The pattern you've specified would be an infinite loop if the @@ -2490,6 +2495,11 @@ was string. (P) The lexer got into a bad state while processing a case modifier. +=item panic: utf16_to_utf8: odd bytelen + +(P) Something tried to call utf16_to_utf8 with an odd (as opposed +to even) byte length. + =item Parentheses missing around "%s" list (W parenthesis) You said something like diff --git a/proto.h b/proto.h index f65f898..714c923 100644 --- a/proto.h +++ b/proto.h @@ -807,8 +807,8 @@ PERL_CALLCONV void Perl_unlock_condpair(pTHX_ void* svv); PERL_CALLCONV void Perl_unsharepvn(pTHX_ const char* sv, I32 len, U32 hash); PERL_CALLCONV void Perl_unshare_hek(pTHX_ HEK* hek); PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP* version, OP* id, OP* arg); -PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U16* p, U8 *d, I32 bytelen); -PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8 *d, I32 bytelen); +PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); +PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen); PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len); diff --git a/t/comp/require.t b/t/comp/require.t index 51f513f..418bc3e 100755 --- a/t/comp/require.t +++ b/t/comp/require.t @@ -122,18 +122,19 @@ do "bleah.do"; dofile(); sub dofile { do "bleah.do"; }; print $x; -$i++; # UTF-encoded things my $utf8 = chr(0xFEFF); -my $utf16 = chr(255).chr(254); -do_require("${utf8}print \"ok $i\n\"; 1;\n"); -$i++; -do_require("$utf8\nprint \"ok $i\n\"; 1;\n"); -$i++; -do_require("$utf16\n1;"); -print "not " unless $@ =~ /^Unrecognized character /; -print "ok $i\n"; + +$i++; do_require(qq(${utf8}print "ok $i\n"; 1;\n)); + +sub bytes_to_utf16 { + my $utf16 = pack("$_[0]*", unpack("C*", $_[1])); + return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16; +} + +$i++; do_require(bytes_to_utf16('n', qq(print "ok $i\\n"; 1;\n), 1)); # BE +$i++; do_require(bytes_to_utf16('v', qq(print "ok $i\\n"; 1;\n), 1)); # LE END { 1 while unlink 'bleah.pm'; 1 while unlink 'bleah.do'; } diff --git a/toke.c b/toke.c index 2887a21..9d03733 100644 --- a/toke.c +++ b/toke.c @@ -2482,7 +2482,8 @@ Perl_yylex(pTHX) do { bool bof; bof = PL_rsfp && (PerlIO_tell(PL_rsfp) == 0); /* *Before* read! */ - if ((s = filter_gets(PL_linestr, PL_rsfp, 0)) == Nullch) { + s = filter_gets(PL_linestr, PL_rsfp, 0); + if (s == Nullch) { fake_eof: if (PL_rsfp) { if (PL_preprocess && !PL_in_eval) @@ -2505,6 +2506,9 @@ Perl_yylex(pTHX) PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr); sv_setpv(PL_linestr,""); TOKEN(';'); /* not infinite loop because rsfp is NULL now */ + } else if (bof) { + PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + s = swallow_bom((U8*)s); } if (PL_doextract) { if (*s == '#' && s[1] == '!' && instr(s,"perl")) @@ -2518,14 +2522,6 @@ Perl_yylex(pTHX) PL_doextract = FALSE; } } - if (bof) - { - PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); - /* Shouldn't this swallow_bom() be earlier, e.g. - * immediately after where bof is set? Currently you can't - * have e.g. a UTF16 sharpbang line. --Mike Guy */ - s = swallow_bom((U8*)s); - } incline(s); } while (PL_doextract); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s; @@ -7374,26 +7370,31 @@ STATIC char* S_swallow_bom(pTHX_ U8 *s) { STRLEN slen; - U8 *olds = s; slen = SvCUR(PL_linestr); switch (*s) { case 0xFF: if (s[1] == 0xFE) { /* UTF-16 little-endian */ -#ifndef PERL_NO_UTF16_FILTER - U8 *news; -#endif if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */ Perl_croak(aTHX_ "Unsupported script encoding"); #ifndef PERL_NO_UTF16_FILTER + DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n")); s += 2; - filter_add(utf16rev_textfilter, NULL); - New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */ - PL_bufend = (char*)utf16_to_utf8((U16*)s, news, - PL_bufend - (char*)s); - Safefree(olds); - s = news; + if (PL_bufend > (char*)s) { + U8 *news; + I32 newlen; + + filter_add(utf16rev_textfilter, NULL); + New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); + PL_bufend = (char*)utf16_to_utf8(s, news, + PL_bufend - (char*)s, + &newlen); + Copy(news, s, newlen, U8); + SvCUR_set(PL_linestr, newlen); + PL_bufend = SvPVX(PL_linestr) + newlen; + news[newlen++] = '\0'; + Safefree(news); + } #else Perl_croak(aTHX_ "Unsupported script encoding"); #endif @@ -7402,14 +7403,23 @@ S_swallow_bom(pTHX_ U8 *s) case 0xFE: if (s[1] == 0xFF) { /* UTF-16 big-endian */ #ifndef PERL_NO_UTF16_FILTER - U8 *news; - filter_add(utf16_textfilter, NULL); - New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - /* See the notes on utf16_to_utf8() in utf8.c --Mike Guy */ - PL_bufend = (char*)utf16_to_utf8((U16*)s, news, - PL_bufend - (char*)s); - Safefree(olds); - s = news; + DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n")); + s += 2; + if (PL_bufend > (char *)s) { + U8 *news; + I32 newlen; + + filter_add(utf16_textfilter, NULL); + New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); + PL_bufend = (char*)utf16_to_utf8(s, news, + PL_bufend - (char*)s, + &newlen); + Copy(news, s, newlen, U8); + SvCUR_set(PL_linestr, newlen); + PL_bufend = SvPVX(PL_linestr) + newlen; + news[newlen++] = '\0'; + Safefree(news); + } #else Perl_croak(aTHX_ "Unsupported script encoding"); #endif @@ -7417,6 +7427,7 @@ S_swallow_bom(pTHX_ U8 *s) break; case 0xEF: if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) { + DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n")); s += 3; /* UTF-8 */ } break; @@ -7459,8 +7470,9 @@ utf16_textfilter(pTHXo_ int idx, SV *sv, int maxlen) if (count) { U8* tmps; U8* tend; + I32 newlen; New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8); - tend = utf16_to_utf8((U16*)SvPVX(sv), tmps, SvCUR(sv)); + tend = utf16_to_utf8((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen); sv_usepvn(sv, (char*)tmps, tend - tmps); } return count; @@ -7473,8 +7485,9 @@ utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen) if (count) { U8* tmps; U8* tend; + I32 newlen; New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8); - tend = utf16_to_utf8_reversed((U16*)SvPVX(sv), tmps, SvCUR(sv)); + tend = utf16_to_utf8_reversed((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen); sv_usepvn(sv, (char*)tmps, tend - tmps); } return count; diff --git a/utf8.c b/utf8.c index d00b9f3..6a99d9d 100644 --- a/utf8.c +++ b/utf8.c @@ -321,26 +321,25 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) } /* - * Convert native or reversed UTF-16 to UTF-8. + * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. * * Destination must be pre-extended to 3/2 source. Do not use in-place. * We optimize for native, for obvious reasons. */ -/* There are several problems with utf16_to_utf8(). - * (1) U16 is not necessarily *exactly* two bytes. - * (2) Secondly, no check is made for odd length. - * (3) Thirdly, the "Malformed UTF-16 surrogate" should probably be - * a hard error (and it should be listed in perldiag). - * (4) The tests (in comp/t/require.t) are a joke: the UTF16 BOM - * really ought to be followed by valid UTF16 characters. - * See swallow_bom() in toke.c. - * --Mike Guy */ U8* -Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) +Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { - U16* pend = p + bytelen / 2; + U8* pend; + U8* dstart = d; + + if (bytelen & 1) + Perl_croak("panic: utf16_to_utf8: odd bytelen"); + + pend = p + bytelen; + while (p < pend) { - UV uv = *p++; + UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ + p += 2; if (uv < 0x80) { *d++ = uv; continue; @@ -352,13 +351,9 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) } if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ dTHR; - int low = *p++; - if (low < 0xdc00 || low >= 0xdfff) { - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate"); - p--; - uv = 0xfffd; - } + UV low = *p++; + if (low < 0xdc00 || low >= 0xdfff) + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; } if (uv < 0x10000) { @@ -375,13 +370,14 @@ Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) continue; } } + *newlen = d - dstart; return d; } /* Note: this one is slightly destructive of the source. */ U8* -Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen) +Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { U8* s = (U8*)p; U8* send = s + bytelen; @@ -391,7 +387,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen) s[1] = tmp; s += 2; } - return utf16_to_utf8(p, d, bytelen); + return utf16_to_utf8(p, d, bytelen, newlen); } /* for now these are all defined (inefficiently) in terms of the utf8 versions */