From: Jarkko Hietaniemi Date: Wed, 12 May 2004 21:31:17 +0000 (+0300) Subject: BOM-marked and (BOMless) UTF-16 scripts not working X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7aa207d6d833e60be59e41514013c4c54b091533;p=p5sagit%2Fp5-mst-13.2.git BOM-marked and (BOMless) UTF-16 scripts not working Message-ID: <40A26D75.8080406@iki.fi> Date: Wed, 12 May 2004 21:31:17 +0300 p4raw-id: //depot/perl@22818 --- diff --git a/MANIFEST b/MANIFEST index eead0b2..19ab326 100644 --- a/MANIFEST +++ b/MANIFEST @@ -2705,6 +2705,7 @@ t/comp/require.t See if require works t/comp/script.t See if script invocation works t/comp/term.t See if more terms work t/comp/use.t See if pragmata work +t/comp/utf.t See if UTFs work t/harness Finer diagnostics from test suite thrdvar.h Per-thread variables thread.h Threading header diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 51d260a..984a170 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4085,10 +4085,10 @@ Note that under some systems, like OS/2, there may be different flavors of Perl executables, some of which may support fork, some not. Try changing the name you call Perl by to C, C, and so on. -=item Unsupported script encoding +=item Unsupported script encoding %s (F) Your program file begins with a Unicode Byte Order Mark (BOM) which -declares it to be in a Unicode encoding that Perl cannot yet read. +declares it to be in a Unicode encoding that Perl cannot read. =item Unsupported socket function "%s" called diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 46ea682..23bee6e 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -42,6 +42,14 @@ is needed.> See L. You can also use the C pragma to change the default encoding of the data in your script; see L. +=item BOM-marked scripts and UTF-16 scripts autodetected + +If a Perl script begins marked with the Unicode BOM (UTF-16LE, UTF16-BE, +or UTF-8), or if the script looks like non-BOM-marked UTF-16 of either +endianness, Perl will correctly read in the script as Unicode. +(BOMless UTF-8 cannot be effectively recognized or differentiated from +ISO 8859-1 or other eight-bit encodings.) + =item C needed to upgrade non-Latin-1 byte strings By default, there is a fundamental asymmetry in Perl's unicode model: diff --git a/t/comp/utf.t b/t/comp/utf.t new file mode 100644 index 0000000..a7b8566 --- /dev/null +++ b/t/comp/utf.t @@ -0,0 +1,48 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + unless (find PerlIO::Layer 'perlio') { + print "1..0 # Skip: not perlio\n"; + exit 0; + } +} + +require "./test.pl"; + +plan(tests => 15); + +my $BOM = chr(0xFEFF); + +sub test { + my ($enc, $tag, $bom) = @_; + open(UTF_PL, ">:encoding($enc)", "utf.pl") + or die "utf.pl($enc,$tag,$bom): $!"; + print UTF_PL $BOM if $bom; + print UTF_PL "$tag\n"; + close(UTF_PL); + my $got = do "./utf.pl"; + is($got, $tag); +} + +test("utf16le", 123, 1); +test("utf16le", 1234, 1); +test("utf16le", 12345, 1); +test("utf16be", 123, 1); +test("utf16be", 1234, 1); +test("utf16be", 12345, 1); +test("utf8", 123, 1); +test("utf8", 1234, 1); +test("utf8", 12345, 1); + +test("utf16le", 123, 0); +test("utf16le", 1234, 0); +test("utf16le", 12345, 0); +test("utf16be", 123, 0); +test("utf16be", 1234, 0); +test("utf16be", 12345, 0); + +END { + 1 while unlink "utf.pl"; +} diff --git a/toke.c b/toke.c index 6899cb4..b113499 100644 --- a/toke.c +++ b/toke.c @@ -2497,8 +2497,13 @@ Perl_yylex(pTHX) sv_setpv(PL_linestr,""); TOKEN(';'); /* not infinite loop because rsfp is NULL now */ } - /* if it looks like the start of a BOM, check if it in fact is */ - else if (bof && (!*s || *(U8*)s == 0xEF || *(U8*)s >= 0xFE)) { + /* If it looks like the start of a BOM or raw UTF-16, + * check if it in fact is. */ + else if (bof && + (*s == 0 || + *(U8*)s == 0xEF || + *(U8*)s >= 0xFE || + s[1] == 0)) { #ifdef PERLIO_IS_STDIO # ifdef __GNU_LIBRARY__ # if __GNU_LIBRARY__ == 1 /* Linux glibc5 */ @@ -7834,72 +7839,94 @@ S_swallow_bom(pTHX_ U8 *s) { STRLEN slen; slen = SvCUR(PL_linestr); - switch (*s) { + switch (s[0]) { case 0xFF: if (s[1] == 0xFE) { - /* UTF-16 little-endian */ + /* UTF-16 little-endian? (or UTF32-LE?) */ if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */ - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF32-LE"); #ifndef PERL_NO_UTF16_FILTER - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF16-LE script encoding (BOM)\n"); s += 2; + utf16le: if (PL_bufend > (char*)s) { U8 *news; I32 newlen; filter_add(utf16rev_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = (char*)utf16_to_utf8_reversed(s, news, - PL_bufend - (char*)s - 1, - &newlen); - Copy(news, s, newlen, U8); - SvCUR_set(PL_linestr, newlen); - PL_bufend = SvPVX(PL_linestr) + newlen; - news[newlen++] = '\0'; + PL_bufend = + (char*)utf16_to_utf8_reversed(s, news, + PL_bufend - (char*)s - 1, + &newlen); + sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); + SvUTF8_on(PL_linestr); + s = (U8*)SvPVX(PL_linestr); + PL_bufend = SvPVX(PL_linestr) + newlen; } #else - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF16-LE"); #endif } break; case 0xFE: - if (s[1] == 0xFF) { /* UTF-16 big-endian */ + if (s[1] == 0xFF) { /* UTF-16 big-endian? */ #ifndef PERL_NO_UTF16_FILTER - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n"); s += 2; + utf16be: if (PL_bufend > (char *)s) { U8 *news; I32 newlen; filter_add(utf16_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = (char*)utf16_to_utf8(s, news, - PL_bufend - (char*)s, - &newlen); - Copy(news, s, newlen, U8); - SvCUR_set(PL_linestr, newlen); - PL_bufend = SvPVX(PL_linestr) + newlen; - news[newlen++] = '\0'; + PL_bufend = + (char*)utf16_to_utf8(s, news, + PL_bufend - (char*)s, + &newlen); + sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); + SvUTF8_on(PL_linestr); + s = (U8*)SvPVX(PL_linestr); + PL_bufend = SvPVX(PL_linestr) + newlen; } #else - Perl_croak(aTHX_ "Unsupported script encoding"); + Perl_croak(aTHX_ "Unsupported script encoding UTF16-BE"); #endif } break; case 0xEF: if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) { - DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n")); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n"); s += 3; /* UTF-8 */ } break; case 0: - if (slen > 3 && s[1] == 0 && /* UTF-32 big-endian */ - s[2] == 0xFE && s[3] == 0xFF) - { - Perl_croak(aTHX_ "Unsupported script encoding"); + if (slen > 3) { + if (s[1] == 0) { + if (s[2] == 0xFE && s[3] == 0xFF) { + /* UTF-32 big-endian */ + Perl_croak(aTHX_ "Unsupported script encoding UTF32-BE"); + } + } + else if (s[2] == 0 && s[3] != 0) { + /* Leading bytes + * 00 xx 00 xx + * are a good indicator of UTF-16BE. */ + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n"); + goto utf16be; + } } + default: + if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) { + /* Leading bytes + * xx 00 xx 00 + * are a good indicator of UTF-16LE. */ + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n"); + goto utf16le; + } } return (char*)s; }