From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Wed, 12 May 2004 21:31:17 +0000 (+0300)
Subject: BOM-marked and (BOMless) UTF-16 scripts not working
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7aa207d6d833e60be59e41514013c4c54b091533;p=p5sagit%2Fp5-mst-13.2.git

BOM-marked and (BOMless) UTF-16 scripts not working
Message-ID: <40A26D75.8080406@iki.fi>
Date: Wed, 12 May 2004 21:31:17 +0300

p4raw-id: //depot/perl@22818
---

diff --git a/MANIFEST b/MANIFEST
index eead0b2..19ab326 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -2705,6 +2705,7 @@ t/comp/require.t		See if require works
 t/comp/script.t			See if script invocation works
 t/comp/term.t			See if more terms work
 t/comp/use.t			See if pragmata work
+t/comp/utf.t			See if UTFs work
 t/harness			Finer diagnostics from test suite
 thrdvar.h			Per-thread variables
 thread.h			Threading header
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 51d260a..984a170 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4085,10 +4085,10 @@ Note that under some systems, like OS/2, there may be different flavors
 of Perl executables, some of which may support fork, some not. Try
 changing the name you call Perl by to C<perl_>, C<perl__>, and so on.
 
-=item Unsupported script encoding
+=item Unsupported script encoding %s
 
 (F) Your program file begins with a Unicode Byte Order Mark (BOM) which
-declares it to be in a Unicode encoding that Perl cannot yet read.
+declares it to be in a Unicode encoding that Perl cannot read.
 
 =item Unsupported socket function "%s" called
 
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 46ea682..23bee6e 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -42,6 +42,14 @@ is needed.>  See L<utf8>.
 You can also use the C<encoding> pragma to change the default encoding
 of the data in your script; see L<encoding>.
 
+=item BOM-marked scripts and UTF-16 scripts autodetected
+
+If a Perl script begins marked with the Unicode BOM (UTF-16LE, UTF16-BE,
+or UTF-8), or if the script looks like non-BOM-marked UTF-16 of either
+endianness, Perl will correctly read in the script as Unicode.
+(BOMless UTF-8 cannot be effectively recognized or differentiated from
+ISO 8859-1 or other eight-bit encodings.)
+
 =item C<use encoding> needed to upgrade non-Latin-1 byte strings
 
 By default, there is a fundamental asymmetry in Perl's unicode model:
diff --git a/t/comp/utf.t b/t/comp/utf.t
new file mode 100644
index 0000000..a7b8566
--- /dev/null
+++ b/t/comp/utf.t
@@ -0,0 +1,48 @@
+#!./perl
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+    unless (find PerlIO::Layer 'perlio') {
+	print "1..0 # Skip: not perlio\n";
+	exit 0;
+    }
+}
+
+require "./test.pl";
+
+plan(tests => 15);
+
+my $BOM = chr(0xFEFF);
+
+sub test {
+    my ($enc, $tag, $bom) = @_;
+    open(UTF_PL, ">:encoding($enc)", "utf.pl")
+	or die "utf.pl($enc,$tag,$bom): $!";
+    print UTF_PL $BOM if $bom;
+    print UTF_PL "$tag\n";
+    close(UTF_PL);
+    my $got = do "./utf.pl";
+    is($got, $tag);
+}
+
+test("utf16le",    123,   1);
+test("utf16le",    1234,  1);
+test("utf16le",    12345, 1);
+test("utf16be",    123,   1);
+test("utf16be",    1234,  1);
+test("utf16be",    12345, 1);
+test("utf8",       123,   1);
+test("utf8",       1234,  1);
+test("utf8",       12345, 1);
+
+test("utf16le",    123,   0);
+test("utf16le",    1234,  0);
+test("utf16le",    12345, 0);
+test("utf16be",    123,   0);
+test("utf16be",    1234,  0);
+test("utf16be",    12345, 0);
+
+END {
+    1 while unlink "utf.pl";
+}
diff --git a/toke.c b/toke.c
index 6899cb4..b113499 100644
--- a/toke.c
+++ b/toke.c
@@ -2497,8 +2497,13 @@ Perl_yylex(pTHX)
 		sv_setpv(PL_linestr,"");
 		TOKEN(';');	/* not infinite loop because rsfp is NULL now */
 	    }
-	    /* if it looks like the start of a BOM, check if it in fact is */
-	    else if (bof && (!*s || *(U8*)s == 0xEF || *(U8*)s >= 0xFE)) {
+	    /* If it looks like the start of a BOM or raw UTF-16,
+	     * check if it in fact is. */
+	    else if (bof &&
+		     (*s == 0 ||
+		      *(U8*)s == 0xEF ||
+		      *(U8*)s >= 0xFE ||
+		      s[1] == 0)) {
 #ifdef PERLIO_IS_STDIO
 #  ifdef __GNU_LIBRARY__
 #    if __GNU_LIBRARY__ == 1 /* Linux glibc5 */
@@ -7834,72 +7839,94 @@ S_swallow_bom(pTHX_ U8 *s)
 {
     STRLEN slen;
     slen = SvCUR(PL_linestr);
-    switch (*s) {
+    switch (s[0]) {
     case 0xFF:
 	if (s[1] == 0xFE) {
-	    /* UTF-16 little-endian */
+	    /* UTF-16 little-endian? (or UTF32-LE?) */
 	    if (s[2] == 0 && s[3] == 0)  /* UTF-32 little-endian */
-		Perl_croak(aTHX_ "Unsupported script encoding");
+		Perl_croak(aTHX_ "Unsupported script encoding UTF32-LE");
 #ifndef PERL_NO_UTF16_FILTER
-	    DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-LE script encoding\n"));
+	    if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF16-LE script encoding (BOM)\n");
 	    s += 2;
+	utf16le:
 	    if (PL_bufend > (char*)s) {
 		U8 *news;
 		I32 newlen;
 
 		filter_add(utf16rev_textfilter, NULL);
 		New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
-		PL_bufend = (char*)utf16_to_utf8_reversed(s, news,
-						 PL_bufend - (char*)s - 1,
-						 &newlen);
-		Copy(news, s, newlen, U8);
-		SvCUR_set(PL_linestr, newlen);
-		PL_bufend = SvPVX(PL_linestr) + newlen;
-		news[newlen++] = '\0';
+		PL_bufend =
+		     (char*)utf16_to_utf8_reversed(s, news,
+						   PL_bufend - (char*)s - 1,
+						   &newlen);
+		sv_setpvn(PL_linestr, (const char*)news, newlen);
 		Safefree(news);
+		SvUTF8_on(PL_linestr);
+		s = (U8*)SvPVX(PL_linestr);
+		PL_bufend = SvPVX(PL_linestr) + newlen;
 	    }
 #else
-	    Perl_croak(aTHX_ "Unsupported script encoding");
+	    Perl_croak(aTHX_ "Unsupported script encoding UTF16-LE");
 #endif
 	}
 	break;
     case 0xFE:
-	if (s[1] == 0xFF) {   /* UTF-16 big-endian */
+	if (s[1] == 0xFF) {   /* UTF-16 big-endian? */
 #ifndef PERL_NO_UTF16_FILTER
-	    DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding\n"));
+	    if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n");
 	    s += 2;
+	utf16be:
 	    if (PL_bufend > (char *)s) {
 		U8 *news;
 		I32 newlen;
 
 		filter_add(utf16_textfilter, NULL);
 		New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8);
-		PL_bufend = (char*)utf16_to_utf8(s, news,
-						 PL_bufend - (char*)s,
-						 &newlen);
-		Copy(news, s, newlen, U8);
-		SvCUR_set(PL_linestr, newlen);
-		PL_bufend = SvPVX(PL_linestr) + newlen;
-		news[newlen++] = '\0';
+		PL_bufend =
+		     (char*)utf16_to_utf8(s, news,
+					  PL_bufend - (char*)s,
+					  &newlen);
+		sv_setpvn(PL_linestr, (const char*)news, newlen);
 		Safefree(news);
+		SvUTF8_on(PL_linestr);
+		s = (U8*)SvPVX(PL_linestr);
+		PL_bufend = SvPVX(PL_linestr) + newlen;
 	    }
 #else
-	    Perl_croak(aTHX_ "Unsupported script encoding");
+	    Perl_croak(aTHX_ "Unsupported script encoding UTF16-BE");
 #endif
 	}
 	break;
     case 0xEF:
 	if (slen > 2 && s[1] == 0xBB && s[2] == 0xBF) {
-	    DEBUG_p(PerlIO_printf(Perl_debug_log, "UTF-8 script encoding\n"));
+	    if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
 	    s += 3;                      /* UTF-8 */
 	}
 	break;
     case 0:
-	if (slen > 3 && s[1] == 0 &&  /* UTF-32 big-endian */
-	    s[2] == 0xFE && s[3] == 0xFF)
-	{
-	    Perl_croak(aTHX_ "Unsupported script encoding");
+	if (slen > 3) {
+	     if (s[1] == 0) {
+		  if (s[2] == 0xFE && s[3] == 0xFF) {
+		       /* UTF-32 big-endian */
+		       Perl_croak(aTHX_ "Unsupported script encoding UTF32-BE");
+		  }
+	     }
+	     else if (s[2] == 0 && s[3] != 0) {
+		  /* Leading bytes
+		   * 00 xx 00 xx
+		   * are a good indicator of UTF-16BE. */
+		  if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n");
+		  goto utf16be;
+	     }
 	}
+    default:
+	 if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) {
+		  /* Leading bytes
+		   * xx 00 xx 00
+		   * are a good indicator of UTF-16LE. */
+	      if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n");
+	      goto utf16le;
+	 }
     }
     return (char*)s;
 }