From: M. J. T. Guy Date: Tue, 25 Jul 2000 12:52:45 +0000 (+0100) Subject: Get UTF16 BOMs working. Patch from X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4ac469dc3d5bdda48d1fc1e1ea92a82232cc5603;p=p5sagit%2Fp5-mst-13.2.git Get UTF16 BOMs working. Patch from Subject: Re: [ID 20000719.001] Problem with bleadperl of 7/18/00 Date: Tue, 25 Jul 2000 12:52:45 +0100 Message-Id: and notes from Subject: Re: [ID 20000719.001] Problem with bleadperl of 7/18/00 From: "M.J.T. Guy" Date: Tue, 25 Jul 2000 11:43:25 +0100 Message-Id: p4raw-id: //depot/perl@6435 --- diff --git a/t/comp/require.t b/t/comp/require.t index 48e3e00..51f513f 100755 --- a/t/comp/require.t +++ b/t/comp/require.t @@ -132,7 +132,8 @@ $i++; do_require("$utf8\nprint \"ok $i\n\"; 1;\n"); $i++; do_require("$utf16\n1;"); -print "ok $i\n" if $@ =~ /Unsupported script encoding/; +print "not " unless $@ =~ /^Unrecognized character /; +print "ok $i\n"; END { 1 while unlink 'bleah.pm'; 1 while unlink 'bleah.do'; } diff --git a/toke.c b/toke.c index 98210a0..f368367 100644 --- a/toke.c +++ b/toke.c @@ -2519,7 +2519,13 @@ Perl_yylex(pTHX) } } if (bof) + { + PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + /* Shouldn't this wsallow_bom() be earlier, e.g. + * immediately after where bof is set? Currently you can't + * have e.g. a UTF16 sharpbang line. --Mike Guy */ s = swallow_bom((U8*)s); + } incline(s); } while (PL_doextract); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s; diff --git a/utf8.c b/utf8.c index 666ec34..95f457f 100644 --- a/utf8.c +++ b/utf8.c @@ -320,13 +320,20 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) return dst; } -/* XXX NOTHING CALLS THE FOLLOWING TWO ROUTINES YET!!! */ /* * Convert native or reversed UTF-16 to UTF-8. * * Destination must be pre-extended to 3/2 source. Do not use in-place. * We optimize for native, for obvious reasons. */ +/* There are several problems with utf16_to_utf8(). + * (1) U16 is not necessarily *exactly* two bytes. + * (2) Secondly, no check is made for odd length. + * (3) Thirdly, the "Malformed UTF-16 surrogate" should probably be + * a hard error (and it should be listed in perldiag). + * (4) The tests (in comp/t/require.t) are a joke: the UTF16 BOM + * really ought to be followed by valid UTF16 characters. + * --Mike Guy */ U8* Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen) {