From: Rafael Garcia-Suarez Date: Fri, 8 Oct 2004 08:52:39 +0000 (+0000) Subject: Make the perl interpreter more tolerant of UTF-16-encoded script X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1de9afcdf18cf98b;p=p5sagit%2Fp5-mst-13.2.git Make the perl interpreter more tolerant of UTF-16-encoded script (patch by Jarkko Hietaniemi) p4raw-id: //depot/perl@23351 --- diff --git a/Makefile.SH b/Makefile.SH index e8fbf3e..3e66a5d 100644 --- a/Makefile.SH +++ b/Makefile.SH @@ -1053,6 +1053,7 @@ makedepend: makedepend.SH config.sh .PHONY: test check test_prep test_prep_nodll test_prep_pre _test_prep \ test_tty test-tty _test_tty test_notty test-notty _test_notty \ utest ucheck test.utf8 check.utf8 test.torture torturetest \ + test.utf16 check.utf16 utest.utf16 ucheck.utf16 \ test.third check.third utest.third ucheck.third test_notty.third \ test.deparse test_notty.deparse test_harness test_harness_notty \ test.bytecompile minitest coretest test.taintwarn @@ -1114,6 +1115,18 @@ test-notty: test_notty test.torture torturetest: test_prep PERL=./perl TEST_ARGS=-torture $(MAKE) _test +# Targets for UTF16 testing: + +minitest.utf16: minitest.prep + - cd t && (rm -f perl$(EXE_EXT); $(LNS) ../miniperl$(EXE_EXT) perl$(EXE_EXT)) \ + && $(LDLIBPTH) ./perl TEST -minitest -utf16 base/*.t comp/*.t cmd/*.t run/*.t io/*.t op/*.t uni/*.t = 0) { $core = 1 if $1 eq 'core'; $verbose = 1 if $1 eq 'v'; $torture = 1 if $1 eq 'torture'; - $with_utf= 1 if $1 eq 'utf8'; + $with_utf8 = 1 if $1 eq 'utf8'; + $with_utf16 = 1 if $1 eq 'utf16'; $bytecompile = 1 if $1 eq 'bytecompile'; $compile = 1 if $1 eq 'compile'; $taintwarn = 1 if $1 eq 'taintwarn'; @@ -134,6 +137,32 @@ elsif( $compile ) { elsif( $bytecompile ) { _testprogs('bytecompile', '', @ARGV); } +elsif ($with_utf16) { + for my $e (0, 1) { + for my $b (0, 1) { + print STDERR "# ENDIAN $e BOM $b\n"; + my @UARGV; + for my $a (@ARGV) { + my $u = $a . "." . ($e ? "l" : "b") . "e" . ($b ? "b" : ""); + my $f = $e ? "v" : "n"; + push @UARGV, $u; + unlink($u); + if (open(A, $a)) { + if (open(U, ">$u")) { + print U pack("$f", 0xFEFF); + while () { + print U pack("$f*", unpack("C*", $_)); + } + close(A); + } + close(B); + } + } + _testprogs('perl', '', @UARGV); + unlink(@UARGV); + } + } +} else { _testprogs('compile', '', @ARGV) if -e "../testcompile"; _testprogs('perl', '', @ARGV); @@ -243,7 +272,7 @@ EOT close(SCRIPT); } - my $utf = $with_utf ? '-I../lib -Mutf8' : ''; + my $utf8 = $with_utf8 ? '-I../lib -Mutf8' : ''; my $testswitch = '-I. -MTestInit'; # -T will strict . from @INC if ($type eq 'deparse') { my $deparse = @@ -275,7 +304,7 @@ EOT my $bytecompile = "$perl $testswitch $switch -I../lib $bswitch". "-o$test.plc $test 2>$null &&". - "$perl $testswitch $switch -I../lib $utf $test.plc |"; + "$perl $testswitch $switch -I../lib $utf8 $test.plc |"; open(RESULTS,$bytecompile) or print "can't byte-compile '$bytecompile': $!.\n"; } @@ -288,7 +317,7 @@ EOT . "--num-callers=50 --logfile-fd=3 $perl"; $redir = "3>$valgrind_log"; } - my $run = "$perl" . _quote_args("$testswitch $switch $utf") . " $test $redir|"; + my $run = "$perl" . _quote_args("$testswitch $switch $utf8") . " $test $redir|"; open(RESULTS,$run) or print "can't run '$run': $!.\n"; } else { @@ -296,7 +325,7 @@ EOT my $pl2c = "$testswitch -I../lib ../utils/perlcc --testsuite " . # -O9 for good measure, -fcog is broken ATM "$switch -Wb=-O9,-fno-cog -L .. " . - "-I \".. ../lib/CORE\" $args $utf $test -o "; + "-I \".. ../lib/CORE\" $args $utf8 $test -o "; if( $^O eq 'MSWin32' ) { $test_executable = "$test.exe"; diff --git a/toke.c b/toke.c index aee151c..80a9ba7 100644 --- a/toke.c +++ b/toke.c @@ -2157,19 +2157,17 @@ Perl_filter_del(pTHX_ filter_t funcp) } -/* Invoke the n'th filter function for the current rsfp. */ +/* Invoke the idxth filter function for the current rsfp. */ +/* maxlen 0 = read one text line */ I32 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) - - - /* 0 = read one text line */ { filter_t funcp; SV *datasv = NULL; if (!PL_rsfp_filters) return -1; - if (idx > AvFILLp(PL_rsfp_filters)){ /* Any more filters? */ + if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */ /* Provide a default input filter to make life easy. */ /* Note that we append to the line. This is handy. */ DEBUG_P(PerlIO_printf(Perl_debug_log, @@ -2200,7 +2198,7 @@ Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) return SvCUR(buf_sv); } /* Skip this filter slot if filter has been deleted */ - if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef){ + if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) { DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_read %d: skipped (filter deleted)\n", idx)); @@ -2226,7 +2224,6 @@ S_filter_gets(pTHX_ register SV *sv, register PerlIO *fp, STRLEN append) } #endif if (PL_rsfp_filters) { - if (!append) SvCUR_set(sv, 0); /* start with empty line */ if (FILTER_READ(0, sv, 0) > 0) @@ -6834,10 +6831,11 @@ S_scan_heredoc(pTHX_ register char *s) av_store(CopFILEAV(PL_curcop), (I32)CopLINE(PL_curcop),sv); } if (*s == term && memEQ(s,PL_tokenbuf,len)) { - s = PL_bufend - 1; - *s = ' '; + STRLEN off = PL_bufend - 1 - SvPVX(PL_linestr); + *(SvPVX(PL_linestr) + off ) = ' '; sv_catsv(PL_linestr,herewas); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); + s = SvPVX(PL_linestr) + off; /* In case PV of PL_linestr moved. */ } else { s = PL_bufend; @@ -7999,10 +7997,9 @@ S_swallow_bom(pTHX_ U8 *s) filter_add(utf16rev_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = - (char*)utf16_to_utf8_reversed(s, news, - PL_bufend - (char*)s - 1, - &newlen); + utf16_to_utf8_reversed(s, news, + PL_bufend - (char*)s - 1, + &newlen); sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); SvUTF8_on(PL_linestr); @@ -8026,10 +8023,9 @@ S_swallow_bom(pTHX_ U8 *s) filter_add(utf16_textfilter, NULL); New(898, news, (PL_bufend - (char*)s) * 3 / 2 + 1, U8); - PL_bufend = - (char*)utf16_to_utf8(s, news, - PL_bufend - (char*)s, - &newlen); + utf16_to_utf8(s, news, + PL_bufend - (char*)s, + &newlen); sv_setpvn(PL_linestr, (const char*)news, newlen); Safefree(news); SvUTF8_on(PL_linestr); @@ -8096,38 +8092,42 @@ restore_rsfp(pTHX_ void *f) static I32 utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) { + STRLEN old = SvCUR(sv); I32 count = FILTER_READ(idx+1, sv, maxlen); + DEBUG_P(PerlIO_printf(Perl_debug_log, + "utf16_textfilter(%p): %d %d (%d)\n", + utf16_textfilter, idx, maxlen, count)); if (count) { U8* tmps; - U8* tend; I32 newlen; New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8); - if (!*SvPV_nolen(sv)) - /* Game over, but don't feed an odd-length string to utf16_to_utf8 */ - return count; - - tend = utf16_to_utf8((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen); - sv_usepvn(sv, (char*)tmps, tend - tmps); + Copy(SvPVX(sv), tmps, old, char); + utf16_to_utf8((U8*)SvPVX(sv) + old, tmps + old, + SvCUR(sv) - old, &newlen); + sv_usepvn(sv, (char*)tmps, (STRLEN)newlen + old); } - return count; + DEBUG_P({sv_dump(sv);}); + return SvCUR(sv); } static I32 utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen) { + STRLEN old = SvCUR(sv); I32 count = FILTER_READ(idx+1, sv, maxlen); + DEBUG_P(PerlIO_printf(Perl_debug_log, + "utf16rev_textfilter(%p): %d %d (%d)\n", + utf16rev_textfilter, idx, maxlen, count)); if (count) { U8* tmps; - U8* tend; I32 newlen; - if (!*SvPV_nolen(sv)) - /* Game over, but don't feed an odd-length string to utf16_to_utf8 */ - return count; - New(898, tmps, SvCUR(sv) * 3 / 2 + 1, U8); - tend = utf16_to_utf8_reversed((U8*)SvPVX(sv), tmps, SvCUR(sv), &newlen); - sv_usepvn(sv, (char*)tmps, tend - tmps); + Copy(SvPVX(sv), tmps, old, char); + utf16_to_utf8((U8*)SvPVX(sv) + old, tmps + old, + SvCUR(sv) - old, &newlen); + sv_usepvn(sv, (char*)tmps, (STRLEN)newlen + old); } + DEBUG_P({ sv_dump(sv); }); return count; } #endif diff --git a/utf8.c b/utf8.c index 24bf93d..f12696e 100644 --- a/utf8.c +++ b/utf8.c @@ -868,8 +868,14 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) U8* pend; U8* dstart = d; + if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ + d[0] = 0; + *newlen = 1; + return d; + } + if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen"); + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %d", bytelen); pend = p + bytelen;