From: Vincent Pit Date: Mon, 30 Jul 2012 13:53:49 +0000 (+0200) Subject: Properly handle BOMs at the beginning of the file X-Git-Tag: release_1.0.10_001~10 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f77c3f08b49a3d3763892adbaa9881e9fbe12754;p=p5sagit%2FModule-Metadata.git Properly handle BOMs at the beginning of the file This applies to new_from_file and new_from_module, but not new_from_handle. This fixes RT #78434. --- diff --git a/lib/Module/Metadata.pm b/lib/Module/Metadata.pm index ce21787..2e70b71 100644 --- a/lib/Module/Metadata.pm +++ b/lib/Module/Metadata.pm @@ -440,9 +440,49 @@ sub _parse_file { my $fh = IO::File->new( $filename ) or croak( "Can't open '$filename': $!" ); + $self->_handle_bom($fh, $filename); + $self->_parse_fh($fh); } +# Look for a UTF-8/UTF-16BE/UTF-16LE BOM at the beginning of the stream. +# If there's one, then skip it and set the :encoding layer appropriately. +sub _handle_bom { + my ($self, $fh, $filename) = @_; + + my $pos = $fh->getpos; + return unless defined $pos; + + my $buf = ' ' x 2; + my $count = $fh->read( $buf, length $buf ); + return unless defined $count and $count >= 2; + + my $encoding; + if ( $buf eq "\x{FE}\x{FF}" ) { + $encoding = 'UTF-16BE'; + } elsif ( $buf eq "\x{FF}\x{FE}" ) { + $encoding = 'UTF-16LE'; + } elsif ( $buf eq "\x{EF}\x{BB}" ) { + $buf = ' '; + $count = $fh->read( $buf, length $buf ); + if ( defined $count and $count >= 1 and $buf eq "\x{BF}" ) { + $encoding = 'UTF-8'; + } + } + + if ( defined $encoding ) { + if ( "$]" >= 5.008 ) { + # $fh->binmode requires perl 5.10 + binmode( $fh, ":encoding($encoding)" ); + } + } else { + $fh->setpos($pos) + or croak( sprintf "Can't reset position to the top of '$filename'" ); + } + + return $encoding; +} + sub _parse_fh { my ($self, $fh) = @_; @@ -742,7 +782,9 @@ Construct a C object given the path to a file. Takes an optional argument C which is a boolean that determines whether POD data is collected and stored for reference. POD data is not collected by default. POD headings are always collected. Returns undef if the filename -does not exist. +does not exist. If the file begins by an UTF-8, UTF-16BE or UTF-16LE +byte-order mark, then it is skipped before processing, and the content of the +file is also decoded appropriately starting from perl 5.8. =item C<< new_from_handle($handle, $filename, collect_pod => 1) >> @@ -750,7 +792,8 @@ This works just like C, except that a handle can be provided as the first argument. Note that there is no validation to confirm that the handle is a handle or something that can act like one. Passing something that isn't a handle will cause a exception when trying to read from it. The -C argument is mandatory or undef will be returned. +C argument is mandatory or undef will be returned. You are +responsible for setting the decoding layers on C<$handle> if required. =item C<< new_from_module($module, collect_pod => 1, inc => \@dirs) >> @@ -759,6 +802,9 @@ to accepting the C argument as described above, this method accepts a C argument which is a reference to an array of of directories to search for the module. If none are given, the default is @INC. Returns undef if the module cannot be found. +If the file that contains the module begins by an UTF-8, UTF-16BE or UTF-16LE +byte-order mark, then it is skipped before processing, and the content of the +file is also decoded appropriately starting from perl 5.8. =item C<< find_module_by_name($module, \@dirs) >> diff --git a/t/encoding.t b/t/encoding.t new file mode 100644 index 0000000..a0970e0 --- /dev/null +++ b/t/encoding.t @@ -0,0 +1,30 @@ +#!perl + +use strict; +use File::Spec; +use Test::More; + +use Module::Metadata; + +if ("$]" < 5.008_003) { + plan skip_all => 'Encoding test needs at least perl 5.8.3'; +} + +my %versions = ( + UTF8 => 3, + UTF16BE => 4, + UTF16LE => 5, +); + +plan tests => 4 * scalar(keys %versions); + +for my $enc (sort keys %versions) { + my $pkg = "BOMTest::$enc"; + my $vers = $versions{$enc}; + my $pm = File::Spec->catfile(qw => "$enc.pm"); + my $info = Module::Metadata->new_from_file($pm); + is( $info->name, $pkg, "$enc: default package was found" ); + is( $info->version, $vers, "$enc: version for default package" ); + is( $info->version('Heart'), '1', 'version for ASCII package' ); + is( $info->version("C\x{153}ur"), '2', 'version for Unicode package' ); +} diff --git a/t/lib/BOMTest/UTF16BE.pm b/t/lib/BOMTest/UTF16BE.pm new file mode 100644 index 0000000..17c6a4a Binary files /dev/null and b/t/lib/BOMTest/UTF16BE.pm differ diff --git a/t/lib/BOMTest/UTF16LE.pm b/t/lib/BOMTest/UTF16LE.pm new file mode 100644 index 0000000..a46de6d Binary files /dev/null and b/t/lib/BOMTest/UTF16LE.pm differ diff --git a/t/lib/BOMTest/UTF8.pm b/t/lib/BOMTest/UTF8.pm new file mode 100644 index 0000000..9062ac6 --- /dev/null +++ b/t/lib/BOMTest/UTF8.pm @@ -0,0 +1,13 @@ +package Heart; + +our $VERSION = 1; + +package BOMTest::UTF8; + +our $VERSION = 3; + +package Cœur; + +our $VERSION = 2; + +1;