### the gnu tar specification:
-### http://www.gnu.org/software/tar/manual/html_mono/tar.html
+### http://www.gnu.org/software/tar/manual/tar.html
###
### and the pax format spec, which tar derives from:
### http://www.opengroup.org/onlinepubs/007904975/utilities/pax.html
$DEBUG = 0;
$WARN = 1;
$FOLLOW_SYMLINK = 0;
-$VERSION = "1.24_01";
+$VERSION = "1.36";
$CHOWN = 1;
$CHMOD = 1;
$DO_NOT_USE_PREFIX = 0;
my $obj = bless { _data => [ ], _file => 'Unknown' }, $class;
if (@_) {
- return unless $obj->read( @_ );
+ unless ( $obj->read( @_ ) ) {
+ $obj->_error(qq[No data could be read from file]);
+ return;
+ }
}
return $obj;
Archive::Tar will warn if you try to pass a compressed file if
IO::Zlib is not available and simply return.
+Note that you can currently B<not> pass a C<gzip> compressed
+filehandle, which is not opened with C<IO::Zlib>, nor a string
+containing the full archive information (either compressed or
+uncompressed). These are worth while features, but not currently
+implemented. See the C<TODO> section.
+
The third argument can be a hash reference with options. Note that
all options are case-sensitive.
### source code (tar.c) to GNU cpio.
next if $chunk eq TAR_END;
+ ### according to the posix spec, the last 12 bytes of the header are
+ ### null bytes, to pad it to a 512 byte block. That means if these
+ ### bytes are NOT null bytes, it's a corrrupt header. See:
+ ### www.koders.com/c/fidCE473AD3D9F835D690259D60AD5654591D91D5BA.aspx
+ ### line 111
+ { my $nulls = join '', "\0" x 12;
+ unless( $nulls eq substr( $chunk, 500, 12 ) ) {
+ $self->_error( qq[Invalid header block at offset $offset] );
+ next LOOP;
+ }
+ }
+
+ ### pass the realname, so we can set it 'proper' right away
+ ### some of the heuristics are done on the name, so important
+ ### to set it ASAP
my $entry;
- unless( $entry = Archive::Tar::File->new( chunk => $chunk ) ) {
- $self->_error( qq[Couldn't read chunk at offset $offset] );
- next;
+ { my %extra_args = ();
+ $extra_args{'name'} = $$real_name if defined $real_name;
+
+ unless( $entry = Archive::Tar::File->new( chunk => $chunk,
+ %extra_args )
+ ) {
+ $self->_error( qq[Couldn't read chunk at offset $offset] );
+ next LOOP;
+ }
}
### ignore labels:
if ( $entry->is_file && !$entry->validate ) {
### sometimes the chunk is rather fux0r3d and a whole 512
- ### bytes ends p in the ->name area.
+ ### bytes ends up in the ->name area.
### clean it up, if need be
my $name = $entry->name;
$name = substr($name, 0, 100) if length $name > 100;
if( $handle->read( $$data, $block ) < $block ) {
$self->_error( qq[Read error on tarfile (missing data) '].
$entry->full_path ."' at offset $offset" );
- next;
+ next LOOP;
}
### throw away trailing garbage ###
- substr ($$data, $entry->size) = "";
+ substr ($$data, $entry->size) = "" if defined $$data;
### part II of the @LongLink munging -- need to do /after/
### the checksum check.
### this is one ugly hack =/ but needed for direct extraction
if( $entry->is_longlink ) {
$real_name = $data;
- next;
+ next LOOP;
} elsif ( defined $real_name ) {
$entry->name( $$real_name );
$entry->prefix('');
sub contains_file {
my $self = shift;
- my $full = shift or return;
+ my $full = shift;
+
+ return unless defined $full;
+ ### don't warn if the entry isn't there.. that's what this function
+ ### is for after all.
+ local $WARN = 0;
return 1 if $self->_find_entry($full);
return;
}
sub extract {
my $self = shift;
+ my @args = @_;
my @files;
+ # use the speed optimization for all extracted files
+ local($self->{cwd}) = cwd() unless $self->{cwd};
+
### you requested the extraction of only certian files
- if( @_ ) {
- for my $file (@_) {
- my $found;
- for my $entry ( @{$self->_data} ) {
- next unless $file eq $entry->full_path;
-
- ### we found the file you're looking for
- push @files, $entry;
- $found++;
- }
+ if( @args ) {
+ for my $file ( @args ) {
+
+ ### it's already an object?
+ if( UNIVERSAL::isa( $file, 'Archive::Tar::File' ) ) {
+ push @files, $file;
+ next;
- unless( $found ) {
- return $self->_error( qq[Could not find '$file' in archive] );
+ ### go find it then
+ } else {
+
+ my $found;
+ for my $entry ( @{$self->_data} ) {
+ next unless $file eq $entry->full_path;
+
+ ### we found the file you're looking for
+ push @files, $entry;
+ $found++;
+ }
+
+ unless( $found ) {
+ return $self->_error(
+ qq[Could not find '$file' in archive] );
+ }
}
}
=head2 $tar->extract_file( $file, [$extract_path] )
Write an entry, whose name is equivalent to the file name provided to
-disk. Optionally takes a second parameter, which is the full (unix)
+disk. Optionally takes a second parameter, which is the full native
path (including filename) the entry will be written to.
For example:
$tar->extract_file( 'name/in/archive', 'name/i/want/to/give/it' );
+ $tar->extract_file( $at_file_object, 'name/i/want/to/give/it' );
+
Returns true on success, false on failure.
=cut
sub extract_file {
my $self = shift;
- my $file = shift or return;
+ my $file = shift; return unless defined $file;
my $alt = shift;
my $entry = $self->_find_entry( $file )
my $self = shift;
my $entry = shift or return;
my $alt = shift;
- my $cwd = cwd();
### you wanted an alternate extraction location ###
my $name = defined $alt ? $alt : $entry->full_path;
### it's a relative path ###
} else {
- my @dirs = File::Spec::Unix->splitdir( $dirs );
- my @cwd = File::Spec->splitdir( $cwd );
- $dir = File::Spec->catdir(@cwd, @dirs);
+ my $cwd = (defined $self->{cwd} ? $self->{cwd} : cwd());
+
+
+
+ my @dirs = defined $alt
+ ? File::Spec->splitdir( $dirs ) # It's a local-OS path
+ : File::Spec::Unix->splitdir( $dirs ); # it's UNIX-style, likely
+ # straight from the tarball
+
+ ### '.' is the directory delimiter, of which the first one has to
+ ### be escaped/changed.
+ map tr/\./_/, @dirs if ON_VMS;
+
+ my ($cwd_vol,$cwd_dir,$cwd_file)
+ = File::Spec->splitpath( $cwd );
+ my @cwd = File::Spec->splitdir( $cwd_dir );
+ push @cwd, $cwd_file if length $cwd_file;
+
+ ### We need to pass '' as the last elemant to catpath. Craig Berry
+ ### explains why (msgid <p0624083dc311ae541393@[172.16.52.1]>):
+ ### The root problem is that splitpath on UNIX always returns the
+ ### final path element as a file even if it is a directory, and of
+ ### course there is no way it can know the difference without checking
+ ### against the filesystem, which it is documented as not doing. When
+ ### you turn around and call catpath, on VMS you have to know which bits
+ ### are directory bits and which bits are file bits. In this case we
+ ### know the result should be a directory. I had thought you could omit
+ ### the file argument to catpath in such a case, but apparently on UNIX
+ ### you can't.
+ $dir = File::Spec->catpath(
+ $cwd_vol, File::Spec->catdir( @cwd, @dirs ), ''
+ );
+
+ ### catdir() returns undef if the path is longer than 255 chars on VMS
+ unless ( defined $dir ) {
+ $^W && $self->_error( qq[Could not compose a path for '$dirs'\n] );
+ return;
+ }
+
}
if( -e $dir && !-d _ ) {
$self->_error( qq[Could not create directory '$dir': $@] );
return;
}
+
+ ### XXX chown here? that might not be the same as in the archive
+ ### as we're only chown'ing to the owner of the file we're extracting
+ ### not to the owner of the directory itself, which may or may not
+ ### be another entry in the archive
+ ### Answer: no, gnu tar doesn't do it either, it'd be the wrong
+ ### way to go.
+ #if( $CHOWN && CAN_CHOWN ) {
+ # chown $entry->uid, $entry->gid, $dir or
+ # $self->_error( qq[Could not set uid/gid on '$dir'] );
+ #}
}
### we're done if we just needed to create a dir ###
return;
}
+ ### it's an object already
+ return $file if UNIVERSAL::isa( $file, 'Archive::Tar::File' );
+
for my $entry ( @{$self->_data} ) {
my $path = $entry->full_path;
return $entry if $path eq $file;
### write the end markers ###
print $handle TAR_END x 2 or
return $self->_error( qq[Could not write tar end markers] );
+
### did you want it written to a file, or returned as a string? ###
- return length($file) ? 1
+ my $rv = length($file) ? 1
: $HAS_PERLIO ? $dummy
- : do { seek $handle, 0, 0; local $/; <$handle> }
+ : do { seek $handle, 0, 0; local $/; <$handle> };
+
+ ### make sure to close the handle;
+ close $handle;
+
+ return $rv;
}
sub _format_tar_entry {
my @rv;
for my $file ( @files ) {
- unless( -e $file ) {
+ unless( -e $file || -l $file ) {
$self->_error( qq[No such file: '$file'] );
next;
}
content C<$data>. Specific properties can be set using C<$opthashref>.
The following list of properties is supported: name, size, mtime
(last modified date), mode, uid, gid, linkname, uname, gname,
-devmajor, devminor, prefix. (On MacOS, the file's path and
+devmajor, devminor, prefix, type. (On MacOS, the file's path and
modification times are converted to Unix equivalents.)
+Valid values for the file type are the following constants defined in
+Archive::Tar::Constants:
+
+=over 4
+
+=item FILE
+
+Regular file.
+
+=item HARDLINK
+
+=item SYMLINK
+
+Hard and symbolic ("soft") links; linkname should specify target.
+
+=item CHARDEV
+
+=item BLOCKDEV
+
+Character and block devices. devmajor and devminor should specify the major
+and minor device numbers.
+
+=item DIR
+
+Directory.
+
+=item FIFO
+
+FIFO (named pipe).
+
+=item SOCKET
+
+Socket.
+
+=back
+
Returns the C<Archive::Tar::File> object that was just added, or
C<undef> on failure.
}
}
+=head2 $tar->setcwd( $cwd );
+
+C<Archive::Tar> needs to know the current directory, and it will run
+C<Cwd::cwd()> I<every> time it extracts a I<relative> entry from the
+tarfile and saves it in the file system. (As of version 1.30, however,
+C<Archive::Tar> will use the speed optimization described below
+automatically, so it's only relevant if you're using C<extract_file()>).
+
+Since C<Archive::Tar> doesn't change the current directory internally
+while it is extracting the items in a tarball, all calls to C<Cwd::cwd()>
+can be avoided if we can guarantee that the current directory doesn't
+get changed externally.
+
+To use this performance boost, set the current directory via
+
+ use Cwd;
+ $tar->setcwd( cwd() );
+
+once before calling a function like C<extract_file> and
+C<Archive::Tar> will use the current directory setting from then on
+and won't call C<Cwd::cwd()> internally.
+
+To switch back to the default behaviour, use
+
+ $tar->setcwd( undef );
+
+and C<Archive::Tar> will call C<Cwd::cwd()> internally again.
+
+If you're using C<Archive::Tar>'s C<exract()> method, C<setcwd()> will
+be called for you.
+
+=cut
+
+sub setcwd {
+ my $self = shift;
+ my $cwd = shift;
+
+ $self->{cwd} = $cwd;
+}
=head2 $bool = $tar->has_io_string
Returns true if we currently have C<IO::String> support loaded.
Either C<IO::String> or C<perlio> support is needed to support writing
-stringified archives. Currently, C<perlio> is the preffered method, if
+stringified archives. Currently, C<perlio> is the preferred method, if
available.
See the C<GLOBAL VARIABLES> section to see how to change this preference.
This requires C<perl-5.8> or higher, compiled with C<perlio>
Either C<IO::String> or C<perlio> support is needed to support writing
-stringified archives. Currently, C<perlio> is the preffered method, if
+stringified archives. Currently, C<perlio> is the preferred method, if
available.
See the C<GLOBAL VARIABLES> section to see how to change this preference.
If C<list_archive()> is passed an array reference as its third
argument it returns a list of hash references containing the requested
properties of each file. The following list of properties is
-supported: name, size, mtime (last modified date), mode, uid, gid,
-linkname, uname, gname, devmajor, devminor, prefix.
+supported: full_path, name, size, mtime (last modified date), mode,
+uid, gid, linkname, uname, gname, devmajor, devminor, prefix.
+
+See C<Archive::Tar::File> for details about supported properties.
Passing an array reference containing only one element, 'name', is
special cased to return a list of names rather than a list of hash
=head2 $Archive::Tar::DO_NOT_USE_PREFIX
-By default, C<Archive::Tar> will try to put paths that are over
-100 characters in the C<prefix> field of your tar header. However,
-some older tar programs do not implement this spec. To retain
-compatibility with these older versions, you can set the
-C<$DO_NOT_USE_PREFIX> variable to a true value, and C<Archive::Tar>
-will use an alternate way of dealing with paths over 100 characters
-by using the C<GNU Extended Header> feature.
+By default, C<Archive::Tar> will try to put paths that are over
+100 characters in the C<prefix> field of your tar header, as
+defined per POSIX-standard. However, some (older) tar programs
+do not implement this spec. To retain compatibility with these older
+or non-POSIX compliant versions, you can set the C<$DO_NOT_USE_PREFIX>
+variable to a true value, and C<Archive::Tar> will use an alternate
+way of dealing with paths over 100 characters by using the
+C<GNU Extended Header> feature.
+
+Note that clients who do not support the C<GNU Extended Header>
+feature will not be able to read these archives. Such clients include
+tars on C<Solaris>, C<Irix> and C<AIX>.
The default is C<0>.
For other filetypes, like C<chardevs> and C<blockdevs> we'll warn that
the extraction of this particular item didn't work.
+=item I'm using WinZip, or some other non-POSIX client, and files are not being extracted properly!
+
+By default, C<Archive::Tar> is in a completely POSIX-compatible
+mode, which uses the POSIX-specification of C<tar> to store files.
+For paths greather than 100 characters, this is done using the
+C<POSIX header prefix>. Non-POSIX-compatible clients may not support
+this part of the specification, and may only support the C<GNU Extended
+Header> functionality. To facilitate those clients, you can set the
+C<$Archive::Tar::DO_NOT_USE_PREFIX> variable to C<true>. See the
+C<GLOBAL VARIABLES> section for details on this variable.
+
+Note that GNU tar earlier than version 1.14 does not cope well with
+the C<POSIX header prefix>. If you use such a version, consider setting
+the C<$Archive::Tar::DO_NOT_USE_PREFIX> variable to C<true>.
+
+=item How do I extract only files that have property X from an archive?
+
+Sometimes, you might not wish to extract a complete archive, just
+the files that are relevant to you, based on some criteria.
+
+You can do this by filtering a list of C<Archive::Tar::File> objects
+based on your criteria. For example, to extract only files that have
+the string C<foo> in their title, you would use:
+
+ $tar->extract(
+ grep { $_->full_path =~ /foo/ } $tar->get_files
+ );
+
+This way, you can filter on any attribute of the files in the archive.
+Consult the C<Archive::Tar::File> documentation on how to use these
+objects.
+
+=item How do I access .tar.Z files?
+
+The C<Archive::Tar> module can optionally use C<Compress::Zlib> (via
+the C<IO::Zlib> module) to access tar files that have been compressed
+with C<gzip>. Unfortunately tar files compressed with the Unix C<compress>
+utility cannot be read by C<Compress::Zlib> and so cannot be directly
+accesses by C<Archive::Tar>.
+
+If the C<uncompress> or C<gunzip> programs are available, you can use
+one of these workarounds to read C<.tar.Z> files from C<Archive::Tar>
+
+Firstly with C<uncompress>
+
+ use Archive::Tar;
+
+ open F, "uncompress -c $filename |";
+ my $tar = Archive::Tar->new(*F);
+ ...
+
+and this with C<gunzip>
+
+ use Archive::Tar;
+
+ open F, "gunzip -c $filename |";
+ my $tar = Archive::Tar->new(*F);
+ ...
+
+Similarly, if the C<compress> program is available, you can use this to
+write a C<.tar.Z> file
+
+ use Archive::Tar;
+ use IO::File;
+
+ my $fh = new IO::File "| compress -c >$filename";
+ my $tar = Archive::Tar->new();
+ ...
+ $tar->write($fh);
+ $fh->close ;
+
+=item How do I handle Unicode strings?
+
+C<Archive::Tar> uses byte semantics for any files it reads from or writes
+to disk. This is not a problem if you only deal with files and never
+look at their content or work solely with byte strings. But if you use
+Unicode strings with character semantics, some additional steps need
+to be taken.
+
+For example, if you add a Unicode string like
+
+ # Problem
+ $tar->add_data('file.txt', "Euro: \x{20AC}");
+
+then there will be a problem later when the tarfile gets written out
+to disk via C<$tar->write()>:
+
+ Wide character in print at .../Archive/Tar.pm line 1014.
+
+The data was added as a Unicode string and when writing it out to disk,
+the C<:utf8> line discipline wasn't set by C<Archive::Tar>, so Perl
+tried to convert the string to ISO-8859 and failed. The written file
+now contains garbage.
+
+For this reason, Unicode strings need to be converted to UTF-8-encoded
+bytestrings before they are handed off to C<add_data()>:
+
+ use Encode;
+ my $data = "Accented character: \x{20AC}";
+ $data = encode('utf8', $data);
+
+ $tar->add_data('file.txt', $data);
+
+A opposite problem occurs if you extract a UTF8-encoded file from a
+tarball. Using C<get_content()> on the C<Archive::Tar::File> object
+will return its content as a bytestring, not as a Unicode string.
+
+If you want it to be a Unicode string (because you want character
+semantics with operations like regular expression matching), you need
+to decode the UTF8-encoded content and have Perl convert it into
+a Unicode string:
+
+ use Encode;
+ my $data = $tar->get_content();
+
+ # Make it a Unicode string
+ $data = decode('utf8', $data);
+
+There is no easy way to provide this functionality in C<Archive::Tar>,
+because a tarball can contain many files, and each of which could be
+encoded in a different way.
+
=back
=head1 TODO
Currently I don't know of any portable pure perl way to do this.
Suggestions welcome.
+=item Allow archives to be passed in as string
+
+Currently, we only allow opened filehandles or filenames, but
+not strings. The internals would need some reworking to facilitate
+stringified archives.
+
+=item Facilitate processing an opened filehandle of a compressed archive
+
+Currently, we only support this if the filehandle is an IO::Zlib object.
+Environments, like apache, will present you with an opened filehandle
+to an uploaded file, which might be a compressed archive.
+
+=back
+
+=head1 SEE ALSO
+
+=over 4
+
+=item The GNU tar specification
+
+C<http://www.gnu.org/software/tar/manual/tar.html>
+
+=item The PAX format specication
+
+The specifcation which tar derives from; C< http://www.opengroup.org/onlinepubs/007904975/utilities/pax.html>
+
+=item A comparison of GNU and POSIX tar standards; C<http://www.delorie.com/gnu/docs/tar/tar_114.html>
+
+=item GNU tar intends to switch to POSIX compatibility
+
+GNU Tar authors have expressed their intention to become completely
+POSIX-compatible; C<http://www.gnu.org/software/tar/manual/html_node/Formats.html>
+
+=item A Comparison between various tar implementations
+
+Lists known issues and incompatibilities; C<http://gd.tuwien.ac.at/utils/archivers/star/README.otherbugs>
+
=back
=head1 AUTHOR
-This module by
-Jos Boumans E<lt>kane@cpan.orgE<gt>.
+This module by Jos Boumans E<lt>kane@cpan.orgE<gt>.
+
+Please reports bugs to E<lt>bug-archive-tar@rt.cpan.orgE<gt>.
=head1 ACKNOWLEDGEMENTS
=head1 COPYRIGHT
-This module is
-copyright (c) 2002 Jos Boumans E<lt>kane@cpan.orgE<gt>.
-All rights reserved.
+This module is copyright (c) 2002 - 2007 Jos Boumans
+E<lt>kane@cpan.orgE<gt>. All rights reserved.
-This library is free software;
-you may redistribute and/or modify it under the same
-terms as Perl itself.
+This library is free software; you may redistribute and/or modify
+it under the same terms as Perl itself.
=cut