2 package IO::Uncompress::Gunzip ;
11 use IO::Uncompress::RawInflate ;
13 use Compress::Zlib qw( crc32 ) ;
14 use Compress::Zlib::Common qw(createSelfTiedObject);
15 use Compress::Gzip::Constants;
19 our ($VERSION, @ISA, @EXPORT_OK, %EXPORT_TAGS, $GunzipError);
21 @ISA = qw( Exporter IO::Uncompress::RawInflate );
22 @EXPORT_OK = qw( $GunzipError gunzip );
23 %EXPORT_TAGS = %IO::Uncompress::RawInflate::DEFLATE_CONSTANTS ;
24 push @{ $EXPORT_TAGS{all} }, @EXPORT_OK ;
25 Exporter::export_ok_tags('all');
29 $VERSION = '2.000_07';
35 my $obj = createSelfTiedObject($class, \$GunzipError);
37 $obj->_create(undef, 0, @_);
42 my $obj = createSelfTiedObject(undef, \$GunzipError);
43 return $obj->_inf(@_) ;
48 use Compress::Zlib::ParseParameters ;
49 return ( 'ParseExtra' => [1, 1, Parse_boolean, 0] ) ;
57 # gunzip always needs crc32
58 $got->value('CRC32' => 1);
68 $self->smartReadExact(\$magic, GZIP_ID_SIZE);
70 *$self->{HeaderPending} = $magic ;
72 return $self->HeaderError("Minimum header size is " .
73 GZIP_MIN_HEADER_SIZE . " bytes")
74 if length $magic != GZIP_ID_SIZE ;
76 return $self->HeaderError("Bad Magic")
77 if ! isGzipMagic($magic) ;
79 *$self->{Type} = 'rfc1952';
89 return $self->_readGzipHeader($magic);
98 my ($CRC32, $ISIZE) = unpack("V V", $trailer) ;
99 *$self->{Info}{CRC32} = $CRC32;
100 *$self->{Info}{ISIZE} = $ISIZE;
102 if (*$self->{Strict}) {
103 return $self->TrailerError("CRC mismatch")
104 if $CRC32 != *$self->{Uncomp}->crc32() ;
106 my $exp_isize = *$self->{Uncomp}->uncompressedBytes();
107 return $self->TrailerError("ISIZE mismatch. Got $ISIZE"
108 . ", expected $exp_isize")
109 if $ISIZE != $exp_isize ;
118 return 0 if length $buffer < GZIP_ID_SIZE ;
119 my ($id1, $id2) = unpack("C C", $buffer) ;
120 return $id1 == GZIP_ID1 && $id2 == GZIP_ID2 ;
123 sub _readFullGzipHeader($)
128 $self->smartReadExact(\$magic, GZIP_ID_SIZE);
130 *$self->{HeaderPending} = $magic ;
132 return $self->HeaderError("Minimum header size is " .
133 GZIP_MIN_HEADER_SIZE . " bytes")
134 if length $magic != GZIP_ID_SIZE ;
137 return $self->HeaderError("Bad Magic")
138 if ! isGzipMagic($magic) ;
140 my $status = $self->_readGzipHeader($magic);
141 delete *$self->{Transparent} if ! defined $status ;
145 sub _readGzipHeader($)
147 my ($self, $magic) = @_ ;
151 $self->smartReadExact(\$buffer, GZIP_MIN_HEADER_SIZE - GZIP_ID_SIZE)
152 or return $self->HeaderError("Minimum header size is " .
153 GZIP_MIN_HEADER_SIZE . " bytes") ;
155 my $keep = $magic . $buffer ;
156 *$self->{HeaderPending} = $keep ;
158 # now split out the various parts
159 my ($cm, $flag, $mtime, $xfl, $os) = unpack("C C V C C", $buffer) ;
161 $cm == GZIP_CM_DEFLATED
162 or return $self->HeaderError("Not Deflate (CM is $cm)") ;
164 # check for use of reserved bits
165 return $self->HeaderError("Use of Reserved Bits in FLG field.")
166 if $flag & GZIP_FLG_RESERVED ;
170 if ($flag & GZIP_FLG_FEXTRA) {
172 $self->smartReadExact(\$buffer, GZIP_FEXTRA_HEADER_SIZE)
173 or return $self->TruncatedHeader("FEXTRA Length") ;
175 my ($XLEN) = unpack("v", $buffer) ;
176 $self->smartReadExact(\$EXTRA, $XLEN)
177 or return $self->TruncatedHeader("FEXTRA Body");
178 $keep .= $buffer . $EXTRA ;
180 if ($XLEN && *$self->{'ParseExtra'}) {
182 while ($offset < $XLEN) {
184 return $self->TruncatedHeader("FEXTRA Body")
185 if $offset + GZIP_FEXTRA_SUBFIELD_HEADER_SIZE > $XLEN ;
187 my $id = substr($EXTRA, $offset, GZIP_FEXTRA_SUBFIELD_ID_SIZE);
188 $offset += GZIP_FEXTRA_SUBFIELD_ID_SIZE ;
190 return $self->HeaderError("SubField ID 2nd byte is 0x00")
191 if *$self->{Strict} && substr($id, 1, 1) eq "\x00" ;
193 my ($subLen) = unpack("v", substr($EXTRA, $offset,
194 GZIP_FEXTRA_SUBFIELD_LEN_SIZE)) ;
195 $offset += GZIP_FEXTRA_SUBFIELD_LEN_SIZE ;
197 return $self->TruncatedHeader("FEXTRA Body")
198 if $offset + $subLen > $XLEN ;
200 push @EXTRA, [$id => substr($EXTRA, $offset, $subLen)];
207 if ($flag & GZIP_FLG_FNAME) {
210 $self->smartReadExact(\$buffer, 1)
211 or return $self->TruncatedHeader("FNAME");
212 last if $buffer eq GZIP_NULL_BYTE ;
215 $keep .= $origname . GZIP_NULL_BYTE ;
217 return $self->HeaderError("Non ISO 8859-1 Character found in Name")
218 if *$self->{Strict} && $origname =~ /$GZIP_FNAME_INVALID_CHAR_RE/o ;
222 if ($flag & GZIP_FLG_FCOMMENT) {
225 $self->smartReadExact(\$buffer, 1)
226 or return $self->TruncatedHeader("FCOMMENT");
227 last if $buffer eq GZIP_NULL_BYTE ;
230 $keep .= $comment . GZIP_NULL_BYTE ;
232 return $self->HeaderError("Non ISO 8859-1 Character found in Comment")
233 if *$self->{Strict} && $comment =~ /$GZIP_FCOMMENT_INVALID_CHAR_RE/o ;
236 if ($flag & GZIP_FLG_FHCRC) {
237 $self->smartReadExact(\$buffer, GZIP_FHCRC_SIZE)
238 or return $self->TruncatedHeader("FHCRC");
240 $HeaderCRC = unpack("v", $buffer) ;
241 my $crc16 = crc32($keep) & 0xFF ;
243 return $self->HeaderError("CRC16 mismatch.")
244 if *$self->{Strict} && $crc16 != $HeaderCRC;
249 # Assume compression method is deflated for xfl tests
253 *$self->{Type} = 'rfc1952';
257 'FingerprintLength' => 2,
258 'HeaderLength' => length $keep,
259 'TrailerLength' => GZIP_TRAILER_SIZE,
261 'isMinimalHeader' => $keep eq GZIP_MINIMUM_HEADER ? 1 : 0,
264 'MethodName' => $cm == GZIP_CM_DEFLATED ? "Deflated" : "Unknown" ,
265 'TextFlag' => $flag & GZIP_FLG_FTEXT ? 1 : 0,
266 'HeaderCRCFlag' => $flag & GZIP_FLG_FHCRC ? 1 : 0,
267 'NameFlag' => $flag & GZIP_FLG_FNAME ? 1 : 0,
268 'CommentFlag' => $flag & GZIP_FLG_FCOMMENT ? 1 : 0,
269 'ExtraFlag' => $flag & GZIP_FLG_FEXTRA ? 1 : 0,
271 'Comment' => $comment,
274 'OsName' => defined $GZIP_OS_Names{$os}
275 ? $GZIP_OS_Names{$os} : "Unknown",
276 'HeaderCRC' => $HeaderCRC,
278 'ExtraFlags' => $xfl,
279 'ExtraFieldRaw' => $EXTRA,
280 'ExtraField' => [ @EXTRA ],
283 #'CompSize'=> $compsize,
285 #'OrigSize'=> $ISIZE,
297 IO::Uncompress::Gunzip - Perl interface to read RFC 1952 files/buffers
301 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
303 my $status = gunzip $input => $output [,OPTS]
304 or die "gunzip failed: $GunzipError\n";
306 my $z = new IO::Uncompress::Gunzip $input [OPTS]
307 or die "gunzip failed: $GunzipError\n";
309 $status = $z->read($buffer)
310 $status = $z->read($buffer, $length)
311 $status = $z->read($buffer, $length, $offset)
312 $line = $z->getline()
315 $status = $z->inflateSync()
317 $data = $z->getHeaderInfo()
319 $z->seek($position, $whence)
331 read($z, $buffer, $length);
332 read($z, $buffer, $length, $offset);
334 seek($z, $position, $whence)
345 B<WARNING -- This is a Beta release>.
349 =item * DO NOT use in production code.
351 =item * The documentation is incomplete in places.
353 =item * Parts of the interface defined here are tentative.
355 =item * Please report any problems you find.
363 This module provides a Perl interface that allows the reading of
364 files/buffers that conform to RFC 1952.
366 For writing RFC 1952 files/buffers, see the companion module IO::Compress::Gzip.
370 =head1 Functional Interface
372 A top-level function, C<gunzip>, is provided to carry out
373 "one-shot" uncompression between buffers and/or files. For finer
374 control over the uncompression process, see the L</"OO Interface">
377 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
379 gunzip $input => $output [,OPTS]
380 or die "gunzip failed: $GunzipError\n";
384 The functional interface needs Perl5.005 or better.
387 =head2 gunzip $input => $output [, OPTS]
390 C<gunzip> expects at least two parameters, C<$input> and C<$output>.
392 =head3 The C<$input> parameter
394 The parameter, C<$input>, is used to define the source of
397 It can take one of the following forms:
403 If the C<$input> parameter is a simple scalar, it is assumed to be a
404 filename. This file will be opened for reading and the input data
405 will be read from it.
409 If the C<$input> parameter is a filehandle, the input data will be
411 The string '-' can be used as an alias for standard input.
413 =item A scalar reference
415 If C<$input> is a scalar reference, the input data will be read
418 =item An array reference
420 If C<$input> is an array reference, each element in the array must be a
423 The input data will be read from each file in turn.
425 The complete array will be walked to ensure that it only
426 contains valid filenames before any data is uncompressed.
430 =item An Input FileGlob string
432 If C<$input> is a string that is delimited by the characters "<" and ">"
433 C<gunzip> will assume that it is an I<input fileglob string>. The
434 input is the list of files that match the fileglob.
436 If the fileglob does not match any files ...
438 See L<File::GlobMapper|File::GlobMapper> for more details.
443 If the C<$input> parameter is any other type, C<undef> will be returned.
447 =head3 The C<$output> parameter
449 The parameter C<$output> is used to control the destination of the
450 uncompressed data. This parameter can take one of these forms.
456 If the C<$output> parameter is a simple scalar, it is assumed to be a
457 filename. This file will be opened for writing and the uncompressed
458 data will be written to it.
462 If the C<$output> parameter is a filehandle, the uncompressed data
463 will be written to it.
464 The string '-' can be used as an alias for standard output.
467 =item A scalar reference
469 If C<$output> is a scalar reference, the uncompressed data will be
470 stored in C<$$output>.
474 =item An Array Reference
476 If C<$output> is an array reference, the uncompressed data will be
477 pushed onto the array.
479 =item An Output FileGlob
481 If C<$output> is a string that is delimited by the characters "<" and ">"
482 C<gunzip> will assume that it is an I<output fileglob string>. The
483 output is the list of files that match the fileglob.
485 When C<$output> is an fileglob string, C<$input> must also be a fileglob
486 string. Anything else is an error.
490 If the C<$output> parameter is any other type, C<undef> will be returned.
496 When C<$input> maps to multiple files/buffers and C<$output> is a single
497 file/buffer the uncompressed input files/buffers will all be stored
498 in C<$output> as a single uncompressed stream.
502 =head2 Optional Parameters
504 Unless specified below, the optional parameters for C<gunzip>,
505 C<OPTS>, are the same as those used with the OO interface defined in the
506 L</"Constructor Options"> section below.
510 =item AutoClose =E<gt> 0|1
512 This option applies to any input or output data streams to
513 C<gunzip> that are filehandles.
515 If C<AutoClose> is specified, and the value is true, it will result in all
516 input and/or output filehandles being closed once C<gunzip> has
519 This parameter defaults to 0.
523 =item BinModeOut =E<gt> 0|1
525 When writing to a file or filehandle, set C<binmode> before writing to the
534 =item -Append =E<gt> 0|1
538 =item -MultiStream =E<gt> 0|1
540 Creates a new stream after each file.
553 To read the contents of the file C<file1.txt.gz> and write the
554 compressed data to the file C<file1.txt>.
558 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
560 my $input = "file1.txt.gz";
561 my $output = "file1.txt";
562 gunzip $input => $output
563 or die "gunzip failed: $GunzipError\n";
566 To read from an existing Perl filehandle, C<$input>, and write the
567 uncompressed data to a buffer, C<$buffer>.
571 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
574 my $input = new IO::File "<file1.txt.gz"
575 or die "Cannot open 'file1.txt.gz': $!\n" ;
577 gunzip $input => \$buffer
578 or die "gunzip failed: $GunzipError\n";
580 To uncompress all files in the directory "/my/home" that match "*.txt.gz" and store the compressed data in the same directory
584 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
586 gunzip '</my/home/*.txt.gz>' => '</my/home/#1.txt>'
587 or die "gunzip failed: $GunzipError\n";
589 and if you want to compress each file one at a time, this will do the trick
593 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
595 for my $input ( glob "/my/home/*.txt.gz" )
599 gunzip $input => $output
600 or die "Error compressing '$input': $GunzipError\n";
607 The format of the constructor for IO::Uncompress::Gunzip is shown below
610 my $z = new IO::Uncompress::Gunzip $input [OPTS]
611 or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
613 Returns an C<IO::Uncompress::Gunzip> object on success and undef on failure.
614 The variable C<$GunzipError> will contain an error message on failure.
616 If you are running Perl 5.005 or better the object, C<$z>, returned from
617 IO::Uncompress::Gunzip can be used exactly like an L<IO::File|IO::File> filehandle.
618 This means that all normal input file operations can be carried out with
619 C<$z>. For example, to read a line from a compressed file/buffer you can
620 use either of these forms
622 $line = $z->getline();
625 The mandatory parameter C<$input> is used to determine the source of the
626 compressed data. This parameter can take one of three forms.
632 If the C<$input> parameter is a scalar, it is assumed to be a filename. This
633 file will be opened for reading and the compressed data will be read from it.
637 If the C<$input> parameter is a filehandle, the compressed data will be
639 The string '-' can be used as an alias for standard input.
642 =item A scalar reference
644 If C<$input> is a scalar reference, the compressed data will be read from
649 =head2 Constructor Options
652 The option names defined below are case insensitive and can be optionally
653 prefixed by a '-'. So all of the following are valid
660 OPTS is a combination of the following options:
664 =item -AutoClose =E<gt> 0|1
666 This option is only valid when the C<$input> parameter is a filehandle. If
667 specified, and the value is true, it will result in the file being closed once
668 either the C<close> method is called or the IO::Uncompress::Gunzip object is
671 This parameter defaults to 0.
673 =item -MultiStream =E<gt> 0|1
677 Allows multiple concatenated compressed streams to be treated as a single
678 compressed stream. Decompression will stop once either the end of the
679 file/buffer is reached, an error is encountered (premature eof, corrupt
680 compressed data) or the end of a stream is not immediately followed by the
681 start of another stream.
683 This parameter defaults to 0.
687 =item -Prime =E<gt> $string
689 This option will uncompress the contents of C<$string> before processing the
692 This option can be useful when the compressed data is embedded in another
693 file/data structure and it is not possible to work out where the compressed
694 data begins without having to read the first few bytes. If this is the
695 case, the uncompression can be I<primed> with these bytes using this
698 =item -Transparent =E<gt> 0|1
700 If this option is set and the input file or buffer is not compressed data,
701 the module will allow reading of it anyway.
703 This option defaults to 1.
705 =item -BlockSize =E<gt> $num
707 When reading the compressed input data, IO::Uncompress::Gunzip will read it in
708 blocks of C<$num> bytes.
710 This option defaults to 4096.
712 =item -InputLength =E<gt> $size
714 When present this option will limit the number of compressed bytes read
715 from the input file/buffer to C<$size>. This option can be used in the
716 situation where there is useful data directly after the compressed data
717 stream and you know beforehand the exact length of the compressed data
720 This option is mostly used when reading from a filehandle, in which case
721 the file pointer will be left pointing to the first byte directly after the
722 compressed data stream.
726 This option defaults to off.
728 =item -Append =E<gt> 0|1
730 This option controls what the C<read> method does with uncompressed data.
732 If set to 1, all uncompressed data will be appended to the output parameter
733 of the C<read> method.
735 If set to 0, the contents of the output parameter of the C<read> method
736 will be overwritten by the uncompressed data.
740 =item -Strict =E<gt> 0|1
744 This option controls whether the extra checks defined below are used when
745 carrying out the decompression. When Strict is on, the extra tests are
746 carried out, when Strict is off they are not.
748 The default for this option is off.
762 If the FHCRC bit is set in the gzip FLG header byte, the CRC16 bytes in the
763 header must match the crc16 value of the gzip header actually read.
767 If the gzip header contains a name field (FNAME) it consists solely of ISO
772 If the gzip header contains a comment field (FCOMMENT) it consists solely
773 of ISO 8859-1 characters plus line-feed.
777 If the gzip FEXTRA header field is present it must conform to the sub-field
778 structure as defined in RFC1952.
782 The CRC32 and ISIZE trailer fields must be present.
786 The value of the CRC32 field read must match the crc32 value of the
787 uncompressed data actually contained in the gzip file.
791 The value of the ISIZE fields read must match the length of the
792 uncompressed data actually read from the file.
801 =item -ParseExtra =E<gt> 0|1
803 If the gzip FEXTRA header field is present and this option is set, it will
804 force the module to check that it conforms to the sub-field structure as
807 If the C<Strict> is on it will automatically enable this option.
825 $status = $z->read($buffer)
827 Reads a block of compressed data (the size the the compressed block is
828 determined by the C<Buffer> option in the constructor), uncompresses it and
829 writes any uncompressed data into C<$buffer>. If the C<Append> parameter is
830 set in the constructor, the uncompressed data will be appended to the
831 C<$buffer> parameter. Otherwise C<$buffer> will be overwritten.
833 Returns the number of uncompressed bytes written to C<$buffer>, zero if eof
834 or a negative number on error.
840 $status = $z->read($buffer, $length)
841 $status = $z->read($buffer, $length, $offset)
843 $status = read($z, $buffer, $length)
844 $status = read($z, $buffer, $length, $offset)
846 Attempt to read C<$length> bytes of uncompressed data into C<$buffer>.
848 The main difference between this form of the C<read> method and the
849 previous one, is that this one will attempt to return I<exactly> C<$length>
850 bytes. The only circumstances that this function will not is if end-of-file
851 or an IO error is encountered.
853 Returns the number of uncompressed bytes written to C<$buffer>, zero if eof
854 or a negative number on error.
861 $line = $z->getline()
866 This method fully supports the use of of the variable C<$/>
867 (or C<$INPUT_RECORD_SEPARATOR> or C<$RS> when C<English> is in use) to
868 determine what constitutes an end of line. Both paragraph mode and file
869 slurp mode are supported.
878 Read a single character.
884 $char = $z->ungetc($string)
891 $status = $z->inflateSync()
899 $hdr = $z->getHeaderInfo();
900 @hdrs = $z->getHeaderInfo();
902 This method returns either a hash reference (in scalar context) or a list
903 or hash references (in array context) that contains information about each
904 of the header fields in the compressed data stream(s).
912 The contents of the Name header field, if present. If no name is
913 present, the value will be undef. Note this is different from a zero length
914 name, which will return an empty string.
918 The contents of the Comment header field, if present. If no comment is
919 present, the value will be undef. Note this is different from a zero length
920 comment, which will return an empty string.
934 Returns the uncompressed file offset.
945 Returns true if the end of the compressed input stream has been reached.
951 $z->seek($position, $whence);
952 seek($z, $position, $whence);
957 Provides a sub-set of the C<seek> functionality, with the restriction
958 that it is only legal to seek forward in the input file/buffer.
959 It is a fatal error to attempt to seek backward.
963 The C<$whence> parameter takes one the usual values, namely SEEK_SET,
964 SEEK_CUR or SEEK_END.
966 Returns 1 on success, 0 on failure.
975 This is a noop provided for completeness.
982 If the C<$z> object is associated with a file, this method will return
983 the underlying filehandle.
985 If the C<$z> object is is associated with a buffer, this method will
995 Closes the output file/buffer.
999 For most versions of Perl this method will be automatically invoked if
1000 the IO::Uncompress::Gunzip object is destroyed (either explicitly or by the
1001 variable with the reference to the object going out of scope). The
1002 exceptions are Perl versions 5.005 through 5.00504 and 5.8.0. In
1003 these cases, the C<close> method will be called automatically, but
1004 not until global destruction of all live objects when the program is
1007 Therefore, if you want your scripts to be able to run on all versions
1008 of Perl, you should call C<close> explicitly and not rely on automatic
1011 Returns true on success, otherwise 0.
1013 If the C<AutoClose> option has been enabled when the IO::Uncompress::Gunzip
1014 object was created, and the object is associated with a file, the
1015 underlying file will also be closed.
1022 No symbolic constants are required by this IO::Uncompress::Gunzip at present.
1028 Imports C<gunzip> and C<$GunzipError>.
1031 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
1042 L<Compress::Zlib>, L<IO::Compress::Gzip>, L<IO::Compress::Deflate>, L<IO::Uncompress::Inflate>, L<IO::Compress::RawDeflate>, L<IO::Uncompress::RawInflate>, L<IO::Uncompress::AnyInflate>
1044 L<Compress::Zlib::FAQ|Compress::Zlib::FAQ>
1046 L<File::GlobMapper|File::GlobMapper>, L<Archive::Tar|Archive::Zip>,
1047 L<IO::Zlib|IO::Zlib>
1049 For RFC 1950, 1951 and 1952 see
1050 F<http://www.faqs.org/rfcs/rfc1950.html>,
1051 F<http://www.faqs.org/rfcs/rfc1951.html> and
1052 F<http://www.faqs.org/rfcs/rfc1952.html>
1054 The primary site for the gzip program is F<http://www.gzip.org>.
1058 The I<IO::Uncompress::Gunzip> module was written by Paul Marquess,
1059 F<pmqs@cpan.org>. The latest copy of the module can be
1060 found on CPAN in F<modules/by-module/Compress/Compress-Zlib-x.x.tar.gz>.
1062 The I<zlib> compression library was written by Jean-loup Gailly
1063 F<gzip@prep.ai.mit.edu> and Mark Adler F<madler@alumni.caltech.edu>.
1065 The primary site for the I<zlib> compression library is
1066 F<http://www.zlib.org>.
1068 =head1 MODIFICATION HISTORY
1070 See the Changes file.
1072 =head1 COPYRIGHT AND LICENSE
1075 Copyright (c) 2005-2006 Paul Marquess. All rights reserved.
1076 This program is free software; you can redistribute it and/or
1077 modify it under the same terms as Perl itself.