2 package IO::Uncompress::Gunzip ;
12 use IO::Uncompress::RawInflate ;
14 use Compress::Raw::Zlib qw( crc32 ) ;
15 use IO::Compress::Base::Common qw(:Status createSelfTiedObject);
16 use IO::Compress::Gzip::Constants;
17 use IO::Compress::Zlib::Extra;
21 our ($VERSION, @ISA, @EXPORT_OK, %EXPORT_TAGS, $GunzipError);
23 @ISA = qw( Exporter IO::Uncompress::RawInflate );
24 @EXPORT_OK = qw( $GunzipError gunzip );
25 %EXPORT_TAGS = %IO::Uncompress::RawInflate::DEFLATE_CONSTANTS ;
26 push @{ $EXPORT_TAGS{all} }, @EXPORT_OK ;
27 Exporter::export_ok_tags('all');
31 $VERSION = '2.000_12';
37 my $obj = createSelfTiedObject($class, \$GunzipError);
39 $obj->_create(undef, 0, @_);
44 my $obj = createSelfTiedObject(undef, \$GunzipError);
45 return $obj->_inf(@_) ;
50 use IO::Compress::Base::Common qw(:Parse);
51 return ( 'ParseExtra' => [1, 1, Parse_boolean, 0] ) ;
59 # gunzip always needs crc32
60 $got->value('CRC32' => 1);
70 $self->smartReadExact(\$magic, GZIP_ID_SIZE);
72 *$self->{HeaderPending} = $magic ;
74 return $self->HeaderError("Minimum header size is " .
75 GZIP_MIN_HEADER_SIZE . " bytes")
76 if length $magic != GZIP_ID_SIZE ;
78 return $self->HeaderError("Bad Magic")
79 if ! isGzipMagic($magic) ;
81 *$self->{Type} = 'rfc1952';
91 return $self->_readGzipHeader($magic);
100 my ($CRC32, $ISIZE) = unpack("V V", $trailer) ;
101 *$self->{Info}{CRC32} = $CRC32;
102 *$self->{Info}{ISIZE} = $ISIZE;
104 if (*$self->{Strict}) {
105 return $self->TrailerError("CRC mismatch")
106 if $CRC32 != *$self->{Uncomp}->crc32() ;
108 my $exp_isize = *$self->{Uncomp}->uncompressedBytes();
109 return $self->TrailerError("ISIZE mismatch. Got $ISIZE"
110 . ", expected $exp_isize")
111 if $ISIZE != $exp_isize ;
120 return 0 if length $buffer < GZIP_ID_SIZE ;
121 my ($id1, $id2) = unpack("C C", $buffer) ;
122 return $id1 == GZIP_ID1 && $id2 == GZIP_ID2 ;
125 sub _readFullGzipHeader($)
130 $self->smartReadExact(\$magic, GZIP_ID_SIZE);
132 *$self->{HeaderPending} = $magic ;
134 return $self->HeaderError("Minimum header size is " .
135 GZIP_MIN_HEADER_SIZE . " bytes")
136 if length $magic != GZIP_ID_SIZE ;
139 return $self->HeaderError("Bad Magic")
140 if ! isGzipMagic($magic) ;
142 my $status = $self->_readGzipHeader($magic);
143 delete *$self->{Transparent} if ! defined $status ;
147 sub _readGzipHeader($)
149 my ($self, $magic) = @_ ;
153 $self->smartReadExact(\$buffer, GZIP_MIN_HEADER_SIZE - GZIP_ID_SIZE)
154 or return $self->HeaderError("Minimum header size is " .
155 GZIP_MIN_HEADER_SIZE . " bytes") ;
157 my $keep = $magic . $buffer ;
158 *$self->{HeaderPending} = $keep ;
160 # now split out the various parts
161 my ($cm, $flag, $mtime, $xfl, $os) = unpack("C C V C C", $buffer) ;
163 $cm == GZIP_CM_DEFLATED
164 or return $self->HeaderError("Not Deflate (CM is $cm)") ;
166 # check for use of reserved bits
167 return $self->HeaderError("Use of Reserved Bits in FLG field.")
168 if $flag & GZIP_FLG_RESERVED ;
172 if ($flag & GZIP_FLG_FEXTRA) {
174 $self->smartReadExact(\$buffer, GZIP_FEXTRA_HEADER_SIZE)
175 or return $self->TruncatedHeader("FEXTRA Length") ;
177 my ($XLEN) = unpack("v", $buffer) ;
178 $self->smartReadExact(\$EXTRA, $XLEN)
179 or return $self->TruncatedHeader("FEXTRA Body");
180 $keep .= $buffer . $EXTRA ;
182 if ($XLEN && *$self->{'ParseExtra'}) {
183 my $bad = IO::Compress::Zlib::Extra::parseRawExtra($EXTRA,
185 return $self->HeaderError($bad)
191 if ($flag & GZIP_FLG_FNAME) {
194 $self->smartReadExact(\$buffer, 1)
195 or return $self->TruncatedHeader("FNAME");
196 last if $buffer eq GZIP_NULL_BYTE ;
199 $keep .= $origname . GZIP_NULL_BYTE ;
201 return $self->HeaderError("Non ISO 8859-1 Character found in Name")
202 if *$self->{Strict} && $origname =~ /$GZIP_FNAME_INVALID_CHAR_RE/o ;
206 if ($flag & GZIP_FLG_FCOMMENT) {
209 $self->smartReadExact(\$buffer, 1)
210 or return $self->TruncatedHeader("FCOMMENT");
211 last if $buffer eq GZIP_NULL_BYTE ;
214 $keep .= $comment . GZIP_NULL_BYTE ;
216 return $self->HeaderError("Non ISO 8859-1 Character found in Comment")
217 if *$self->{Strict} && $comment =~ /$GZIP_FCOMMENT_INVALID_CHAR_RE/o ;
220 if ($flag & GZIP_FLG_FHCRC) {
221 $self->smartReadExact(\$buffer, GZIP_FHCRC_SIZE)
222 or return $self->TruncatedHeader("FHCRC");
224 $HeaderCRC = unpack("v", $buffer) ;
225 my $crc16 = crc32($keep) & 0xFF ;
227 return $self->HeaderError("CRC16 mismatch.")
228 if *$self->{Strict} && $crc16 != $HeaderCRC;
233 # Assume compression method is deflated for xfl tests
237 *$self->{Type} = 'rfc1952';
241 'FingerprintLength' => 2,
242 'HeaderLength' => length $keep,
243 'TrailerLength' => GZIP_TRAILER_SIZE,
245 'isMinimalHeader' => $keep eq GZIP_MINIMUM_HEADER ? 1 : 0,
248 'MethodName' => $cm == GZIP_CM_DEFLATED ? "Deflated" : "Unknown" ,
249 'TextFlag' => $flag & GZIP_FLG_FTEXT ? 1 : 0,
250 'HeaderCRCFlag' => $flag & GZIP_FLG_FHCRC ? 1 : 0,
251 'NameFlag' => $flag & GZIP_FLG_FNAME ? 1 : 0,
252 'CommentFlag' => $flag & GZIP_FLG_FCOMMENT ? 1 : 0,
253 'ExtraFlag' => $flag & GZIP_FLG_FEXTRA ? 1 : 0,
255 'Comment' => $comment,
258 'OsName' => defined $GZIP_OS_Names{$os}
259 ? $GZIP_OS_Names{$os} : "Unknown",
260 'HeaderCRC' => $HeaderCRC,
262 'ExtraFlags' => $xfl,
263 'ExtraFieldRaw' => $EXTRA,
264 'ExtraField' => [ @EXTRA ],
267 #'CompSize'=> $compsize,
269 #'OrigSize'=> $ISIZE,
283 IO::Uncompress::Gunzip - Read RFC 1952 files/buffers
289 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
291 my $status = gunzip $input => $output [,OPTS]
292 or die "gunzip failed: $GunzipError\n";
294 my $z = new IO::Uncompress::Gunzip $input [OPTS]
295 or die "gunzip failed: $GunzipError\n";
297 $status = $z->read($buffer)
298 $status = $z->read($buffer, $length)
299 $status = $z->read($buffer, $length, $offset)
300 $line = $z->getline()
305 $status = $z->inflateSync()
308 $data = $z->getHeaderInfo()
310 $z->seek($position, $whence)
322 read($z, $buffer, $length);
323 read($z, $buffer, $length, $offset);
325 seek($z, $position, $whence)
336 B<WARNING -- This is a Beta release>.
340 =item * DO NOT use in production code.
342 =item * The documentation is incomplete in places.
344 =item * Parts of the interface defined here are tentative.
346 =item * Please report any problems you find.
354 This module provides a Perl interface that allows the reading of
355 files/buffers that conform to RFC 1952.
357 For writing RFC 1952 files/buffers, see the companion module IO::Compress::Gzip.
364 =head1 Functional Interface
366 A top-level function, C<gunzip>, is provided to carry out
367 "one-shot" uncompression between buffers and/or files. For finer
368 control over the uncompression process, see the L</"OO Interface">
371 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
373 gunzip $input => $output [,OPTS]
374 or die "gunzip failed: $GunzipError\n";
378 The functional interface needs Perl5.005 or better.
381 =head2 gunzip $input => $output [, OPTS]
384 C<gunzip> expects at least two parameters, C<$input> and C<$output>.
386 =head3 The C<$input> parameter
388 The parameter, C<$input>, is used to define the source of
391 It can take one of the following forms:
397 If the C<$input> parameter is a simple scalar, it is assumed to be a
398 filename. This file will be opened for reading and the input data
399 will be read from it.
403 If the C<$input> parameter is a filehandle, the input data will be
405 The string '-' can be used as an alias for standard input.
407 =item A scalar reference
409 If C<$input> is a scalar reference, the input data will be read
412 =item An array reference
414 If C<$input> is an array reference, each element in the array must be a
417 The input data will be read from each file in turn.
419 The complete array will be walked to ensure that it only
420 contains valid filenames before any data is uncompressed.
424 =item An Input FileGlob string
426 If C<$input> is a string that is delimited by the characters "<" and ">"
427 C<gunzip> will assume that it is an I<input fileglob string>. The
428 input is the list of files that match the fileglob.
430 If the fileglob does not match any files ...
432 See L<File::GlobMapper|File::GlobMapper> for more details.
437 If the C<$input> parameter is any other type, C<undef> will be returned.
441 =head3 The C<$output> parameter
443 The parameter C<$output> is used to control the destination of the
444 uncompressed data. This parameter can take one of these forms.
450 If the C<$output> parameter is a simple scalar, it is assumed to be a
451 filename. This file will be opened for writing and the uncompressed
452 data will be written to it.
456 If the C<$output> parameter is a filehandle, the uncompressed data
457 will be written to it.
458 The string '-' can be used as an alias for standard output.
461 =item A scalar reference
463 If C<$output> is a scalar reference, the uncompressed data will be
464 stored in C<$$output>.
468 =item An Array Reference
470 If C<$output> is an array reference, the uncompressed data will be
471 pushed onto the array.
473 =item An Output FileGlob
475 If C<$output> is a string that is delimited by the characters "<" and ">"
476 C<gunzip> will assume that it is an I<output fileglob string>. The
477 output is the list of files that match the fileglob.
479 When C<$output> is an fileglob string, C<$input> must also be a fileglob
480 string. Anything else is an error.
484 If the C<$output> parameter is any other type, C<undef> will be returned.
491 When C<$input> maps to multiple compressed files/buffers and C<$output> is
492 a single file/buffer, after uncompression C<$output> will contain a
493 concatenation of all the uncompressed data from each of the input
500 =head2 Optional Parameters
502 Unless specified below, the optional parameters for C<gunzip>,
503 C<OPTS>, are the same as those used with the OO interface defined in the
504 L</"Constructor Options"> section below.
508 =item AutoClose =E<gt> 0|1
510 This option applies to any input or output data streams to
511 C<gunzip> that are filehandles.
513 If C<AutoClose> is specified, and the value is true, it will result in all
514 input and/or output filehandles being closed once C<gunzip> has
517 This parameter defaults to 0.
521 =item BinModeOut =E<gt> 0|1
523 When writing to a file or filehandle, set C<binmode> before writing to the
532 =item -Append =E<gt> 0|1
536 =item -MultiStream =E<gt> 0|1
538 Creates a new stream after each file.
551 To read the contents of the file C<file1.txt.gz> and write the
552 compressed data to the file C<file1.txt>.
556 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
558 my $input = "file1.txt.gz";
559 my $output = "file1.txt";
560 gunzip $input => $output
561 or die "gunzip failed: $GunzipError\n";
564 To read from an existing Perl filehandle, C<$input>, and write the
565 uncompressed data to a buffer, C<$buffer>.
569 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
572 my $input = new IO::File "<file1.txt.gz"
573 or die "Cannot open 'file1.txt.gz': $!\n" ;
575 gunzip $input => \$buffer
576 or die "gunzip failed: $GunzipError\n";
578 To uncompress all files in the directory "/my/home" that match "*.txt.gz" and store the compressed data in the same directory
582 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
584 gunzip '</my/home/*.txt.gz>' => '</my/home/#1.txt>'
585 or die "gunzip failed: $GunzipError\n";
587 and if you want to compress each file one at a time, this will do the trick
591 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
593 for my $input ( glob "/my/home/*.txt.gz" )
597 gunzip $input => $output
598 or die "Error compressing '$input': $GunzipError\n";
605 The format of the constructor for IO::Uncompress::Gunzip is shown below
608 my $z = new IO::Uncompress::Gunzip $input [OPTS]
609 or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
611 Returns an C<IO::Uncompress::Gunzip> object on success and undef on failure.
612 The variable C<$GunzipError> will contain an error message on failure.
614 If you are running Perl 5.005 or better the object, C<$z>, returned from
615 IO::Uncompress::Gunzip can be used exactly like an L<IO::File|IO::File> filehandle.
616 This means that all normal input file operations can be carried out with
617 C<$z>. For example, to read a line from a compressed file/buffer you can
618 use either of these forms
620 $line = $z->getline();
623 The mandatory parameter C<$input> is used to determine the source of the
624 compressed data. This parameter can take one of three forms.
630 If the C<$input> parameter is a scalar, it is assumed to be a filename. This
631 file will be opened for reading and the compressed data will be read from it.
635 If the C<$input> parameter is a filehandle, the compressed data will be
637 The string '-' can be used as an alias for standard input.
640 =item A scalar reference
642 If C<$input> is a scalar reference, the compressed data will be read from
647 =head2 Constructor Options
650 The option names defined below are case insensitive and can be optionally
651 prefixed by a '-'. So all of the following are valid
658 OPTS is a combination of the following options:
662 =item -AutoClose =E<gt> 0|1
664 This option is only valid when the C<$input> parameter is a filehandle. If
665 specified, and the value is true, it will result in the file being closed once
666 either the C<close> method is called or the IO::Uncompress::Gunzip object is
669 This parameter defaults to 0.
671 =item -MultiStream =E<gt> 0|1
675 Allows multiple concatenated compressed streams to be treated as a single
676 compressed stream. Decompression will stop once either the end of the
677 file/buffer is reached, an error is encountered (premature eof, corrupt
678 compressed data) or the end of a stream is not immediately followed by the
679 start of another stream.
681 This parameter defaults to 0.
685 =item -Prime =E<gt> $string
687 This option will uncompress the contents of C<$string> before processing the
690 This option can be useful when the compressed data is embedded in another
691 file/data structure and it is not possible to work out where the compressed
692 data begins without having to read the first few bytes. If this is the
693 case, the uncompression can be I<primed> with these bytes using this
696 =item -Transparent =E<gt> 0|1
698 If this option is set and the input file or buffer is not compressed data,
699 the module will allow reading of it anyway.
701 This option defaults to 1.
703 =item -BlockSize =E<gt> $num
705 When reading the compressed input data, IO::Uncompress::Gunzip will read it in
706 blocks of C<$num> bytes.
708 This option defaults to 4096.
710 =item -InputLength =E<gt> $size
712 When present this option will limit the number of compressed bytes read
713 from the input file/buffer to C<$size>. This option can be used in the
714 situation where there is useful data directly after the compressed data
715 stream and you know beforehand the exact length of the compressed data
718 This option is mostly used when reading from a filehandle, in which case
719 the file pointer will be left pointing to the first byte directly after the
720 compressed data stream.
724 This option defaults to off.
726 =item -Append =E<gt> 0|1
728 This option controls what the C<read> method does with uncompressed data.
730 If set to 1, all uncompressed data will be appended to the output parameter
731 of the C<read> method.
733 If set to 0, the contents of the output parameter of the C<read> method
734 will be overwritten by the uncompressed data.
738 =item -Strict =E<gt> 0|1
742 This option controls whether the extra checks defined below are used when
743 carrying out the decompression. When Strict is on, the extra tests are
744 carried out, when Strict is off they are not.
746 The default for this option is off.
760 If the FHCRC bit is set in the gzip FLG header byte, the CRC16 bytes in the
761 header must match the crc16 value of the gzip header actually read.
765 If the gzip header contains a name field (FNAME) it consists solely of ISO
770 If the gzip header contains a comment field (FCOMMENT) it consists solely
771 of ISO 8859-1 characters plus line-feed.
775 If the gzip FEXTRA header field is present it must conform to the sub-field
776 structure as defined in RFC 1952.
780 The CRC32 and ISIZE trailer fields must be present.
784 The value of the CRC32 field read must match the crc32 value of the
785 uncompressed data actually contained in the gzip file.
789 The value of the ISIZE fields read must match the length of the
790 uncompressed data actually read from the file.
799 =item -ParseExtra =E<gt> 0|1
801 If the gzip FEXTRA header field is present and this option is set, it will
802 force the module to check that it conforms to the sub-field structure as
805 If the C<Strict> is on it will automatically enable this option.
825 $status = $z->read($buffer)
827 Reads a block of compressed data (the size the the compressed block is
828 determined by the C<Buffer> option in the constructor), uncompresses it and
829 writes any uncompressed data into C<$buffer>. If the C<Append> parameter is
830 set in the constructor, the uncompressed data will be appended to the
831 C<$buffer> parameter. Otherwise C<$buffer> will be overwritten.
833 Returns the number of uncompressed bytes written to C<$buffer>, zero if eof
834 or a negative number on error.
840 $status = $z->read($buffer, $length)
841 $status = $z->read($buffer, $length, $offset)
843 $status = read($z, $buffer, $length)
844 $status = read($z, $buffer, $length, $offset)
846 Attempt to read C<$length> bytes of uncompressed data into C<$buffer>.
848 The main difference between this form of the C<read> method and the
849 previous one, is that this one will attempt to return I<exactly> C<$length>
850 bytes. The only circumstances that this function will not is if end-of-file
851 or an IO error is encountered.
853 Returns the number of uncompressed bytes written to C<$buffer>, zero if eof
854 or a negative number on error.
861 $line = $z->getline()
866 This method fully supports the use of of the variable C<$/>
867 (or C<$INPUT_RECORD_SEPARATOR> or C<$RS> when C<English> is in use) to
868 determine what constitutes an end of line. Both paragraph mode and file
869 slurp mode are supported.
878 Read a single character.
884 $char = $z->ungetc($string)
892 $status = $z->inflateSync()
901 $hdr = $z->getHeaderInfo();
902 @hdrs = $z->getHeaderInfo();
904 This method returns either a hash reference (in scalar context) or a list
905 or hash references (in array context) that contains information about each
906 of the header fields in the compressed data stream(s).
914 The contents of the Name header field, if present. If no name is
915 present, the value will be undef. Note this is different from a zero length
916 name, which will return an empty string.
920 The contents of the Comment header field, if present. If no comment is
921 present, the value will be undef. Note this is different from a zero length
922 comment, which will return an empty string.
936 Returns the uncompressed file offset.
947 Returns true if the end of the compressed input stream has been reached.
953 $z->seek($position, $whence);
954 seek($z, $position, $whence);
959 Provides a sub-set of the C<seek> functionality, with the restriction
960 that it is only legal to seek forward in the input file/buffer.
961 It is a fatal error to attempt to seek backward.
965 The C<$whence> parameter takes one the usual values, namely SEEK_SET,
966 SEEK_CUR or SEEK_END.
968 Returns 1 on success, 0 on failure.
977 This is a noop provided for completeness.
983 Returns true if the object currently refers to a opened file/buffer.
987 my $prev = $z->autoflush()
988 my $prev = $z->autoflush(EXPR)
990 If the C<$z> object is associated with a file or a filehandle, this method
991 returns the current autoflush setting for the underlying filehandle. If
992 C<EXPR> is present, and is non-zero, it will enable flushing after every
993 write/print operation.
995 If C<$z> is associated with a buffer, this method has no effect and always
998 B<Note> that the special variable C<$|> B<cannot> be used to set or
999 retrieve the autoflush setting.
1001 =head2 input_line_number
1003 $z->input_line_number()
1004 $z->input_line_number(EXPR)
1008 Returns the current uncompressed line number. If C<EXPR> is present it has
1009 the effect of setting the line number. Note that setting the line number
1010 does not change the current position within the file/buffer being read.
1012 The contents of C<$/> are used to to determine what constitutes a line
1022 If the C<$z> object is associated with a file or a filehandle, this method
1023 will return the underlying file descriptor.
1025 If the C<$z> object is is associated with a buffer, this method will
1035 Closes the output file/buffer.
1039 For most versions of Perl this method will be automatically invoked if
1040 the IO::Uncompress::Gunzip object is destroyed (either explicitly or by the
1041 variable with the reference to the object going out of scope). The
1042 exceptions are Perl versions 5.005 through 5.00504 and 5.8.0. In
1043 these cases, the C<close> method will be called automatically, but
1044 not until global destruction of all live objects when the program is
1047 Therefore, if you want your scripts to be able to run on all versions
1048 of Perl, you should call C<close> explicitly and not rely on automatic
1051 Returns true on success, otherwise 0.
1053 If the C<AutoClose> option has been enabled when the IO::Uncompress::Gunzip
1054 object was created, and the object is associated with a file, the
1055 underlying file will also be closed.
1062 No symbolic constants are required by this IO::Uncompress::Gunzip at present.
1068 Imports C<gunzip> and C<$GunzipError>.
1071 use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
1082 L<Compress::Zlib>, L<IO::Compress::Gzip>, L<IO::Compress::Deflate>, L<IO::Uncompress::Inflate>, L<IO::Compress::RawDeflate>, L<IO::Uncompress::RawInflate>, L<IO::Compress::Bzip2>, L<IO::Uncompress::Bunzip2>, L<IO::Compress::Lzop>, L<IO::Uncompress::UnLzop>, L<IO::Uncompress::AnyInflate>, L<IO::Uncompress::AnyUncompress>
1084 L<Compress::Zlib::FAQ|Compress::Zlib::FAQ>
1086 L<File::GlobMapper|File::GlobMapper>, L<Archive::Zip|Archive::Zip>,
1087 L<Archive::Tar|Archive::Tar>,
1088 L<IO::Zlib|IO::Zlib>
1091 For RFC 1950, 1951 and 1952 see
1092 F<http://www.faqs.org/rfcs/rfc1950.html>,
1093 F<http://www.faqs.org/rfcs/rfc1951.html> and
1094 F<http://www.faqs.org/rfcs/rfc1952.html>
1096 The I<zlib> compression library was written by Jean-loup Gailly
1097 F<gzip@prep.ai.mit.edu> and Mark Adler F<madler@alumni.caltech.edu>.
1099 The primary site for the I<zlib> compression library is
1100 F<http://www.zlib.org>.
1102 The primary site for gzip is F<http://www.gzip.org>.
1109 This module was written by Paul Marquess, F<pmqs@cpan.org>.
1113 =head1 MODIFICATION HISTORY
1115 See the Changes file.
1117 =head1 COPYRIGHT AND LICENSE
1119 Copyright (c) 2005-2006 Paul Marquess. All rights reserved.
1121 This program is free software; you can redistribute it and/or
1122 modify it under the same terms as Perl itself.