cleaned up bench_spew_list to be more consistant in names and options
[urisagit/File-Slurp.git] / Slurp.pm
CommitLineData
635c7876 1package File::Slurp;
2
3use strict;
4
5use Carp ;
6use Fcntl qw( :DEFAULT ) ;
7use POSIX qw( :fcntl_h ) ;
8use Symbol ;
9
10use base 'Exporter' ;
11use vars qw( %EXPORT_TAGS @EXPORT_OK $VERSION @EXPORT ) ;
12
13%EXPORT_TAGS = ( 'all' => [
14 qw( read_file write_file overwrite_file append_file read_dir ) ] ) ;
15
16@EXPORT = ( @{ $EXPORT_TAGS{'all'} } );
17@EXPORT_OK = qw( slurp ) ;
18
19$VERSION = '9999.13';
20
21my $is_win32 = $^O =~ /win32/i ;
22
23# Install subs for various constants that aren't set in older perls
24# (< 5.005). Fcntl on old perls uses Exporter to define subs without a
25# () prototype These can't be overridden with the constant pragma or
26# we get a prototype mismatch. Hence this less than aesthetically
27# appealing BEGIN block:
28
29BEGIN {
30 unless( eval { defined SEEK_SET() } ) {
31 *SEEK_SET = sub { 0 };
32 *SEEK_CUR = sub { 1 };
33 *SEEK_END = sub { 2 };
34 }
35
36 unless( eval { defined O_BINARY() } ) {
37 *O_BINARY = sub { 0 };
38 *O_RDONLY = sub { 0 };
39 *O_WRONLY = sub { 1 };
40 }
41
42 unless ( eval { defined O_APPEND() } ) {
43
44 if ( $^O =~ /olaris/ ) {
45 *O_APPEND = sub { 8 };
46 *O_CREAT = sub { 256 };
47 *O_EXCL = sub { 1024 };
48 }
49 elsif ( $^O =~ /inux/ ) {
50 *O_APPEND = sub { 1024 };
51 *O_CREAT = sub { 64 };
52 *O_EXCL = sub { 128 };
53 }
54 elsif ( $^O =~ /BSD/i ) {
55 *O_APPEND = sub { 8 };
56 *O_CREAT = sub { 512 };
57 *O_EXCL = sub { 2048 };
58 }
59 }
60}
61
62# print "OS [$^O]\n" ;
63
64# print "O_BINARY = ", O_BINARY(), "\n" ;
65# print "O_RDONLY = ", O_RDONLY(), "\n" ;
66# print "O_WRONLY = ", O_WRONLY(), "\n" ;
67# print "O_APPEND = ", O_APPEND(), "\n" ;
68# print "O_CREAT ", O_CREAT(), "\n" ;
69# print "O_EXCL ", O_EXCL(), "\n" ;
70
71
72*slurp = \&read_file ;
73
74sub read_file {
75
76 my( $file_name, %args ) = @_ ;
77
78# set the buffer to either the passed in one or ours and init it to the null
79# string
80
81 my $buf ;
82 my $buf_ref = $args{'buf_ref'} || \$buf ;
83 ${$buf_ref} = '' ;
84
85 my( $read_fh, $size_left, $blk_size ) ;
86
87# check if we are reading from a handle (glob ref or IO:: object)
88
89 if ( ref $file_name ) {
90
91# slurping a handle so use it and don't open anything.
92# set the block size so we know it is a handle and read that amount
93
94 $read_fh = $file_name ;
95 $blk_size = $args{'blk_size'} || 1024 * 1024 ;
96 $size_left = $blk_size ;
97
98# DEEP DARK MAGIC. this checks the UNTAINT IO flag of a
99# glob/handle. only the DATA handle is untainted (since it is from
100# trusted data in the source file). this allows us to test if this is
101# the DATA handle and then to do a sysseek to make sure it gets
102# slurped correctly. on some systems, the buffered i/o pointer is not
103# left at the same place as the fd pointer. this sysseek makes them
104# the same so slurping with sysread will work.
105
106 eval{ require B } ;
107
108 if ( $@ ) {
109
110 @_ = ( \%args, <<ERR ) ;
111Can't find B.pm with this Perl: $!.
112That module is needed to slurp the DATA handle.
113ERR
114 goto &_error ;
115 }
116
117 if ( B::svref_2object( $read_fh )->IO->IoFLAGS & 16 ) {
118
119# set the seek position to the current tell.
120
121 sysseek( $read_fh, tell( $read_fh ), SEEK_SET ) ||
122 croak "sysseek $!" ;
123 }
124 }
125 else {
126
127# a regular file. set the sysopen mode
128
129 my $mode = O_RDONLY ;
130 $mode |= O_BINARY if $args{'binmode'} ;
131
132#printf "RD: BINARY %x MODE %x\n", O_BINARY, $mode ;
133
134# open the file and handle any error
135
136 $read_fh = gensym ;
137 unless ( sysopen( $read_fh, $file_name, $mode ) ) {
138 @_ = ( \%args, "read_file '$file_name' - sysopen: $!");
139 goto &_error ;
140 }
141
142# get the size of the file for use in the read loop
143
144 $size_left = -s $read_fh ;
145
146 unless( $size_left ) {
147
148 $blk_size = $args{'blk_size'} || 1024 * 1024 ;
149 $size_left = $blk_size ;
150 }
151 }
152
153# infinite read loop. we exit when we are done slurping
154
155 while( 1 ) {
156
157# do the read and see how much we got
158
159 my $read_cnt = sysread( $read_fh, ${$buf_ref},
160 $size_left, length ${$buf_ref} ) ;
161
162 if ( defined $read_cnt ) {
163
164# good read. see if we hit EOF (nothing left to read)
165
166 last if $read_cnt == 0 ;
167
168# loop if we are slurping a handle. we don't track $size_left then.
169
170 next if $blk_size ;
171
172# count down how much we read and loop if we have more to read.
173 $size_left -= $read_cnt ;
174 last if $size_left <= 0 ;
175 next ;
176 }
177
178# handle the read error
179
180 @_ = ( \%args, "read_file '$file_name' - sysread: $!");
181 goto &_error ;
182 }
183
184# fix up cr/lf to be a newline if this is a windows text file
185
186 ${$buf_ref} =~ s/\015\012/\n/g if $is_win32 && !$args{'binmode'} ;
187
188# this is the 5 returns in a row. each handles one possible
189# combination of caller context and requested return type
190
191 my $sep = $/ ;
192 $sep = '\n\n+' if defined $sep && $sep eq '' ;
193
194# caller wants to get an array ref of lines
195
196# this split doesn't work since it tries to use variable length lookbehind
197# the m// line works.
198# return [ split( m|(?<=$sep)|, ${$buf_ref} ) ] if $args{'array_ref'} ;
199 return [ length(${$buf_ref}) ? ${$buf_ref} =~ /(.*?$sep|.+)/sg : () ]
200 if $args{'array_ref'} ;
201
202# caller wants a list of lines (normal list context)
203
204# same problem with this split as before.
205# return split( m|(?<=$sep)|, ${$buf_ref} ) if wantarray ;
206 return length(${$buf_ref}) ? ${$buf_ref} =~ /(.*?$sep|.+)/sg : ()
207 if wantarray ;
208
209# caller wants a scalar ref to the slurped text
210
211 return $buf_ref if $args{'scalar_ref'} ;
212
213# caller wants a scalar with the slurped text (normal scalar context)
214
215 return ${$buf_ref} if defined wantarray ;
216
217# caller passed in an i/o buffer by reference (normal void context)
218
219 return ;
220}
221
222sub write_file {
223
224 my $file_name = shift ;
225
226# get the optional argument hash ref from @_ or an empty hash ref.
227
228 my $args = ( ref $_[0] eq 'HASH' ) ? shift : {} ;
229
230 my( $buf_ref, $write_fh, $no_truncate, $orig_file_name, $data_is_ref ) ;
231
232# get the buffer ref - it depends on how the data is passed into write_file
233# after this if/else $buf_ref will have a scalar ref to the data.
234
235 if ( ref $args->{'buf_ref'} eq 'SCALAR' ) {
236
237# a scalar ref passed in %args has the data
238# note that the data was passed by ref
239
240 $buf_ref = $args->{'buf_ref'} ;
241 $data_is_ref = 1 ;
242 }
243 elsif ( ref $_[0] eq 'SCALAR' ) {
244
245# the first value in @_ is the scalar ref to the data
246# note that the data was passed by ref
247
248 $buf_ref = shift ;
249 $data_is_ref = 1 ;
250 }
251 elsif ( ref $_[0] eq 'ARRAY' ) {
252
253# the first value in @_ is the array ref to the data so join it.
254
255 ${$buf_ref} = join '', @{$_[0]} ;
256 }
257 else {
258
259# good old @_ has all the data so join it.
260
261 ${$buf_ref} = join '', @_ ;
262 }
263
264# see if we were passed a open handle to spew to.
265
266 if ( ref $file_name ) {
267
268# we have a handle. make sure we don't call truncate on it.
269
270 $write_fh = $file_name ;
271 $no_truncate = 1 ;
272 }
273 else {
274
275# spew to regular file.
276
277 if ( $args->{'atomic'} ) {
278
279# in atomic mode, we spew to a temp file so make one and save the original
280# file name.
281 $orig_file_name = $file_name ;
282 $file_name .= ".$$" ;
283 }
284
285# set the mode for the sysopen
286
287 my $mode = O_WRONLY | O_CREAT ;
288 $mode |= O_BINARY if $args->{'binmode'} ;
289 $mode |= O_APPEND if $args->{'append'} ;
290 $mode |= O_EXCL if $args->{'no_clobber'} ;
291
292#printf "WR: BINARY %x MODE %x\n", O_BINARY, $mode ;
293
294# open the file and handle any error.
295
296 $write_fh = gensym ;
297 unless ( sysopen( $write_fh, $file_name, $mode ) ) {
298 @_ = ( $args, "write_file '$file_name' - sysopen: $!");
299 goto &_error ;
300 }
301 }
302
303 sysseek( $write_fh, 0, SEEK_END ) if $args->{'append'} ;
304
305
306#print 'WR before data ', unpack( 'H*', ${$buf_ref}), "\n" ;
307
308# fix up newline to write cr/lf if this is a windows text file
309
310 if ( $is_win32 && !$args->{'binmode'} ) {
311
312# copy the write data if it was passed by ref so we don't clobber the
313# caller's data
314 $buf_ref = \do{ my $copy = ${$buf_ref}; } if $data_is_ref ;
315 ${$buf_ref} =~ s/\n/\015\012/g ;
316 }
317
318#print 'after data ', unpack( 'H*', ${$buf_ref}), "\n" ;
319
320# get the size of how much we are writing and init the offset into that buffer
321
322 my $size_left = length( ${$buf_ref} ) ;
323 my $offset = 0 ;
324
325# loop until we have no more data left to write
326
327 do {
328
329# do the write and track how much we just wrote
330
331 my $write_cnt = syswrite( $write_fh, ${$buf_ref},
332 $size_left, $offset ) ;
333
334 unless ( defined $write_cnt ) {
335
336# the write failed
337 @_ = ( $args, "write_file '$file_name' - syswrite: $!");
338 goto &_error ;
339 }
340
341# track much left to write and where to write from in the buffer
342
343 $size_left -= $write_cnt ;
344 $offset += $write_cnt ;
345
346 } while( $size_left > 0 ) ;
347
348# we truncate regular files in case we overwrite a long file with a shorter file
349# so seek to the current position to get it (same as tell()).
350
351 truncate( $write_fh,
352 sysseek( $write_fh, 0, SEEK_CUR ) ) unless $no_truncate ;
353
354 close( $write_fh ) ;
355
356# handle the atomic mode - move the temp file to the original filename.
357
358 rename( $file_name, $orig_file_name ) if $args->{'atomic'} ;
359
360 return 1 ;
361}
362
363# this is for backwards compatibility with the previous File::Slurp module.
364# write_file always overwrites an existing file
365
366*overwrite_file = \&write_file ;
367
368# the current write_file has an append mode so we use that. this
369# supports the same API with an optional second argument which is a
370# hash ref of options.
371
372sub append_file {
373
374# get the optional args hash ref
375 my $args = $_[1] ;
376 if ( ref $args eq 'HASH' ) {
377
378# we were passed an args ref so just mark the append mode
379
380 $args->{append} = 1 ;
381 }
382 else {
383
384# no args hash so insert one with the append mode
385
386 splice( @_, 1, 0, { append => 1 } ) ;
387 }
388
389# magic goto the main write_file sub. this overlays the sub without touching
390# the stack or @_
391
392 goto &write_file
393}
394
395# basic wrapper around opendir/readdir
396
397sub read_dir {
398
399 my ($dir, %args ) = @_;
400
401# this handle will be destroyed upon return
402
403 local(*DIRH);
404
405# open the dir and handle any errors
406
407 unless ( opendir( DIRH, $dir ) ) {
408
409 @_ = ( \%args, "read_dir '$dir' - opendir: $!" ) ;
410 goto &_error ;
411 }
412
413 my @dir_entries = readdir(DIRH) ;
414
415 @dir_entries = grep( $_ ne "." && $_ ne "..", @dir_entries )
416 unless $args{'keep_dot_dot'} ;
417
418 return @dir_entries if wantarray ;
419 return \@dir_entries ;
420}
421
422# error handling section
423#
424# all the error handling uses magic goto so the caller will get the
425# error message as if from their code and not this module. if we just
426# did a call on the error code, the carp/croak would report it from
427# this module since the error sub is one level down on the call stack
428# from read_file/write_file/read_dir.
429
430
431my %err_func = (
432 'carp' => \&carp,
433 'croak' => \&croak,
434) ;
435
436sub _error {
437
438 my( $args, $err_msg ) = @_ ;
439
440# get the error function to use
441
442 my $func = $err_func{ $args->{'err_mode'} || 'croak' } ;
443
444# if we didn't find it in our error function hash, they must have set
445# it to quiet and we don't do anything.
446
447 return unless $func ;
448
449# call the carp/croak function
450
451 $func->($err_msg) ;
452
453# return a hard undef (in list context this will be a single value of
454# undef which is not a legal in-band value)
455
456 return undef ;
457}
458
4591;
460__END__
461
462=head1 NAME
463
464File::Slurp - Efficient Reading/Writing of Complete Files
465
466=head1 SYNOPSIS
467
468 use File::Slurp;
469
470 my $text = read_file( 'filename' ) ;
471 my @lines = read_file( 'filename' ) ;
472
473 write_file( 'filename', @lines ) ;
474
475 use File::Slurp qw( slurp ) ;
476
477 my $text = slurp( 'filename' ) ;
478
479
480=head1 DESCRIPTION
481
482This module provides subs that allow you to read or write entire files
483with one simple call. They are designed to be simple to use, have
484flexible ways to pass in or get the file contents and to be very
485efficient. There is also a sub to read in all the files in a
486directory other than C<.> and C<..>
487
488These slurp/spew subs work for files, pipes and
489sockets, and stdio, pseudo-files, and DATA.
490
491=head2 B<read_file>
492
493This sub reads in an entire file and returns its contents to the
494caller. In list context it will return a list of lines (using the
495current value of $/ as the separator including support for paragraph
496mode when it is set to ''). In scalar context it returns the entire
497file as a single scalar.
498
499 my $text = read_file( 'filename' ) ;
500 my @lines = read_file( 'filename' ) ;
501
502The first argument to C<read_file> is the filename and the rest of the
503arguments are key/value pairs which are optional and which modify the
504behavior of the call. Other than binmode the options all control how
505the slurped file is returned to the caller.
506
507If the first argument is a file handle reference or I/O object (if ref
508is true), then that handle is slurped in. This mode is supported so
509you slurp handles such as C<DATA>, C<STDIN>. See the test handle.t
510for an example that does C<open( '-|' )> and child process spews data
511to the parant which slurps it in. All of the options that control how
512the data is returned to the caller still work in this case.
513
514NOTE: as of version 9999.06, read_file works correctly on the C<DATA>
515handle. It used to need a sysseek workaround but that is now handled
516when needed by the module itself.
517
518You can optionally request that C<slurp()> is exported to your code. This
519is an alias for read_file and is meant to be forward compatible with
520Perl 6 (which will have slurp() built-in).
521
522The options are:
523
524=head3 binmode
525
526If you set the binmode option, then the file will be slurped in binary
527mode.
528
529 my $bin_data = read_file( $bin_file, binmode => ':raw' ) ;
530
531NOTE: this actually sets the O_BINARY mode flag for sysopen. It
532probably should call binmode and pass its argument to support other
533file modes.
534
535=head3 array_ref
536
537If this boolean option is set, the return value (only in scalar
538context) will be an array reference which contains the lines of the
539slurped file. The following two calls are equivalent:
540
541 my $lines_ref = read_file( $bin_file, array_ref => 1 ) ;
542 my $lines_ref = [ read_file( $bin_file ) ] ;
543
544=head3 scalar_ref
545
546If this boolean option is set, the return value (only in scalar
547context) will be an scalar reference to a string which is the contents
548of the slurped file. This will usually be faster than returning the
549plain scalar.
550
551 my $text_ref = read_file( $bin_file, scalar_ref => 1 ) ;
552
553=head3 buf_ref
554
555You can use this option to pass in a scalar reference and the slurped
556file contents will be stored in the scalar. This can be used in
557conjunction with any of the other options.
558
559 my $text_ref = read_file( $bin_file, buf_ref => \$buffer,
560 array_ref => 1 ) ;
561 my @lines = read_file( $bin_file, buf_ref => \$buffer ) ;
562
563=head3 blk_size
564
565You can use this option to set the block size used when slurping from an already open handle (like \*STDIN). It defaults to 1MB.
566
567 my $text_ref = read_file( $bin_file, blk_size => 10_000_000,
568 array_ref => 1 ) ;
569
570=head3 err_mode
571
572You can use this option to control how read_file behaves when an error
573occurs. This option defaults to 'croak'. You can set it to 'carp' or
574to 'quiet to have no error handling. This code wants to carp and then
575read abother file if it fails.
576
577 my $text_ref = read_file( $file, err_mode => 'carp' ) ;
578 unless ( $text_ref ) {
579
580 # read a different file but croak if not found
581 $text_ref = read_file( $another_file ) ;
582 }
583
584 # process ${$text_ref}
585
586=head2 B<write_file>
587
588This sub writes out an entire file in one call.
589
590 write_file( 'filename', @data ) ;
591
592The first argument to C<write_file> is the filename. The next argument
593is an optional hash reference and it contains key/values that can
594modify the behavior of C<write_file>. The rest of the argument list is
595the data to be written to the file.
596
597 write_file( 'filename', {append => 1 }, @data ) ;
598 write_file( 'filename', {binmode => ':raw' }, $buffer ) ;
599
600As a shortcut if the first data argument is a scalar or array
601reference, it is used as the only data to be written to the file. Any
602following arguments in @_ are ignored. This is a faster way to pass in
603the output to be written to the file and is equivilent to the
604C<buf_ref> option. These following pairs are equivilent but the pass
605by reference call will be faster in most cases (especially with larger
606files).
607
608 write_file( 'filename', \$buffer ) ;
609 write_file( 'filename', $buffer ) ;
610
611 write_file( 'filename', \@lines ) ;
612 write_file( 'filename', @lines ) ;
613
614If the first argument is a file handle reference or I/O object (if ref
615is true), then that handle is slurped in. This mode is supported so
616you spew to handles such as \*STDOUT. See the test handle.t for an
617example that does C<open( '-|' )> and child process spews data to the
618parant which slurps it in. All of the options that control how the
619data is passes into C<write_file> still work in this case.
620
621C<write_file> returns 1 upon successfully writing the file or undef if
622it encountered an error.
623
624The options are:
625
626=head3 binmode
627
628If you set the binmode option, then the file will be written in binary
629mode.
630
631 write_file( $bin_file, {binmode => ':raw'}, @data ) ;
632
633NOTE: this actually sets the O_BINARY mode flag for sysopen. It
634probably should call binmode and pass its argument to support other
635file modes.
636
637=head3 buf_ref
638
639You can use this option to pass in a scalar reference which has the
640data to be written. If this is set then any data arguments (including
641the scalar reference shortcut) in @_ will be ignored. These are
642equivilent:
643
644 write_file( $bin_file, { buf_ref => \$buffer } ) ;
645 write_file( $bin_file, \$buffer ) ;
646 write_file( $bin_file, $buffer ) ;
647
648=head3 atomic
649
650If you set this boolean option, the file will be written to in an
651atomic fashion. A temporary file name is created by appending the pid
652($$) to the file name argument and that file is spewed to. After the
653file is closed it is renamed to the original file name (and rename is
654an atomic operation on most OS's). If the program using this were to
655crash in the middle of this, then the file with the pid suffix could
656be left behind.
657
658=head3 append
659
660If you set this boolean option, the data will be written at the end of
661the current file.
662
663 write_file( $file, {append => 1}, @data ) ;
664
665C<write_file> croaks if it cannot open the file. It returns true if it
666succeeded in writing out the file and undef if there was an
667error. (Yes, I know if it croaks it can't return anything but that is
668for when I add the options to select the error handling mode).
669
670=head3 no_clobber
671
672If you set this boolean option, an existing file will not be overwritten.
673
674 write_file( $file, {no_clobber => 1}, @data ) ;
675
676=head3 err_mode
677
678You can use this option to control how C<write_file> behaves when an
679error occurs. This option defaults to 'croak'. You can set it to
680'carp' or to 'quiet' to have no error handling other than the return
681value. If the first call to C<write_file> fails it will carp and then
682write to another file. If the second call to C<write_file> fails, it
683will croak.
684
685 unless ( write_file( $file, { err_mode => 'carp', \$data ) ;
686
687 # write a different file but croak if not found
688 write_file( $other_file, \$data ) ;
689 }
690
691=head2 overwrite_file
692
693This sub is just a typeglob alias to write_file since write_file
694always overwrites an existing file. This sub is supported for
695backwards compatibility with the original version of this module. See
696write_file for its API and behavior.
697
698=head2 append_file
699
700This sub will write its data to the end of the file. It is a wrapper
701around write_file and it has the same API so see that for the full
702documentation. These calls are equivilent:
703
704 append_file( $file, @data ) ;
705 write_file( $file, {append => 1}, @data ) ;
706
707=head2 read_dir
708
709This sub reads all the file names from directory and returns them to
710the caller but C<.> and C<..> are removed by default.
711
712 my @files = read_dir( '/path/to/dir' ) ;
713
714It croaks if it cannot open the directory.
715
716In a list context C<read_dir> returns a list of the entries in the
717directory. In a scalar context it returns an array reference which has
718the entries.
719
720=head3 keep_dot_dot
721
722If this boolean option is set, C<.> and C<..> are not removed from the
723list of files.
724
725 my @all_files = read_dir( '/path/to/dir', keep_dot_dot => 1 ) ;
726
727=head2 EXPORT
728
729 read_file write_file overwrite_file append_file read_dir
730
731=head2 SEE ALSO
732
733An article on file slurping in extras/slurp_article.pod. There is
734also a benchmarking script in extras/slurp_bench.pl.
735
736=head2 BUGS
737
738If run under Perl 5.004, slurping from the DATA handle will fail as
739that requires B.pm which didn't get into core until 5.005.
740
741=head1 AUTHOR
742
743Uri Guttman, E<lt>uri@stemsystems.comE<gt>
744
745=cut