5 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
10 # Idea: The object will always contain an array of byte offsets
11 # this will be filled in as is necessary and convenient.
12 # fetch will do seek-read.
13 # There will be a cache parameter that controls the amount of cached *data*
14 # Also an LRU queue of cached records
15 # store will read the relevant record into the cache
16 # If it's the same length as what is being written, it will overwrite it in
17 # place; if not, it will do a from-to copying write.
18 # The record separator string is also a parameter
20 # Record numbers start at ZERO.
22 my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
28 my ($pack, $file, %opts) = @_;
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
40 # the cache is a hash instead of an array because it is likely to be
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
57 if (UNIVERSAL::isa($file, 'GLOB')) {
58 unless (seek $file, 0, SEEK_SET) {
59 croak "$pack: your filehandle does not appear to be seekable";
63 croak "usage: tie \@array, $pack, filename, [option => value]...";
65 $fh = \do { local *FH }; # only works in 5.005 and later
66 sysopen $fh, $file, $mode, 0666 or return;
69 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
72 bless \%opts => $pack;
78 # check the record cache
79 { my $cached = $self->_check_cache($n);
80 return $cached if defined $cached;
83 unless ($#{$self->{offsets}} >= $n) {
84 my $o = $self->_fill_offsets_to($n);
85 # If it's still undefined, there is no such record, so return 'undef'
86 return unless defined $o;
90 $self->_seek($n); # we can do this now that offsets is populated
91 my $rec = $self->_read_record;
92 $self->_cache_insert($n, $rec) if defined $rec;
97 my ($self, $n, $rec) = @_;
99 $self->_fixrecs($rec);
101 # TODO: what should we do about the cache? Install the new record
102 # in the cache only if the old version of the same record was
105 # We need this to decide whether the new record will fit
106 # It incidentally populates the offsets table
107 # Note we have to do this before we alter the cache
108 my $oldrec = $self->FETCH($n);
110 # _check_cache promotes record $n to MRU. Is this correct behavior?
111 if (my $cached = $self->_check_cache($n)) {
112 $self->{cache}{$n} = $rec;
113 $self->{cached} += length($rec) - length($cached);
116 if (not defined $oldrec) {
117 # We're storing a record beyond the end of the file
118 $self->_extend_file_to($n+1);
119 $oldrec = $self->{recsep};
121 my $len_diff = length($rec) - length($oldrec);
123 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
125 # now update the offsets
126 # array slice goes from element $n+1 (the first one to move)
128 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
135 my $n = $#{$self->{offsets}};
136 while (defined ($self->_fill_offsets_to($n+1))) {
143 my ($self, $len) = @_;
144 my $olen = $self->FETCHSIZE;
145 return if $len == $olen; # Woo-hoo!
149 $self->_extend_file_to($len);
156 $#{$self->{offsets}} = $len-1;
157 my @cached = grep $_ > $len, keys %{$self->{cache}};
158 delete @{$self->{cache}}{@cached} if @cached;
163 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
169 my $size = $self->FETCHSIZE;
170 return if $size == 0;
171 # print STDERR "# POPPITY POP POP POP\n";
172 scalar $self->SPLICE($size-1, 1);
177 scalar $self->SPLICE(0, 1);
182 $self->SPLICE(0, 0, @_);
187 # And enable auto-defer mode, since it's likely that they just
192 %{$self->{cache}} = ();
194 @{$self->{lru}} = ();
195 @{$self->{offsets}} = (0);
200 $self->_fill_offsets_to($n);
201 $self->_extend_file_to($n);
206 my $lastrec = $self->FETCHSIZE-1;
207 if ($n == $lastrec) {
210 $#{$self->{offsets}}--;
211 delete $self->{cached}{$n};
212 @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
213 # perhaps in this case I should also remove trailing null records?
215 $self->STORE($n, "");
221 $self->_fill_offsets_to($n);
222 0 <= $n && $n < $self->FETCHSIZE;
226 my ($self, $pos, $nrecs, @data) = @_;
229 $pos = 0 unless defined $pos;
231 # Deal with negative and other out-of-range positions
232 # Also set default for $nrecs
234 my $oldsize = $self->FETCHSIZE;
235 $nrecs = $oldsize unless defined $nrecs;
241 croak "Modification of non-creatable array value attempted, subscript $oldpos";
245 if ($pos > $oldsize) {
247 $pos = $oldsize; # This is what perl does for normal arrays
251 $self->_fixrecs(@data);
252 my $data = join '', @data;
253 my $datalen = length $data;
256 # compute length of data being removed
257 # Incidentally fills offsets table
258 for ($pos .. $pos+$nrecs-1) {
259 my $rec = $self->FETCH($_);
260 last unless defined $rec;
262 $oldlen += length($rec);
266 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
268 # update the offsets table part 1
269 # compute the offsets of the new records:
272 push @new_offsets, $self->{offsets}[$pos];
273 for (0 .. $#data-1) {
274 push @new_offsets, $new_offsets[-1] + length($data[$_]);
277 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
279 # update the offsets table part 2
280 # adjust the offsets of the following old records
281 for ($pos+@data .. $#{$self->{offsets}}) {
282 $self->{offsets}[$_] += $datalen - $oldlen;
284 # If we scrubbed out all known offsets, regenerate the trivial table
285 # that knows that the file does indeed start at 0.
286 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
288 # Perhaps the following cache foolery could be factored out
289 # into a bunch of mor opaque cache functions. For example,
290 # it's odd to delete a record from the cache and then remove
291 # it from the LRU queue later on; there should be a function to
294 # update the read cache, part 1
296 # Consider this carefully for correctness
297 for ($pos .. $pos+$nrecs-1) {
298 my $cached = $self->{cache}{$_};
299 next unless defined $cached;
300 my $new = $data[$_-$pos];
302 $self->{cached} += length($new) - length($cached);
303 $self->{cache}{$_} = $new;
305 delete $self->{cache}{$_};
306 $self->{cached} -= length($cached);
309 # update the read cache, part 2
310 # moved records - records past the site of the change
311 # need to be renumbered
312 # Maybe merge this with the previous block?
313 for (keys %{$self->{cache}}) {
314 next unless $_ >= $pos + $nrecs;
315 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
320 for (@{$self->{lru}}) {
321 if ($_ >= $pos + $nrecs) {
322 push @new, $_ + @data - $nrecs;
323 } elsif ($_ >= $pos) {
324 push @changed, $_ if $_ < $pos + @data;
329 @{$self->{lru}} = (@new, @changed);
331 # Yes, the return value of 'splice' *is* actually this complicated
332 wantarray ? @result : @result ? $result[-1] : undef;
335 # write data into the file
336 # $data is the data to be written.
337 # it should be written at position $pos, and should overwrite
338 # exactly $len of the following bytes.
339 # Note that if length($data) > $len, the subsequent bytes will have to
340 # be moved up, and if length($data) < $len, they will have to
343 my ($self, $data, $pos, $len) = @_;
345 unless (defined $pos) {
346 die "\$pos was undefined in _twrite";
349 my $len_diff = length($data) - $len;
351 if ($len_diff == 0) { # Woo-hoo!
352 my $fh = $self->{fh};
354 $self->_write_record($data);
355 return; # well, that was easy.
358 # the two records are of different lengths
359 # our strategy here: rewrite the tail of the file,
360 # reading ahead one buffer at a time
361 # $bufsize is required to be at least as large as the data we're overwriting
362 my $bufsize = _bufsize($len_diff);
363 my ($writepos, $readpos) = ($pos, $pos+$len);
366 # Seems like there ought to be a way to avoid the repeated code
367 # and the special case here. The read(1) is also a little weird.
370 $self->_seekb($readpos);
371 my $br = read $self->{fh}, $next_block, $bufsize;
372 my $more_data = read $self->{fh}, my($dummy), 1;
373 $self->_seekb($writepos);
374 $self->_write_record($data);
376 $writepos += length $data;
379 $self->_seekb($writepos);
380 $self->_write_record($next_block);
382 # There might be leftover data at the end of the file
383 $self->_chop_file if $len_diff < 0;
386 # If a record does not already end with the appropriate terminator
387 # string, append one.
391 $_ .= $self->{recsep}
392 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
396 # seek to the beginning of record #$n
397 # Assumes that the offsets table is already correctly populated
399 # Note that $n=-1 has a special meaning here: It means the start of
400 # the last known record; this may or may not be the very last record
401 # in the file, depending on whether the offsets table is fully populated.
405 my $o = $self->{offsets}[$n];
407 or confess("logic error: undefined offset for record $n");
408 seek $self->{fh}, $o, SEEK_SET
409 or die "Couldn't seek filehandle: $!"; # "Should never happen."
414 seek $self->{fh}, $b, SEEK_SET
415 or die "Couldn't seek filehandle: $!"; # "Should never happen."
418 # populate the offsets table up to the beginning of record $n
419 # return the offset of record $n
420 sub _fill_offsets_to {
422 my $fh = $self->{fh};
423 local *OFF = $self->{offsets};
426 until ($#OFF >= $n) {
428 $self->_seek(-1); # tricky -- see comment at _seek
429 $rec = $self->_read_record;
433 return; # It turns out there is no such record
437 # we have now read all the records up to record n-1,
438 # so we can return the offset of record n
442 # assumes that $rec is already suitably terminated
444 my ($self, $rec) = @_;
445 my $fh = $self->{fh};
447 or die "Couldn't write record: $!"; # "Should never happen."
454 { local $/ = $self->{recsep};
455 my $fh = $self->{fh};
462 my ($self, $n, $rec) = @_;
464 # Do not cache records that are too big to fit in the cache.
465 return unless length $rec <= $self->{cachesize};
467 $self->{cache}{$n} = $rec;
468 $self->{cached} += length $rec;
469 push @{$self->{lru}}, $n; # most-recently-used is at the END
471 $self->_cache_flush if $self->{cached} > $self->{cachesize};
477 return unless defined($rec = $self->{cache}{$n});
479 # cache hit; update LRU queue and return $rec
480 # replace this with a heap in a later version
481 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
487 while ($self->{cached} > $self->{cachesize}) {
488 my $lru = shift @{$self->{lru}};
489 $self->{cached} -= length $lru;
490 delete $self->{cache}{$lru};
494 # We have read to the end of the file and have the offsets table
495 # entirely populated. Now we need to write a new record beyond
496 # the end of the file. We prepare for this by writing
497 # empty records into the file up to the position we want
499 # assumes that the offsets table already contains the offset of record $n,
500 # if it exists, and extends to the end of the file if not.
501 sub _extend_file_to {
503 $self->_seek(-1); # position after the end of the last record
504 my $pos = $self->{offsets}[-1];
506 # the offsets table has one entry more than the total number of records
507 $extras = $n - $#{$self->{offsets}};
509 # Todo : just use $self->{recsep} x $extras here?
510 while ($extras-- > 0) {
511 $self->_write_record($self->{recsep});
512 push @{$self->{offsets}}, tell $self->{fh};
516 # Truncate the file at the current position
519 truncate $self->{fh}, tell($self->{fh});
522 # compute the size of a buffer suitable for moving
523 # all the data in a file forward $n bytes
524 # ($n may be negative)
525 # The result should be at least $n.
528 return 8192 if $n < 0;
530 $b += 8192 if $n & 8191;
536 my ($self, $op) = @_;
538 my $pack = ref $self;
539 croak "Usage: $pack\->flock([OPERATION])";
541 my $fh = $self->{fh};
542 $op = LOCK_EX unless defined $op;
546 # Given a file, make sure the cache is consistent with the
548 sub _check_integrity {
549 my ($self, $file, $warn) = @_;
552 unless ($self->{offsets}[0] == 0) {
553 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
557 local *F = $self->{fh};
559 local $/ = $self->{recsep};
564 my $cached = $self->{cache}{$n};
565 my $offset = $self->{offsets}[$.];
567 if (defined $offset && $offset != $ao) {
568 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
571 if (defined $cached && $_ ne $cached) {
575 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
580 while (my ($n, $r) = each %{$self->{cache}}) {
581 $cachesize += length($r);
582 next if $n+1 <= $.; # checked this already
583 $warn && print STDERR "# spurious caching of record $n\n";
586 if ($cachesize != $self->{cached}) {
587 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
591 my (%seen, @duplicate);
592 for (@{$self->{lru}}) {
594 if (not exists $self->{cache}{$_}) {
595 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
599 @duplicate = grep $seen{$_}>1, keys %seen;
601 my $records = @duplicate == 1 ? 'Record' : 'Records';
602 my $appear = @duplicate == 1 ? 'appears' : 'appear';
603 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
606 for (keys %{$self->{cache}}) {
607 unless (exists $seen{$_}) {
608 print "# $record $_ is in the cache but not the LRU queue\n";
616 "Cogito, ergo sum."; # don't forget to return a true value from the file
620 Tie::File - Access the lines of a disk file via a Perl array
624 # This file documents Tie::File version 0.16
626 tie @array, 'Tie::File', filename or die ...;
628 $array[13] = 'blah'; # line 13 of the file is now 'blah'
629 print $array[42]; # display line 42 of the file
631 $n_recs = @array; # how many records are in the file?
632 $#array = $n_recs - 2; # chop records off the end
634 # As you would expect:
636 push @array, new recs...;
638 unshift @array, new recs...;
639 my $r1 = shift @array;
640 @old_recs = splice @array, 3, 7, new recs...;
642 untie @array; # all finished
646 C<Tie::File> represents a regular text file as a Perl array. Each
647 element in the array corresponds to a record in the file. The first
648 line of the file is element 0 of the array; the second line is element
651 The file is I<not> loaded into memory, so this will work even for
654 Changes to the array are reflected in the file immediately.
658 What is a 'record'? By default, the meaning is the same as for the
659 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
660 probably C<"\n"> or C<"\r\n">. You may change the definition of
661 "record" by supplying the C<recsep> option in the C<tie> call:
663 tie @array, 'Tie::File', $file, recsep => 'es';
665 This says that records are delimited by the string C<es>. If the file contained the following data:
667 Curse these pesky flies!\n
669 then the C<@array> would appear to have four elements:
676 An undefined value is not permitted as a record separator. Perl's
677 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
680 Records read from the tied array will have the record separator string
681 on the end, just as if they were read from the C<E<lt>...E<gt>>
682 operator. Records stored into the array will have the record
683 separator string appended before they are written to the file, if they
684 don't have one already. For example, if the record separator string
685 is C<"\n">, then the following two lines do exactly the same thing:
687 $array[17] = "Cherry pie";
688 $array[17] = "Cherry pie\n";
690 The result is that the contents of line 17 of the file will be
691 replaced with "Cherry pie"; a newline character will separate line 17
692 from line 18. This means that in particular, this will do nothing:
696 Because the C<chomp>ed value will have the separator reattached when
697 it is written back to the file. There is no way to create a file
698 whose trailing record separator string is missing.
700 Inserting records that I<contain> the record separator string will
701 produce a reasonable result, but if you can't foresee what this result
702 will be, you'd better avoid doing this.
706 Normally, the specified file will be opened for read and write access,
707 and will be created if it does not exist. (That is, the flags
708 C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
709 change this, you may supply alternative flags in the C<mode> option.
710 See L<Fcntl> for a listing of available flags.
713 # open the file if it exists, but fail if it does not exist
715 tie @array, 'Tie::File', $file, mode => O_RDWR;
717 # create the file if it does not exist
718 use Fcntl 'O_RDWR', 'O_CREAT';
719 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
721 # open an existing file in read-only mode
722 use Fcntl 'O_RDONLY';
723 tie @array, 'Tie::File', $file, mode => O_RDONLY;
725 Opening the data file in write-only or append mode is not supported.
729 Records read in from the file are cached, to avoid having to re-read
730 them repeatedly. If you read the same record twice, the first time it
731 will be stored in memory, and the second time it will be fetched from
734 The cache has a bounded size; when it exceeds this size, the
735 least-recently visited records will be purged from the cache. The
736 default size is 2Mib. You can adjust the amount of space used for the
737 cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
739 # I have a lot of memory, so use a large cache to speed up access
740 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
742 Setting the cache size to 0 will inhibit caching; records will be
743 fetched from disk every time you examine them.
747 C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
748 C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
751 =head1 Public Methods
753 The C<tie> call returns an object, say C<$o>. You may call
755 $rec = $o->FETCH($n);
758 to fetch or store the record at line C<$n>, respectively. The only other public method in this package is:
764 will lock the tied file. C<MODE> has the same meaning as the second
765 argument to the Perl built-in C<flock> function; for example
766 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
767 the C<use Fcntl ':flock'> declaration.)
769 C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
772 The best way to unlock a file is to discard the object and untie the
773 array. It is probably unsafe to unlock the file without also untying
774 it, because if you do, changes may remain unwritten inside the object.
775 That is why there is no shortcut for unlocking. If you really want to
776 unlock the file prematurely, you know what to do; if you don't know
777 what to do, then don't do it.
779 All the usual warnings about file locking apply here. In particular,
780 note that file locking in Perl is B<advisory>, which means that
781 holding a lock will not prevent anyone else from reading, writing, or
782 erasing the file; it only prevents them from getting another lock at
783 the same time. Locks are analogous to green traffic lights: If you
784 have a green light, that does not prevent the idiot coming the other
785 way from plowing into you sideways; it merely guarantees to you that
786 the idiot does not also have a green light at the same time.
788 =head2 Tying to an already-opened filehandle
790 If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
791 of the other C<IO> modules, you may use:
793 tie @array, 'Tie::File', $fh, ...;
795 Similarly if you opened that handle C<FH> with regular C<open> or
796 C<sysopen>, you may use:
798 tie @array, 'Tie::File', \*FH, ...;
800 Handles that were opened write-only won't work. Handles that were
801 opened read-only will work as long as you don't try to write to them.
802 Handles must be attached to seekable sources of data---that means no
803 pipes or sockets. If you try to supply a non-seekable handle, the
804 C<tie> call will abort your program.
808 (That's Latin for 'warnings'.)
810 =head2 Efficiency Note
812 Every effort was made to make this module efficient. Nevertheless,
813 changing the size of a record in the middle of a large file will
814 always be slow, because everything after the new record must be moved.
816 In particular, note that:
823 is likely to be very slow, because the first iteration must relocate
824 lines 1 through 999,999; the second iteration must relocate lines 2
825 through 999,999, and so on. The relocation is done using block
826 writes, however, so it's not as slow as it might be.
828 A future version of this module will provide a mechanism for getting
829 better performance in such cases, by deferring the writing until it
830 can be done all at once.
832 =head2 Efficiency Note 2
834 Not every effort was made to make this module as efficient as
835 possible. C<FETCHSIZE> should use binary search instead of linear
836 search. The cache's LRU queue should be a heap instead of a list.
837 These defects are probably minor; in any event, they will be fixed in
838 a later version of the module.
840 =head2 Efficiency Note 3
842 The author has supposed that since this module is concerned with file
843 I/O, almost all normal use of it will be heavily I/O bound, and that
844 the time to maintain complicated data structures inside the module
845 will be dominated by the time to actually perform the I/O. This
846 suggests, for example, that an LRU read-cache is a good tradeoff,
847 even if it requires substantial adjustment following a C<splice>
852 (That's Latin for 'warnings'.)
854 The behavior of tied arrays is not precisely the same as for regular
857 undef $a[10]; print "How unusual!\n" if $a[10];
859 C<undef>-ing a C<Tie::File> array element just blanks out the
860 corresponding record in the file. When you read it back again, you'll
861 see the record separator (typically, $a[10] will appear to contain
862 "\n") so the supposedly-C<undef>'ed value will be true.
864 There are other minor differences, but in general, the correspondence
871 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
873 To receive an announcement whenever a new version of this module is
874 released, send a blank email message to
875 C<mjd-perl-tiefile-subscribe@plover.com>.
879 C<Tie::File> version 0.16 is copyright (C) 2002 Mark Jason Dominus.
881 This library is free software; you may redistribute it and/or modify
882 it under the same terms as Perl itself.
884 These terms include your choice of (1) the Perl Artistic Licence, or
885 (2) version 2 of the GNU General Public License as published by the
886 Free Software Foundation, or (3) any later version of the GNU General
889 This library is distributed in the hope that it will be useful,
890 but WITHOUT ANY WARRANTY; without even the implied warranty of
891 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
892 GNU General Public License for more details.
894 You should have received a copy of the GNU General Public License
895 along with this library program; it should be in the file C<COPYING>.
896 If not, write to the Free Software Foundation, Inc., 59 Temple Place,
897 Suite 330, Boston, MA 02111 USA
899 For licensing inquiries, contact the author at:
903 Philadelphia, PA 19107
907 C<Tie::File> version 0.16 comes with ABSOLUTELY NO WARRANTY.
908 For details, see the license.
912 Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
913 core when I hadn't written it yet, and for generally being helpful,
914 supportive, and competent. (Usually the rule is "choose any one.")
915 Also big thanks to Abhijit Menon-Sen for all of the same things.
917 Special thanks to Craig Berry (for VMS portability help), Randy Kobes
918 (for Win32 portability help), the rest of the CPAN testers (for
923 Tassilo von Parseval /
930 Test DELETE machinery more carefully.
932 More tests. (Configuration options, cache flushery. _twrite should
933 be tested separately, because there are a lot of weird special cases
936 More tests. (Stuff I didn't think of yet.)
938 Deferred writing. (!!!)
946 Maybe an autolocking mode?