5 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
10 # Idea: The object will always contain an array of byte offsets
11 # this will be filled in as is necessary and convenient.
12 # fetch will do seek-read.
13 # There will be a cache parameter that controls the amount of cached *data*
14 # Also an LRU queue of cached records
15 # store will read the relevant record into the cache
16 # If it's the same length as what is being written, it will overwrite it in
17 # place; if not, it will do a from-to copying write.
18 # The record separator string is also a parameter
20 # Record numbers start at ZERO.
22 my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
28 my ($pack, $file, %opts) = @_;
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
40 # the cache is a hash instead of an array because it is likely to be
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
57 if (UNIVERSAL::isa($file, 'GLOB')) {
58 unless (seek $file, 0, SEEK_SET) {
59 croak "$pack: your filehandle does not appear to be seekable";
63 croak "usage: tie \@array, $pack, filename, [option => value]...";
65 $fh = \do { local *FH }; # only works in 5.005 and later
66 sysopen $fh, $file, $mode, 0666 or return;
69 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
72 bless \%opts => $pack;
78 # check the record cache
79 { my $cached = $self->_check_cache($n);
80 return $cached if defined $cached;
83 unless ($#{$self->{offsets}} >= $n) {
84 my $o = $self->_fill_offsets_to($n);
85 # If it's still undefined, there is no such record, so return 'undef'
86 return unless defined $o;
90 $self->_seek($n); # we can do this now that offsets is populated
91 my $rec = $self->_read_record;
92 $self->_cache_insert($n, $rec) if defined $rec;
97 my ($self, $n, $rec) = @_;
99 $self->_fixrecs($rec);
101 # TODO: what should we do about the cache? Install the new record
102 # in the cache only if the old version of the same record was
105 # We need this to decide whether the new record will fit
106 # It incidentally populates the offsets table
107 # Note we have to do this before we alter the cache
108 my $oldrec = $self->FETCH($n);
110 # _check_cache promotes record $n to MRU. Is this correct behavior?
111 if (my $cached = $self->_check_cache($n)) {
112 $self->{cache}{$n} = $rec;
113 $self->{cached} += length($rec) - length($cached);
116 if (not defined $oldrec) {
117 # We're storing a record beyond the end of the file
118 $self->_extend_file_to($n+1);
119 $oldrec = $self->{recsep};
121 my $len_diff = length($rec) - length($oldrec);
123 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
125 # now update the offsets
126 # array slice goes from element $n+1 (the first one to move)
128 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
135 my $n = $#{$self->{offsets}};
136 while (defined ($self->_fill_offsets_to($n+1))) {
143 my ($self, $len) = @_;
144 my $olen = $self->FETCHSIZE;
145 return if $len == $olen; # Woo-hoo!
149 $self->_extend_file_to($len);
156 $#{$self->{offsets}} = $len;
157 my @cached = grep $_ >= $len, keys %{$self->{cache}};
158 $self->_uncache(@cached);
163 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
169 my $size = $self->FETCHSIZE;
170 return if $size == 0;
171 # print STDERR "# POPPITY POP POP POP\n";
172 scalar $self->SPLICE($size-1, 1);
177 scalar $self->SPLICE(0, 1);
182 $self->SPLICE(0, 0, @_);
187 # And enable auto-defer mode, since it's likely that they just
192 %{$self->{cache}} = ();
194 @{$self->{lru}} = ();
195 @{$self->{offsets}} = (0);
200 $self->_fill_offsets_to($n);
201 $self->_extend_file_to($n);
206 my $lastrec = $self->FETCHSIZE-1;
207 if ($n == $lastrec) {
210 $#{$self->{offsets}}--;
212 # perhaps in this case I should also remove trailing null records?
214 $self->STORE($n, "");
220 $self->_fill_offsets_to($n);
221 0 <= $n && $n < $self->FETCHSIZE;
225 my ($self, $pos, $nrecs, @data) = @_;
228 $pos = 0 unless defined $pos;
230 # Deal with negative and other out-of-range positions
231 # Also set default for $nrecs
233 my $oldsize = $self->FETCHSIZE;
234 $nrecs = $oldsize unless defined $nrecs;
240 croak "Modification of non-creatable array value attempted, subscript $oldpos";
244 if ($pos > $oldsize) {
246 $pos = $oldsize; # This is what perl does for normal arrays
250 $self->_fixrecs(@data);
251 my $data = join '', @data;
252 my $datalen = length $data;
255 # compute length of data being removed
256 # Incidentally fills offsets table
257 for ($pos .. $pos+$nrecs-1) {
258 my $rec = $self->FETCH($_);
259 last unless defined $rec;
261 $oldlen += length($rec);
265 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
267 # update the offsets table part 1
268 # compute the offsets of the new records:
271 push @new_offsets, $self->{offsets}[$pos];
272 for (0 .. $#data-1) {
273 push @new_offsets, $new_offsets[-1] + length($data[$_]);
276 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
278 # update the offsets table part 2
279 # adjust the offsets of the following old records
280 for ($pos+@data .. $#{$self->{offsets}}) {
281 $self->{offsets}[$_] += $datalen - $oldlen;
283 # If we scrubbed out all known offsets, regenerate the trivial table
284 # that knows that the file does indeed start at 0.
285 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
287 # Perhaps the following cache foolery could be factored out
288 # into a bunch of mor opaque cache functions. For example,
289 # it's odd to delete a record from the cache and then remove
290 # it from the LRU queue later on; there should be a function to
293 # update the read cache, part 1
295 # Consider this carefully for correctness
296 for ($pos .. $pos+$nrecs-1) {
297 my $cached = $self->{cache}{$_};
298 next unless defined $cached;
299 my $new = $data[$_-$pos];
301 $self->{cached} += length($new) - length($cached);
302 $self->{cache}{$_} = $new;
307 # update the read cache, part 2
308 # moved records - records past the site of the change
309 # need to be renumbered
310 # Maybe merge this with the previous block?
311 for (keys %{$self->{cache}}) {
312 next unless $_ >= $pos + $nrecs;
313 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
318 for (@{$self->{lru}}) {
319 if ($_ >= $pos + $nrecs) {
320 push @new, $_ + @data - $nrecs;
321 } elsif ($_ >= $pos) {
322 push @changed, $_ if $_ < $pos + @data;
327 @{$self->{lru}} = (@new, @changed);
329 # Yes, the return value of 'splice' *is* actually this complicated
330 wantarray ? @result : @result ? $result[-1] : undef;
333 # write data into the file
334 # $data is the data to be written.
335 # it should be written at position $pos, and should overwrite
336 # exactly $len of the following bytes.
337 # Note that if length($data) > $len, the subsequent bytes will have to
338 # be moved up, and if length($data) < $len, they will have to
341 my ($self, $data, $pos, $len) = @_;
343 unless (defined $pos) {
344 die "\$pos was undefined in _twrite";
347 my $len_diff = length($data) - $len;
349 if ($len_diff == 0) { # Woo-hoo!
350 my $fh = $self->{fh};
352 $self->_write_record($data);
353 return; # well, that was easy.
356 # the two records are of different lengths
357 # our strategy here: rewrite the tail of the file,
358 # reading ahead one buffer at a time
359 # $bufsize is required to be at least as large as the data we're overwriting
360 my $bufsize = _bufsize($len_diff);
361 my ($writepos, $readpos) = ($pos, $pos+$len);
364 # Seems like there ought to be a way to avoid the repeated code
365 # and the special case here. The read(1) is also a little weird.
368 $self->_seekb($readpos);
369 my $br = read $self->{fh}, $next_block, $bufsize;
370 my $more_data = read $self->{fh}, my($dummy), 1;
371 $self->_seekb($writepos);
372 $self->_write_record($data);
374 $writepos += length $data;
377 $self->_seekb($writepos);
378 $self->_write_record($next_block);
380 # There might be leftover data at the end of the file
381 $self->_chop_file if $len_diff < 0;
384 # If a record does not already end with the appropriate terminator
385 # string, append one.
389 $_ .= $self->{recsep}
390 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
394 # seek to the beginning of record #$n
395 # Assumes that the offsets table is already correctly populated
397 # Note that $n=-1 has a special meaning here: It means the start of
398 # the last known record; this may or may not be the very last record
399 # in the file, depending on whether the offsets table is fully populated.
403 my $o = $self->{offsets}[$n];
405 or confess("logic error: undefined offset for record $n");
406 seek $self->{fh}, $o, SEEK_SET
407 or die "Couldn't seek filehandle: $!"; # "Should never happen."
412 seek $self->{fh}, $b, SEEK_SET
413 or die "Couldn't seek filehandle: $!"; # "Should never happen."
416 # populate the offsets table up to the beginning of record $n
417 # return the offset of record $n
418 sub _fill_offsets_to {
420 my $fh = $self->{fh};
421 local *OFF = $self->{offsets};
424 until ($#OFF >= $n) {
426 $self->_seek(-1); # tricky -- see comment at _seek
427 $rec = $self->_read_record;
431 return; # It turns out there is no such record
435 # we have now read all the records up to record n-1,
436 # so we can return the offset of record n
440 # assumes that $rec is already suitably terminated
442 my ($self, $rec) = @_;
443 my $fh = $self->{fh};
445 or die "Couldn't write record: $!"; # "Should never happen."
452 { local $/ = $self->{recsep};
453 my $fh = $self->{fh};
460 my ($self, $n, $rec) = @_;
462 # Do not cache records that are too big to fit in the cache.
463 return unless length $rec <= $self->{cachesize};
465 $self->{cache}{$n} = $rec;
466 $self->{cached} += length $rec;
467 push @{$self->{lru}}, $n; # most-recently-used is at the END
469 $self->_cache_flush if $self->{cached} > $self->{cachesize};
475 my $cached = delete $self->{cache}{$n};
476 next unless defined $cached;
477 @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
478 $self->{cached} -= length($cached);
485 return unless defined($rec = $self->{cache}{$n});
487 # cache hit; update LRU queue and return $rec
488 # replace this with a heap in a later version
489 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
495 while ($self->{cached} > $self->{cachesize}) {
496 my $lru = shift @{$self->{lru}};
497 $self->{cached} -= length $lru;
498 delete $self->{cache}{$lru};
502 # We have read to the end of the file and have the offsets table
503 # entirely populated. Now we need to write a new record beyond
504 # the end of the file. We prepare for this by writing
505 # empty records into the file up to the position we want
507 # assumes that the offsets table already contains the offset of record $n,
508 # if it exists, and extends to the end of the file if not.
509 sub _extend_file_to {
511 $self->_seek(-1); # position after the end of the last record
512 my $pos = $self->{offsets}[-1];
514 # the offsets table has one entry more than the total number of records
515 $extras = $n - $#{$self->{offsets}};
517 # Todo : just use $self->{recsep} x $extras here?
518 while ($extras-- > 0) {
519 $self->_write_record($self->{recsep});
520 push @{$self->{offsets}}, tell $self->{fh};
524 # Truncate the file at the current position
527 truncate $self->{fh}, tell($self->{fh});
530 # compute the size of a buffer suitable for moving
531 # all the data in a file forward $n bytes
532 # ($n may be negative)
533 # The result should be at least $n.
536 return 8192 if $n < 0;
538 $b += 8192 if $n & 8191;
544 my ($self, $op) = @_;
546 my $pack = ref $self;
547 croak "Usage: $pack\->flock([OPERATION])";
549 my $fh = $self->{fh};
550 $op = LOCK_EX unless defined $op;
554 # Given a file, make sure the cache is consistent with the
556 sub _check_integrity {
557 my ($self, $file, $warn) = @_;
561 if (not defined $self->{offsets}[0]) {
562 $warn && print STDERR "# offset 0 is missing!\n";
564 } elsif ($self->{offsets}[0] != 0) {
565 $warn && print STDERR "# offset 0 is missing!\n";
566 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
570 local *F = $self->{fh};
572 local $/ = $self->{recsep};
577 my $cached = $self->{cache}{$n};
578 my $offset = $self->{offsets}[$.];
580 if (defined $offset && $offset != $ao) {
581 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
584 if (defined $cached && $_ ne $cached) {
588 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
593 while (my ($n, $r) = each %{$self->{cache}}) {
594 $cachesize += length($r);
595 next if $n+1 <= $.; # checked this already
596 $warn && print STDERR "# spurious caching of record $n\n";
599 if ($cachesize != $self->{cached}) {
600 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
604 my (%seen, @duplicate);
605 for (@{$self->{lru}}) {
607 if (not exists $self->{cache}{$_}) {
608 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
612 @duplicate = grep $seen{$_}>1, keys %seen;
614 my $records = @duplicate == 1 ? 'Record' : 'Records';
615 my $appear = @duplicate == 1 ? 'appears' : 'appear';
616 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
619 for (keys %{$self->{cache}}) {
620 unless (exists $seen{$_}) {
621 print "# record $_ is in the cache but not the LRU queue\n";
629 "Cogito, ergo sum."; # don't forget to return a true value from the file
633 Tie::File - Access the lines of a disk file via a Perl array
637 # This file documents Tie::File version 0.17
639 tie @array, 'Tie::File', filename or die ...;
641 $array[13] = 'blah'; # line 13 of the file is now 'blah'
642 print $array[42]; # display line 42 of the file
644 $n_recs = @array; # how many records are in the file?
645 $#array = $n_recs - 2; # chop records off the end
647 # As you would expect:
649 push @array, new recs...;
651 unshift @array, new recs...;
652 my $r1 = shift @array;
653 @old_recs = splice @array, 3, 7, new recs...;
655 untie @array; # all finished
659 C<Tie::File> represents a regular text file as a Perl array. Each
660 element in the array corresponds to a record in the file. The first
661 line of the file is element 0 of the array; the second line is element
664 The file is I<not> loaded into memory, so this will work even for
667 Changes to the array are reflected in the file immediately.
671 What is a 'record'? By default, the meaning is the same as for the
672 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
673 probably C<"\n"> or C<"\r\n">. You may change the definition of
674 "record" by supplying the C<recsep> option in the C<tie> call:
676 tie @array, 'Tie::File', $file, recsep => 'es';
678 This says that records are delimited by the string C<es>. If the file contained the following data:
680 Curse these pesky flies!\n
682 then the C<@array> would appear to have four elements:
689 An undefined value is not permitted as a record separator. Perl's
690 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
693 Records read from the tied array will have the record separator string
694 on the end, just as if they were read from the C<E<lt>...E<gt>>
695 operator. Records stored into the array will have the record
696 separator string appended before they are written to the file, if they
697 don't have one already. For example, if the record separator string
698 is C<"\n">, then the following two lines do exactly the same thing:
700 $array[17] = "Cherry pie";
701 $array[17] = "Cherry pie\n";
703 The result is that the contents of line 17 of the file will be
704 replaced with "Cherry pie"; a newline character will separate line 17
705 from line 18. This means that in particular, this will do nothing:
709 Because the C<chomp>ed value will have the separator reattached when
710 it is written back to the file. There is no way to create a file
711 whose trailing record separator string is missing.
713 Inserting records that I<contain> the record separator string will
714 produce a reasonable result, but if you can't foresee what this result
715 will be, you'd better avoid doing this.
719 Normally, the specified file will be opened for read and write access,
720 and will be created if it does not exist. (That is, the flags
721 C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
722 change this, you may supply alternative flags in the C<mode> option.
723 See L<Fcntl> for a listing of available flags.
726 # open the file if it exists, but fail if it does not exist
728 tie @array, 'Tie::File', $file, mode => O_RDWR;
730 # create the file if it does not exist
731 use Fcntl 'O_RDWR', 'O_CREAT';
732 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
734 # open an existing file in read-only mode
735 use Fcntl 'O_RDONLY';
736 tie @array, 'Tie::File', $file, mode => O_RDONLY;
738 Opening the data file in write-only or append mode is not supported.
742 Records read in from the file are cached, to avoid having to re-read
743 them repeatedly. If you read the same record twice, the first time it
744 will be stored in memory, and the second time it will be fetched from
747 The cache has a bounded size; when it exceeds this size, the
748 least-recently visited records will be purged from the cache. The
749 default size is 2Mib. You can adjust the amount of space used for the
750 cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
752 # I have a lot of memory, so use a large cache to speed up access
753 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
755 Setting the cache size to 0 will inhibit caching; records will be
756 fetched from disk every time you examine them.
760 C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
761 C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
764 =head1 Public Methods
766 The C<tie> call returns an object, say C<$o>. You may call
768 $rec = $o->FETCH($n);
771 to fetch or store the record at line C<$n>, respectively. The only other public method in this package is:
777 will lock the tied file. C<MODE> has the same meaning as the second
778 argument to the Perl built-in C<flock> function; for example
779 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
780 the C<use Fcntl ':flock'> declaration.)
782 C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
785 The best way to unlock a file is to discard the object and untie the
786 array. It is probably unsafe to unlock the file without also untying
787 it, because if you do, changes may remain unwritten inside the object.
788 That is why there is no shortcut for unlocking. If you really want to
789 unlock the file prematurely, you know what to do; if you don't know
790 what to do, then don't do it.
792 All the usual warnings about file locking apply here. In particular,
793 note that file locking in Perl is B<advisory>, which means that
794 holding a lock will not prevent anyone else from reading, writing, or
795 erasing the file; it only prevents them from getting another lock at
796 the same time. Locks are analogous to green traffic lights: If you
797 have a green light, that does not prevent the idiot coming the other
798 way from plowing into you sideways; it merely guarantees to you that
799 the idiot does not also have a green light at the same time.
801 =head2 Tying to an already-opened filehandle
803 If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
804 of the other C<IO> modules, you may use:
806 tie @array, 'Tie::File', $fh, ...;
808 Similarly if you opened that handle C<FH> with regular C<open> or
809 C<sysopen>, you may use:
811 tie @array, 'Tie::File', \*FH, ...;
813 Handles that were opened write-only won't work. Handles that were
814 opened read-only will work as long as you don't try to write to them.
815 Handles must be attached to seekable sources of data---that means no
816 pipes or sockets. If you try to supply a non-seekable handle, the
817 C<tie> call will try to abort your program. This feature is not yet
822 (That's Latin for 'warnings'.)
824 =head2 Efficiency Note
826 Every effort was made to make this module efficient. Nevertheless,
827 changing the size of a record in the middle of a large file will
828 always be slow, because everything after the new record must be moved.
830 In particular, note that:
837 is likely to be very slow, because the first iteration must relocate
838 lines 1 through 999,999; the second iteration must relocate lines 2
839 through 999,999, and so on. The relocation is done using block
840 writes, however, so it's not as slow as it might be.
842 A soon-to-be-released version of this module will provide a mechanism
843 for getting better performance in such cases, by deferring the writing
844 until it can be done all at once.
846 =head2 Efficiency Note 2
848 Not every effort was made to make this module as efficient as
849 possible. C<FETCHSIZE> should use binary search instead of linear
850 search. The cache's LRU queue should be a heap instead of a list.
851 These defects are probably minor; in any event, they will be fixed in
852 a later version of the module.
854 =head2 Efficiency Note 3
856 The author has supposed that since this module is concerned with file
857 I/O, almost all normal use of it will be heavily I/O bound, and that
858 the time to maintain complicated data structures inside the module
859 will be dominated by the time to actually perform the I/O. This
860 suggests, for example, that an LRU read-cache is a good tradeoff,
861 even if it requires substantial adjustment following a C<splice>
866 (That's Latin for 'warnings'.)
868 The behavior of tied arrays is not precisely the same as for regular
871 undef $a[10]; print "How unusual!\n" if $a[10];
873 C<undef>-ing a C<Tie::File> array element just blanks out the
874 corresponding record in the file. When you read it back again, you'll
875 see the record separator (typically, $a[10] will appear to contain
876 "\n") so the supposedly-C<undef>'ed value will be true.
878 There are other minor differences, but in general, the correspondence
885 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
887 To receive an announcement whenever a new version of this module is
888 released, send a blank email message to
889 C<mjd-perl-tiefile-subscribe@plover.com>.
893 C<Tie::File> version 0.17 is copyright (C) 2002 Mark Jason Dominus.
895 This library is free software; you may redistribute it and/or modify
896 it under the same terms as Perl itself.
898 These terms include your choice of (1) the Perl Artistic Licence, or
899 (2) version 2 of the GNU General Public License as published by the
900 Free Software Foundation, or (3) any later version of the GNU General
903 This library is distributed in the hope that it will be useful,
904 but WITHOUT ANY WARRANTY; without even the implied warranty of
905 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
906 GNU General Public License for more details.
908 You should have received a copy of the GNU General Public License
909 along with this library program; it should be in the file C<COPYING>.
910 If not, write to the Free Software Foundation, Inc., 59 Temple Place,
911 Suite 330, Boston, MA 02111 USA
913 For licensing inquiries, contact the author at:
917 Philadelphia, PA 19107
921 C<Tie::File> version 0.17 comes with ABSOLUTELY NO WARRANTY.
922 For details, see the license.
926 Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
927 core when I hadn't written it yet, and for generally being helpful,
928 supportive, and competent. (Usually the rule is "choose any one.")
929 Also big thanks to Abhijit Menon-Sen for all of the same things.
931 Special thanks to Craig Berry (for VMS portability help), Randy Kobes
932 (for Win32 portability help), the rest of the CPAN testers (for
938 Tassilo von Parseval /
945 Test DELETE machinery more carefully.
947 More tests. (Configuration options, cache flushery. _twrite should
948 be tested separately, because there are a lot of weird special cases
951 More tests. (Stuff I didn't think of yet.)
953 Deferred writing. (!!!)
961 Maybe an autolocking mode?