5 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
10 # Idea: The object will always contain an array of byte offsets
11 # this will be filled in as is necessary and convenient.
12 # fetch will do seek-read.
13 # There will be a cache parameter that controls the amount of cached *data*
14 # Also an LRU queue of cached records
15 # store will read the relevant record into the cache
16 # If it's the same length as what is being written, it will overwrite it in
17 # place; if not, it will do a from-to copying write.
18 # The record separator string is also a parameter
20 # Record numbers start at ZERO.
22 my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
28 my ($pack, $file, %opts) = @_;
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
40 # the cache is a hash instead of an array because it is likely to be
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
56 my $fh = \do { local *FH }; # only works in 5.005 and later
57 sysopen $fh, $file, $mode, 0666 or return;
59 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
62 bless \%opts => $pack;
68 # check the record cache
69 { my $cached = $self->_check_cache($n);
70 return $cached if defined $cached;
73 unless ($#{$self->{offsets}} >= $n) {
74 my $o = $self->_fill_offsets_to($n);
75 # If it's still undefined, there is no such record, so return 'undef'
76 return unless defined $o;
80 $self->_seek($n); # we can do this now that offsets is populated
81 my $rec = $self->_read_record;
82 $self->_cache_insert($n, $rec) if defined $rec;
87 my ($self, $n, $rec) = @_;
89 $self->_fixrecs($rec);
91 # TODO: what should we do about the cache? Install the new record
92 # in the cache only if the old version of the same record was
95 # We need this to decide whether the new record will fit
96 # It incidentally populates the offsets table
97 # Note we have to do this before we alter the cache
98 my $oldrec = $self->FETCH($n);
100 # _check_cache promotes record $n to MRU. Is this correct behavior?
101 $self->{cache}{$n} = $rec if $self->_check_cache($n);
103 if (not defined $oldrec) {
104 # We're storing a record beyond the end of the file
105 $self->_extend_file_to($n+1);
106 $oldrec = $self->{recsep};
108 my $len_diff = length($rec) - length($oldrec);
110 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
112 # now update the offsets
113 # array slice goes from element $n+1 (the first one to move)
115 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
122 my $n = $#{$self->{offsets}};
123 while (defined ($self->_fill_offsets_to($n+1))) {
130 my ($self, $len) = @_;
131 my $olen = $self->FETCHSIZE;
132 return if $len == $olen; # Woo-hoo!
136 $self->_extend_file_to($len);
143 $#{$self->{offsets}} = $len-1;
144 my @cached = grep $_ > $len, keys %{$self->{cache}};
145 delete @{$self->{cache}}{@cached} if @cached;
150 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
156 scalar $self->SPLICE(-1, 1);
161 scalar $self->SPLICE(0, 1);
166 $self->SPLICE(0, 0, @_);
171 # And enable auto-defer mode, since it's likely that they just
176 %{$self->{cache}} = ();
178 @{$self->{lru}} = ();
179 @{$self->{offsets}} = (0);
184 $self->_fill_offsets_to($n);
185 $self->_extend_file_to($n);
190 my $lastrec = $self->FETCHSIZE-1;
191 if ($n == $lastrec) {
194 # perhaps in this case I should also remove trailing null records?
196 $self->STORE($n, "");
202 $self->_fill_offsets_to($n);
203 0 <= $n && $n < $self->FETCHSIZE;
207 my ($self, $pos, $nrecs, @data) = @_;
211 my $oldsize = $self->FETCHSIZE;
217 croak "Modification of non-creatable array value attempted, subscript $oldpos";
221 if ($pos > $oldsize) {
223 $pos = $oldsize; # This is what perl does for normal arrays
227 $self->_fixrecs(@data);
228 my $data = join '', @data;
229 my $datalen = length $data;
232 # compute length of data being removed
233 # Incidentally fills offsets table
234 for ($pos .. $pos+$nrecs-1) {
235 my $rec = $self->FETCH($_);
236 last unless defined $rec;
238 $oldlen += length($rec);
242 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
244 # update the offsets table part 1
245 # compute the offsets of the new records:
248 push @new_offsets, $self->{offsets}[$pos];
249 for (0 .. $#data-1) {
250 push @new_offsets, $new_offsets[-1] + length($data[$_]);
253 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
255 # update the offsets table part 2
256 # adjust the offsets of the following old records
257 for ($pos+@data .. $#{$self->{offsets}}) {
258 $self->{offsets}[$_] += $datalen - $oldlen;
260 # If we scrubbed out all known offsets, regenerate the trivial table
261 # that knows that the file does indeed start at 0.
262 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
264 # Perhaps the following cache foolery could be factored out
265 # into a bunch of mor opaque cache functions. For example,
266 # it's odd to delete a record from the cache and then remove
267 # it from the LRU queue later on; there should be a function to
270 # update the read cache, part 1
272 # Consider this carefully for correctness
273 for ($pos .. $pos+$nrecs-1) {
274 my $cached = $self->{cache}{$_};
275 next unless defined $cached;
276 my $new = $data[$_-$pos];
278 $self->{cached} += length($new) - length($cached);
279 $self->{cache}{$_} = $new;
281 delete $self->{cache}{$_};
282 $self->{cached} -= length($cached);
285 # update the read cache, part 2
286 # moved records - records past the site of the change
287 # need to be renumbered
288 # Maybe merge this with the previous block?
289 for (keys %{$self->{cache}}) {
290 next unless $_ >= $pos + $nrecs;
291 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
296 for (@{$self->{lru}}) {
297 if ($_ >= $pos + $nrecs) {
298 push @new, $_ + @data - $nrecs;
299 } elsif ($_ >= $pos) {
300 push @changed, $_ if $_ < $pos + @data;
305 @{$self->{lru}} = (@new, @changed);
307 # Yes, the return value of 'splice' *is* actually this complicated
308 wantarray ? @result : @result ? $result[-1] : undef;
311 # write data into the file
312 # $data is the data to be written.
313 # it should be written at position $pos, and should overwrite
314 # exactly $len of the following bytes.
315 # Note that if length($data) > $len, the subsequent bytes will have to
316 # be moved up, and if length($data) < $len, they will have to
319 my ($self, $data, $pos, $len) = @_;
321 unless (defined $pos) {
322 die "\$pos was undefined in _twrite";
325 my $len_diff = length($data) - $len;
327 if ($len_diff == 0) { # Woo-hoo!
328 my $fh = $self->{fh};
330 $self->_write_record($data);
331 return; # well, that was easy.
334 # the two records are of different lengths
335 # our strategy here: rewrite the tail of the file,
336 # reading ahead one buffer at a time
337 # $bufsize is required to be at least as large as the data we're overwriting
338 my $bufsize = _bufsize($len_diff);
339 my ($writepos, $readpos) = ($pos, $pos+$len);
342 # Seems like there ought to be a way to avoid the repeated code
343 # and the special case here. The read(1) is also a little weird.
346 $self->_seekb($readpos);
347 my $br = read $self->{fh}, $next_block, $bufsize;
348 my $more_data = read $self->{fh}, my($dummy), 1;
349 $self->_seekb($writepos);
350 $self->_write_record($data);
352 $writepos += length $data;
355 $self->_seekb($writepos);
356 $self->_write_record($next_block);
358 # There might be leftover data at the end of the file
359 $self->_chop_file if $len_diff < 0;
362 # If a record does not already end with the appropriate terminator
363 # string, append one.
367 $_ .= $self->{recsep}
368 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
372 # seek to the beginning of record #$n
373 # Assumes that the offsets table is already correctly populated
375 # Note that $n=-1 has a special meaning here: It means the start of
376 # the last known record; this may or may not be the very last record
377 # in the file, depending on whether the offsets table is fully populated.
381 my $o = $self->{offsets}[$n];
383 or confess("logic error: undefined offset for record $n");
384 seek $self->{fh}, $o, SEEK_SET
385 or die "Couldn't seek filehandle: $!"; # "Should never happen."
390 seek $self->{fh}, $b, SEEK_SET
391 or die "Couldn't seek filehandle: $!"; # "Should never happen."
394 # populate the offsets table up to the beginning of record $n
395 # return the offset of record $n
396 sub _fill_offsets_to {
398 my $fh = $self->{fh};
399 local *OFF = $self->{offsets};
402 until ($#OFF >= $n) {
404 $self->_seek(-1); # tricky -- see comment at _seek
405 $rec = $self->_read_record;
409 return; # It turns out there is no such record
413 # we have now read all the records up to record n-1,
414 # so we can return the offset of record n
418 # assumes that $rec is already suitably terminated
420 my ($self, $rec) = @_;
421 my $fh = $self->{fh};
423 or die "Couldn't write record: $!"; # "Should never happen."
430 { local $/ = $self->{recsep};
431 my $fh = $self->{fh};
438 my ($self, $n, $rec) = @_;
440 # Do not cache records that are too big to fit in the cache.
441 return unless length $rec <= $self->{cachesize};
443 $self->{cache}{$n} = $rec;
444 $self->{cached} += length $rec;
445 push @{$self->{lru}}, $n; # most-recently-used is at the END
447 $self->_cache_flush if $self->{cached} > $self->{cachesize};
453 return unless defined($rec = $self->{cache}{$n});
455 # cache hit; update LRU queue and return $rec
456 # replace this with a heap in a later version
457 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
463 while ($self->{cached} > $self->{cachesize}) {
464 my $lru = shift @{$self->{lru}};
465 $self->{cached} -= length $lru;
466 delete $self->{cache}{$lru};
470 # We have read to the end of the file and have the offsets table
471 # entirely populated. Now we need to write a new record beyond
472 # the end of the file. We prepare for this by writing
473 # empty records into the file up to the position we want
475 # assumes that the offsets table already contains the offset of record $n,
476 # if it exists, and extends to the end of the file if not.
477 sub _extend_file_to {
479 $self->_seek(-1); # position after the end of the last record
480 my $pos = $self->{offsets}[-1];
482 # the offsets table has one entry more than the total number of records
483 $extras = $n - $#{$self->{offsets}};
485 # Todo : just use $self->{recsep} x $extras here?
486 while ($extras-- > 0) {
487 $self->_write_record($self->{recsep});
488 $pos += $self->{recseplen};
489 push @{$self->{offsets}}, $pos;
493 # Truncate the file at the current position
496 truncate $self->{fh}, tell($self->{fh});
499 # compute the size of a buffer suitable for moving
500 # all the data in a file forward $n bytes
501 # ($n may be negative)
502 # The result should be at least $n.
505 return 8192 if $n < 0;
507 $b += 8192 if $n & 8191;
513 my ($self, $op) = @_;
515 my $pack = ref $self;
516 croak "Usage: $pack\->flock([OPERATION])";
518 my $fh = $self->{fh};
519 $op = LOCK_EX unless defined $op;
523 # Given a file, make sure the cache is consistent with the
525 sub _check_integrity {
526 my ($self, $file, $warn) = @_;
529 open F, $file or die "Couldn't open file $file: $!";
531 local $/ = $self->{recsep};
532 unless ($self->{offsets}[0] == 0) {
533 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
538 my $cached = $self->{cache}{$n};
539 my $offset = $self->{offsets}[$.];
541 if (defined $offset && $offset != $ao) {
542 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
544 if (defined $cached && $_ ne $cached) {
548 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
553 while (my ($n, $r) = each %{$self->{cache}}) {
554 $cachesize += length($r);
555 next if $n+1 <= $.; # checked this already
556 $warn && print STDERR "# spurious caching of record $n\n";
559 if ($cachesize != $self->{cached}) {
560 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
564 my (%seen, @duplicate);
565 for (@{$self->{lru}}) {
567 if (not exists $self->{cache}{$_}) {
568 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
572 @duplicate = grep $seen{$_}>1, keys %seen;
574 my $records = @duplicate == 1 ? 'Record' : 'Records';
575 my $appear = @duplicate == 1 ? 'appears' : 'appear';
576 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
579 for (keys %{$self->{cache}}) {
580 unless (exists $seen{$_}) {
581 print "# $record $_ is in the cache but not the LRU queue\n";
591 Tie::File - Access the lines of a disk file via a Perl array
595 # This file documents Tie::File version 0.14
597 tie @array, 'Tie::File', filename or die ...;
599 $array[13] = 'blah'; # line 13 of the file is now 'blah'
600 print $array[42]; # display line 42 of the file
602 $n_recs = @array; # how many records are in the file?
603 $#array = $n_recs - 2; # chop records off the end
605 # As you would expect:
607 push @array, new recs...;
609 unshift @array, new recs...;
610 my $r1 = shift @array;
611 @old_recs = splice @array, 3, 7, new recs...;
613 untie @array; # all finished
617 C<Tie::File> represents a regular text file as a Perl array. Each
618 element in the array corresponds to a record in the file. The first
619 line of the file is element 0 of the array; the second line is element
622 The file is I<not> loaded into memory, so this will work even for
625 Changes to the array are reflected in the file immediately.
629 What is a 'record'? By default, the meaning is the same as for the
630 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
631 probably C<"\n"> or C<"\r\n">. You may change the definition of
632 "record" by supplying the C<recsep> option in the C<tie> call:
634 tie @array, 'Tie::File', $file, recsep => 'es';
636 This says that records are delimited by the string C<es>. If the file contained the following data:
638 Curse these pesky flies!\n
640 then the C<@array> would appear to have four elements:
647 An undefined value is not permitted as a record separator. Perl's
648 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
651 Records read from the tied array will have the record separator string
652 on the end, just as if they were read from the C<E<lt>...E<gt>>
653 operator. Records stored into the array will have the record
654 separator string appended before they are written to the file, if they
655 don't have one already. For example, if the record separator string
656 is C<"\n">, then the following two lines do exactly the same thing:
658 $array[17] = "Cherry pie";
659 $array[17] = "Cherry pie\n";
661 The result is that the contents of line 17 of the file will be
662 replaced with "Cherry pie"; a newline character will separate line 17
663 from line 18. This means that inparticular, this will do nothing:
667 Because the C<chomp>ed value will have the separator reattached when
668 it is written back to the file. There is no way to create a file
669 whose trailing record separator string is missing.
671 Inserting records that I<contain> the record separator string will
672 produce a reasonable result, but if you can't foresee what this result
673 will be, you'd better avoid doing this.
677 Normally, the specified file will be opened for read and write access,
678 and will be created if it does not exist. (That is, the flags
679 C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
680 change this, you may supply alternative flags in the C<mode> option.
681 See L<Fcntl> for a listing of available flags.
684 # open the file if it exists, but fail if it does not exist
686 tie @array, 'Tie::File', $file, mode => O_RDWR;
688 # create the file if it does not exist
689 use Fcntl 'O_RDWR', 'O_CREAT';
690 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
692 # open an existing file in read-only mode
693 use Fcntl 'O_RDONLY';
694 tie @array, 'Tie::File', $file, mode => O_RDONLY;
696 Opening the data file in write-only or append mode is not supported.
700 Records read in from the file are cached, to avoid having to re-read
701 them repeatedly. If you read the same record twice, the first time it
702 will be stored in memory, and the second time it will be fetched from
705 The cache has a bounded size; when it exceeds this size, the
706 least-recently visited records will be purged from the cache. The
707 default size is 2Mib. You can adjust the amount of space used for the
708 cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
710 # I have a lot of memory, so use a large cache to speed up access
711 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
713 Setting the cache size to 0 will inhibit caching; records will be
714 fetched from disk every time you examine them.
718 C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
719 C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
722 =head1 Public Methods
724 The C<tie> call returns an object, say C<$o>. You may call
726 $rec = $o->FETCH($n);
729 to fetch or store the record at line C<$n>, respectively. The only other public method in this package is:
735 will lock the tied file. C<MODE> has the same meaning as the second
736 argument to the Perl built-in C<flock> function; for example
737 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
738 the C<use Fcntl ':flock'> declaration.)
740 C<MODE> is optional; C<< $o->flock >> simply locks the file with
743 The best way to unlock a file is to discard the object and untie the
744 array. It is probably unsafe to unlock the file without also untying
745 it, because if you do, changes may remain unwritten inside the object.
746 That is why there is no shortcut for unlocking. If you really want to
747 unlock the file prematurely, you know what to do; if you don't know
748 what to do, then don't do it.
750 All the usual warnings about file locking apply here. In particular,
751 note that file locking in Perl is B<advisory>, which means that
752 holding a lock will not prevent anyone else from reading, writing, or
753 erasing the file; it only prevents them from getting another lock at
754 the same time. Locks are analogous to green traffic lights: If you
755 have a green light, that does not prevent the idiot coming the other
756 way from plowing into you sideways; it merely guarantees to you that
757 the idiot does not also have a green light at the same time.
761 (That's Latin for 'warnings'.)
763 =head2 Efficiency Note
765 Every effort was made to make this module efficient. Nevertheless,
766 changing the size of a record in the middle of a large file will
767 always be slow, because everything after the new record must be move.
769 In particular, note that:
776 is likely to be very slow, because the first iteration must relocate
777 lines 1 through 999,999; the second iteration must relocate lines 2
778 through 999,999, and so on. The relocation is done using block
779 writes, however, so it's not as slow as it might be.
781 A future version of this module will provide some mechanism for
782 getting better performance in such cases, by deferring the writing
783 until it can be done all at once.
785 =head2 Efficiency Note 2
787 Not every effort was made to make this module as efficient as
788 possible. C<FETCHSIZE> should use binary search instead of linear
789 search. The cache's LRU queue should be a heap instead of a list.
790 These defects are probably minor; in any event, they will be fixed in
791 a later version of the module.
793 =head2 Efficiency Note 3
795 The author has supposed that since this module is concerned with file
796 I/O, almost all normal use of it will be heavily I/O bound, and that
797 the time to maintain complicated data structures inside the module
798 will be dominated by the time to actually perform the I/O. This
799 suggests, for example, that and LRU read-cache is a good tradeoff,
800 even if it requires substantial adjustment following a C<splice>
805 (That's Latin for 'warnings'.)
807 The behavior of tied arrays is not precisely the same as for regular
810 undef $a[10]; print "How unusual!\n" if $a[10];
812 C<undef>-ing a C<Tie::File> array element just blanks out the
813 corresponding record in the file. When you read it back again, you'll
814 see the record separator (typically, $a[10] will appear to contain
815 "\n") so the supposedly-C<undef>'ed value will be true.
817 There are other minor differences, but in general, the correspondence
824 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
826 To receive an announcement whenever a new version of this module is
827 released, send a blank email message to
828 C<mjd-perl-tiefile-subscribe@plover.com>.
832 C<Tie::File> version 0.14 is copyright (C) 2002 Mark Jason Dominus.
834 This program is free software; you can redistribute it and/or modify
835 it under the terms of the GNU General Public License as published by
836 the Free Software Foundation; either version 2 of the License, or (at
837 your option) any later version.
839 This program is distributed in the hope that it will be useful,
840 but WITHOUT ANY WARRANTY; without even the implied warranty of
841 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
842 GNU General Public License for more details.
844 You should have received a copy of the GNU General Public License
845 along with this program; it should be in the file C<COPYING>. If not,
846 write to the Free Software Foundation, Inc., 59 Temple Place, Suite
847 330, Boston, MA 02111 USA
849 For licensing inquiries, contact the author at:
853 Philadelphia, PA 19107
857 C<Tie::File> version 0.14 comes with ABSOLUTELY NO WARRANTY.
858 For details, see the license.
862 Tests for default arguments to SPLICE. Tests for CLEAR/EXTEND.
863 Tests for DELETE/EXISTS.
865 More tests. (Configuration options, cache flushery, locking. _twrite
866 should be tested separately, because there are a lot of weird special
867 cases lurking in there.)
869 More tests. (Stuff I didn't think of yet.)
871 Deferred writing. (!!!)