6 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
9 my $DEFAULT_MEMORY_SIZE = 1<<21; # 2 megabytes
10 my $DEFAULT_AUTODEFER_THRESHHOLD = 3; # 3 records
11 my $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD = 65536; # 16 disk blocksful
13 my %good_opt = map {$_ => 1, "-$_" => 1}
14 qw(memory dw_size mode recsep discipline autodefer autochomp);
18 croak "usage: tie \@array, $_[0], filename, [option => value]...";
20 my ($pack, $file, %opts) = @_;
22 # transform '-foo' keys into 'foo' keys
23 for my $key (keys %opts) {
24 unless ($good_opt{$key}) {
25 croak("$pack: Unrecognized option '$key'\n");
28 if ($key =~ s/^-+//) {
29 $opts{$key} = delete $opts{$okey};
33 unless (defined $opts{memory}) {
34 # default is the larger of the default cache size and the
35 # deferred-write buffer size (if specified)
36 $opts{memory} = $DEFAULT_MEMORY_SIZE;
37 $opts{memory} = $opts{dw_size}
38 if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
41 $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
42 if ($opts{dw_size} > $opts{memory}) {
43 croak("$pack: dw_size may not be larger than total memory allocation\n");
45 # are we in deferred-write mode?
46 $opts{defer} = 0 unless defined $opts{defer};
47 $opts{deferred} = {}; # no records are presently deferred
48 $opts{deferred_s} = 0; # count of total bytes in ->{deferred}
49 $opts{deferred_max} = -1; # empty
51 # the cache is a hash instead of an array because it is likely to be
53 $opts{cache} = Tie::File::Cache->new($opts{memory});
55 # autodeferment is enabled by default
56 $opts{autodefer} = 1 unless defined $opts{autodefer};
57 $opts{autodeferring} = 0; # but is not initially active
58 $opts{ad_history} = [];
59 $opts{autodefer_threshhold} = $DEFAULT_AUTODEFER_THRESHHOLD
60 unless defined $opts{autodefer_threshhold};
61 $opts{autodefer_filelen_threshhold} = $DEFAULT_AUTODEFER_FILELEN_THRESHHOLD
62 unless defined $opts{autodefer_filelen_threshhold};
65 $opts{filename} = $file;
66 unless (defined $opts{recsep}) {
67 $opts{recsep} = _default_recsep();
69 $opts{recseplen} = length($opts{recsep});
70 if ($opts{recseplen} == 0) {
71 croak "Empty record separator not supported by $pack";
74 $opts{autochomp} = 1 unless defined $opts{autochomp};
76 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
79 if (UNIVERSAL::isa($file, 'GLOB')) {
80 # We use 1 here on the theory that some systems
81 # may not indicate failure if we use 0.
82 # MSWin32 does not indicate failure with 0, but I don't know if
83 # it will indicate failure with 1 or not.
84 unless (seek $file, 1, SEEK_SET) {
85 croak "$pack: your filehandle does not appear to be seekable";
87 seek $file, 0, SEEK_SET # put it back
88 $fh = $file; # setting binmode is the user's problem
90 croak "usage: tie \@array, $pack, filename, [option => value]...";
92 $fh = \do { local *FH }; # only works in 5.005 and later
93 sysopen $fh, $file, $mode, 0666 or return;
96 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
97 if (defined $opts{discipline} && $] >= 5.006) {
98 # This avoids a compile-time warning under 5.005
99 eval 'binmode($fh, $opts{discipline})';
100 croak $@ if $@ =~ /unknown discipline/i;
105 bless \%opts => $pack;
112 # check the defer buffer
113 if ($self->_is_deferring && exists $self->{deferred}{$n}) {
114 $rec = $self->{deferred}{$n};
116 $rec = $self->_fetch($n);
119 $self->_chomp1($rec);
122 # Chomp many records in-place; return nothing useful
125 return unless $self->{autochomp};
126 if ($self->{autochomp}) {
129 substr($_, - $self->{recseplen}) = "";
134 # Chomp one record in-place; return modified record
136 my ($self, $rec) = @_;
137 return $rec unless $self->{autochomp};
138 return unless defined $rec;
139 substr($rec, - $self->{recseplen}) = "";
146 # check the record cache
147 { my $cached = $self->{cache}->lookup($n);
148 return $cached if defined $cached;
151 unless ($#{$self->{offsets}} >= $n) {
152 my $o = $self->_fill_offsets_to($n);
153 # If it's still undefined, there is no such record, so return 'undef'
154 return unless defined $o;
157 my $fh = $self->{FH};
158 $self->_seek($n); # we can do this now that offsets is populated
159 my $rec = $self->_read_record;
161 # If we happen to have just read the first record, check to see if
162 # the length of the record matches what 'tell' says. If not, Tie::File
163 # won't work, and should drop dead.
165 # if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
166 # if (defined $self->{discipline}) {
167 # croak "I/O discipline $self->{discipline} not supported";
169 # croak "File encoding not supported";
173 $self->{cache}->insert($n, $rec) if defined $rec && not $self->{flushing};
178 my ($self, $n, $rec) = @_;
179 die "STORE called from _check_integrity!" if $DIAGNOSTIC;
181 $self->_fixrecs($rec);
183 if ($self->{autodefer}) {
184 $self->_annotate_ad_history($n);
187 return $self->_store_deferred($n, $rec) if $self->_is_deferring;
190 # We need this to decide whether the new record will fit
191 # It incidentally populates the offsets table
192 # Note we have to do this before we alter the cache
193 # 20020324 Wait, but this DOES alter the cache. TODO BUG?
194 my $oldrec = $self->_fetch($n);
196 if (defined($self->{cache}->lookup($n))) {
197 $self->{cache}->update($n, $rec);
200 if (not defined $oldrec) {
201 # We're storing a record beyond the end of the file
202 $self->_extend_file_to($n+1);
203 $oldrec = $self->{recsep};
205 my $len_diff = length($rec) - length($oldrec);
207 # length($oldrec) here is not consistent with text mode TODO XXX BUG
208 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
210 # now update the offsets
211 # array slice goes from element $n+1 (the first one to move)
213 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
218 sub _store_deferred {
219 my ($self, $n, $rec) = @_;
220 $self->{cache}->remove($n);
221 my $old_deferred = $self->{deferred}{$n};
223 if (defined $self->{deferred_max} && $n > $self->{deferred_max}) {
224 $self->{deferred_max} = $n;
226 $self->{deferred}{$n} = $rec;
228 my $len_diff = length($rec);
229 $len_diff -= length($old_deferred) if defined $old_deferred;
230 $self->{deferred_s} += $len_diff;
231 $self->{cache}->adj_limit(-$len_diff);
232 if ($self->{deferred_s} > $self->{dw_size}) {
234 } elsif ($self->_cache_too_full) {
239 # Remove a single record from the deferred-write buffer without writing it
240 # The record need not be present
241 sub _delete_deferred {
243 my $rec = delete $self->{deferred}{$n};
244 return unless defined $rec;
246 if (defined $self->{deferred_max}
247 && $n == $self->{deferred_max}) {
248 undef $self->{deferred_max};
251 $self->{deferred_s} -= length $rec;
252 $self->{cache}->adj_limit(length $rec);
257 my $n = $#{$self->{offsets}};
258 # 20020317 Change this to binary search
259 while (defined ($self->_fill_offsets_to($n+1))) {
262 my $top_deferred = $self->_defer_max;
263 $n = $top_deferred+1 if defined $top_deferred && $n < $top_deferred+1;
268 my ($self, $len) = @_;
270 if ($self->{autodefer}) {
271 $self->_annotate_ad_history('STORESIZE');
274 my $olen = $self->FETCHSIZE;
275 return if $len == $olen; # Woo-hoo!
279 if ($self->_is_deferring) {
280 for ($olen .. $len-1) {
281 $self->_store_deferred($_, $self->{recsep});
284 $self->_extend_file_to($len);
290 if ($self->_is_deferring) {
291 # TODO maybe replace this with map-plus-assignment?
292 for (grep $_ >= $len, keys %{$self->{deferred}}) {
293 $self->_delete_deferred($_);
295 $self->{deferred_max} = $len-1;
300 $#{$self->{offsets}} = $len;
301 # $self->{offsets}[0] = 0; # in case we just chopped this
303 $self->{cache}->remove(grep $_ >= $len, $self->{cache}->keys);
308 $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
309 # $self->FETCHSIZE; # av.c takes care of this for me
314 my $size = $self->FETCHSIZE;
315 return if $size == 0;
316 # print STDERR "# POPPITY POP POP POP\n";
317 scalar $self->SPLICE($size-1, 1);
322 scalar $self->SPLICE(0, 1);
327 $self->SPLICE(0, 0, @_);
328 # $self->FETCHSIZE; # av.c takes care of this for me
334 if ($self->{autodefer}) {
335 $self->_annotate_ad_history('CLEAR');
340 $self->{cache}->set_limit($self->{memory});
341 $self->{cache}->empty;
342 @{$self->{offsets}} = (0);
343 %{$self->{deferred}}= ();
344 $self->{deferred_s} = 0;
345 $self->{deferred_max} = -1;
351 # No need to pre-extend anything in this case
352 return if $self->_is_deferring;
354 $self->_fill_offsets_to($n);
355 $self->_extend_file_to($n);
361 if ($self->{autodefer}) {
362 $self->_annotate_ad_history('DELETE');
365 my $lastrec = $self->FETCHSIZE-1;
366 my $rec = $self->FETCH($n);
367 $self->_delete_deferred($n) if $self->_is_deferring;
368 if ($n == $lastrec) {
371 $#{$self->{offsets}}--;
372 $self->{cache}->remove($n);
373 # perhaps in this case I should also remove trailing null records?
375 # Note that delete @a[-3..-1] deletes the records in the wrong order,
376 # so we only chop the very last one out of the file. We could repair this
377 # by tracking deleted records inside the object.
378 } elsif ($n < $lastrec) {
379 $self->STORE($n, "");
386 return 1 if exists $self->{deferred}{$n};
387 $self->_fill_offsets_to($n); # I think this is unnecessary
388 $n < $self->FETCHSIZE;
394 if ($self->{autodefer}) {
395 $self->_annotate_ad_history('SPLICE');
398 $self->_flush if $self->_is_deferring; # move this up?
400 $self->_chomp(my @a = $self->_splice(@_));
403 $self->_chomp1(scalar $self->_splice(@_));
409 $self->flush if $self->_is_deferring;
410 $self->{cache}->delink if defined $self->{cache}; # break circular link
414 my ($self, $pos, $nrecs, @data) = @_;
417 $pos = 0 unless defined $pos;
419 # Deal with negative and other out-of-range positions
420 # Also set default for $nrecs
422 my $oldsize = $self->FETCHSIZE;
423 $nrecs = $oldsize unless defined $nrecs;
429 croak "Modification of non-creatable array value attempted, subscript $oldpos";
433 if ($pos > $oldsize) {
435 $pos = $oldsize; # This is what perl does for normal arrays
439 $self->_fixrecs(@data);
440 my $data = join '', @data;
441 my $datalen = length $data;
444 # compute length of data being removed
445 for ($pos .. $pos+$nrecs-1) {
446 $self->_fill_offsets_to($_);
447 my $rec = $self->_fetch($_);
448 last unless defined $rec;
451 # Why don't we just use length($rec) here?
452 # Because that record might have come from the cache. _splice
453 # might have been called to flush out the deferred-write records,
454 # and in this case length($rec) is the length of the record to be *written*,
455 # not the length of the actual record in the file. But the offsets are
456 # still true. 20020322
457 $oldlen += $self->{offsets}[$_+1] - $self->{offsets}[$_]
458 if defined $self->{offsets}[$_+1];
462 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
464 # update the offsets table part 1
465 # compute the offsets of the new records:
468 push @new_offsets, $self->{offsets}[$pos];
469 for (0 .. $#data-1) {
470 push @new_offsets, $new_offsets[-1] + length($data[$_]);
473 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
475 # update the offsets table part 2
476 # adjust the offsets of the following old records
477 for ($pos+@data .. $#{$self->{offsets}}) {
478 $self->{offsets}[$_] += $datalen - $oldlen;
480 # If we scrubbed out all known offsets, regenerate the trivial table
481 # that knows that the file does indeed start at 0.
482 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
484 # Perhaps the following cache foolery could be factored out
485 # into a bunch of mor opaque cache functions. For example,
486 # it's odd to delete a record from the cache and then remove
487 # it from the LRU queue later on; there should be a function to
490 # update the read cache, part 1
492 for ($pos .. $pos+$nrecs-1) {
493 my $new = $data[$_-$pos];
495 $self->{cache}->update($_, $new);
497 $self->{cache}->remove($_);
501 # update the read cache, part 2
502 # moved records - records past the site of the change
503 # need to be renumbered
504 # Maybe merge this with the previous block?
506 my @oldkeys = grep $_ >= $pos + $nrecs, $self->{cache}->keys;
507 my @newkeys = map $_-$nrecs+@data, @oldkeys;
508 $self->{cache}->rekey(\@oldkeys, \@newkeys);
511 # Now there might be too much data in the cache, if we spliced out
512 # some short records and spliced in some long ones. If so, flush
516 # Yes, the return value of 'splice' *is* actually this complicated
517 wantarray ? @result : @result ? $result[-1] : undef;
520 # write data into the file
521 # $data is the data to be written.
522 # it should be written at position $pos, and should overwrite
523 # exactly $len of the following bytes.
524 # Note that if length($data) > $len, the subsequent bytes will have to
525 # be moved up, and if length($data) < $len, they will have to
528 my ($self, $data, $pos, $len) = @_;
530 unless (defined $pos) {
531 die "\$pos was undefined in _twrite";
534 my $len_diff = length($data) - $len;
536 if ($len_diff == 0) { # Woo-hoo!
537 my $fh = $self->{fh};
539 $self->_write_record($data);
540 return; # well, that was easy.
543 # the two records are of different lengths
544 # our strategy here: rewrite the tail of the file,
545 # reading ahead one buffer at a time
546 # $bufsize is required to be at least as large as the data we're overwriting
547 my $bufsize = _bufsize($len_diff);
548 my ($writepos, $readpos) = ($pos, $pos+$len);
552 # Seems like there ought to be a way to avoid the repeated code
553 # and the special case here. The read(1) is also a little weird.
556 $self->_seekb($readpos);
557 my $br = read $self->{fh}, $next_block, $bufsize;
558 $more_data = read $self->{fh}, my($dummy), 1;
559 $self->_seekb($writepos);
560 $self->_write_record($data);
562 $writepos += length $data;
564 } while $more_data; # BUG XXX TODO how could this have worked?
565 $self->_seekb($writepos);
566 $self->_write_record($next_block);
568 # There might be leftover data at the end of the file
569 $self->_chop_file if $len_diff < 0;
572 # If a record does not already end with the appropriate terminator
573 # string, append one.
577 $_ .= $self->{recsep}
578 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
583 ################################################################
585 # Basic read, write, and seek
588 # seek to the beginning of record #$n
589 # Assumes that the offsets table is already correctly populated
591 # Note that $n=-1 has a special meaning here: It means the start of
592 # the last known record; this may or may not be the very last record
593 # in the file, depending on whether the offsets table is fully populated.
597 my $o = $self->{offsets}[$n];
599 or confess("logic error: undefined offset for record $n");
600 seek $self->{fh}, $o, SEEK_SET
601 or die "Couldn't seek filehandle: $!"; # "Should never happen."
606 seek $self->{fh}, $b, SEEK_SET
607 or die "Couldn't seek filehandle: $!"; # "Should never happen."
610 # populate the offsets table up to the beginning of record $n
611 # return the offset of record $n
612 sub _fill_offsets_to {
614 my $fh = $self->{fh};
615 local *OFF = $self->{offsets};
618 until ($#OFF >= $n) {
620 $self->_seek(-1); # tricky -- see comment at _seek
621 $rec = $self->_read_record;
625 return; # It turns out there is no such record
629 # we have now read all the records up to record n-1,
630 # so we can return the offset of record n
634 # assumes that $rec is already suitably terminated
636 my ($self, $rec) = @_;
637 my $fh = $self->{fh};
639 or die "Couldn't write record: $!"; # "Should never happen."
640 $self->{_written} += length($rec);
646 { local $/ = $self->{recsep};
647 my $fh = $self->{fh};
650 $self->{_read} += length($rec) if defined $rec;
655 @{$self}{'_read', '_written'};
658 ################################################################
660 # Read cache management
664 $self->{cache}->reduce_size_to($self->{memory} - $self->{deferred_s});
667 sub _cache_too_full {
669 $self->{cache}->bytes + $self->{deferred_s} >= $self->{memory};
672 ################################################################
674 # File custodial services
678 # We have read to the end of the file and have the offsets table
679 # entirely populated. Now we need to write a new record beyond
680 # the end of the file. We prepare for this by writing
681 # empty records into the file up to the position we want
683 # assumes that the offsets table already contains the offset of record $n,
684 # if it exists, and extends to the end of the file if not.
685 sub _extend_file_to {
687 $self->_seek(-1); # position after the end of the last record
688 my $pos = $self->{offsets}[-1];
690 # the offsets table has one entry more than the total number of records
691 my $extras = $n - $#{$self->{offsets}};
693 # Todo : just use $self->{recsep} x $extras here?
694 while ($extras-- > 0) {
695 $self->_write_record($self->{recsep});
696 push @{$self->{offsets}}, tell $self->{fh};
700 # Truncate the file at the current position
703 truncate $self->{fh}, tell($self->{fh});
707 # compute the size of a buffer suitable for moving
708 # all the data in a file forward $n bytes
709 # ($n may be negative)
710 # The result should be at least $n.
713 return 8192 if $n < 0;
715 $b += 8192 if $n & 8191;
719 ################################################################
721 # Miscellaneous public methods
726 my ($self, $op) = @_;
728 my $pack = ref $self;
729 croak "Usage: $pack\->flock([OPERATION])";
731 my $fh = $self->{fh};
732 $op = LOCK_EX unless defined $op;
736 # Get/set autochomp option
740 my $old = $self->{autochomp};
741 $self->{autochomp} = shift;
748 ################################################################
750 # Matters related to deferred writing
756 $self->_stop_autodeferring;
757 @{$self->{ad_history}} = ();
761 # Flush deferred writes
763 # This could be better optimized to write the file in one pass, instead
764 # of one pass per block of records. But that will require modifications
765 # to _twrite, so I should have a good _twite test suite first.
775 my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
778 # gather all consecutive records from the front of @writable
779 my $first_rec = shift @writable;
780 my $last_rec = $first_rec+1;
781 ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
783 $self->_fill_offsets_to($last_rec);
784 $self->_extend_file_to($last_rec);
785 $self->_splice($first_rec, $last_rec-$first_rec+1,
786 @{$self->{deferred}}{$first_rec .. $last_rec});
789 $self->_discard; # clear out defered-write-cache
792 # Discard deferred writes and disable future deferred writes
799 # Discard deferred writes, but retain old deferred writing mode
802 %{$self->{deferred}} = ();
803 $self->{deferred_s} = 0;
804 $self->{deferred_max} = -1;
805 $self->{cache}->set_limit($self->{memory});
808 # Deferred writing is enabled, either explicitly ($self->{defer})
809 # or automatically ($self->{autodeferring})
812 $self->{defer} || $self->{autodeferring};
815 # The largest record number of any deferred record
818 return $self->{deferred_max} if defined $self->{deferred_max};
820 for my $key (keys %{$self->{deferred}}) {
821 $max = $key if $key > $max;
823 $self->{deferred_max} = $max;
827 ################################################################
829 # Matters related to autodeferment
832 # Get/set autodefer option
836 my $old = $self->{autodefer};
837 $self->{autodefer} = shift;
839 $self->_stop_autodeferring;
840 @{$self->{ad_history}} = ();
848 # The user is trying to store record #$n Record that in the history,
849 # and then enable (or disable) autodeferment if that seems useful.
850 # Note that it's OK for $n to be a non-number, as long as the function
851 # is prepared to deal with that. Nobody else looks at the ad_history.
853 # Now, what does the ad_history mean, and what is this function doing?
854 # Essentially, the idea is to enable autodeferring when we see that the
855 # user has made three consecutive STORE calls to three consecutive records.
856 # ("Three" is actually ->{autodefer_threshhold}.)
857 # A STORE call for record #$n inserts $n into the autodefer history,
858 # and if the history contains three consecutive records, we enable
859 # autodeferment. An ad_history of [X, Y] means that the most recent
860 # STOREs were for records X, X+1, ..., Y, in that order.
862 # Inserting a nonconsecutive number erases the history and starts over.
864 # Performing a special operation like SPLICE erases the history.
866 # There's one special case: CLEAR means that CLEAR was just called.
867 # In this case, we prime the history with [-2, -1] so that if the next
868 # write is for record 0, autodeferring goes on immediately. This is for
869 # the common special case of "@a = (...)".
871 sub _annotate_ad_history {
873 return unless $self->{autodefer}; # feature is disabled
874 return if $self->{defer}; # already in explicit defer mode
875 return unless $self->{offsets}[-1] >= $self->{autodefer_filelen_threshhold};
877 local *H = $self->{ad_history};
879 @H = (-2, -1); # prime the history with fake records
880 $self->_stop_autodeferring;
881 } elsif ($n =~ /^\d+$/) {
885 if ($H[1] == $n-1) { # another consecutive record
887 if ($H[1] - $H[0] + 1 >= $self->{autodefer_threshhold}) {
888 $self->{autodeferring} = 1;
890 } else { # nonconsecutive- erase and start over
892 $self->_stop_autodeferring;
895 } else { # SPLICE or STORESIZE or some such
897 $self->_stop_autodeferring;
901 # If autodferring was enabled, cut it out and discard the history
902 sub _stop_autodeferring {
904 if ($self->{autodeferring}) {
907 $self->{autodeferring} = 0;
910 ################################################################
913 # This is NOT a method. It is here for two reasons:
914 # 1. To factor a fairly complicated block out of the constructor
915 # 2. To provide access for the test suite, which need to be sure
916 # files are being written properly.
917 sub _default_recsep {
919 if ($^O eq 'MSWin32') { # Dos too?
920 # Windows users expect files to be terminated with \r\n
921 # But $/ is set to \n instead
922 # Note that this also transforms \n\n into \r\n\r\n.
924 $recsep =~ s/\n/\r\n/g;
929 # Utility function for _check_integrity
937 # Given a file, make sure the cache is consistent with the
938 # file contents and the internal data structures are consistent with
939 # each other. Returns true if everything checks out, false if not
941 # The $file argument is no longer used. It is retained for compatibility
942 # with the existing test suite.
943 sub _check_integrity {
944 my ($self, $file, $warn) = @_;
945 my $rsl = $self->{recseplen};
946 my $rs = $self->{recsep};
948 local *_; # local $_ does not work here
949 local $DIAGNOSTIC = 1;
951 if (not defined $rs) {
952 _ci_warn("recsep is undef!");
954 } elsif ($rs eq "") {
955 _ci_warn("recsep is empty!");
957 } elsif ($rsl != length $rs) {
959 _ci_warn("recsep <$rs> has length $ln, should be $rsl");
963 if (not defined $self->{offsets}[0]) {
964 _ci_warn("offset 0 is missing!");
966 } elsif ($self->{offsets}[0] != 0) {
967 _ci_warn("rec 0: offset <$self->{offsets}[0]> s/b 0!");
973 local *F = $self->{fh};
980 my $cached = $self->{cache}->_produce($n);
981 my $offset = $self->{offsets}[$.];
983 if (defined $offset && $offset != $ao) {
984 _ci_warn("rec $n: offset <$offset> actual <$ao>");
987 if (defined $cached && $_ ne $cached && ! $self->{deferred}{$n}) {
989 _ci_warn("rec $n: cached <$cached> actual <$_>");
991 if (defined $cached && substr($cached, -$rsl) ne $rs) {
992 _ci_warn("rec $n in the cache is missing the record separator");
996 my $deferring = $self->_is_deferring;
997 for my $n ($self->{cache}->keys) {
998 my $r = $self->{cache}->_produce($n);
999 $cached += length($r);
1000 next if $n+1 <= $.; # checked this already
1001 _ci_warn("spurious caching of record $n");
1004 my $b = $self->{cache}->bytes;
1005 if ($cached != $b) {
1006 _ci_warn("cache size is $b, should be $cached");
1011 $good = 0 unless $self->{cache}->_check_integrity;
1013 # Now let's check the deferbuffer
1014 # Unless deferred writing is enabled, it should be empty
1015 if (! $self->_is_deferring && %{$self->{deferred}}) {
1016 _ci_warn("deferred writing disabled, but deferbuffer nonempty");
1020 # Any record in the deferbuffer should *not* be present in the readcache
1022 while (my ($n, $r) = each %{$self->{deferred}}) {
1023 $deferred_s += length($r);
1024 if (defined $self->{cache}->_produce($n)) {
1025 _ci_warn("record $n is in the deferbuffer *and* the readcache");
1028 if (substr($r, -$rsl) ne $rs) {
1029 _ci_warn("rec $n in the deferbuffer is missing the record separator");
1034 # Total size of deferbuffer should match internal total
1035 if ($deferred_s != $self->{deferred_s}) {
1036 _ci_warn("buffer size is $self->{deferred_s}, should be $deferred_s");
1040 # Total size of deferbuffer should not exceed the specified limit
1041 if ($deferred_s > $self->{dw_size}) {
1042 _ci_warn("buffer size is $self->{deferred_s} which exceeds the limit of $self->{dw_size}");
1046 # Total size of cached data should not exceed the specified limit
1047 if ($deferred_s + $cached > $self->{memory}) {
1048 my $total = $deferred_s + $cached;
1049 _ci_warn("total stored data size is $total which exceeds the limit of $self->{memory}");
1053 # Stuff related to autodeferment
1054 if (!$self->{autodefer} && @{$self->{ad_history}}) {
1055 _ci_warn("autodefer is disabled, but ad_history is nonempty");
1058 if ($self->{autodeferring} && $self->{defer}) {
1059 _ci_warn("both autodeferring and explicit deferring are active");
1062 if (@{$self->{ad_history}} == 0) {
1063 # That's OK, no additional tests required
1064 } elsif (@{$self->{ad_history}} == 2) {
1065 my @non_number = grep !/^-?\d+$/, @{$self->{ad_history}};
1069 $msg = "ad_history contains non-numbers (@{$self->{ad_history}})";
1073 } elsif ($self->{ad_history}[1] < $self->{ad_history}[0]) {
1074 _ci_warn("ad_history has nonsensical values @{$self->{ad_history}}");
1078 _ci_warn("ad_history has bad length <@{$self->{ad_history}}>");
1085 ################################################################
1091 package Tie::File::Cache;
1092 $Tie::File::Cache::VERSION = $Tie::File::VERSION;
1093 use Carp ':DEFAULT', 'confess';
1102 my ($pack, $max) = @_;
1104 croak "missing argument to ->new" unless defined $max;
1106 bless $self => $pack;
1107 @$self = (Tie::File::Heap->new($self), {}, $max, 0);
1112 my ($self, $n) = @_;
1117 my ($self, $n) = @_;
1121 # For internal use only
1122 # Will be called by the heap structure to notify us that a certain
1123 # piece of data has moved from one heap element to another.
1124 # $k is the hash key of the item
1125 # $n is the new index into the heap at which it is stored
1126 # If $n is undefined, the item has been removed from the heap.
1128 my ($self, $k, $n) = @_;
1130 $self->[HASH]{$k} = $n;
1132 delete $self->[HASH]{$k};
1137 my ($self, $key, $val) = @_;
1139 croak "missing argument to ->insert" unless defined $key;
1140 unless (defined $self->[MAX]) {
1141 confess "undefined max" ;
1143 confess "undefined val" unless defined $val;
1144 return if length($val) > $self->[MAX];
1145 my $oldnode = $self->[HASH]{$key};
1146 if (defined $oldnode) {
1147 my $oldval = $self->[HEAP]->set_val($oldnode, $val);
1148 $self->[BYTES] -= length($oldval);
1150 $self->[HEAP]->insert($key, $val);
1152 $self->[BYTES] += length($val);
1158 my $old_data = $self->[HEAP]->popheap;
1159 return unless defined $old_data;
1160 $self->[BYTES] -= length $old_data;
1165 my ($self, @keys) = @_;
1167 for my $key (@keys) {
1168 next unless exists $self->[HASH]{$key};
1169 my $old_data = $self->[HEAP]->remove($self->[HASH]{$key});
1170 $self->[BYTES] -= length $old_data;
1171 push @result, $old_data;
1177 my ($self, $key) = @_;
1179 croak "missing argument to ->lookup" unless defined $key;
1180 if (exists $self->[HASH]{$key}) {
1181 $self->[HEAP]->lookup($self->[HASH]{$key});
1187 # For internal use only
1189 my ($self, $key) = @_;
1190 my $loc = $self->[HASH]{$key};
1191 return unless defined $loc;
1192 $self->[HEAP][$loc][2];
1195 # For internal use only
1197 my ($self, $key) = @_;
1198 $self->[HEAP]->promote($self->[HASH]{$key});
1203 %{$self->[HASH]} = ();
1205 $self->[HEAP]->empty;
1210 keys %{$self->[HASH]} == 0;
1214 my ($self, $key, $val) = @_;
1216 croak "missing argument to ->update" unless defined $key;
1217 if (length($val) > $self->[MAX]) {
1218 my $oldval = $self->remove($key);
1219 $self->[BYTES] -= length($oldval) if defined $oldval;
1220 } elsif (exists $self->[HASH]{$key}) {
1221 my $oldval = $self->[HEAP]->set_val($self->[HASH]{$key}, $val);
1222 $self->[BYTES] += length($val);
1223 $self->[BYTES] -= length($oldval) if defined $oldval;
1225 $self->[HEAP]->insert($key, $val);
1226 $self->[BYTES] += length($val);
1232 my ($self, $okeys, $nkeys) = @_;
1235 @map{@$okeys} = @$nkeys;
1236 croak "missing argument to ->rekey" unless defined $nkeys;
1237 croak "length mismatch in ->rekey arguments" unless @$nkeys == @$okeys;
1238 my %adjusted; # map new keys to heap indices
1239 # You should be able to cut this to one loop TODO XXX
1240 for (0 .. $#$okeys) {
1241 $adjusted{$nkeys->[$_]} = delete $self->[HASH]{$okeys->[$_]};
1243 while (my ($nk, $ix) = each %adjusted) {
1244 # @{$self->[HASH]}{keys %adjusted} = values %adjusted;
1245 $self->[HEAP]->rekey($ix, $nk);
1246 $self->[HASH]{$nk} = $ix;
1252 my @a = keys %{$self->[HASH]};
1261 sub reduce_size_to {
1262 my ($self, $max) = @_;
1263 until ($self->is_empty || $self->[BYTES] <= $max) {
1270 until ($self->is_empty || $self->[BYTES] <= $self->[MAX]) {
1275 # For internal use only
1278 $self->[HEAP]->expire_order;
1281 sub _check_integrity {
1283 $self->[HEAP]->_check_integrity;
1288 $self->[HEAP] = undef; # Bye bye heap
1291 ################################################################
1295 # Heap data structure for use by cache LRU routines
1297 package Tie::File::Heap;
1298 use Carp ':DEFAULT', 'confess';
1299 $Tie::File::Heap::VERSION = $Tie::File::Cache::VERSION;
1305 my ($pack, $cache) = @_;
1306 die "$pack: Parent cache object $cache does not support _heap_move method"
1307 unless eval { $cache->can('_heap_move') };
1308 my $self = [[0,$cache,0]];
1309 bless $self => $pack;
1312 # Allocate a new sequence number, larger than all previously allocated numbers
1347 $self->[0][0] = 0; # might as well reset the sequence numbers
1350 # notify the parent cache objec tthat we moved something
1353 $self->_cache->_heap_move(@_);
1356 # Insert a piece of data into the heap with the indicated sequence number.
1357 # The item with the smallest sequence number is always at the top.
1358 # If no sequence number is specified, allocate a new one and insert the
1359 # item at the bottom.
1361 my ($self, $key, $data, $seq) = @_;
1362 $seq = $self->_nseq unless defined $seq;
1363 $self->_insert_new([$seq, $key, $data]);
1366 # Insert a new, fresh item at the bottom of the heap
1368 my ($self, $item) = @_;
1370 $i = int($i/2) until defined $self->[$i/2];
1371 $self->[$i] = $item;
1372 $self->_heap_move($self->[$i][KEY], $i);
1376 # Insert [$data, $seq] pair at or below item $i in the heap.
1377 # If $i is omitted, default to 1 (the top element.)
1379 my ($self, $item, $i) = @_;
1380 $self->_check_loc($i) if defined $i;
1381 $i = 1 unless defined $i;
1382 until (! defined $self->[$i]) {
1383 if ($self->[$i][SEQ] > $item->[SEQ]) { # inserted item is older
1384 ($self->[$i], $item) = ($item, $self->[$i]);
1385 $self->_heap_move($self->[$i][KEY], $i);
1387 # If either is undefined, go that way. Otherwise, choose at random
1389 $dir = 0 if !defined $self->[2*$i];
1390 $dir = 1 if !defined $self->[2*$i+1];
1391 $dir = int(rand(2)) unless defined $dir;
1394 $self->[$i] = $item;
1395 $self->_heap_move($self->[$i][KEY], $i);
1399 # Remove the item at node $i from the heap, moving child items upwards.
1400 # The item with the smallest sequence number is always at the top.
1401 # Moving items upwards maintains this condition.
1402 # Return the removed item.
1404 my ($self, $i) = @_;
1405 $i = 1 unless defined $i;
1406 my $top = $self->[$i];
1407 return unless defined $top;
1410 my ($L, $R) = (2*$i, 2*$i+1);
1412 # If either is undefined, go the other way.
1413 # Otherwise, go towards the smallest.
1414 last unless defined $self->[$L] || defined $self->[$R];
1415 $ii = $R if not defined $self->[$L];
1416 $ii = $L if not defined $self->[$R];
1417 unless (defined $ii) {
1418 $ii = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1421 $self->[$i] = $self->[$ii]; # Promote child to fill vacated spot
1422 $self->_heap_move($self->[$i][KEY], $i);
1423 $i = $ii; # Fill new vacated spot
1425 $self->_heap_move($top->[KEY], undef);
1436 # set the sequence number of the indicated item to a higher number
1437 # than any other item in the heap, and bubble the item down to the
1440 my ($self, $n) = @_;
1441 $self->_check_loc($n);
1442 $self->[$n][SEQ] = $self->_nseq;
1445 my ($L, $R) = (2*$i, 2*$i+1);
1447 last unless defined $self->[$L] || defined $self->[$R];
1448 $dir = $R unless defined $self->[$L];
1449 $dir = $L unless defined $self->[$R];
1450 unless (defined $dir) {
1451 $dir = $self->[$L][SEQ] < $self->[$R][SEQ] ? $L : $R;
1453 @{$self}[$i, $dir] = @{$self}[$dir, $i];
1455 $self->_heap_move($self->[$_][KEY], $_) if defined $self->[$_];
1461 # Return item $n from the heap, promoting its LRU status
1463 my ($self, $n) = @_;
1464 $self->_check_loc($n);
1465 my $val = $self->[$n];
1471 # Assign a new value for node $n, promoting it to the bottom of the heap
1473 my ($self, $n, $val) = @_;
1474 $self->_check_loc($n);
1475 my $oval = $self->[$n][DAT];
1476 $self->[$n][DAT] = $val;
1481 # The hask key has changed for an item;
1482 # alter the heap's record of the hash key
1484 my ($self, $n, $new_key) = @_;
1485 $self->_check_loc($n);
1486 $self->[$n][KEY] = $new_key;
1490 my ($self, $n) = @_;
1491 unless (defined $self->[$n]) {
1492 confess "_check_loc($n) failed";
1496 sub _check_integrity {
1499 unless (eval {$self->[0][1]->isa("Tie::File::Cache")}) {
1500 print "# Element 0 of heap corrupt\n";
1503 $good = 0 unless $self->_satisfies_heap_condition(1);
1504 for my $i (2 .. $#{$self}) {
1505 my $p = int($i/2); # index of parent node
1506 if (defined $self->[$i] && ! defined $self->[$p]) {
1507 print "# Element $i of heap defined, but parent $p isn't\n";
1514 sub _satisfies_heap_condition {
1520 next unless defined $self->[$c];
1521 if ($self->[$n][SEQ] >= $self->[$c]) {
1522 print "# Node $n of heap does not predate node $c\n";
1525 $good = 0 unless $self->_satisfies_heap_condition($c);
1530 # Return a list of all the values, sorted by expiration order
1533 my @nodes = sort {$a->[SEQ] <=> $b->[SEQ]} $self->_nodes;
1534 map { $_->[KEY] } @nodes;
1540 return unless defined $self->[$i];
1541 ($self->[$i], $self->_nodes($i*2), $self->_nodes($i*2+1));
1548 "Cogito, ergo sum."; # don't forget to return a true value from the file
1552 Tie::File - Access the lines of a disk file via a Perl array
1556 # This file documents Tie::File version 0.90
1558 tie @array, 'Tie::File', filename or die ...;
1560 $array[13] = 'blah'; # line 13 of the file is now 'blah'
1561 print $array[42]; # display line 42 of the file
1563 $n_recs = @array; # how many records are in the file?
1564 $#array -= 2; # chop two records off the end
1568 s/PERL/Perl/g; # Replace PERL with Perl everywhere in the file
1571 # These are just like regular push, pop, unshift, shift, and splice
1572 # Except that they modify the file in the way you would expect
1574 push @array, new recs...;
1575 my $r1 = pop @array;
1576 unshift @array, new recs...;
1577 my $r1 = shift @array;
1578 @old_recs = splice @array, 3, 7, new recs...;
1580 untie @array; # all finished
1585 C<Tie::File> represents a regular text file as a Perl array. Each
1586 element in the array corresponds to a record in the file. The first
1587 line of the file is element 0 of the array; the second line is element
1590 The file is I<not> loaded into memory, so this will work even for
1593 Changes to the array are reflected in the file immediately.
1595 Lazy people and beginners may now stop reading the manual.
1599 What is a 'record'? By default, the meaning is the same as for the
1600 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
1601 probably C<"\n">. (Minor exception: on dos and Win32 systems, a
1602 'record' is a string terminated by C<"\r\n">.) You may change the
1603 definition of "record" by supplying the C<recsep> option in the C<tie>
1606 tie @array, 'Tie::File', $file, recsep => 'es';
1608 This says that records are delimited by the string C<es>. If the file
1609 contained the following data:
1611 Curse these pesky flies!\n
1613 then the C<@array> would appear to have four elements:
1620 An undefined value is not permitted as a record separator. Perl's
1621 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
1624 Records read from the tied array do not have the record separator
1625 string on the end; this is to allow
1627 $array[17] .= "extra";
1629 to work as expected.
1631 (See L<"autochomp">, below.) Records stored into the array will have
1632 the record separator string appended before they are written to the
1633 file, if they don't have one already. For example, if the record
1634 separator string is C<"\n">, then the following two lines do exactly
1637 $array[17] = "Cherry pie";
1638 $array[17] = "Cherry pie\n";
1640 The result is that the contents of line 17 of the file will be
1641 replaced with "Cherry pie"; a newline character will separate line 17
1642 from line 18. This means that in particular, this will do nothing:
1646 Because the C<chomp>ed value will have the separator reattached when
1647 it is written back to the file. There is no way to create a file
1648 whose trailing record separator string is missing.
1650 Inserting records that I<contain> the record separator string will
1651 produce a reasonable result, but if you can't foresee what this result
1652 will be, you'd better avoid doing this.
1656 Normally, array elements have the record separator removed, so that if
1657 the file contains the text
1663 the tied array will appear to contain C<("Gold", "Frankincense",
1664 "Myrrh")>. If you set C<autochomp> to a false value, the record
1665 separator will not be removed. If the file above was tied with
1667 tie @gifts, "Tie::File", $gifts, autochomp => 0;
1669 then the array C<@gifts> would appear to contain C<("Gold\n",
1670 "Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n",
1671 "Frankincense\r\n", "Myrrh\r\n")>.
1675 Normally, the specified file will be opened for read and write access,
1676 and will be created if it does not exist. (That is, the flags
1677 C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
1678 change this, you may supply alternative flags in the C<mode> option.
1679 See L<Fcntl> for a listing of available flags.
1682 # open the file if it exists, but fail if it does not exist
1684 tie @array, 'Tie::File', $file, mode => O_RDWR;
1686 # create the file if it does not exist
1687 use Fcntl 'O_RDWR', 'O_CREAT';
1688 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
1690 # open an existing file in read-only mode
1691 use Fcntl 'O_RDONLY';
1692 tie @array, 'Tie::File', $file, mode => O_RDONLY;
1694 Opening the data file in write-only or append mode is not supported.
1698 This is an upper limit on the amount of memory that C<Tie::File> will
1699 consume at any time while managing the file. This is used for two
1700 things: managing the I<read cache> and managing the I<deferred write
1703 Records read in from the file are cached, to avoid having to re-read
1704 them repeatedly. If you read the same record twice, the first time it
1705 will be stored in memory, and the second time it will be fetched from
1706 the I<read cache>. The amount of data in the read cache will not
1707 exceed the value you specified for C<memory>. If C<Tie::File> wants
1708 to cache a new record, but the read cache is full, it will make room
1709 by expiring the least-recently visited records from the read cache.
1711 The default memory limit is 2Mib. You can adjust the maximum read
1712 cache size by supplying the C<memory> option. The argument is the
1713 desired cache size, in bytes.
1715 # I have a lot of memory, so use a large cache to speed up access
1716 tie @array, 'Tie::File', $file, memory => 20_000_000;
1718 Setting the memory limit to 0 will inhibit caching; records will be
1719 fetched from disk every time you examine them.
1723 (This is an advanced feature. Skip this section on first reading.)
1725 If you use deferred writing (See L<"Deferred Writing">, below) then
1726 data you write into the array will not be written directly to the
1727 file; instead, it will be saved in the I<deferred write buffer> to be
1728 written out later. Data in the deferred write buffer is also charged
1729 against the memory limit you set with the C<memory> option.
1731 You may set the C<dw_size> option to limit the amount of data that can
1732 be saved in the deferred write buffer. This limit may not exceed the
1733 total memory limit. For example, if you set C<dw_size> to 1000 and
1734 C<memory> to 2500, that means that no more than 1000 bytes of deferred
1735 writes will be saved up. The space available for the read cache will
1736 vary, but it will always be at least 1500 bytes (if the deferred write
1737 buffer is full) and it could grow as large as 2500 bytes (if the
1738 deferred write buffer is empty.)
1740 If you don't specify a C<dw_size>, it defaults to the entire memory
1743 =head2 Option Format
1745 C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
1746 C<recsep>. C<-memory> is a synonym for C<memory>. You get the
1749 =head1 Public Methods
1751 The C<tie> call returns an object, say C<$o>. You may call
1753 $rec = $o->FETCH($n);
1754 $o->STORE($n, $rec);
1756 to fetch or store the record at line C<$n>, respectively; similarly
1757 the other tied array methods. (See L<perltie> for details.) You may
1758 also call the following methods on this object:
1764 will lock the tied file. C<MODE> has the same meaning as the second
1765 argument to the Perl built-in C<flock> function; for example
1766 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>. (These constants are provided by
1767 the C<use Fcntl ':flock'> declaration.)
1769 C<MODE> is optional; the default is C<LOCK_EX>.
1771 C<Tie::File> promises that the following sequence of operations will
1774 my $o = tie @array, "Tie::File", $filename;
1777 In particular, C<Tie::File> will I<not> read or write the file during
1778 the C<tie> call. (Exception: Using C<mode =E<gt> O_TRUNC> will, of
1779 course, erase the file during the C<tie> call. If you want to do this
1780 safely, then open the file without C<O_TRUNC>, lock the file, and use
1783 The best way to unlock a file is to discard the object and untie the
1784 array. It is probably unsafe to unlock the file without also untying
1785 it, because if you do, changes may remain unwritten inside the object.
1786 That is why there is no shortcut for unlocking. If you really want to
1787 unlock the file prematurely, you know what to do; if you don't know
1788 what to do, then don't do it.
1790 All the usual warnings about file locking apply here. In particular,
1791 note that file locking in Perl is B<advisory>, which means that
1792 holding a lock will not prevent anyone else from reading, writing, or
1793 erasing the file; it only prevents them from getting another lock at
1794 the same time. Locks are analogous to green traffic lights: If you
1795 have a green light, that does not prevent the idiot coming the other
1796 way from plowing into you sideways; it merely guarantees to you that
1797 the idiot does not also have a green light at the same time.
1801 my $old_value = $o->autochomp(0); # disable autochomp option
1802 my $old_value = $o->autochomp(1); # enable autochomp option
1804 my $ac = $o->autochomp(); # recover current value
1806 See L<"autochomp">, above.
1808 =head2 C<defer>, C<flush>, C<discard>, and C<autodefer>
1810 See L<"Deferred Writing">, below.
1812 =head1 Tying to an already-opened filehandle
1814 If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
1815 of the other C<IO> modules, you may use:
1817 tie @array, 'Tie::File', $fh, ...;
1819 Similarly if you opened that handle C<FH> with regular C<open> or
1820 C<sysopen>, you may use:
1822 tie @array, 'Tie::File', \*FH, ...;
1824 Handles that were opened write-only won't work. Handles that were
1825 opened read-only will work as long as you don't try to modify the
1826 array. Handles must be attached to seekable sources of data---that
1827 means no pipes or sockets. If you supply a non-seekable handle, the
1828 C<tie> call will try to throw an exception. (On Unix systems, it
1829 B<will> throw an exception.)
1831 =head1 Deferred Writing
1833 (This is an advanced feature. Skip this section on first reading.)
1835 Normally, modifying a C<Tie::File> array writes to the underlying file
1836 immediately. Every assignment like C<$a[3] = ...> rewrites as much of
1837 the file as is necessary; typically, everything from line 3 through
1838 the end will need to be rewritten. This is the simplest and most
1839 transparent behavior. Performance even for large files is reasonably
1842 However, under some circumstances, this behavior may be excessively
1843 slow. For example, suppose you have a million-record file, and you
1850 The first time through the loop, you will rewrite the entire file,
1851 from line 0 through the end. The second time through the loop, you
1852 will rewrite the entire file from line 1 through the end. The third
1853 time through the loop, you will rewrite the entire file from line 2 to
1856 If the performance in such cases is unacceptable, you may defer the
1857 actual writing, and then have it done all at once. The following loop
1858 will perform much better for large files:
1866 If C<Tie::File>'s memory limit is large enough, all the writing will
1867 done in memory. Then, when you call C<-E<gt>flush>, the entire file
1868 will be rewritten in a single pass.
1870 (Actually, the preceding discussion is something of a fib. You don't
1871 need to enable deferred writing to get good performance for this
1872 common case, because C<Tie::File> will do it for you automatically
1873 unless you specifically tell it not to. See L<"autodeferring">,
1876 Calling C<-E<gt>flush> returns the array to immediate-write mode. If
1877 you wish to discard the deferred writes, you may call C<-E<gt>discard>
1878 instead of C<-E<gt>flush>. Note that in some cases, some of the data
1879 will have been written already, and it will be too late for
1880 C<-E<gt>discard> to discard all the changes. Support for
1881 C<-E<gt>discard> may be withdrawn in a future version of C<Tie::File>.
1883 Deferred writes are cached in memory up to the limit specified by the
1884 C<dw_size> option (see above). If the deferred-write buffer is full
1885 and you try to write still more deferred data, the buffer will be
1886 flushed. All buffered data will be written immediately, the buffer
1887 will be emptied, and the now-empty space will be used for future
1890 If the deferred-write buffer isn't yet full, but the total size of the
1891 buffer and the read cache would exceed the C<memory> limit, the oldest
1892 records will be flushed out of the read cache until total usage is
1895 C<push>, C<pop>, C<shift>, C<unshift>, and C<splice> cannot be
1896 deferred. When you perform one of these operations, any deferred data
1897 is written to the file and the operation is performed immediately.
1898 This may change in a future version.
1900 If you resize the array with deferred writing enabled, the file will
1901 be resized immediately, but deferred records will not be written.
1903 =head2 Autodeferring
1905 C<Tie::File> tries to guess when deferred writing might be helpful,
1906 and to turn it on and off automatically. In the example above, only
1907 the first two assignments will be done immediately; after this, all
1908 the changes to the file will be deferred up to the user-specified
1911 You should usually be able to ignore this and just use the module
1912 without thinking about deferring. However, special applications may
1913 require fine control over which writes are deferred, or may require
1914 that all writes be immediate. To disable the autodeferment feature,
1917 (tied @o)->autodefer(0);
1921 tie @array, 'Tie::File', $file, autodefer => 0;
1926 (That's Latin for 'warnings'.)
1932 This is BETA RELEASE SOFTWARE. It may have bugs. See the discussion
1933 below about the (lack of any) warranty.
1935 In particular, this means that the interface may change in
1936 incompatible ways from one version to the next, without warning. That
1937 has happened at least once already. The interface will freeze before
1938 Perl 5.8 is released, probably sometime in April 2002.
1942 Reasonable effort was made to make this module efficient. Nevertheless,
1943 changing the size of a record in the middle of a large file will
1944 always be fairly slow, because everything after the new record must be
1949 The behavior of tied arrays is not precisely the same as for regular
1950 arrays. For example:
1952 # This DOES print "How unusual!"
1953 undef $a[10]; print "How unusual!\n" if defined $a[10];
1955 C<undef>-ing a C<Tie::File> array element just blanks out the
1956 corresponding record in the file. When you read it back again, you'll
1957 get the empty string, so the supposedly-C<undef>'ed value will be
1958 defined. Similarly, if you have C<autochomp> disabled, then
1960 # This DOES print "How unusual!" if 'autochomp' is disabled
1962 print "How unusual!\n" if $a[10];
1964 Because when C<autochomp> is disabled, C<$a[10]> will read back as
1965 C<"\n"> (or whatever the record separator string is.)
1967 There are other minor differences, particularly regarding C<exists>
1968 and C<delete>, but in general, the correspondence is extremely close.
1972 Not quite every effort was made to make this module as efficient as
1973 possible. C<FETCHSIZE> should use binary search instead of linear
1974 search. The cache's LRU queue should be a heap instead of a list.
1976 The performance of the C<flush> method could be improved. At present,
1977 it still rewrites the tail of the file once for each block of
1978 contiguous lines to be changed. In the typical case, this will result
1979 in only one rewrite, but in peculiar cases it might be bad. It should
1980 be possible to perform I<all> deferred writing with a single rewrite.
1982 These defects are probably minor; in any event, they will be fixed in
1983 a future version of the module.
1987 The author has supposed that since this module is concerned with file
1988 I/O, almost all normal use of it will be heavily I/O bound, and that
1989 the time to maintain complicated data structures inside the module
1990 will be dominated by the time to actually perform the I/O. This
1991 suggests, for example, that an LRU read-cache is a good tradeoff, even
1992 if it requires substantial bookkeeping following a C<splice>
1997 You might be tempted to think that deferred writing is like
1998 transactions, with C<flush> as C<commit> and C<discard> as
1999 C<rollback>, but it isn't, so don't.
2005 This version promises absolutely nothing about the internals, which
2006 may change without notice. A future version of the module will have a
2007 well-defined and stable subclassing API.
2009 =head1 WHAT ABOUT C<DB_File>?
2011 C<DB_File>'s C<DB_RECNO> feature does something similar to
2012 C<Tie::File>, but there are a number of reasons that you might prefer
2013 C<Tie::File>. C<DB_File> is a great piece of software, but the
2014 C<DB_RECNO> part is less great than the rest of it.
2020 C<DB_File> reads your entire file into memory, modifies it in memory,
2021 and the writes out the entire file again when you untie the file.
2022 This is completely impractical for large files.
2024 C<Tie::File> does not do any of those things. It doesn't try to read
2025 the entire file into memory; instead it uses a lazy approach and
2026 caches recently-used records. The cache size is strictly bounded by
2027 the C<memory> option. DB_File's C<-E<gt>{cachesize}> doesn't prevent
2028 your process from blowing up when reading a big file.
2032 C<DB_File> has an extremely poor writing strategy. If you have a
2033 ten-megabyte file and tie it with C<DB_File>, and then use
2035 $a[0] =~ s/PERL/Perl/;
2037 C<DB_file> will then read the entire ten-megabyte file into memory, do
2038 the change, and write the entire file back to disk, reading ten
2039 megabytes and writing ten megabytes. C<Tie::File> will read and write
2040 only the first record.
2042 If you have a million-record file and tie it with C<DB_File>, and then
2045 $a[999998] =~ s/Larry/Larry Wall/;
2047 C<DB_File> will read the entire million-record file into memory, do
2048 the change, and write the entire file back to disk. C<Tie::File> will
2049 only rewrite records 999998 and 999999. During the writing process,
2050 it will never have more than a few kilobytes of data in memory at any
2051 time, even if the two records are very large.
2055 Since changes to C<DB_File> files only appear when you do C<untie>, it
2056 can be inconvenient to arrange for concurrent access to the same file
2057 by two or more processes. Each process needs to call C<$db-E<gt>sync>
2058 after every write. When you change a C<Tie::File> array, the changes
2059 are reflected in the file immediately; no explicit C<-E<gt>sync> call
2060 is required. (Or you can enable deferred writing mode to require that
2061 changes be explicitly sync'ed.)
2065 C<DB_File> is only installed by default if you already have the C<db>
2066 library on your system; C<Tie::File> is pure Perl and is installed by
2067 default no matter what. Starting with Perl 5.7.3 you can be
2068 absolutely sure it will be everywhere. You will never have that
2069 surety with C<DB_File>. If you don't have C<DB_File> yet, it requires
2070 a C compiler. You can install C<Tie::File> from CPAN in five minutes
2075 C<DB_File> is written in C, so if you aren't allowed to install
2076 modules on your system, it is useless. C<Tie::File> is written in Perl,
2077 so even if you aren't allowed to install modules, you can look into
2078 the source code, see how it works, and copy the subroutines or the
2079 ideas from the subroutines directly into your own Perl program.
2083 Except in very old, unsupported versions, C<DB_File>'s free license
2084 requires that you distribute the source code for your entire
2085 application. If you are not able to distribute the source code for
2086 your application, you must negotiate an alternative license from
2087 Sleepycat, possibly for a fee. Tie::File is under the Perl Artistic
2088 license and can be distributed free under the same terms as Perl
2097 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
2099 To receive an announcement whenever a new version of this module is
2100 released, send a blank email message to
2101 C<mjd-perl-tiefile-subscribe@plover.com>.
2103 The most recent version of this module, including documentation and
2104 any news of importance, will be available at
2106 http://perl.plover.com/TieFile/
2111 C<Tie::File> version 0.90 is copyright (C) 2002 Mark Jason Dominus.
2113 This library is free software; you may redistribute it and/or modify
2114 it under the same terms as Perl itself.
2116 These terms are your choice of any of (1) the Perl Artistic Licence,
2117 or (2) version 2 of the GNU General Public License as published by the
2118 Free Software Foundation, or (3) any later version of the GNU General
2121 This library is distributed in the hope that it will be useful,
2122 but WITHOUT ANY WARRANTY; without even the implied warranty of
2123 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2124 GNU General Public License for more details.
2126 You should have received a copy of the GNU General Public License
2127 along with this library program; it should be in the file C<COPYING>.
2128 If not, write to the Free Software Foundation, Inc., 59 Temple Place,
2129 Suite 330, Boston, MA 02111 USA
2131 For licensing inquiries, contact the author at:
2135 Philadelphia, PA 19107
2139 C<Tie::File> version 0.90 comes with ABSOLUTELY NO WARRANTY.
2140 For details, see the license.
2144 Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
2145 core when I hadn't written it yet, and for generally being helpful,
2146 supportive, and competent. (Usually the rule is "choose any one.")
2147 Also big thanks to Abhijit Menon-Sen for all of the same things.
2149 Special thanks to Craig Berry and Peter Prymmer (for VMS portability
2150 help), Randy Kobes (for Win32 portability help), Clinton Pierce and
2151 Autrijus Tang (for heroic eleventh-hour Win32 testing above and beyond
2152 the call of duty), Michael G Schwern (for testing advice), and the
2153 rest of the CPAN testers (for testing generally).
2155 Additional thanks to:
2160 Tassilo von Parseval /
2165 Autrijus Tang (again) /
2170 More tests. (_twrite should be tested separately, because there are a
2171 lot of weird special cases lurking in there.)
2173 Improve SPLICE algorithm to use deferred writing machinery.
2175 More tests. (Stuff I didn't think of yet.)
2179 Fixed-length mode. Leave-blanks mode.
2181 Maybe an autolocking mode?
2183 Record locking with fcntl()? Then the module might support an undo
2184 log and get real transactions. What a tour de force that would be.
2186 Cleverer strategy for flushing deferred writes.