5 use Fcntl 'O_CREAT', 'O_RDWR';
10 # Idea: The object will always contain an array of byte offsets
11 # this will be filled in as is necessary and convenient.
12 # fetch will do seek-read.
13 # There will be a cache parameter that controls the amount of cached *data*
14 # Also an LRU queue of cached records
15 # store will read the relevant record into the cache
16 # If it's the same length as what is being written, it will overwrite it in
17 # place; if not, it will do a from-to copying write.
18 # The record separator string is also a parameter
20 # Record numbers start at ZERO.
22 my $DEFAULT_CACHE_SIZE = 1<<21; # 2 megabytes
26 croak "usage: tie \@array, $_[0], filename, [option => value]...";
28 my ($pack, $file, %opts) = @_;
30 # transform '-foo' keys into 'foo' keys
31 for my $key (keys %opts) {
33 if ($key =~ s/^-+//) {
34 $opts{$key} = delete $opts{$okey};
38 $opts{cachesize} ||= $DEFAULT_CACHE_SIZE;
40 # the cache is a hash instead of an array because it is likely to be
43 $opts{cached} = 0; # total size of cached data
44 $opts{lru} = []; # replace with heap in later version
47 $opts{filename} = $file;
48 $opts{recsep} = $/ unless defined $opts{recsep};
49 $opts{recseplen} = length($opts{recsep});
50 if ($opts{recseplen} == 0) {
51 croak "Empty record separator not supported by $pack";
54 my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
56 my $fh = \do { local *FH }; # only works in 5.005 and later
57 sysopen $fh, $file, $mode, 0666 or return;
59 { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
62 bless \%opts => $pack;
68 # check the record cache
69 { my $cached = $self->_check_cache($n);
70 return $cached if defined $cached;
73 unless ($#{$self->{offsets}} >= $n) {
74 my $o = $self->_fill_offsets_to($n);
75 # If it's still undefined, there is no such record, so return 'undef'
76 return unless defined $o;
80 $self->_seek($n); # we can do this now that offsets is populated
81 my $rec = $self->_read_record;
82 $self->_cache_insert($n, $rec) if defined $rec;
87 my ($self, $n, $rec) = @_;
89 $self->_fixrecs($rec);
91 # TODO: what should we do about the cache? Install the new record
92 # in the cache only if the old version of the same record was
95 # We need this to decide whether the new record will fit
96 # It incidentally populates the offsets table
97 # Note we have to do this before we alter the cache
98 my $oldrec = $self->FETCH($n);
100 # _check_cache promotes record $n to MRU. Is this correct behavior?
101 $self->{cache}{$n} = $rec if $self->_check_cache($n);
103 if (not defined $oldrec) {
104 # We're storing a record beyond the end of the file
105 $self->_extend_file_to($n);
106 $oldrec = $self->{recsep};
108 my $len_diff = length($rec) - length($oldrec);
110 $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
112 # now update the offsets
113 # array slice goes from element $n+1 (the first one to move)
115 for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
122 my $n = $#{$self->{offsets}};
123 while (defined ($self->_fill_offsets_to($n+1))) {
130 my ($self, $len) = @_;
131 my $olen = $self->FETCHSIZE;
132 return if $len == $olen; # Woo-hoo!
136 $self->_extend_file_to($len-1); # record numbers from 0 .. $len-1
143 $#{$self->{offsets}} = $len-1;
144 my @cached = grep $_ > $len, keys %{$self->{cache}};
145 delete @{$self->{cache}}{@cached} if @cached;
149 my ($self, $pos, $nrecs, @data) = @_;
152 $pos += $self->FETCHSIZE if $pos < 0;
154 $self->_fixrecs(@data);
155 my $data = join '', @data;
156 my $datalen = length $data;
159 # compute length of data being removed
160 for ($pos .. $pos+$nrecs-1) {
161 my $rec = $self->FETCH($_);
162 last unless defined $rec;
164 $oldlen += length($rec);
167 $self->_fill_offsets_to($pos);
168 $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
170 # update the offsets table part 1
171 # compute the offsets of the new records:
174 push @new_offsets, $self->{offsets}[$pos];
175 for (0 .. $#data-1) {
176 push @new_offsets, $new_offsets[-1] + length($data[$_]);
179 splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
181 # update the offsets table part 2
182 # adjust the offsets of the following old records
183 for ($pos+@data .. $#{$self->{offsets}}) {
184 $self->{offsets}[$_] += $datalen - $oldlen;
186 # If we scrubbed out all known offsets, regenerate the trivial table
187 # that knows that the file does indeed start at 0.
188 $self->{offsets}[0] = 0 unless @{$self->{offsets}};
190 # update the read cache, part 1
192 # Consider this carefully for correctness
193 for ($pos .. $pos+$nrecs-1) {
194 my $cached = $self->{cache}{$_};
195 next unless defined $cached;
196 my $new = $data[$_-$pos];
198 $self->{cached} += length($new) - length($cached);
199 $self->{cache}{$_} = $new;
201 delete $self->{cache}{$_};
202 $self->{cached} -= length($cached);
205 # update the read cache, part 2
206 # moved records - records past the site of the change
207 # need to be renumbered
208 # Maybe merge this with the previous block?
209 for (keys %{$self->{cache}}) {
210 next unless $_ >= $pos + $nrecs;
211 $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
216 for (@{$self->{lru}}) {
217 if ($_ >= $pos + $nrecs) {
218 push @new, $_ + @data - $nrecs;
219 } elsif ($_ >= $pos) {
220 push @changed, $_ if $_ < $pos + @data;
225 @{$self->{lru}} = (@new, @changed);
230 # write data into the file
231 # $data is the data to be written.
232 # it should be written at position $pos, and should overwrite
233 # exactly $len of the following bytes.
234 # Note that if length($data) > $len, the subsequent bytes will have to
235 # be moved up, and if length($data) < $len, they will have to
238 my ($self, $data, $pos, $len) = @_;
240 unless (defined $pos) {
241 die "\$pos was undefined in _twrite";
244 my $len_diff = length($data) - $len;
246 if ($len_diff == 0) { # Woo-hoo!
247 my $fh = $self->{fh};
249 $self->_write_record($data);
250 return; # well, that was easy.
253 # the two records are of different lengths
254 # our strategy here: rewrite the tail of the file,
255 # reading ahead one buffer at a time
256 # $bufsize is required to be at least as large as the data we're overwriting
257 my $bufsize = _bufsize($len_diff);
258 my ($writepos, $readpos) = ($pos, $pos+$len);
260 # Seems like there ought to be a way to avoid the repeated code
261 # and the special case here. The read(1) is also a little weird.
264 $self->_seekb($readpos);
265 my $br = read $self->{fh}, my($next_block), $bufsize;
266 my $more_data = read $self->{fh}, my($dummy), 1;
267 $self->_seekb($writepos);
268 $self->_write_record($data);
270 $writepos += length $data;
272 unless ($more_data) {
273 $self->_seekb($writepos);
274 $self->_write_record($next_block);
278 # There might be leftover data at the end of the file
279 $self->_chop_file if $len_diff < 0;
282 # If a record does not already end with the appropriate terminator
283 # string, append one.
287 $_ .= $self->{recsep}
288 unless substr($_, - $self->{recseplen}) eq $self->{recsep};
292 # seek to the beginning of record #$n
293 # Assumes that the offsets table is already correctly populated
295 # Note that $n=-1 has a special meaning here: It means the start of
296 # the last known record; this may or may not be the very last record
297 # in the file, depending on whether the offsets table is fully populated.
301 my $o = $self->{offsets}[$n];
303 or confess("logic error: undefined offset for record $n");
304 seek $self->{fh}, $o, SEEK_SET
305 or die "Couldn't seek filehandle: $!"; # "Should never happen."
310 seek $self->{fh}, $b, SEEK_SET
311 or die "Couldn't seek filehandle: $!"; # "Should never happen."
314 # populate the offsets table up to the beginning of record $n
315 # return the offset of record $n
316 sub _fill_offsets_to {
318 my $fh = $self->{fh};
319 local *OFF = $self->{offsets};
322 until ($#OFF >= $n) {
324 $self->_seek(-1); # tricky -- see comment at _seek
325 $rec = $self->_read_record;
327 push @OFF, $o+length($rec);
329 return; # It turns out there is no such record
333 # we have now read all the records up to record n-1,
334 # so we can return the offset of record n
338 # assumes that $rec is already suitably terminated
340 my ($self, $rec) = @_;
341 my $fh = $self->{fh};
343 or die "Couldn't write record: $!"; # "Should never happen."
350 { local $/ = $self->{recsep};
351 my $fh = $self->{fh};
358 my ($self, $n, $rec) = @_;
360 # Do not cache records that are too big to fit in the cache.
361 return unless length $rec <= $self->{cachesize};
363 $self->{cache}{$n} = $rec;
364 $self->{cached} += length $rec;
365 push @{$self->{lru}}, $n; # most-recently-used is at the END
367 $self->_cache_flush if $self->{cached} > $self->{cachesize};
373 return unless defined($rec = $self->{cache}{$n});
375 # cache hit; update LRU queue and return $rec
376 # replace this with a heap in a later version
377 @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
383 while ($self->{cached} > $self->{cachesize}) {
384 my $lru = shift @{$self->{lru}};
385 $self->{cached} -= length $lru;
386 delete $self->{cache}{$lru};
390 # We have read to the end of the file and have the offsets table
391 # entirely populated. Now we need to write a new record beyond
392 # the end of the file. We prepare for this by writing
393 # empty records into the file up to the position we want
394 # $n here is the record number of the last record we're going to write
395 sub _extend_file_to {
397 $self->_seek(-1); # position after the end of the last record
398 my $pos = $self->{offsets}[-1];
400 # the offsets table has one entry more than the total number of records
401 $extras = $n - ($#{$self->{offsets}} - 1);
403 # Todo : just use $self->{recsep} x $extras here?
404 while ($extras-- > 0) {
405 $self->_write_record($self->{recsep});
406 $pos += $self->{recseplen};
407 push @{$self->{offsets}}, $pos;
411 # Truncate the file at the current position
414 truncate $self->{fh}, tell($self->{fh});
417 # compute the size of a buffer suitable for moving
418 # all the data in a file forward $n bytes
419 # ($n may be negative)
420 # The result should be at least $n.
423 return 8192 if $n < 0;
425 $b += 8192 if $n & 8191;
430 # Given a file, make sure the cache is consistent with the
432 sub _check_integrity {
433 my ($self, $file, $warn) = @_;
436 open F, $file or die "Couldn't open file $file: $!";
438 local $/ = $self->{recsep};
439 unless ($self->{offsets}[0] == 0) {
440 $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
445 my $cached = $self->{cache}{$n};
446 my $offset = $self->{offsets}[$.];
448 if (defined $offset && $offset != $ao) {
449 $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
451 if (defined $cached && $_ ne $cached) {
455 $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
460 while (my ($n, $r) = each %{$self->{cache}}) {
461 $cachesize += length($r);
462 next if $n+1 <= $.; # checked this already
463 $warn && print STDERR "# spurious caching of record $n\n";
466 if ($cachesize != $self->{cached}) {
467 $warn && print STDERR "# cache size is $self->{cached}, should be $cachesize\n";
471 my (%seen, @duplicate);
472 for (@{$self->{lru}}) {
474 if (not exists $self->{cache}{$_}) {
475 print "# $_ is mentioned in the LRU queue, but not in the cache\n";
479 @duplicate = grep $seen{$_}>1, keys %seen;
481 my $records = @duplicate == 1 ? 'Record' : 'Records';
482 my $appear = @duplicate == 1 ? 'appears' : 'appear';
483 print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
486 for (keys %{$self->{cache}}) {
487 unless (exists $seen{$_}) {
488 print "# $record $_ is in the cache but not the LRU queue\n";
498 Tie::File - Access the lines of a disk file via a Perl array
502 # This file documents Tie::File version 0.13
504 tie @array, 'Tie::File', filename or die ...;
506 $array[13] = 'blah'; # line 13 of the file is now 'blah'
507 print $array[42]; # display line 42 of the file
509 $n_recs = @array; # how many records are in the file?
510 $#array = $n_recs - 2; # chop records off the end
512 # As you would expect
513 @old_recs = splice @array, 3, 7, new recs...;
515 untie @array; # all finished
519 C<Tie::File> represents a regular text file as a Perl array. Each
520 element in the array corresponds to a record in the file. The first
521 line of the file is element 0 of the array; the second line is element
524 The file is I<not> loaded into memory, so this will work even for
527 Changes to the array are reflected in the file immediately.
531 What is a 'record'? By default, the meaning is the same as for the
532 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
533 probably C<"\n"> or C<"\r\n">. You may change the definition of
534 "record" by supplying the C<recsep> option in the C<tie> call:
536 tie @array, 'Tie::File', $file, recsep => 'es';
538 This says that records are delimited by the string C<es>. If the file contained the following data:
540 Curse these pesky flies!\n
542 then the C<@array> would appear to have four elements:
549 An undefined value is not permitted as a record separator. Perl's
550 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
553 Records read from the tied array will have the record separator string
554 on the end, just as if they were read from the C<E<lt>...E<gt>>
555 operator. Records stored into the array will have the record
556 separator string appended before they are written to the file, if they
557 don't have one already. For example, if the record separator string
558 is C<"\n">, then the following two lines do exactly the same thing:
560 $array[17] = "Cherry pie";
561 $array[17] = "Cherry pie\n";
563 The result is that the contents of line 17 of the file will be
564 replaced with "Cherry pie"; a newline character will separate line 17
565 from line 18. This means that inparticular, this will do nothing:
569 Because the C<chomp>ed value will have the separator reattached when
570 it is written back to the file. There is no way to create a file
571 whose trailing record separator string is missing.
573 Inserting records that I<contain> the record separator string will
574 produce a reasonable result, but if you can't foresee what this result
575 will be, you'd better avoid doing this.
579 Normally, the specified file will be opened for read and write access,
580 and will be created if it does not exist. (That is, the flags
581 C<O_RDWR | O_CREAT> are supplied in the C<open> call.) If you want to
582 change this, you may supply alternative flags in the C<mode> option.
583 See L<Fcntl> for a listing of available flags.
586 # open the file if it exists, but fail if it does not exist
588 tie @array, 'Tie::File', $file, mode => O_RDWR;
590 # create the file if it does not exist
591 use Fcntl 'O_RDWR', 'O_CREAT';
592 tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
594 # open an existing file in read-only mode
595 use Fcntl 'O_RDONLY';
596 tie @array, 'Tie::File', $file, mode => O_RDONLY;
598 Opening the data file in write-only or append mode is not supported.
602 Records read in from the file are cached, to avoid having to re-read
603 them repeatedly. If you read the same record twice, the first time it
604 will be stored in memory, and the second time it will be fetched from
607 The cache has a bounded size; when it exceeds this size, the
608 least-recently visited records will be purged from the cache. The
609 default size is 2Mib. You can adjust the amount of space used for the
610 cache by supplying the C<cachesize> option. The argument is the desired cache size, in bytes.
612 # I have a lot of memory, so use a large cache to speed up access
613 tie @array, 'Tie::File', $file, cachesize => 20_000_000;
615 Setting the cache size to 0 will inhibit caching; records will be
616 fetched from disk every time you examine them.
620 C<-mode> is a synonym for C<mode>. C<-recsep> is a synonym for
621 C<recsep>. C<-cachesize> is a synonym for C<cachesize>. You get the
624 =head1 Public Methods
626 The C<tie> call returns an object, say C<$o>. You may call
628 $rec = $o->FETCH($n);
631 to fetch or store the record at line C<$n>, respectively. There are
632 no other public methods in this package.
636 (That's Latin for 'warnings'.)
638 =head2 Efficiency Note
640 Every effort was made to make this module efficient. Nevertheless,
641 changing the size of a record in the middle of a large file will
642 always be slow, because everything after the new record must be move.
644 In particular, note that:
651 is likely to be very slow, because the first iteration must relocate
652 lines 1 through 999,999; the second iteration must relocate lines 2
653 through 999,999, and so on. The relocation is done using block
654 writes, however, so it's not as slow as it might be.
656 A future version of this module will provide some mechanism for
657 getting better performance in such cases, by deferring the writing
658 until it can be done all at once.
660 =head2 Efficiency Note 2
662 Not every effort was made to make this module as efficient as
663 possible. C<FETCHSIZE> should use binary search instead of linear
664 search. The cache's LRU queue should be a heap instead of a list.
665 These defects are probably minor; in any event, they will be fixed in
666 a later version of the module.
668 =head2 Efficiency Note 3
670 The author has supposed that since this module is concerned with file
671 I/O, almost all normal use of it will be heavily I/O bound, and that
672 the time to maintain complicated data structures inside the module
673 will be dominated by the time to actually perform the I/O. This
674 suggests, for example, that and LRU read-cache is a good tradeoff,
675 even if it requires substantial adjustment following a C<splice>
678 =head2 Missing Methods
680 The tied array does not yet support C<push>, C<pop>, C<shift>,
681 C<unshift>, C<splice>, or size-setting via C<$#array = $n>. I will
688 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
690 To receive an announcement whenever a new version of this module is
691 released, send a blank email message to
692 C<mjd-perl-tiefile-subscribe@plover.com>.
696 C<Tie::File> version 0.13 is copyright (C) 2002 Mark Jason Dominus.
698 This program is free software; you can redistribute it and/or modify
699 it under the terms of the GNU General Public License as published by
700 the Free Software Foundation; either version 2 of the License, or (at
701 your option) any later version.
703 This program is distributed in the hope that it will be useful,
704 but WITHOUT ANY WARRANTY; without even the implied warranty of
705 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
706 GNU General Public License for more details.
708 You should have received a copy of the GNU General Public License
709 along with this program; it should be in the file C<COPYING>. If not,
710 write to the Free Software Foundation, Inc., 59 Temple Place, Suite
711 330, Boston, MA 02111 USA
713 For licensing inquiries, contact the author at:
717 Philadelphia, PA 19107
721 C<Tie::File> version 0.13 comes with ABSOLUTELY NO WARRANTY.
722 For details, see the license.
726 C<push>, C<pop>, C<shift>, C<unshift>.
728 More tests. (Configuration options, cache flushery. _twrite shoule
729 be tested separately, because there are a lot of weird special cases
732 More tests. (Stuff I didn't think of yet.)
736 Deferred writing. (!!!)