lib/Tie/File.pm

   1
   2 package Tie::File;
   3 use Carp;
   4 use POSIX 'SEEK_SET';
   5 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
   6 require 5.005;
   7
   8 $VERSION = "0.19";
   9
  10 # Idea: The object will always contain an array of byte offsets
  11 # this will be filled in as is necessary and convenient.
  12 # fetch will do seek-read.
  13 # There will be a cache parameter that controls the amount of cached *data*
  14 # Also an LRU queue of cached records
  15 # store will read the relevant record into the cache
  16 # If it's the same length as what is being written, it will overwrite it in
  17 #   place; if not, it will do a from-to copying write.
  18 # The record separator string is also a parameter
  19
  20 # Record numbers start at ZERO.
  21
  22 my $DEFAULT_MEMORY_SIZE = 1<<21;    # 2 megabytes
  23
  24 my %good_opt = map {$_ => 1, "-$_" => 1}
  25                qw(memory dw_size mode recsep discipline);
  26
  27 sub TIEARRAY {
  28   if (@_ % 2 != 0) {
  29     croak "usage: tie \@array, $_[0], filename, [option => value]...";
  30   }
  31   my ($pack, $file, %opts) = @_;
  32
  33   # transform '-foo' keys into 'foo' keys
  34   for my $key (keys %opts) {
  35     unless ($good_opt{$key}) {
  36       croak("$pack: Unrecognized option '$key'\n");
  37     }
  38     my $okey = $key;
  39     if ($key =~ s/^-+//) {
  40       $opts{$key} = delete $opts{$okey};
  41     }
  42   }
  43
  44   unless (defined $opts{memory}) {
  45     # default is the larger of the default cache size and the
  46     # deferred-write buffer size (if specified)
  47     $opts{memory} = $DEFAULT_MEMORY_SIZE;
  48     $opts{memory} = $opts{dw_size}
  49       if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
  50   }
  51   $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
  52   if ($opts{dw_size} > $opts{memory}) {
  53       croak("$pack: dw_size may not be larger than total memory allocation\n");
  54   }
  55   $opts{deferred} = {};         # no records presently deferred
  56   $opts{deferred_s} = 0;        # count of total bytes in ->{deferred}
  57
  58   # the cache is a hash instead of an array because it is likely to be
  59   # sparsely populated
  60   $opts{cache} = {};
  61   $opts{cached} = 0;   # total size of cached data
  62   $opts{lru} = [];     # replace with heap in later version
  63
  64   $opts{offsets} = [0];
  65   $opts{filename} = $file;
  66   unless (defined $opts{recsep}) {
  67     $opts{recsep} = _default_recsep();
  68   }
  69   $opts{recseplen} = length($opts{recsep});
  70   if ($opts{recseplen} == 0) {
  71     croak "Empty record separator not supported by $pack";
  72   }
  73
  74   my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
  75   my $fh;
  76
  77   if (UNIVERSAL::isa($file, 'GLOB')) {
  78     unless (seek $file, 0, SEEK_SET) {
  79       croak "$pack: your filehandle does not appear to be seekable";
  80     }
  81     $fh = $file;
  82   } elsif (ref $file) {
  83     croak "usage: tie \@array, $pack, filename, [option => value]...";
  84   } else {
  85     $fh = \do { local *FH };   # only works in 5.005 and later
  86     sysopen $fh, $file, $mode, 0666 or return;
  87     binmode $fh;
  88   }
  89   { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
  90   if (defined $opts{discipline} && $] >= 5.006) {
  91     # This avoids a compile-time warning under 5.005
  92     eval 'binmode($fh, $opts{discipline})';
  93     croak $@ if $@ =~ /unknown discipline/i;
  94     die if $@;
  95   }
  96   $opts{fh} = $fh;
  97
  98   bless \%opts => $pack;
  99 }
 100
 101 sub FETCH {
 102   my ($self, $n) = @_;
 103
 104   # check the record cache
 105   { my $cached = $self->_check_cache($n);
 106     return $cached if defined $cached;
 107   }
 108
 109   unless ($#{$self->{offsets}} >= $n) {
 110     my $o = $self->_fill_offsets_to($n);
 111     # If it's still undefined, there is no such record, so return 'undef'
 112     return unless defined $o;
 113   }
 114
 115   my $fh = $self->{FH};
 116   $self->_seek($n);             # we can do this now that offsets is populated
 117   my $rec = $self->_read_record;
 118
 119 # If we happen to have just read the first record, check to see if
 120 # the length of the record matches what 'tell' says.  If not, Tie::File
 121 # won't work, and should drop dead.
 122 #
 123 #  if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
 124 #    if (defined $self->{discipline}) {
 125 #      croak "I/O discipline $self->{discipline} not supported";
 126 #    } else {
 127 #      croak "File encoding not supported";
 128 #    }
 129 #  }
 130
 131   $self->_cache_insert($n, $rec) if defined $rec;
 132   $rec;
 133 }
 134
 135 sub STORE {
 136   my ($self, $n, $rec) = @_;
 137
 138   $self->_fixrecs($rec);
 139
 140   return $self->_store_deferred($n, $rec) if $self->{defer};
 141
 142   # We need this to decide whether the new record will fit
 143   # It incidentally populates the offsets table
 144   # Note we have to do this before we alter the cache
 145   my $oldrec = $self->FETCH($n);
 146
 147   # _check_cache promotes record $n to MRU.  Is this correct behavior?
 148   if (my $cached = $self->_check_cache($n)) {
 149     my $len_diff = length($rec) - length($cached);
 150     $self->{cache}{$n} = $rec;
 151     $self->{cached} += $len_diff;
 152     $self->_cache_flush
 153       if $len_diff > 0
 154         && $self->{deferred_s} + $self->{cached} > $self->{memory};
 155   }
 156
 157   if (not defined $oldrec) {
 158     # We're storing a record beyond the end of the file
 159     $self->_extend_file_to($n+1);
 160     $oldrec = $self->{recsep};
 161   }
 162   my $len_diff = length($rec) - length($oldrec);
 163
 164   # length($oldrec) here is not consistent with text mode  TODO XXX BUG
 165   $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
 166
 167   # now update the offsets
 168   # array slice goes from element $n+1 (the first one to move)
 169   # to the end
 170   for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
 171     $_ += $len_diff;
 172   }
 173 }
 174
 175 sub _store_deferred {
 176   my ($self, $n, $rec) = @_;
 177   $self->_uncache($n);
 178   my $old_deferred = $self->{deferred}{$n};
 179   $self->{deferred}{$n} = $rec;
 180   $self->{deferred_s} += length($rec);
 181   $self->{deferred_s} -= length($old_deferred) if defined $old_deferred;
 182   if ($self->{deferred_s} > $self->{dw_size}) {
 183     $self->flush;
 184     $self->defer;               # flush clears the 'defer' flag
 185   } elsif ($self->{deferred_s} + $self->{cached} > $self->{memory}) {
 186     $self->_cache_flush;
 187   }
 188 }
 189
 190 sub FETCHSIZE {
 191   my $self = shift;
 192   my $n = $#{$self->{offsets}};
 193   while (defined ($self->_fill_offsets_to($n+1))) {
 194     ++$n;
 195   }
 196   $n;
 197 }
 198
 199 sub STORESIZE {
 200   my ($self, $len) = @_;
 201   my $olen = $self->FETCHSIZE;
 202   return if $len == $olen;      # Woo-hoo!
 203
 204   # file gets longer
 205   if ($len > $olen) {
 206     $self->_extend_file_to($len);
 207     return;
 208   }
 209
 210   # file gets shorter
 211   $self->_seek($len);
 212   $self->_chop_file;
 213   $#{$self->{offsets}} = $len;
 214 #  $self->{offsets}[0] = 0;      # in case we just chopped this
 215   my @cached = grep $_ >= $len, keys %{$self->{cache}};
 216   $self->_uncache(@cached);
 217 }
 218
 219 sub PUSH {
 220   my $self = shift;
 221   $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
 222   $self->FETCHSIZE;
 223 }
 224
 225 sub POP {
 226   my $self = shift;
 227   my $size = $self->FETCHSIZE;
 228   return if $size == 0;
 229 #  print STDERR "# POPPITY POP POP POP\n";
 230   scalar $self->SPLICE($size-1, 1);
 231 }
 232
 233 sub SHIFT {
 234   my $self = shift;
 235   scalar $self->SPLICE(0, 1);
 236 }
 237
 238 sub UNSHIFT {
 239   my $self = shift;
 240   $self->SPLICE(0, 0, @_);
 241   $self->FETCHSIZE;
 242 }
 243
 244 sub CLEAR {
 245   # And enable auto-defer mode, since it's likely that they just
 246   # did @a = (...);
 247   my $self = shift;
 248   $self->_seekb(0);
 249   $self->_chop_file;
 250   %{$self->{cache}}   = ();
 251     $self->{cached}   = 0;
 252   @{$self->{lru}}     = ();
 253   @{$self->{offsets}} = (0);
 254 }
 255
 256 sub EXTEND {
 257   my ($self, $n) = @_;
 258   $self->_fill_offsets_to($n);
 259   $self->_extend_file_to($n);
 260 }
 261
 262 sub DELETE {
 263   my ($self, $n) = @_;
 264   my $lastrec = $self->FETCHSIZE-1;
 265   if ($n == $lastrec) {
 266     $self->_seek($n);
 267     $self->_chop_file;
 268     $#{$self->{offsets}}--;
 269     $self->_uncache($n);
 270     # perhaps in this case I should also remove trailing null records?
 271   } else {
 272     $self->STORE($n, "");
 273   }
 274 }
 275
 276 sub EXISTS {
 277   my ($self, $n) = @_;
 278   $self->_fill_offsets_to($n);
 279   0 <= $n && $n < $self->FETCHSIZE;
 280 }
 281
 282 sub SPLICE {
 283   my $self = shift;
 284   $self->_flush if $self->{defer};
 285   $self->_splice(@_);
 286 }
 287
 288 sub DESTROY {
 289   $self->flush if $self->{defer};
 290 }
 291
 292 sub _splice {
 293   my ($self, $pos, $nrecs, @data) = @_;
 294   my @result;
 295
 296   $pos = 0 unless defined $pos;
 297
 298   # Deal with negative and other out-of-range positions
 299   # Also set default for $nrecs
 300   {
 301     my $oldsize = $self->FETCHSIZE;
 302     $nrecs = $oldsize unless defined $nrecs;
 303     my $oldpos = $pos;
 304
 305     if ($pos < 0) {
 306       $pos += $oldsize;
 307       if ($pos < 0) {
 308         croak "Modification of non-creatable array value attempted, subscript $oldpos";
 309       }
 310     }
 311
 312     if ($pos > $oldsize) {
 313       return unless @data;
 314       $pos = $oldsize;          # This is what perl does for normal arrays
 315     }
 316   }
 317
 318   $self->_fixrecs(@data);
 319   my $data = join '', @data;
 320   my $datalen = length $data;
 321   my $oldlen = 0;
 322
 323   # compute length of data being removed
 324   # Incidentally fills offsets table
 325   for ($pos .. $pos+$nrecs-1) {
 326     my $rec = $self->FETCH($_);
 327     last unless defined $rec;
 328     push @result, $rec;
 329     $oldlen += length($rec);
 330   }
 331
 332   # Modify the file
 333   $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
 334
 335   # update the offsets table part 1
 336   # compute the offsets of the new records:
 337   my @new_offsets;
 338   if (@data) {
 339     push @new_offsets, $self->{offsets}[$pos];
 340     for (0 .. $#data-1) {
 341       push @new_offsets, $new_offsets[-1] + length($data[$_]);
 342     }
 343   }
 344   splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
 345
 346   # update the offsets table part 2
 347   # adjust the offsets of the following old records
 348   for ($pos+@data .. $#{$self->{offsets}}) {
 349     $self->{offsets}[$_] += $datalen - $oldlen;
 350   }
 351   # If we scrubbed out all known offsets, regenerate the trivial table
 352   # that knows that the file does indeed start at 0.
 353   $self->{offsets}[0] = 0 unless @{$self->{offsets}};
 354
 355   # Perhaps the following cache foolery could be factored out
 356   # into a bunch of mor opaque cache functions.  For example,
 357   # it's odd to delete a record from the cache and then remove
 358   # it from the LRU queue later on; there should be a function to
 359   # do both at once.
 360
 361   # update the read cache, part 1
 362   # modified records
 363   # Consider this carefully for correctness
 364   for ($pos .. $pos+$nrecs-1) {
 365     my $cached = $self->{cache}{$_};
 366     next unless defined $cached;
 367     my $new = $data[$_-$pos];
 368     if (defined $new) {
 369       $self->{cached} += length($new) - length($cached);
 370       $self->{cache}{$_} = $new;
 371     } else {
 372       $self->_uncache($_);
 373     }
 374   }
 375   # update the read cache, part 2
 376   # moved records - records past the site of the change
 377   # need to be renumbered
 378   # Maybe merge this with the previous block?
 379   {
 380     my %adjusted;
 381     for (keys %{$self->{cache}}) {
 382       next unless $_ >= $pos + $nrecs;
 383       $adjusted{$_-$nrecs+@data} = delete $self->{cache}{$_};
 384     }
 385     @{$self->{cache}}{keys %adjusted} = values %adjusted;
 386 #    for (keys %{$self->{cache}}) {
 387 #      next unless $_ >= $pos + $nrecs;
 388 #      $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
 389 #    }
 390   }
 391
 392   # fix the LRU queue
 393   my(@new, @changed);
 394   for (@{$self->{lru}}) {
 395     if ($_ >= $pos + $nrecs) {
 396       push @new, $_ + @data - $nrecs;
 397     } elsif ($_ >= $pos) {
 398       push @changed, $_ if $_ < $pos + @data;
 399     } else {
 400       push @new, $_;
 401     }
 402   }
 403   @{$self->{lru}} = (@new, @changed);
 404
 405   # Now there might be too much data in the cache, if we spliced out
 406   # some short records and spliced in some long ones.  If so, flush
 407   # the cache.
 408   $self->_cache_flush;
 409
 410   # Yes, the return value of 'splice' *is* actually this complicated
 411   wantarray ? @result : @result ? $result[-1] : undef;
 412 }
 413
 414 # write data into the file
 415 # $data is the data to be written.
 416 # it should be written at position $pos, and should overwrite
 417 # exactly $len of the following bytes.
 418 # Note that if length($data) > $len, the subsequent bytes will have to
 419 # be moved up, and if length($data) < $len, they will have to
 420 # be moved down
 421 sub _twrite {
 422   my ($self, $data, $pos, $len) = @_;
 423
 424   unless (defined $pos) {
 425     die "\$pos was undefined in _twrite";
 426   }
 427
 428   my $len_diff = length($data) - $len;
 429
 430   if ($len_diff == 0) {          # Woo-hoo!
 431     my $fh = $self->{fh};
 432     $self->_seekb($pos);
 433     $self->_write_record($data);
 434     return;                     # well, that was easy.
 435   }
 436
 437   # the two records are of different lengths
 438   # our strategy here: rewrite the tail of the file,
 439   # reading ahead one buffer at a time
 440   # $bufsize is required to be at least as large as the data we're overwriting
 441   my $bufsize = _bufsize($len_diff);
 442   my ($writepos, $readpos) = ($pos, $pos+$len);
 443   my $next_block;
 444
 445   # Seems like there ought to be a way to avoid the repeated code
 446   # and the special case here.  The read(1) is also a little weird.
 447   # Think about this.
 448   do {
 449     $self->_seekb($readpos);
 450     my $br = read $self->{fh}, $next_block, $bufsize;
 451     my $more_data = read $self->{fh}, my($dummy), 1;
 452     $self->_seekb($writepos);
 453     $self->_write_record($data);
 454     $readpos += $br;
 455     $writepos += length $data;
 456     $data = $next_block;
 457   } while $more_data;
 458   $self->_seekb($writepos);
 459   $self->_write_record($next_block);
 460
 461   # There might be leftover data at the end of the file
 462   $self->_chop_file if $len_diff < 0;
 463 }
 464
 465 # If a record does not already end with the appropriate terminator
 466 # string, append one.
 467 sub _fixrecs {
 468   my $self = shift;
 469   for (@_) {
 470     $_ .= $self->{recsep}
 471       unless substr($_, - $self->{recseplen}) eq $self->{recsep};
 472   }
 473 }
 474
 475 # seek to the beginning of record #$n
 476 # Assumes that the offsets table is already correctly populated
 477 #
 478 # Note that $n=-1 has a special meaning here: It means the start of
 479 # the last known record; this may or may not be the very last record
 480 # in the file, depending on whether the offsets table is fully populated.
 481 #
 482 sub _seek {
 483   my ($self, $n) = @_;
 484   my $o = $self->{offsets}[$n];
 485   defined($o)
 486     or confess("logic error: undefined offset for record $n");
 487   seek $self->{fh}, $o, SEEK_SET
 488     or die "Couldn't seek filehandle: $!";  # "Should never happen."
 489 }
 490
 491 sub _seekb {
 492   my ($self, $b) = @_;
 493   seek $self->{fh}, $b, SEEK_SET
 494     or die "Couldn't seek filehandle: $!";  # "Should never happen."
 495 }
 496
 497 # populate the offsets table up to the beginning of record $n
 498 # return the offset of record $n
 499 sub _fill_offsets_to {
 500   my ($self, $n) = @_;
 501   my $fh = $self->{fh};
 502   local *OFF = $self->{offsets};
 503   my $rec;
 504
 505   until ($#OFF >= $n) {
 506     my $o = $OFF[-1];
 507     $self->_seek(-1);           # tricky -- see comment at _seek
 508     $rec = $self->_read_record;
 509     if (defined $rec) {
 510       push @OFF, tell $fh;
 511     } else {
 512       return;                   # It turns out there is no such record
 513     }
 514   }
 515
 516   # we have now read all the records up to record n-1,
 517   # so we can return the offset of record n
 518   return $OFF[$n];
 519 }
 520
 521 # assumes that $rec is already suitably terminated
 522 sub _write_record {
 523   my ($self, $rec) = @_;
 524   my $fh = $self->{fh};
 525   print $fh $rec
 526     or die "Couldn't write record: $!";  # "Should never happen."
 527
 528 }
 529
 530 sub _read_record {
 531   my $self = shift;
 532   my $rec;
 533   { local $/ = $self->{recsep};
 534     my $fh = $self->{fh};
 535     $rec = <$fh>;
 536   }
 537   $rec;
 538 }
 539
 540 sub _cache_insert {
 541   my ($self, $n, $rec) = @_;
 542
 543   # Do not cache records that are too big to fit in the cache.
 544   return unless length $rec <= $self->{memory};
 545
 546   $self->{cache}{$n} = $rec;
 547   $self->{cached} += length $rec;
 548   push @{$self->{lru}}, $n;     # most-recently-used is at the END
 549
 550   $self->_cache_flush if $self->{cached} > $self->{memory};
 551 }
 552
 553 sub _uncache {
 554   my $self = shift;
 555   for my $n (@_) {
 556     my $cached = delete $self->{cache}{$n};
 557     next unless defined $cached;
 558     @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
 559     $self->{cached} -= length($cached);
 560   }
 561 }
 562
 563 sub _check_cache {
 564   my ($self, $n) = @_;
 565   my $rec;
 566   return unless defined($rec = $self->{cache}{$n});
 567
 568   # cache hit; update LRU queue and return $rec
 569   # replace this with a heap in a later version
 570   @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
 571   $rec;
 572 }
 573
 574 sub _cache_flush {
 575   my ($self) = @_;
 576   while ($self->{cached} + $self->{deferred_s} > $self->{memory}) {
 577     my $lru = shift @{$self->{lru}};
 578     my $rec = delete $self->{cache}{$lru};
 579     $self->{cached} -= length $rec;
 580   }
 581 }
 582
 583 # We have read to the end of the file and have the offsets table
 584 # entirely populated.  Now we need to write a new record beyond
 585 # the end of the file.  We prepare for this by writing
 586 # empty records into the file up to the position we want
 587 #
 588 # assumes that the offsets table already contains the offset of record $n,
 589 # if it exists, and extends to the end of the file if not.
 590 sub _extend_file_to {
 591   my ($self, $n) = @_;
 592   $self->_seek(-1);             # position after the end of the last record
 593   my $pos = $self->{offsets}[-1];
 594
 595   # the offsets table has one entry more than the total number of records
 596   $extras = $n - $#{$self->{offsets}};
 597
 598   # Todo : just use $self->{recsep} x $extras here?
 599   while ($extras-- > 0) {
 600     $self->_write_record($self->{recsep});
 601     push @{$self->{offsets}}, tell $self->{fh};
 602   }
 603 }
 604
 605 # Truncate the file at the current position
 606 sub _chop_file {
 607   my $self = shift;
 608   truncate $self->{fh}, tell($self->{fh});
 609 }
 610
 611 # compute the size of a buffer suitable for moving
 612 # all the data in a file forward $n bytes
 613 # ($n may be negative)
 614 # The result should be at least $n.
 615 sub _bufsize {
 616   my $n = shift;
 617   return 8192 if $n < 0;
 618   my $b = $n & ~8191;
 619   $b += 8192 if $n & 8191;
 620   $b;
 621 }
 622
 623 # Lock the file
 624 sub flock {
 625   my ($self, $op) = @_;
 626   unless (@_ <= 3) {
 627     my $pack = ref $self;
 628     croak "Usage: $pack\->flock([OPERATION])";
 629   }
 630   my $fh = $self->{fh};
 631   $op = LOCK_EX unless defined $op;
 632   flock $fh, $op;
 633 }
 634
 635 # Defer writes
 636 sub defer {
 637   my $self = shift;
 638   $self->{defer} = 1;
 639 }
 640
 641 # Flush deferred writes
 642 #
 643 # This could be better optimized to write the file in one pass, instead
 644 # of one pass per block of records.  But that will require modifications
 645 # to _twrite, so I should have a good _twite test suite first.
 646 sub flush {
 647   my $self = shift;
 648
 649   $self->_flush;
 650   $self->{defer} = 0;
 651 }
 652
 653 sub _flush {
 654   my $self = shift;
 655   my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
 656
 657   while (@writable) {
 658     # gather all consecutive records from the front of @writable
 659     my $first_rec = shift @writable;
 660     my $last_rec = $first_rec+1;
 661     ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
 662     --$last_rec;
 663     $self->_fill_offsets_to($last_rec);
 664     $self->_extend_file_to($last_rec);
 665     $self->_splice($first_rec, $last_rec-$first_rec+1,
 666                    @{$self->{deferred}}{$first_rec .. $last_rec});
 667   }
 668
 669   $self->discard;               # clear out defered-write-cache
 670 }
 671
 672 # Discard deferred writes
 673 sub discard {
 674   my $self = shift;
 675   undef $self->{deferred};
 676   $self->{deferred_s} = 0;
 677   $self->{defer} = 0;
 678 }
 679
 680 # Not yet implemented
 681 sub autodefer { }
 682
 683 sub _default_recsep {
 684   my $recsep = $/;
 685   if ($^O eq 'MSWin32') {
 686     # Windows users expect files to be terminated with \r\n
 687     # But $/ is set to \n instead
 688     # Note that this also transforms \n\n into \r\n\r\n.
 689     # That is a feature.
 690     $recsep =~ s/\n/\r\n/g;
 691   }
 692   $recsep;
 693 }
 694
 695 # Given a file, make sure the cache is consistent with the
 696 # file contents
 697 sub _check_integrity {
 698   my ($self, $file, $warn) = @_;
 699   my $good = 1;
 700
 701   if (not defined $self->{offsets}[0]) {
 702     $warn && print STDERR "# offset 0 is missing!\n";
 703     $good = 0;
 704   } elsif ($self->{offsets}[0] != 0) {
 705     $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
 706     $good = 0;
 707   }
 708
 709   local *F = $self->{fh};
 710   seek F, 0, SEEK_SET;
 711   local $/ = $self->{recsep};
 712   $. = 0;
 713
 714   while (<F>) {
 715     my $n = $. - 1;
 716     my $cached = $self->{cache}{$n};
 717     my $offset = $self->{offsets}[$.];
 718     my $ao = tell F;
 719     if (defined $offset && $offset != $ao) {
 720       $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
 721       $good = 0;
 722     }
 723     if (defined $cached && $_ ne $cached) {
 724       $good = 0;
 725       chomp $cached;
 726       chomp;
 727       $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
 728     }
 729   }
 730
 731   my $memory = 0;
 732   while (my ($n, $r) = each %{$self->{cache}}) {
 733     $memory += length($r);
 734     next if $n+1 <= $.;         # checked this already
 735     $warn && print STDERR "# spurious caching of record $n\n";
 736     $good = 0;
 737   }
 738   if ($memory != $self->{cached}) {
 739     $warn && print STDERR "# cache size is $self->{cached}, should be $memory\n";
 740     $good = 0;
 741   }
 742
 743   my (%seen, @duplicate);
 744   for (@{$self->{lru}}) {
 745     $seen{$_}++;
 746     if (not exists $self->{cache}{$_}) {
 747       $warn && print "# $_ is mentioned in the LRU queue, but not in the cache\n";
 748       $good = 0;
 749     }
 750   }
 751   @duplicate = grep $seen{$_}>1, keys %seen;
 752   if (@duplicate) {
 753     my $records = @duplicate == 1 ? 'Record' : 'Records';
 754     my $appear  = @duplicate == 1 ? 'appears' : 'appear';
 755     $warn && print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
 756     $good = 0;
 757   }
 758   for (keys %{$self->{cache}}) {
 759     unless (exists $seen{$_}) {
 760       print "# record $_ is in the cache but not the LRU queue\n";
 761       $good = 0;
 762     }
 763   }
 764
 765   $good;
 766 }
 767
 768 "Cogito, ergo sum.";  # don't forget to return a true value from the file
 769
 770 =head1 NAME
 771
 772 Tie::File - Access the lines of a disk file via a Perl array
 773
 774 =head1 SYNOPSIS
 775
 776         # This file documents Tie::File version 0.19
 777
 778         tie @array, 'Tie::File', filename or die ...;
 779
 780         $array[13] = 'blah';     # line 13 of the file is now 'blah'
 781         print $array[42];        # display line 42 of the file
 782
 783         $n_recs = @array;        # how many records are in the file?
 784         $#array = $n_recs - 2;   # chop records off the end
 785
 786         # As you would expect:
 787
 788         push @array, new recs...;
 789         my $r1 = pop @array;
 790         unshift @array, new recs...;
 791         my $r1 = shift @array;
 792         @old_recs = splice @array, 3, 7, new recs...;
 793
 794         untie @array;            # all finished
 795
 796 =head1 DESCRIPTION
 797
 798 C<Tie::File> represents a regular text file as a Perl array.  Each
 799 element in the array corresponds to a record in the file.  The first
 800 line of the file is element 0 of the array; the second line is element
 801 1, and so on.
 802
 803 The file is I<not> loaded into memory, so this will work even for
 804 gigantic files.
 805
 806 Changes to the array are reflected in the file immediately.
 807
 808 Lazy people may now stop reading the manual.
 809
 810 =head2 C<recsep>
 811
 812 What is a 'record'?  By default, the meaning is the same as for the
 813 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
 814 probably C<"\n">.  (Minor exception: on dos and Win32 systems, a
 815 'record' is a string terminated by C<"\r\n">.)  You may change the
 816 definition of "record" by supplying the C<recsep> option in the C<tie>
 817 call:
 818
 819         tie @array, 'Tie::File', $file, recsep => 'es';
 820
 821 This says that records are delimited by the string C<es>.  If the file
 822 contained the following data:
 823
 824         Curse these pesky flies!\n
 825
 826 then the C<@array> would appear to have four elements:
 827
 828         "Curse thes"
 829         "e pes"
 830         "ky flies"
 831         "!\n"
 832
 833 An undefined value is not permitted as a record separator.  Perl's
 834 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
 835 emulated.
 836
 837 Records read from the tied array will have the record separator string
 838 on the end, just as if they were read from the C<E<lt>...E<gt>>
 839 operator.  Records stored into the array will have the record
 840 separator string appended before they are written to the file, if they
 841 don't have one already.  For example, if the record separator string
 842 is C<"\n">, then the following two lines do exactly the same thing:
 843
 844         $array[17] = "Cherry pie";
 845         $array[17] = "Cherry pie\n";
 846
 847 The result is that the contents of line 17 of the file will be
 848 replaced with "Cherry pie"; a newline character will separate line 17
 849 from line 18.  This means that in particular, this will do nothing:
 850
 851         chomp $array[17];
 852
 853 Because the C<chomp>ed value will have the separator reattached when
 854 it is written back to the file.  There is no way to create a file
 855 whose trailing record separator string is missing.
 856
 857 Inserting records that I<contain> the record separator string will
 858 produce a reasonable result, but if you can't foresee what this result
 859 will be, you'd better avoid doing this.
 860
 861 =head2 C<mode>
 862
 863 Normally, the specified file will be opened for read and write access,
 864 and will be created if it does not exist.  (That is, the flags
 865 C<O_RDWR | O_CREAT> are supplied in the C<open> call.)  If you want to
 866 change this, you may supply alternative flags in the C<mode> option.
 867 See L<Fcntl> for a listing of available flags.
 868 For example:
 869
 870         # open the file if it exists, but fail if it does not exist
 871         use Fcntl 'O_RDWR';
 872         tie @array, 'Tie::File', $file, mode => O_RDWR;
 873
 874         # create the file if it does not exist
 875         use Fcntl 'O_RDWR', 'O_CREAT';
 876         tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
 877
 878         # open an existing file in read-only mode
 879         use Fcntl 'O_RDONLY';
 880         tie @array, 'Tie::File', $file, mode => O_RDONLY;
 881
 882 Opening the data file in write-only or append mode is not supported.
 883
 884 =head2 C<memory>
 885
 886 This is an (inexact) upper limit on the amount of memory that
 887 C<Tie::File> will consume at any time while managing the file.
 888 At present, this is used as a bound on the size of the read cache.
 889
 890 Records read in from the file are cached, to avoid having to re-read
 891 them repeatedly.  If you read the same record twice, the first time it
 892 will be stored in memory, and the second time it will be fetched from
 893 the I<read cache>.  The amount of data in the read cache will not
 894 exceed the value you specified for C<memory>.  If C<Tie::File> wants
 895 to cache a new record, but the read cache is full, it will make room
 896 by expiring the least-recently visited records from the read cache.
 897
 898 The default memory limit is 2Mib.  You can adjust the maximum read
 899 cache size by supplying the C<memory> option.  The argument is the
 900 desired cache size, in bytes.
 901
 902         # I have a lot of memory, so use a large cache to speed up access
 903         tie @array, 'Tie::File', $file, memory => 20_000_000;
 904
 905 Setting the memory limit to 0 will inhibit caching; records will be
 906 fetched from disk every time you examine them.
 907
 908 =head2 Option Format
 909
 910 C<-mode> is a synonym for C<mode>.  C<-recsep> is a synonym for
 911 C<recsep>.  C<-memory> is a synonym for C<memory>.  You get the
 912 idea.
 913
 914 =head1 Public Methods
 915
 916 The C<tie> call returns an object, say C<$o>.  You may call
 917
 918         $rec = $o->FETCH($n);
 919         $o->STORE($n, $rec);
 920
 921 to fetch or store the record at line C<$n>, respectively; similarly
 922 the other tied array methods.  (See L<perltie> for details.)  You may
 923 also call the following methods on this object:
 924
 925 =head2 C<flock>
 926
 927         $o->flock(MODE)
 928
 929 will lock the tied file.  C<MODE> has the same meaning as the second
 930 argument to the Perl built-in C<flock> function; for example
 931 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>.  (These constants are provided by
 932 the C<use Fcntl ':flock'> declaration.)
 933
 934 C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
 935 C<LOCK_EX>.
 936
 937 The best way to unlock a file is to discard the object and untie the
 938 array.  It is probably unsafe to unlock the file without also untying
 939 it, because if you do, changes may remain unwritten inside the object.
 940 That is why there is no shortcut for unlocking.  If you really want to
 941 unlock the file prematurely, you know what to do; if you don't know
 942 what to do, then don't do it.
 943
 944 All the usual warnings about file locking apply here.  In particular,
 945 note that file locking in Perl is B<advisory>, which means that
 946 holding a lock will not prevent anyone else from reading, writing, or
 947 erasing the file; it only prevents them from getting another lock at
 948 the same time.  Locks are analogous to green traffic lights: If you
 949 have a green light, that does not prevent the idiot coming the other
 950 way from plowing into you sideways; it merely guarantees to you that
 951 the idiot does not also have a green light at the same time.
 952
 953 =head2 Tying to an already-opened filehandle
 954
 955 If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
 956 of the other C<IO> modules, you may use:
 957
 958         tie @array, 'Tie::File', $fh, ...;
 959
 960 Similarly if you opened that handle C<FH> with regular C<open> or
 961 C<sysopen>, you may use:
 962
 963         tie @array, 'Tie::File', \*FH, ...;
 964
 965 Handles that were opened write-only won't work.  Handles that were
 966 opened read-only will work as long as you don't try to write to them.
 967 Handles must be attached to seekable sources of data---that means no
 968 pipes or sockets.  If you supply a non-seekable handle, the C<tie>
 969 call will try to abort your program.
 970
 971 =head1 CAVEATS
 972
 973 (That's Latin for 'warnings'.)
 974
 975 =over 4
 976
 977 =item *
 978
 979 This is BETA RELEASE SOFTWARE.  It may have bugs.  See the discussion
 980 below about the (lack of any) warranty.
 981
 982 =item *
 983
 984 Every effort was made to make this module efficient.  Nevertheless,
 985 changing the size of a record in the middle of a large file will
 986 always be fairly slow, because everything after the new record must be
 987 moved.
 988
 989 In particular, note that the following innocent-looking loop has very
 990 bad behavior:
 991
 992         # million-line file
 993         for (@file_array) {
 994           $_ .= 'x';
 995         }
 996
 997 This is likely to be very slow, because the first iteration must
 998 relocate lines 1 through 999,999; the second iteration must relocate
 999 lines 2 through 999,999, and so on.  The relocation is done using
1000 block writes, however, so it's not as slow as it might be.
1001
1002 A soon-to-be-released version of this module will provide a mechanism
1003 for getting better performance in such cases, by deferring the writing
1004 until it can be done all at once.  This deferred writing feature might
1005 be enabled automagically if C<Tie::File> guesses that you are about to write many consecutive records.  To disable this feature, use
1006
1007         (tied @o)->autodefer(0);
1008
1009 (At present, this call does nothing.)
1010
1011 =item *
1012
1013 The behavior of tied arrays is not precisely the same as for regular
1014 arrays.  For example:
1015
1016         undef $a[10];  print "How unusual!\n" if $a[10];
1017
1018 C<undef>-ing a C<Tie::File> array element just blanks out the
1019 corresponding record in the file.  When you read it back again, you'll
1020 see the record separator (typically, $a[10] will appear to contain
1021 "\n") so the supposedly-C<undef>'ed value will be true.
1022
1023 There are other minor differences, but in general, the correspondence
1024 is extremely close.
1025
1026 =item *
1027
1028 Not quite every effort was made to make this module as efficient as
1029 possible.  C<FETCHSIZE> should use binary search instead of linear
1030 search.  The cache's LRU queue should be a heap instead of a list.
1031 These defects are probably minor; in any event, they will be fixed in
1032 a later version of the module.
1033
1034 =item *
1035
1036 The author has supposed that since this module is concerned with file
1037 I/O, almost all normal use of it will be heavily I/O bound, and that
1038 the time to maintain complicated data structures inside the module
1039 will be dominated by the time to actually perform the I/O.  This
1040 suggests, for example, that an LRU read-cache is a good tradeoff,
1041 even if it requires substantial adjustment following a C<splice>
1042 operation.
1043
1044 =back
1045
1046 =head1 WHAT ABOUT C<DB_File>?
1047
1048 C<DB_File>'s C<DB_RECNO> feature does something similar to
1049 C<Tie::File>, but there are a number of reasons that you might prefer
1050 C<Tie::File>.  C<DB_File> is a great piece of software, but the
1051 C<DB_RECNO> part is less great than the rest of it.
1052
1053 =over 4
1054
1055 =item *
1056
1057 C<DB_File> reads your entire file into memory, modifies it in memory,
1058 and the writes out the entire file again when you untie the file.
1059 This is completely impractical for large files.
1060
1061 C<Tie::File> does not do any of those things.  It doesn't try to read
1062 the entire file into memory; instead it uses a lazy approach and
1063 caches recently-used records.  The cache size is strictly bounded by
1064 the C<memory> option.  DB_File's C<-E<gt>{cachesize}> doesn't prevent
1065 your process from blowing up when reading a big file.
1066
1067 =item *
1068
1069 C<DB_File> has an extremely poor writing strategy.  If you have a
1070 ten-megabyte file and tie it with C<DB_File>, and then use
1071
1072         $a[0] =~ s/PERL/Perl/;
1073
1074 C<DB_file> will then read the entire ten-megabyte file into memory, do
1075 the change, and write the entire file back to disk, reading ten
1076 megabytes and writing ten megabytes.  C<Tie::File> will read and write
1077 only the first record.
1078
1079 If you have a million-record file and tie it with C<DB_File>, and then
1080 use
1081
1082         $a[999998] =~ s/Larry/Larry Wall/;
1083
1084 C<DB_File> will read the entire million-record file into memory, do
1085 the change, and write the entire file back to disk.  C<Tie::File> will
1086 only rewrite records 999998 and 999999.  During the writing process,
1087 it will never have more than a few kilobytes of data in memory at any
1088 time, even if the two records are very large.
1089
1090 =item *
1091
1092 Since changes to C<DB_File> files only appear when you do C<untie>, it
1093 can be inconvenient to arrange for concurrent access to the same file
1094 by two or more processes.  Each process needs to call C<$db-E<gt>sync>
1095 after every write.  When you change a C<Tie::File> array, the changes
1096 are reflected in the file immediately; no explicit C<-E<gt>sync> call
1097 is required.  (The forthcoming "deferred writing" mode will allow you
1098 to request that writes be held in memory until explicitly C<sync>'ed.)
1099
1100 =item *
1101
1102 C<DB_File> is only installed by default if you already have the C<db>
1103 library on your system; C<Tie::File> is pure Perl and is installed by
1104 default no matter what.  Starting with Perl 5.7.3 you can be
1105 absolutely sure it will be everywhere.  You will never have that
1106 surety with C<DB_File>.  If you don't have C<DB_File> yet, it requires
1107 a C compiler.  You can install C<Tie::File> from CPAN in five minutes
1108 with no compiler.
1109
1110 =item *
1111
1112 C<DB_File> is written in C, so if you aren't allowed to install
1113 modules on your system, it is useless.  C<Tie::File> is written in Perl,
1114 so even if you aren't allowed to install modules, you can look into
1115 the source code, see how it works, and copy the subroutines or the
1116 ideas from the subroutines directly into your own Perl program.
1117
1118 =item *
1119
1120 Except in very old, unsupported versions, C<DB_File>'s free license
1121 requires that you distribute the source code for your entire
1122 application.  If you are not able to distribute the source code for
1123 your application, you must negotiate an alternative license from
1124 Sleepycat, possibly for a fee.  Tie::File is under the Perl Artistic
1125 license and can be distributed free under the same terms as Perl
1126 itself.
1127
1128 =back
1129
1130 =head1 AUTHOR
1131
1132 Mark Jason Dominus
1133
1134 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
1135
1136 To receive an announcement whenever a new version of this module is
1137 released, send a blank email message to
1138 C<mjd-perl-tiefile-subscribe@plover.com>.
1139
1140 =head1 LICENSE
1141
1142 C<Tie::File> version 0.19 is copyright (C) 2002 Mark Jason Dominus.
1143
1144 This library is free software; you may redistribute it and/or modify
1145 it under the same terms as Perl itself.
1146
1147 These terms include your choice of (1) the Perl Artistic Licence, or
1148 (2) version 2 of the GNU General Public License as published by the
1149 Free Software Foundation, or (3) any later version of the GNU General
1150 Public License.
1151
1152 This library is distributed in the hope that it will be useful,
1153 but WITHOUT ANY WARRANTY; without even the implied warranty of
1154 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1155 GNU General Public License for more details.
1156
1157 You should have received a copy of the GNU General Public License
1158 along with this library program; it should be in the file C<COPYING>.
1159 If not, write to the Free Software Foundation, Inc., 59 Temple Place,
1160 Suite 330, Boston, MA 02111 USA
1161
1162 For licensing inquiries, contact the author at:
1163
1164         Mark Jason Dominus
1165         255 S. Warnock St.
1166         Philadelphia, PA 19107
1167
1168 =head1 WARRANTY
1169
1170 C<Tie::File> version 0.19 comes with ABSOLUTELY NO WARRANTY.
1171 For details, see the license.
1172
1173 =head1 THANKS
1174
1175 Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
1176 core when I hadn't written it yet, and for generally being helpful,
1177 supportive, and competent.  (Usually the rule is "choose any one.")
1178 Also big thanks to Abhijit Menon-Sen for all of the same things.
1179
1180 Special thanks to Craig Berry (for VMS portability help), Randy Kobes
1181 (for Win32 portability help), Clinton Pierce and Autrijus Tang (for
1182 heroic eleventh-hour Win32 testing above and beyond the call of duty),
1183 and the rest of the CPAN testers (for testing generally).
1184
1185 More thanks to:
1186 Edward Avis /
1187 Gerrit Haase /
1188 Nikola Knezevic /
1189 Nick Ing-Simmons /
1190 Tassilo von Parseval /
1191 H. Dieter Pearcey /
1192 Slaven Rezic /
1193 Peter Somu /
1194 Tels
1195
1196 =head1 TODO
1197
1198 Test DELETE machinery more carefully.
1199
1200 More tests.  (C<mode> option.  _twrite should be tested separately,
1201 because there are a lot of weird special cases lurking in there.)
1202
1203 More tests.  (Stuff I didn't think of yet.)
1204
1205 Paragraph mode?
1206
1207 More tests.
1208
1209 Fixed-length mode.
1210
1211 Maybe an autolocking mode?
1212
1213 Finish deferred writing.
1214
1215 Autodeferment.
1216
1217 Record locking with fcntl()?  Then you might support an undo log and
1218 get real transactions.  What a coup that would be.
1219
1220 Leave-blanks mode
1221
1222 =cut
1223