lib/Tie/File.pm

   1
   2 package Tie::File;
   3 use Carp;
   4 use POSIX 'SEEK_SET';
   5 use Fcntl 'O_CREAT', 'O_RDWR', 'LOCK_EX';
   6 require 5.005;
   7
   8 $VERSION = "0.20";
   9
  10 # Idea: The object will always contain an array of byte offsets
  11 # this will be filled in as is necessary and convenient.
  12 # fetch will do seek-read.
  13 # There will be a cache parameter that controls the amount of cached *data*
  14 # Also an LRU queue of cached records
  15 # store will read the relevant record into the cache
  16 # If it's the same length as what is being written, it will overwrite it in
  17 #   place; if not, it will do a from-to copying write.
  18 # The record separator string is also a parameter
  19
  20 # Record numbers start at ZERO.
  21
  22 my $DEFAULT_MEMORY_SIZE = 1<<21;    # 2 megabytes
  23
  24 my %good_opt = map {$_ => 1, "-$_" => 1}
  25                qw(memory dw_size mode recsep discipline autochomp);
  26
  27 sub TIEARRAY {
  28   if (@_ % 2 != 0) {
  29     croak "usage: tie \@array, $_[0], filename, [option => value]...";
  30   }
  31   my ($pack, $file, %opts) = @_;
  32
  33   # transform '-foo' keys into 'foo' keys
  34   for my $key (keys %opts) {
  35     unless ($good_opt{$key}) {
  36       croak("$pack: Unrecognized option '$key'\n");
  37     }
  38     my $okey = $key;
  39     if ($key =~ s/^-+//) {
  40       $opts{$key} = delete $opts{$okey};
  41     }
  42   }
  43
  44   unless (defined $opts{memory}) {
  45     # default is the larger of the default cache size and the
  46     # deferred-write buffer size (if specified)
  47     $opts{memory} = $DEFAULT_MEMORY_SIZE;
  48     $opts{memory} = $opts{dw_size}
  49       if defined $opts{dw_size} && $opts{dw_size} > $DEFAULT_MEMORY_SIZE;
  50   }
  51   $opts{dw_size} = $opts{memory} unless defined $opts{dw_size};
  52   if ($opts{dw_size} > $opts{memory}) {
  53       croak("$pack: dw_size may not be larger than total memory allocation\n");
  54   }
  55   $opts{deferred} = {};         # no records presently deferred
  56   $opts{deferred_s} = 0;        # count of total bytes in ->{deferred}
  57
  58   # the cache is a hash instead of an array because it is likely to be
  59   # sparsely populated
  60   $opts{cache} = {};
  61   $opts{cached} = 0;   # total size of cached data
  62   $opts{lru} = [];     # replace with heap in later version
  63
  64   $opts{offsets} = [0];
  65   $opts{filename} = $file;
  66   unless (defined $opts{recsep}) {
  67     $opts{recsep} = _default_recsep();
  68   }
  69   $opts{recseplen} = length($opts{recsep});
  70   if ($opts{recseplen} == 0) {
  71     croak "Empty record separator not supported by $pack";
  72   }
  73
  74   $opts{autochomp} = 1 unless defined $opts{autochomp};
  75
  76   my $mode = defined($opts{mode}) ? $opts{mode} : O_CREAT|O_RDWR;
  77   my $fh;
  78
  79   if (UNIVERSAL::isa($file, 'GLOB')) {
  80     unless (seek $file, 0, SEEK_SET) {
  81       croak "$pack: your filehandle does not appear to be seekable";
  82     }
  83     $fh = $file;
  84   } elsif (ref $file) {
  85     croak "usage: tie \@array, $pack, filename, [option => value]...";
  86   } else {
  87     $fh = \do { local *FH };   # only works in 5.005 and later
  88     sysopen $fh, $file, $mode, 0666 or return;
  89     binmode $fh;
  90   }
  91   { my $ofh = select $fh; $| = 1; select $ofh } # autoflush on write
  92   if (defined $opts{discipline} && $] >= 5.006) {
  93     # This avoids a compile-time warning under 5.005
  94     eval 'binmode($fh, $opts{discipline})';
  95     croak $@ if $@ =~ /unknown discipline/i;
  96     die if $@;
  97   }
  98   $opts{fh} = $fh;
  99
 100   bless \%opts => $pack;
 101 }
 102
 103 sub FETCH {
 104   my ($self, $n) = @_;
 105   $self->_chomp1($self->_fetch($n));
 106 }
 107
 108 # Chomp many records in-place; return nothing useful
 109 sub _chomp {
 110   my $self = shift;
 111   return unless $self->{autochomp};
 112   if ($self->{autochomp}) {
 113     for (@_) {
 114       next unless defined;
 115       substr($_, - $self->{recseplen}) = "";
 116     }
 117   }
 118 }
 119
 120 # Chomp one record in-place; return modified record
 121 sub _chomp1 {
 122   my ($self, $rec) = @_;
 123   return $rec unless $self->{autochomp};
 124   return unless defined $rec;
 125   substr($rec, - $self->{recseplen}) = "";
 126   $rec;
 127 }
 128
 129 sub _fetch {
 130   my ($self, $n) = @_;
 131
 132   # check the record cache
 133   { my $cached = $self->_check_cache($n);
 134     return $cached if defined $cached;
 135   }
 136
 137   unless ($#{$self->{offsets}} >= $n) {
 138     my $o = $self->_fill_offsets_to($n);
 139     # If it's still undefined, there is no such record, so return 'undef'
 140     return unless defined $o;
 141   }
 142
 143   my $fh = $self->{FH};
 144   $self->_seek($n);             # we can do this now that offsets is populated
 145   my $rec = $self->_read_record;
 146
 147 # If we happen to have just read the first record, check to see if
 148 # the length of the record matches what 'tell' says.  If not, Tie::File
 149 # won't work, and should drop dead.
 150 #
 151 #  if ($n == 0 && defined($rec) && tell($self->{fh}) != length($rec)) {
 152 #    if (defined $self->{discipline}) {
 153 #      croak "I/O discipline $self->{discipline} not supported";
 154 #    } else {
 155 #      croak "File encoding not supported";
 156 #    }
 157 #  }
 158
 159   $self->_cache_insert($n, $rec) if defined $rec;
 160   $rec;
 161 }
 162
 163 sub STORE {
 164   my ($self, $n, $rec) = @_;
 165
 166   $self->_fixrecs($rec);
 167
 168   return $self->_store_deferred($n, $rec) if $self->{defer};
 169
 170   # We need this to decide whether the new record will fit
 171   # It incidentally populates the offsets table
 172   # Note we have to do this before we alter the cache
 173   my $oldrec = $self->_fetch($n);
 174
 175   # _check_cache promotes record $n to MRU.  Is this correct behavior?
 176   if (my $cached = $self->_check_cache($n)) {
 177     my $len_diff = length($rec) - length($cached);
 178     $self->{cache}{$n} = $rec;
 179     $self->{cached} += $len_diff;
 180     $self->_cache_flush
 181       if $len_diff > 0
 182         && $self->{deferred_s} + $self->{cached} > $self->{memory};
 183   }
 184
 185   if (not defined $oldrec) {
 186     # We're storing a record beyond the end of the file
 187     $self->_extend_file_to($n+1);
 188     $oldrec = $self->{recsep};
 189   }
 190   my $len_diff = length($rec) - length($oldrec);
 191
 192   # length($oldrec) here is not consistent with text mode  TODO XXX BUG
 193   $self->_twrite($rec, $self->{offsets}[$n], length($oldrec));
 194
 195   # now update the offsets
 196   # array slice goes from element $n+1 (the first one to move)
 197   # to the end
 198   for (@{$self->{offsets}}[$n+1 .. $#{$self->{offsets}}]) {
 199     $_ += $len_diff;
 200   }
 201 }
 202
 203 sub _store_deferred {
 204   my ($self, $n, $rec) = @_;
 205   $self->_uncache($n);
 206   my $old_deferred = $self->{deferred}{$n};
 207   $self->{deferred}{$n} = $rec;
 208   $self->{deferred_s} += length($rec);
 209   $self->{deferred_s} -= length($old_deferred) if defined $old_deferred;
 210   if ($self->{deferred_s} > $self->{dw_size}) {
 211     $self->flush;
 212     $self->defer;               # flush clears the 'defer' flag
 213   } elsif ($self->{deferred_s} + $self->{cached} > $self->{memory}) {
 214     $self->_cache_flush;
 215   }
 216 }
 217
 218 sub FETCHSIZE {
 219   my $self = shift;
 220   my $n = $#{$self->{offsets}};
 221   while (defined ($self->_fill_offsets_to($n+1))) {
 222     ++$n;
 223   }
 224   $n;
 225 }
 226
 227 sub STORESIZE {
 228   my ($self, $len) = @_;
 229   my $olen = $self->FETCHSIZE;
 230   return if $len == $olen;      # Woo-hoo!
 231
 232   # file gets longer
 233   if ($len > $olen) {
 234     $self->_extend_file_to($len);
 235     return;
 236   }
 237
 238   # file gets shorter
 239   $self->_seek($len);
 240   $self->_chop_file;
 241   $#{$self->{offsets}} = $len;
 242 #  $self->{offsets}[0] = 0;      # in case we just chopped this
 243   my @cached = grep $_ >= $len, keys %{$self->{cache}};
 244   $self->_uncache(@cached);
 245 }
 246
 247 sub PUSH {
 248   my $self = shift;
 249   $self->SPLICE($self->FETCHSIZE, scalar(@_), @_);
 250   $self->FETCHSIZE;
 251 }
 252
 253 sub POP {
 254   my $self = shift;
 255   my $size = $self->FETCHSIZE;
 256   return if $size == 0;
 257 #  print STDERR "# POPPITY POP POP POP\n";
 258   scalar $self->SPLICE($size-1, 1);
 259 }
 260
 261 sub SHIFT {
 262   my $self = shift;
 263   scalar $self->SPLICE(0, 1);
 264 }
 265
 266 sub UNSHIFT {
 267   my $self = shift;
 268   $self->SPLICE(0, 0, @_);
 269   $self->FETCHSIZE;
 270 }
 271
 272 sub CLEAR {
 273   # And enable auto-defer mode, since it's likely that they just
 274   # did @a = (...);
 275   my $self = shift;
 276   $self->_seekb(0);
 277   $self->_chop_file;
 278   %{$self->{cache}}   = ();
 279     $self->{cached}   = 0;
 280   @{$self->{lru}}     = ();
 281   @{$self->{offsets}} = (0);
 282 }
 283
 284 sub EXTEND {
 285   my ($self, $n) = @_;
 286   $self->_fill_offsets_to($n);
 287   $self->_extend_file_to($n);
 288 }
 289
 290 sub DELETE {
 291   my ($self, $n) = @_;
 292   my $lastrec = $self->FETCHSIZE-1;
 293   if ($n == $lastrec) {
 294     $self->_seek($n);
 295     $self->_chop_file;
 296     $#{$self->{offsets}}--;
 297     $self->_uncache($n);
 298     # perhaps in this case I should also remove trailing null records?
 299   } else {
 300     $self->STORE($n, "");
 301   }
 302 }
 303
 304 sub EXISTS {
 305   my ($self, $n) = @_;
 306   $self->_fill_offsets_to($n);
 307   0 <= $n && $n < $self->FETCHSIZE;
 308 }
 309
 310 sub SPLICE {
 311   my $self = shift;
 312   $self->_flush if $self->{defer};
 313   if (wantarray) {
 314     $self->_chomp(my @a = $self->_splice(@_));
 315     @a;
 316   } else {
 317     $self->_chomp1(scalar $self->_splice(@_));
 318   }
 319 }
 320
 321 sub DESTROY {
 322   $self->flush if $self->{defer};
 323 }
 324
 325 sub _splice {
 326   my ($self, $pos, $nrecs, @data) = @_;
 327   my @result;
 328
 329   $pos = 0 unless defined $pos;
 330
 331   # Deal with negative and other out-of-range positions
 332   # Also set default for $nrecs
 333   {
 334     my $oldsize = $self->FETCHSIZE;
 335     $nrecs = $oldsize unless defined $nrecs;
 336     my $oldpos = $pos;
 337
 338     if ($pos < 0) {
 339       $pos += $oldsize;
 340       if ($pos < 0) {
 341         croak "Modification of non-creatable array value attempted, subscript $oldpos";
 342       }
 343     }
 344
 345     if ($pos > $oldsize) {
 346       return unless @data;
 347       $pos = $oldsize;          # This is what perl does for normal arrays
 348     }
 349   }
 350
 351   $self->_fixrecs(@data);
 352   my $data = join '', @data;
 353   my $datalen = length $data;
 354   my $oldlen = 0;
 355
 356   # compute length of data being removed
 357   # Incidentally fills offsets table
 358   for ($pos .. $pos+$nrecs-1) {
 359     my $rec = $self->_fetch($_);
 360     last unless defined $rec;
 361     push @result, $rec;
 362     $oldlen += length($rec);
 363   }
 364
 365   # Modify the file
 366   $self->_twrite($data, $self->{offsets}[$pos], $oldlen);
 367
 368   # update the offsets table part 1
 369   # compute the offsets of the new records:
 370   my @new_offsets;
 371   if (@data) {
 372     push @new_offsets, $self->{offsets}[$pos];
 373     for (0 .. $#data-1) {
 374       push @new_offsets, $new_offsets[-1] + length($data[$_]);
 375     }
 376   }
 377   splice(@{$self->{offsets}}, $pos, $nrecs, @new_offsets);
 378
 379   # update the offsets table part 2
 380   # adjust the offsets of the following old records
 381   for ($pos+@data .. $#{$self->{offsets}}) {
 382     $self->{offsets}[$_] += $datalen - $oldlen;
 383   }
 384   # If we scrubbed out all known offsets, regenerate the trivial table
 385   # that knows that the file does indeed start at 0.
 386   $self->{offsets}[0] = 0 unless @{$self->{offsets}};
 387
 388   # Perhaps the following cache foolery could be factored out
 389   # into a bunch of mor opaque cache functions.  For example,
 390   # it's odd to delete a record from the cache and then remove
 391   # it from the LRU queue later on; there should be a function to
 392   # do both at once.
 393
 394   # update the read cache, part 1
 395   # modified records
 396   # Consider this carefully for correctness
 397   for ($pos .. $pos+$nrecs-1) {
 398     my $cached = $self->{cache}{$_};
 399     next unless defined $cached;
 400     my $new = $data[$_-$pos];
 401     if (defined $new) {
 402       $self->{cached} += length($new) - length($cached);
 403       $self->{cache}{$_} = $new;
 404     } else {
 405       $self->_uncache($_);
 406     }
 407   }
 408   # update the read cache, part 2
 409   # moved records - records past the site of the change
 410   # need to be renumbered
 411   # Maybe merge this with the previous block?
 412   {
 413     my %adjusted;
 414     for (keys %{$self->{cache}}) {
 415       next unless $_ >= $pos + $nrecs;
 416       $adjusted{$_-$nrecs+@data} = delete $self->{cache}{$_};
 417     }
 418     @{$self->{cache}}{keys %adjusted} = values %adjusted;
 419 #    for (keys %{$self->{cache}}) {
 420 #      next unless $_ >= $pos + $nrecs;
 421 #      $self->{cache}{$_-$nrecs+@data} = delete $self->{cache}{$_};
 422 #    }
 423   }
 424
 425   # fix the LRU queue
 426   my(@new, @changed);
 427   for (@{$self->{lru}}) {
 428     if ($_ >= $pos + $nrecs) {
 429       push @new, $_ + @data - $nrecs;
 430     } elsif ($_ >= $pos) {
 431       push @changed, $_ if $_ < $pos + @data;
 432     } else {
 433       push @new, $_;
 434     }
 435   }
 436   @{$self->{lru}} = (@new, @changed);
 437
 438   # Now there might be too much data in the cache, if we spliced out
 439   # some short records and spliced in some long ones.  If so, flush
 440   # the cache.
 441   $self->_cache_flush;
 442
 443   # Yes, the return value of 'splice' *is* actually this complicated
 444   wantarray ? @result : @result ? $result[-1] : undef;
 445 }
 446
 447 # write data into the file
 448 # $data is the data to be written.
 449 # it should be written at position $pos, and should overwrite
 450 # exactly $len of the following bytes.
 451 # Note that if length($data) > $len, the subsequent bytes will have to
 452 # be moved up, and if length($data) < $len, they will have to
 453 # be moved down
 454 sub _twrite {
 455   my ($self, $data, $pos, $len) = @_;
 456
 457   unless (defined $pos) {
 458     die "\$pos was undefined in _twrite";
 459   }
 460
 461   my $len_diff = length($data) - $len;
 462
 463   if ($len_diff == 0) {          # Woo-hoo!
 464     my $fh = $self->{fh};
 465     $self->_seekb($pos);
 466     $self->_write_record($data);
 467     return;                     # well, that was easy.
 468   }
 469
 470   # the two records are of different lengths
 471   # our strategy here: rewrite the tail of the file,
 472   # reading ahead one buffer at a time
 473   # $bufsize is required to be at least as large as the data we're overwriting
 474   my $bufsize = _bufsize($len_diff);
 475   my ($writepos, $readpos) = ($pos, $pos+$len);
 476   my $next_block;
 477
 478   # Seems like there ought to be a way to avoid the repeated code
 479   # and the special case here.  The read(1) is also a little weird.
 480   # Think about this.
 481   do {
 482     $self->_seekb($readpos);
 483     my $br = read $self->{fh}, $next_block, $bufsize;
 484     my $more_data = read $self->{fh}, my($dummy), 1;
 485     $self->_seekb($writepos);
 486     $self->_write_record($data);
 487     $readpos += $br;
 488     $writepos += length $data;
 489     $data = $next_block;
 490   } while $more_data;
 491   $self->_seekb($writepos);
 492   $self->_write_record($next_block);
 493
 494   # There might be leftover data at the end of the file
 495   $self->_chop_file if $len_diff < 0;
 496 }
 497
 498 # If a record does not already end with the appropriate terminator
 499 # string, append one.
 500 sub _fixrecs {
 501   my $self = shift;
 502   for (@_) {
 503     $_ .= $self->{recsep}
 504       unless substr($_, - $self->{recseplen}) eq $self->{recsep};
 505   }
 506 }
 507
 508 # seek to the beginning of record #$n
 509 # Assumes that the offsets table is already correctly populated
 510 #
 511 # Note that $n=-1 has a special meaning here: It means the start of
 512 # the last known record; this may or may not be the very last record
 513 # in the file, depending on whether the offsets table is fully populated.
 514 #
 515 sub _seek {
 516   my ($self, $n) = @_;
 517   my $o = $self->{offsets}[$n];
 518   defined($o)
 519     or confess("logic error: undefined offset for record $n");
 520   seek $self->{fh}, $o, SEEK_SET
 521     or die "Couldn't seek filehandle: $!";  # "Should never happen."
 522 }
 523
 524 sub _seekb {
 525   my ($self, $b) = @_;
 526   seek $self->{fh}, $b, SEEK_SET
 527     or die "Couldn't seek filehandle: $!";  # "Should never happen."
 528 }
 529
 530 # populate the offsets table up to the beginning of record $n
 531 # return the offset of record $n
 532 sub _fill_offsets_to {
 533   my ($self, $n) = @_;
 534   my $fh = $self->{fh};
 535   local *OFF = $self->{offsets};
 536   my $rec;
 537
 538   until ($#OFF >= $n) {
 539     my $o = $OFF[-1];
 540     $self->_seek(-1);           # tricky -- see comment at _seek
 541     $rec = $self->_read_record;
 542     if (defined $rec) {
 543       push @OFF, tell $fh;
 544     } else {
 545       return;                   # It turns out there is no such record
 546     }
 547   }
 548
 549   # we have now read all the records up to record n-1,
 550   # so we can return the offset of record n
 551   return $OFF[$n];
 552 }
 553
 554 # assumes that $rec is already suitably terminated
 555 sub _write_record {
 556   my ($self, $rec) = @_;
 557   my $fh = $self->{fh};
 558   print $fh $rec
 559     or die "Couldn't write record: $!";  # "Should never happen."
 560
 561 }
 562
 563 sub _read_record {
 564   my $self = shift;
 565   my $rec;
 566   { local $/ = $self->{recsep};
 567     my $fh = $self->{fh};
 568     $rec = <$fh>;
 569   }
 570   $rec;
 571 }
 572
 573 sub _cache_insert {
 574   my ($self, $n, $rec) = @_;
 575
 576   # Do not cache records that are too big to fit in the cache.
 577   return unless length $rec <= $self->{memory};
 578
 579   $self->{cache}{$n} = $rec;
 580   $self->{cached} += length $rec;
 581   push @{$self->{lru}}, $n;     # most-recently-used is at the END
 582
 583   $self->_cache_flush if $self->{cached} > $self->{memory};
 584 }
 585
 586 sub _uncache {
 587   my $self = shift;
 588   for my $n (@_) {
 589     my $cached = delete $self->{cache}{$n};
 590     next unless defined $cached;
 591     @{$self->{lru}} = grep $_ != $n, @{$self->{lru}};
 592     $self->{cached} -= length($cached);
 593   }
 594 }
 595
 596 sub _check_cache {
 597   my ($self, $n) = @_;
 598   my $rec;
 599   return unless defined($rec = $self->{cache}{$n});
 600
 601   # cache hit; update LRU queue and return $rec
 602   # replace this with a heap in a later version
 603   @{$self->{lru}} = ((grep $_ ne $n, @{$self->{lru}}), $n);
 604   $rec;
 605 }
 606
 607 sub _cache_flush {
 608   my ($self) = @_;
 609   while ($self->{cached} + $self->{deferred_s} > $self->{memory}) {
 610     my $lru = shift @{$self->{lru}};
 611     my $rec = delete $self->{cache}{$lru};
 612     $self->{cached} -= length $rec;
 613   }
 614 }
 615
 616 # We have read to the end of the file and have the offsets table
 617 # entirely populated.  Now we need to write a new record beyond
 618 # the end of the file.  We prepare for this by writing
 619 # empty records into the file up to the position we want
 620 #
 621 # assumes that the offsets table already contains the offset of record $n,
 622 # if it exists, and extends to the end of the file if not.
 623 sub _extend_file_to {
 624   my ($self, $n) = @_;
 625   $self->_seek(-1);             # position after the end of the last record
 626   my $pos = $self->{offsets}[-1];
 627
 628   # the offsets table has one entry more than the total number of records
 629   $extras = $n - $#{$self->{offsets}};
 630
 631   # Todo : just use $self->{recsep} x $extras here?
 632   while ($extras-- > 0) {
 633     $self->_write_record($self->{recsep});
 634     push @{$self->{offsets}}, tell $self->{fh};
 635   }
 636 }
 637
 638 # Truncate the file at the current position
 639 sub _chop_file {
 640   my $self = shift;
 641   truncate $self->{fh}, tell($self->{fh});
 642 }
 643
 644 # compute the size of a buffer suitable for moving
 645 # all the data in a file forward $n bytes
 646 # ($n may be negative)
 647 # The result should be at least $n.
 648 sub _bufsize {
 649   my $n = shift;
 650   return 8192 if $n < 0;
 651   my $b = $n & ~8191;
 652   $b += 8192 if $n & 8191;
 653   $b;
 654 }
 655
 656 # Lock the file
 657 sub flock {
 658   my ($self, $op) = @_;
 659   unless (@_ <= 3) {
 660     my $pack = ref $self;
 661     croak "Usage: $pack\->flock([OPERATION])";
 662   }
 663   my $fh = $self->{fh};
 664   $op = LOCK_EX unless defined $op;
 665   flock $fh, $op;
 666 }
 667
 668 # Defer writes
 669 sub defer {
 670   my $self = shift;
 671   $self->{defer} = 1;
 672 }
 673
 674 # Get/set autochomp option
 675 sub autochomp {
 676   my $self = shift;
 677   if (@_) {
 678     my $old = $self->{autochomp};
 679     $self->{autochomp} = shift;
 680     $old;
 681   } else {
 682     $self->{autochomp};
 683   }
 684 }
 685
 686 # Flush deferred writes
 687 #
 688 # This could be better optimized to write the file in one pass, instead
 689 # of one pass per block of records.  But that will require modifications
 690 # to _twrite, so I should have a good _twite test suite first.
 691 sub flush {
 692   my $self = shift;
 693
 694   $self->_flush;
 695   $self->{defer} = 0;
 696 }
 697
 698 sub _flush {
 699   my $self = shift;
 700   my @writable = sort {$a<=>$b} (keys %{$self->{deferred}});
 701
 702   while (@writable) {
 703     # gather all consecutive records from the front of @writable
 704     my $first_rec = shift @writable;
 705     my $last_rec = $first_rec+1;
 706     ++$last_rec, shift @writable while @writable && $last_rec == $writable[0];
 707     --$last_rec;
 708     $self->_fill_offsets_to($last_rec);
 709     $self->_extend_file_to($last_rec);
 710     $self->_splice($first_rec, $last_rec-$first_rec+1,
 711                    @{$self->{deferred}}{$first_rec .. $last_rec});
 712   }
 713
 714   $self->discard;               # clear out defered-write-cache
 715 }
 716
 717 # Discard deferred writes
 718 sub discard {
 719   my $self = shift;
 720   undef $self->{deferred};
 721   $self->{deferred_s} = 0;
 722   $self->{defer} = 0;
 723 }
 724
 725 # Not yet implemented
 726 sub autodefer { }
 727
 728 sub _default_recsep {
 729   my $recsep = $/;
 730   if ($^O eq 'MSWin32') {
 731     # Windows users expect files to be terminated with \r\n
 732     # But $/ is set to \n instead
 733     # Note that this also transforms \n\n into \r\n\r\n.
 734     # That is a feature.
 735     $recsep =~ s/\n/\r\n/g;
 736   }
 737   $recsep;
 738 }
 739
 740 # Given a file, make sure the cache is consistent with the
 741 # file contents
 742 sub _check_integrity {
 743   my ($self, $file, $warn) = @_;
 744   my $good = 1;
 745
 746   if (not defined $self->{offsets}[0]) {
 747     $warn && print STDERR "# offset 0 is missing!\n";
 748     $good = 0;
 749   } elsif ($self->{offsets}[0] != 0) {
 750     $warn && print STDERR "# rec 0: offset <$self->{offsets}[0]> s/b 0!\n";
 751     $good = 0;
 752   }
 753
 754   local *F = $self->{fh};
 755   seek F, 0, SEEK_SET;
 756   local $/ = $self->{recsep};
 757   $. = 0;
 758
 759   while (<F>) {
 760     my $n = $. - 1;
 761     my $cached = $self->{cache}{$n};
 762     my $offset = $self->{offsets}[$.];
 763     my $ao = tell F;
 764     if (defined $offset && $offset != $ao) {
 765       $warn && print STDERR "# rec $n: offset <$offset> actual <$ao>\n";
 766       $good = 0;
 767     }
 768     if (defined $cached && $_ ne $cached) {
 769       $good = 0;
 770       chomp $cached;
 771       chomp;
 772       $warn && print STDERR "# rec $n: cached <$cached> actual <$_>\n";
 773     }
 774   }
 775
 776   my $memory = 0;
 777   while (my ($n, $r) = each %{$self->{cache}}) {
 778     $memory += length($r);
 779     next if $n+1 <= $.;         # checked this already
 780     $warn && print STDERR "# spurious caching of record $n\n";
 781     $good = 0;
 782   }
 783   if ($memory != $self->{cached}) {
 784     $warn && print STDERR "# cache size is $self->{cached}, should be $memory\n";
 785     $good = 0;
 786   }
 787
 788   my (%seen, @duplicate);
 789   for (@{$self->{lru}}) {
 790     $seen{$_}++;
 791     if (not exists $self->{cache}{$_}) {
 792       $warn && print "# $_ is mentioned in the LRU queue, but not in the cache\n";
 793       $good = 0;
 794     }
 795   }
 796   @duplicate = grep $seen{$_}>1, keys %seen;
 797   if (@duplicate) {
 798     my $records = @duplicate == 1 ? 'Record' : 'Records';
 799     my $appear  = @duplicate == 1 ? 'appears' : 'appear';
 800     $warn && print "# $records @duplicate $appear multiple times in LRU queue: @{$self->{lru}}\n";
 801     $good = 0;
 802   }
 803   for (keys %{$self->{cache}}) {
 804     unless (exists $seen{$_}) {
 805       print "# record $_ is in the cache but not the LRU queue\n";
 806       $good = 0;
 807     }
 808   }
 809
 810   $good;
 811 }
 812
 813 "Cogito, ergo sum.";  # don't forget to return a true value from the file
 814
 815 =head1 NAME
 816
 817 Tie::File - Access the lines of a disk file via a Perl array
 818
 819 =head1 SYNOPSIS
 820
 821         # This file documents Tie::File version 0.20
 822
 823         tie @array, 'Tie::File', filename or die ...;
 824
 825         $array[13] = 'blah';     # line 13 of the file is now 'blah'
 826         print $array[42];        # display line 42 of the file
 827
 828         $n_recs = @array;        # how many records are in the file?
 829         $#array = $n_recs - 2;   # chop records off the end
 830
 831         # As you would expect:
 832
 833         push @array, new recs...;
 834         my $r1 = pop @array;
 835         unshift @array, new recs...;
 836         my $r1 = shift @array;
 837         @old_recs = splice @array, 3, 7, new recs...;
 838
 839         untie @array;            # all finished
 840
 841 =head1 DESCRIPTION
 842
 843 C<Tie::File> represents a regular text file as a Perl array.  Each
 844 element in the array corresponds to a record in the file.  The first
 845 line of the file is element 0 of the array; the second line is element
 846 1, and so on.
 847
 848 The file is I<not> loaded into memory, so this will work even for
 849 gigantic files.
 850
 851 Changes to the array are reflected in the file immediately.
 852
 853 Lazy people may now stop reading the manual.
 854
 855 =head2 C<recsep>
 856
 857 What is a 'record'?  By default, the meaning is the same as for the
 858 C<E<lt>...E<gt>> operator: It's a string terminated by C<$/>, which is
 859 probably C<"\n">.  (Minor exception: on dos and Win32 systems, a
 860 'record' is a string terminated by C<"\r\n">.)  You may change the
 861 definition of "record" by supplying the C<recsep> option in the C<tie>
 862 call:
 863
 864         tie @array, 'Tie::File', $file, recsep => 'es';
 865
 866 This says that records are delimited by the string C<es>.  If the file
 867 contained the following data:
 868
 869         Curse these pesky flies!\n
 870
 871 then the C<@array> would appear to have four elements:
 872
 873         "Curse th"
 874         "e p"
 875         "ky fli"
 876         "!\n"
 877
 878 An undefined value is not permitted as a record separator.  Perl's
 879 special "paragraph mode" semantics (E<agrave> la C<$/ = "">) are not
 880 emulated.
 881
 882 Records read from the tied array do not have the record separator
 883 string on the end; this is to allow
 884
 885         $array[17] .= "extra";
 886
 887 to work as expected.
 888
 889 (See L<"autochomp">, below.)  Records stored into the array will have
 890 the record separator string appended before they are written to the
 891 file, if they don't have one already.  For example, if the record
 892 separator string is C<"\n">, then the following two lines do exactly
 893 the same thing:
 894
 895         $array[17] = "Cherry pie";
 896         $array[17] = "Cherry pie\n";
 897
 898 The result is that the contents of line 17 of the file will be
 899 replaced with "Cherry pie"; a newline character will separate line 17
 900 from line 18.  This means that in particular, this will do nothing:
 901
 902         chomp $array[17];
 903
 904 Because the C<chomp>ed value will have the separator reattached when
 905 it is written back to the file.  There is no way to create a file
 906 whose trailing record separator string is missing.
 907
 908 Inserting records that I<contain> the record separator string will
 909 produce a reasonable result, but if you can't foresee what this result
 910 will be, you'd better avoid doing this.
 911
 912 =head2 C<autochomp>
 913
 914 Normally, array elements have the record separator removed, so that if
 915 the file contains the text
 916
 917         Gold
 918         Frankincense
 919         Myrrh
 920
 921 the tied array will appear to contain C<("Gold", "Frankincense", "Myrrh")>.
 922 If you set C<autochomp> to a false value, the record separator will not be removed.  If the file above was tied with
 923
 924         tie @gifts, "Tie::File", $gifts, autochomp => 0;
 925
 926 then the array C<@gifts> would appear to contain C<("Gold\n",
 927 "Frankincense\n", "Myrrh\n")>, or (on Win32 systems) C<("Gold\r\n",
 928 "Frankincense\r\n", "Myrrh\r\n")>.
 929
 930 =head2 C<mode>
 931
 932 Normally, the specified file will be opened for read and write access,
 933 and will be created if it does not exist.  (That is, the flags
 934 C<O_RDWR | O_CREAT> are supplied in the C<open> call.)  If you want to
 935 change this, you may supply alternative flags in the C<mode> option.
 936 See L<Fcntl> for a listing of available flags.
 937 For example:
 938
 939         # open the file if it exists, but fail if it does not exist
 940         use Fcntl 'O_RDWR';
 941         tie @array, 'Tie::File', $file, mode => O_RDWR;
 942
 943         # create the file if it does not exist
 944         use Fcntl 'O_RDWR', 'O_CREAT';
 945         tie @array, 'Tie::File', $file, mode => O_RDWR | O_CREAT;
 946
 947         # open an existing file in read-only mode
 948         use Fcntl 'O_RDONLY';
 949         tie @array, 'Tie::File', $file, mode => O_RDONLY;
 950
 951 Opening the data file in write-only or append mode is not supported.
 952
 953 =head2 C<memory>
 954
 955 This is an (inexact) upper limit on the amount of memory that
 956 C<Tie::File> will consume at any time while managing the file.
 957 At present, this is used as a bound on the size of the read cache.
 958
 959 Records read in from the file are cached, to avoid having to re-read
 960 them repeatedly.  If you read the same record twice, the first time it
 961 will be stored in memory, and the second time it will be fetched from
 962 the I<read cache>.  The amount of data in the read cache will not
 963 exceed the value you specified for C<memory>.  If C<Tie::File> wants
 964 to cache a new record, but the read cache is full, it will make room
 965 by expiring the least-recently visited records from the read cache.
 966
 967 The default memory limit is 2Mib.  You can adjust the maximum read
 968 cache size by supplying the C<memory> option.  The argument is the
 969 desired cache size, in bytes.
 970
 971         # I have a lot of memory, so use a large cache to speed up access
 972         tie @array, 'Tie::File', $file, memory => 20_000_000;
 973
 974 Setting the memory limit to 0 will inhibit caching; records will be
 975 fetched from disk every time you examine them.
 976
 977 =head2 Option Format
 978
 979 C<-mode> is a synonym for C<mode>.  C<-recsep> is a synonym for
 980 C<recsep>.  C<-memory> is a synonym for C<memory>.  You get the
 981 idea.
 982
 983 =head1 Public Methods
 984
 985 The C<tie> call returns an object, say C<$o>.  You may call
 986
 987         $rec = $o->FETCH($n);
 988         $o->STORE($n, $rec);
 989
 990 to fetch or store the record at line C<$n>, respectively; similarly
 991 the other tied array methods.  (See L<perltie> for details.)  You may
 992 also call the following methods on this object:
 993
 994 =head2 C<flock>
 995
 996         $o->flock(MODE)
 997
 998 will lock the tied file.  C<MODE> has the same meaning as the second
 999 argument to the Perl built-in C<flock> function; for example
1000 C<LOCK_SH> or C<LOCK_EX | LOCK_NB>.  (These constants are provided by
1001 the C<use Fcntl ':flock'> declaration.)
1002
1003 C<MODE> is optional; C<$o-E<gt>flock> simply locks the file with
1004 C<LOCK_EX>.
1005
1006 The best way to unlock a file is to discard the object and untie the
1007 array.  It is probably unsafe to unlock the file without also untying
1008 it, because if you do, changes may remain unwritten inside the object.
1009 That is why there is no shortcut for unlocking.  If you really want to
1010 unlock the file prematurely, you know what to do; if you don't know
1011 what to do, then don't do it.
1012
1013 All the usual warnings about file locking apply here.  In particular,
1014 note that file locking in Perl is B<advisory>, which means that
1015 holding a lock will not prevent anyone else from reading, writing, or
1016 erasing the file; it only prevents them from getting another lock at
1017 the same time.  Locks are analogous to green traffic lights: If you
1018 have a green light, that does not prevent the idiot coming the other
1019 way from plowing into you sideways; it merely guarantees to you that
1020 the idiot does not also have a green light at the same time.
1021
1022 =head2 C<autochomp>
1023
1024         my $old_value = $o->autochomp(0);    # disable autochomp option
1025         my $old_value = $o->autochomp(1);    #  enable autochomp option
1026
1027         my $ac = $o->autochomp();   # recover current value
1028
1029 See L<"autochomp">, above.
1030
1031 =head1 Tying to an already-opened filehandle
1032
1033 If C<$fh> is a filehandle, such as is returned by C<IO::File> or one
1034 of the other C<IO> modules, you may use:
1035
1036         tie @array, 'Tie::File', $fh, ...;
1037
1038 Similarly if you opened that handle C<FH> with regular C<open> or
1039 C<sysopen>, you may use:
1040
1041         tie @array, 'Tie::File', \*FH, ...;
1042
1043 Handles that were opened write-only won't work.  Handles that were
1044 opened read-only will work as long as you don't try to write to them.
1045 Handles must be attached to seekable sources of data---that means no
1046 pipes or sockets.  If you supply a non-seekable handle, the C<tie>
1047 call will try to abort your program.
1048
1049 =head1 CAVEATS
1050
1051 (That's Latin for 'warnings'.)
1052
1053 =over 4
1054
1055 =item *
1056
1057 This is BETA RELEASE SOFTWARE.  It may have bugs.  See the discussion
1058 below about the (lack of any) warranty.
1059
1060 =item *
1061
1062 Every effort was made to make this module efficient.  Nevertheless,
1063 changing the size of a record in the middle of a large file will
1064 always be fairly slow, because everything after the new record must be
1065 moved.
1066
1067 In particular, note that the following innocent-looking loop has very
1068 bad behavior:
1069
1070         # million-line file
1071         for (@file_array) {
1072           $_ .= 'x';
1073         }
1074
1075 This is likely to be very slow, because the first iteration must
1076 relocate lines 1 through 999,999; the second iteration must relocate
1077 lines 2 through 999,999, and so on.  The relocation is done using
1078 block writes, however, so it's not as slow as it might be.
1079
1080 A soon-to-be-released version of this module will provide a mechanism
1081 for getting better performance in such cases, by deferring the writing
1082 until it can be done all at once.  This deferred writing feature might
1083 be enabled automagically if C<Tie::File> guesses that you are about to write many consecutive records.  To disable this feature, use
1084
1085         (tied @o)->autodefer(0);
1086
1087 (At present, this call does nothing.)
1088
1089 =item *
1090
1091 The behavior of tied arrays is not precisely the same as for regular
1092 arrays.  For example:
1093
1094         undef $a[10];  print "How unusual!\n" if $a[10];
1095
1096 C<undef>-ing a C<Tie::File> array element just blanks out the
1097 corresponding record in the file.  When you read it back again, you'll
1098 see the record separator (typically, $a[10] will appear to contain
1099 "\n") so the supposedly-C<undef>'ed value will be true.
1100
1101 There are other minor differences, but in general, the correspondence
1102 is extremely close.
1103
1104 =item *
1105
1106 Not quite every effort was made to make this module as efficient as
1107 possible.  C<FETCHSIZE> should use binary search instead of linear
1108 search.  The cache's LRU queue should be a heap instead of a list.
1109 These defects are probably minor; in any event, they will be fixed in
1110 a later version of the module.
1111
1112 =item *
1113
1114 The author has supposed that since this module is concerned with file
1115 I/O, almost all normal use of it will be heavily I/O bound, and that
1116 the time to maintain complicated data structures inside the module
1117 will be dominated by the time to actually perform the I/O.  This
1118 suggests, for example, that an LRU read-cache is a good tradeoff,
1119 even if it requires substantial adjustment following a C<splice>
1120 operation.
1121
1122 =back
1123
1124 =head1 WHAT ABOUT C<DB_File>?
1125
1126 C<DB_File>'s C<DB_RECNO> feature does something similar to
1127 C<Tie::File>, but there are a number of reasons that you might prefer
1128 C<Tie::File>.  C<DB_File> is a great piece of software, but the
1129 C<DB_RECNO> part is less great than the rest of it.
1130
1131 =over 4
1132
1133 =item *
1134
1135 C<DB_File> reads your entire file into memory, modifies it in memory,
1136 and the writes out the entire file again when you untie the file.
1137 This is completely impractical for large files.
1138
1139 C<Tie::File> does not do any of those things.  It doesn't try to read
1140 the entire file into memory; instead it uses a lazy approach and
1141 caches recently-used records.  The cache size is strictly bounded by
1142 the C<memory> option.  DB_File's C<-E<gt>{cachesize}> doesn't prevent
1143 your process from blowing up when reading a big file.
1144
1145 =item *
1146
1147 C<DB_File> has an extremely poor writing strategy.  If you have a
1148 ten-megabyte file and tie it with C<DB_File>, and then use
1149
1150         $a[0] =~ s/PERL/Perl/;
1151
1152 C<DB_file> will then read the entire ten-megabyte file into memory, do
1153 the change, and write the entire file back to disk, reading ten
1154 megabytes and writing ten megabytes.  C<Tie::File> will read and write
1155 only the first record.
1156
1157 If you have a million-record file and tie it with C<DB_File>, and then
1158 use
1159
1160         $a[999998] =~ s/Larry/Larry Wall/;
1161
1162 C<DB_File> will read the entire million-record file into memory, do
1163 the change, and write the entire file back to disk.  C<Tie::File> will
1164 only rewrite records 999998 and 999999.  During the writing process,
1165 it will never have more than a few kilobytes of data in memory at any
1166 time, even if the two records are very large.
1167
1168 =item *
1169
1170 Since changes to C<DB_File> files only appear when you do C<untie>, it
1171 can be inconvenient to arrange for concurrent access to the same file
1172 by two or more processes.  Each process needs to call C<$db-E<gt>sync>
1173 after every write.  When you change a C<Tie::File> array, the changes
1174 are reflected in the file immediately; no explicit C<-E<gt>sync> call
1175 is required.  (The forthcoming "deferred writing" mode will allow you
1176 to request that writes be held in memory until explicitly C<sync>'ed.)
1177
1178 =item *
1179
1180 C<DB_File> is only installed by default if you already have the C<db>
1181 library on your system; C<Tie::File> is pure Perl and is installed by
1182 default no matter what.  Starting with Perl 5.7.3 you can be
1183 absolutely sure it will be everywhere.  You will never have that
1184 surety with C<DB_File>.  If you don't have C<DB_File> yet, it requires
1185 a C compiler.  You can install C<Tie::File> from CPAN in five minutes
1186 with no compiler.
1187
1188 =item *
1189
1190 C<DB_File> is written in C, so if you aren't allowed to install
1191 modules on your system, it is useless.  C<Tie::File> is written in Perl,
1192 so even if you aren't allowed to install modules, you can look into
1193 the source code, see how it works, and copy the subroutines or the
1194 ideas from the subroutines directly into your own Perl program.
1195
1196 =item *
1197
1198 Except in very old, unsupported versions, C<DB_File>'s free license
1199 requires that you distribute the source code for your entire
1200 application.  If you are not able to distribute the source code for
1201 your application, you must negotiate an alternative license from
1202 Sleepycat, possibly for a fee.  Tie::File is under the Perl Artistic
1203 license and can be distributed free under the same terms as Perl
1204 itself.
1205
1206 =back
1207
1208 =head1 AUTHOR
1209
1210 Mark Jason Dominus
1211
1212 To contact the author, send email to: C<mjd-perl-tiefile+@plover.com>
1213
1214 To receive an announcement whenever a new version of this module is
1215 released, send a blank email message to
1216 C<mjd-perl-tiefile-subscribe@plover.com>.
1217
1218 =head1 LICENSE
1219
1220 C<Tie::File> version 0.20 is copyright (C) 2002 Mark Jason Dominus.
1221
1222 This library is free software; you may redistribute it and/or modify
1223 it under the same terms as Perl itself.
1224
1225 These terms include your choice of (1) the Perl Artistic Licence, or
1226 (2) version 2 of the GNU General Public License as published by the
1227 Free Software Foundation, or (3) any later version of the GNU General
1228 Public License.
1229
1230 This library is distributed in the hope that it will be useful,
1231 but WITHOUT ANY WARRANTY; without even the implied warranty of
1232 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1233 GNU General Public License for more details.
1234
1235 You should have received a copy of the GNU General Public License
1236 along with this library program; it should be in the file C<COPYING>.
1237 If not, write to the Free Software Foundation, Inc., 59 Temple Place,
1238 Suite 330, Boston, MA 02111 USA
1239
1240 For licensing inquiries, contact the author at:
1241
1242         Mark Jason Dominus
1243         255 S. Warnock St.
1244         Philadelphia, PA 19107
1245
1246 =head1 WARRANTY
1247
1248 C<Tie::File> version 0.20 comes with ABSOLUTELY NO WARRANTY.
1249 For details, see the license.
1250
1251 =head1 THANKS
1252
1253 Gigantic thanks to Jarkko Hietaniemi, for agreeing to put this in the
1254 core when I hadn't written it yet, and for generally being helpful,
1255 supportive, and competent.  (Usually the rule is "choose any one.")
1256 Also big thanks to Abhijit Menon-Sen for all of the same things.
1257
1258 Special thanks to Craig Berry (for VMS portability help), Randy Kobes
1259 (for Win32 portability help), Clinton Pierce and Autrijus Tang (for
1260 heroic eleventh-hour Win32 testing above and beyond the call of duty),
1261 and the rest of the CPAN testers (for testing generally).
1262
1263 More thanks to:
1264 Edward Avis /
1265 Gerrit Haase /
1266 Nikola Knezevic /
1267 Nick Ing-Simmons /
1268 Tassilo von Parseval /
1269 H. Dieter Pearcey /
1270 Slaven Rezic /
1271 Peter Somu /
1272 Tels
1273
1274 =head1 TODO
1275
1276 Test DELETE machinery more carefully.
1277
1278 More tests.  (C<mode> option.  _twrite should be tested separately,
1279 because there are a lot of weird special cases lurking in there.)
1280
1281 More tests.  (Stuff I didn't think of yet.)
1282
1283 Paragraph mode?
1284
1285 More tests.
1286
1287 Fixed-length mode.
1288
1289 Maybe an autolocking mode?
1290
1291 Finish deferred writing.
1292
1293 Autodeferment.
1294
1295 Record locking with fcntl()?  Then you might support an undo log and
1296 get real transactions.  What a coup that would be.
1297
1298 Leave-blanks mode
1299
1300 =cut
1301