1 package DBM::Deep::Engine;
8 use Fcntl qw( :DEFAULT :flock :seek );
11 # Setup file and tag signatures. These should never change.
13 sub SIG_FILE () { 'DPDB' }
14 sub SIG_HEADER () { 'h' }
15 sub SIG_INTERNAL () { 'i' }
16 sub SIG_HASH () { 'H' }
17 sub SIG_ARRAY () { 'A' }
18 sub SIG_NULL () { 'N' }
19 sub SIG_DATA () { 'D' }
20 sub SIG_INDEX () { 'I' }
21 sub SIG_BLIST () { 'B' }
22 sub SIG_FREE () { 'F' }
35 digest => \&Digest::MD5::md5,
39 # Maximum number of buckets per list before another level of indexing is
40 # done. Increase this value for slightly greater speed, but larger database
41 # files. DO NOT decrease this value below 16, due to risk of recursive
49 if ( defined $args->{pack_size} ) {
50 if ( lc $args->{pack_size} eq 'small' ) {
51 $args->{long_size} = 2;
52 $args->{long_pack} = 'S';
54 elsif ( lc $args->{pack_size} eq 'medium' ) {
55 $args->{long_size} = 4;
56 $args->{long_pack} = 'N';
58 elsif ( lc $args->{pack_size} eq 'large' ) {
59 $args->{long_size} = 8;
60 $args->{long_pack} = 'Q';
63 die "Unknown pack_size value: '$args->{pack_size}'\n";
67 # Grab the parameters we want to use
68 foreach my $param ( keys %$self ) {
69 next unless exists $args->{$param};
70 $self->{$param} = $args->{$param};
73 if ( $self->{max_buckets} < 16 ) {
74 warn "Floor of max_buckets is 16. Setting it to 16 from '$self->{max_buckets}'\n";
75 $self->{max_buckets} = 16;
81 sub _fileobj { return $_[0]{fileobj} }
82 sub _fh { return $_[0]->_fileobj->{fh} }
87 $self->{index_size} = (2**8) * $self->{long_size};
88 $self->{bucket_size} = $self->{hash_size} + $self->{long_size} * 2;
89 $self->{bucket_list_size} = $self->{max_buckets} * $self->{bucket_size};
94 sub write_file_header {
99 my $loc = $self->_request_space( length( SIG_FILE ) + 21 );
100 seek($fh, $loc + $self->_fileobj->{file_offset}, SEEK_SET);
104 pack('N', 1), # header version
105 pack('N', 12), # header size
106 pack('N', 0), # file version
107 pack('S', $self->{long_size}),
108 pack('A', $self->{long_pack}),
109 pack('S', $self->{data_size}),
110 pack('A', $self->{data_pack}),
111 pack('S', $self->{max_buckets}),
117 sub read_file_header {
122 seek($fh, 0 + $self->_fileobj->{file_offset}, SEEK_SET);
124 my $bytes_read = read( $fh, $buffer, length(SIG_FILE) + 9 );
126 return unless $bytes_read;
128 my ($file_signature, $sig_header, $header_version, $size) = unpack(
132 unless ( $file_signature eq SIG_FILE ) {
133 $self->{fileobj}->close;
134 $self->_throw_error( "Signature not found -- file is not a Deep DB" );
137 unless ( $sig_header eq SIG_HEADER ) {
138 $self->{fileobj}->close;
139 $self->_throw_error( "Old file version found." );
143 $bytes_read += read( $fh, $buffer2, $size );
144 my ($file_version, @values) = unpack( 'N S A S A S', $buffer2 );
145 if ( @values < 5 || grep { !defined } @values ) {
146 $self->{fileobj}->close;
147 $self->_throw_error("Corrupted file - bad header");
150 #XXX Add warnings if values weren't set right
151 @{$self}{qw(long_size long_pack data_size data_pack max_buckets)} = @values;
163 #XXX The duplication of calculate_sizes needs to go away
164 unless ( $obj->{base_offset} ) {
165 my $bytes_read = $self->read_file_header;
167 $self->calculate_sizes;
170 # File is empty -- write header and master index
173 $self->write_file_header;
175 $obj->{base_offset} = $self->_request_space( $self->tag_size( $self->{index_size} ) );
178 $obj->_base_offset, $obj->_type,
179 chr(0)x$self->{index_size},
182 # Flush the filehandle
183 my $old_fh = select $fh;
184 my $old_af = $|; $| = 1; $| = $old_af;
188 $obj->{base_offset} = $bytes_read;
191 # Get our type from master index header
193 my $tag = $self->load_tag($obj->_base_offset)
194 or $self->_throw_error("Corrupted file, no master index record");
196 unless ($obj->_type eq $tag->{signature}) {
197 $self->_throw_error("File type mismatch");
202 $self->calculate_sizes;
205 #XXX We have to make sure we don't mess up when autoflush isn't turned on
206 unless ( $self->_fileobj->{inode} ) {
207 my @stats = stat($fh);
208 $self->_fileobj->{inode} = $stats[1];
209 $self->_fileobj->{end} = $stats[7];
220 return SIG_SIZE + $self->{data_size} + $size;
225 # Given offset, signature and content, create tag and write to disk
228 my ($offset, $sig, $content) = @_;
229 my $size = length( $content );
233 if ( defined $offset ) {
234 seek($fh, $offset + $self->_fileobj->{file_offset}, SEEK_SET);
237 print( $fh $sig . pack($self->{data_pack}, $size) . $content );
239 return unless defined $offset;
244 offset => $offset + SIG_SIZE + $self->{data_size},
251 # Given offset, load single tag and return signature, size and data
256 # print join(':',map{$_||''}caller(1)), $/;
260 seek($fh, $offset + $self->_fileobj->{file_offset}, SEEK_SET);
262 #XXX I'm not sure this check will work if autoflush isn't enabled ...
266 read( $fh, $b, SIG_SIZE + $self->{data_size} );
267 my ($sig, $size) = unpack( "A $self->{data_pack}", $b );
270 read( $fh, $buffer, $size);
275 offset => $offset + SIG_SIZE + $self->{data_size},
280 sub _get_dbm_object {
285 if ($item->isa( 'DBM::Deep' )) {
292 my $r = Scalar::Util::reftype( $item ) || '';
293 if ( $r eq 'HASH' ) {
296 my $obj = tied(%$item);
297 if ($obj->isa( 'DBM::Deep' )) {
304 elsif ( $r eq 'ARRAY' ) {
307 my $obj = tied(@$item);
308 if ($obj->isa( 'DBM::Deep' )) {
321 my ($value, $key) = @_;
323 my $is_dbm_deep = eval {
324 local $SIG{'__DIE__'};
325 $value->isa( 'DBM::Deep' );
328 my $len = SIG_SIZE + $self->{data_size}
329 + $self->{data_size} + length( $key );
331 if ( $is_dbm_deep && $value->_fileobj eq $self->_fileobj ) {
332 return $len + $self->{long_size};
335 my $r = Scalar::Util::reftype( $value ) || '';
336 if ( $self->_fileobj->{autobless} ) {
337 # This is for the bit saying whether or not this thing is blessed.
341 unless ( $r eq 'HASH' || $r eq 'ARRAY' ) {
342 if ( defined $value ) {
343 $len += length( $value );
348 $len += $self->{index_size};
350 # if autobless is enabled, must also take into consideration
351 # the class name as it is stored after the key.
352 if ( $self->_fileobj->{autobless} ) {
353 my $c = Scalar::Util::blessed($value);
354 if ( defined $c && !$is_dbm_deep ) {
355 $len += $self->{data_size} + length($c);
364 # Adds one key/value pair to bucket list, given offset, MD5 digest of key,
365 # plain (undigested) key and value.
368 my ($tag, $md5, $plain_key, $value) = @_;
370 # This verifies that only supported values will be stored.
372 my $r = Scalar::Util::reftype( $value );
375 last if $r eq 'HASH';
376 last if $r eq 'ARRAY';
379 "Storage of variables of type '$r' is not supported."
386 my $root = $self->_fileobj;
389 my $actual_length = $self->_length_needed( $value, $plain_key );
391 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
393 # $self->_release_space( $size, $subloc );
394 # Updating a known md5
395 #XXX This needs updating to use _release_space
399 if ($actual_length <= $size) {
403 $location = $self->_request_space( $actual_length );
406 $tag->{offset} + $offset
407 + $self->{hash_size} + $root->{file_offset},
410 print( $fh pack($self->{long_pack}, $location ) );
411 print( $fh pack($self->{long_pack}, $actual_length ) );
415 elsif ( defined $offset ) {
416 $location = $self->_request_space( $actual_length );
418 seek( $fh, $tag->{offset} + $offset + $root->{file_offset}, SEEK_SET );
419 print( $fh $md5 . pack($self->{long_pack}, $location ) );
420 print( $fh pack($self->{long_pack}, $actual_length ) );
422 # If bucket didn't fit into list, split into a new index level
423 # split_index() will do the _request_space() call
425 $location = $self->split_index( $md5, $tag );
428 $self->write_value( $location, $plain_key, $value );
435 my ($location, $key, $value) = @_;
438 my $root = $self->_fileobj;
440 my $dbm_deep_obj = _get_dbm_object( $value );
441 if ( $dbm_deep_obj && $dbm_deep_obj->_fileobj ne $self->_fileobj ) {
442 $self->_throw_error( "Cannot cross-reference. Use export() instead" );
445 seek($fh, $location + $root->{file_offset}, SEEK_SET);
448 # Write signature based on content type, set content length and write
451 my $r = Scalar::Util::reftype( $value ) || '';
452 if ( $dbm_deep_obj ) {
453 $self->write_tag( undef, SIG_INTERNAL,pack($self->{long_pack}, $dbm_deep_obj->_base_offset) );
455 elsif ($r eq 'HASH') {
456 if ( !$dbm_deep_obj && tied %{$value} ) {
457 $self->_throw_error( "Cannot store something that is tied" );
459 $self->write_tag( undef, SIG_HASH, chr(0)x$self->{index_size} );
461 elsif ($r eq 'ARRAY') {
462 if ( !$dbm_deep_obj && tied @{$value} ) {
463 $self->_throw_error( "Cannot store something that is tied" );
465 $self->write_tag( undef, SIG_ARRAY, chr(0)x$self->{index_size} );
467 elsif (!defined($value)) {
468 $self->write_tag( undef, SIG_NULL, '' );
471 $self->write_tag( undef, SIG_DATA, $value );
475 # Plain key is stored AFTER value, as keys are typically fetched less often.
477 print( $fh pack($self->{data_pack}, length($key)) . $key );
479 # Internal references don't care about autobless
480 return 1 if $dbm_deep_obj;
483 # If value is blessed, preserve class name
485 if ( $root->{autobless} ) {
486 my $c = Scalar::Util::blessed($value);
487 if ( defined $c && !$dbm_deep_obj ) {
489 print( $fh pack($self->{data_pack}, length($c)) . $c );
497 # Tie the passed in reference so that changes to it are reflected in the
498 # datafile. The use of $location as the base_offset will act as the
499 # the linkage between parent and child.
501 # The overall assignment is a hack around the fact that just tying doesn't
502 # store the values. This may not be the wrong thing to do.
506 tie %$value, 'DBM::Deep', {
507 base_offset => $location,
512 elsif ($r eq 'ARRAY') {
514 tie @$value, 'DBM::Deep', {
515 base_offset => $location,
526 my ($md5, $tag) = @_;
529 my $root = $self->_fileobj;
531 my $loc = $self->_request_space(
532 $self->tag_size( $self->{index_size} ),
535 seek($fh, $tag->{ref_loc} + $root->{file_offset}, SEEK_SET);
536 print( $fh pack($self->{long_pack}, $loc) );
538 my $index_tag = $self->write_tag(
540 chr(0)x$self->{index_size},
543 my $newtag_loc = $self->_request_space(
544 $self->tag_size( $self->{bucket_list_size} ),
547 my $keys = $tag->{content}
548 . $md5 . pack($self->{long_pack}, $newtag_loc)
549 . pack($self->{long_pack}, 0);
553 for (my $i = 0; $i <= $self->{max_buckets}; $i++) {
554 my ($key, $old_subloc, $size) = $self->_get_key_subloc( $keys, $i );
556 die "[INTERNAL ERROR]: No key in split_index()\n" unless $key;
557 die "[INTERNAL ERROR]: No subloc in split_index()\n" unless $old_subloc;
559 my $num = ord(substr($key, $tag->{ch} + 1, 1));
562 seek($fh, $newloc[$num] + $root->{file_offset}, SEEK_SET);
564 read( $fh, $subkeys, $self->{bucket_list_size});
566 # This is looking for the first empty spot
567 my ($subloc, $offset, $size) = $self->_find_in_buckets(
568 { content => $subkeys }, '',
571 seek($fh, $newloc[$num] + $offset + $root->{file_offset}, SEEK_SET);
572 print( $fh $key . pack($self->{long_pack}, $old_subloc) );
577 seek($fh, $index_tag->{offset} + ($num * $self->{long_size}) + $root->{file_offset}, SEEK_SET);
579 my $loc = $self->_request_space(
580 $self->tag_size( $self->{bucket_list_size} ),
583 print( $fh pack($self->{long_pack}, $loc) );
585 my $blist_tag = $self->write_tag(
587 chr(0)x$self->{bucket_list_size},
590 seek($fh, $blist_tag->{offset} + $root->{file_offset}, SEEK_SET);
591 print( $fh $key . pack($self->{long_pack}, $old_subloc) );
593 $newloc[$num] = $blist_tag->{offset};
596 $self->_release_space(
597 $self->tag_size( $self->{bucket_list_size} ),
598 $tag->{offset} - SIG_SIZE - $self->{data_size},
611 # Found match -- seek to offset and read signature
614 seek($fh, $subloc + $self->_fileobj->{file_offset}, SEEK_SET);
615 read( $fh, $signature, SIG_SIZE);
618 # If value is a hash or array, return new DBM::Deep object with correct offset
620 if (($signature eq SIG_HASH) || ($signature eq SIG_ARRAY)) {
621 my $new_obj = DBM::Deep->new({
623 base_offset => $subloc,
624 fileobj => $self->_fileobj,
627 if ($new_obj->_fileobj->{autobless}) {
629 # Skip over value and plain key to see if object needs
632 seek($fh, $self->{data_size} + $self->{index_size}, SEEK_CUR);
635 read( $fh, $size, $self->{data_size});
636 $size = unpack($self->{data_pack}, $size);
637 if ($size) { seek($fh, $size, SEEK_CUR); }
640 read( $fh, $bless_bit, 1);
641 if (ord($bless_bit)) {
643 # Yes, object needs to be re-blessed
646 read( $fh, $size, $self->{data_size});
647 $size = unpack($self->{data_pack}, $size);
648 if ($size) { read( $fh, $class_name, $size); }
649 if ($class_name) { $new_obj = bless( $new_obj, $class_name ); }
655 elsif ( $signature eq SIG_INTERNAL ) {
657 read( $fh, $size, $self->{data_size});
658 $size = unpack($self->{data_pack}, $size);
662 read( $fh, $new_loc, $size );
663 $new_loc = unpack( $self->{long_pack}, $new_loc );
665 return $self->read_from_loc( $new_loc );
672 # Otherwise return actual value
674 elsif ( $signature eq SIG_DATA ) {
676 read( $fh, $size, $self->{data_size});
677 $size = unpack($self->{data_pack}, $size);
680 if ($size) { read( $fh, $value, $size); }
685 # Key exists, but content is null
690 sub get_bucket_value {
692 # Fetch single value given tag and MD5 digested key.
695 my ($tag, $md5) = @_;
697 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
699 return $self->read_from_loc( $subloc );
706 # Delete single key/value pair given tag and MD5 digested key.
709 my ($tag, $md5) = @_;
711 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
712 #XXX This needs _release_space()
715 seek($fh, $tag->{offset} + $offset + $self->_fileobj->{file_offset}, SEEK_SET);
716 print( $fh substr($tag->{content}, $offset + $self->{bucket_size} ) );
717 print( $fh chr(0) x $self->{bucket_size} );
726 # Check existence of single key given tag and MD5 digested key.
729 my ($tag, $md5) = @_;
731 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
735 sub find_bucket_list {
737 # Locate offset for bucket list, given digested key
740 my ($offset, $md5, $args) = @_;
741 $args = {} unless $args;
744 # Locate offset for bucket list using digest index system
746 my $tag = $self->load_tag( $offset )
747 or $self->_throw_error( "INTERNAL ERROR - Cannot find tag" );
750 while ($tag->{signature} ne SIG_BLIST) {
751 my $num = ord substr($md5, $ch, 1);
753 my $ref_loc = $tag->{offset} + ($num * $self->{long_size});
754 $tag = $self->index_lookup( $tag, $num );
757 return if !$args->{create};
759 my $loc = $self->_request_space(
760 $self->tag_size( $self->{bucket_list_size} ),
764 seek($fh, $ref_loc + $self->_fileobj->{file_offset}, SEEK_SET);
765 print( $fh pack($self->{long_pack}, $loc) );
767 $tag = $self->write_tag(
769 chr(0)x$self->{bucket_list_size},
772 $tag->{ref_loc} = $ref_loc;
779 $tag->{ref_loc} = $ref_loc;
787 # Given index tag, lookup single entry in index and return .
790 my ($tag, $index) = @_;
792 my $location = unpack(
796 $index * $self->{long_size},
801 if (!$location) { return; }
803 return $self->load_tag( $location );
808 # Scan index and recursively step into deeper levels, looking for next key.
811 my ($obj, $offset, $ch, $force_return_next) = @_;
813 my $tag = $self->load_tag( $offset );
817 if ($tag->{signature} ne SIG_BLIST) {
818 my $content = $tag->{content};
819 my $start = $obj->{return_next} ? 0 : ord(substr($obj->{prev_md5}, $ch, 1));
821 for (my $idx = $start; $idx < (2**8); $idx++) {
826 $idx * $self->{long_size},
832 my $result = $self->traverse_index(
833 $obj, $subloc, $ch + 1, $force_return_next,
836 if (defined($result)) { return $result; }
840 $obj->{return_next} = 1;
844 my $keys = $tag->{content};
845 if ($force_return_next) { $obj->{return_next} = 1; }
848 # Iterate through buckets, looking for a key match
850 for (my $i = 0; $i < $self->{max_buckets}; $i++) {
851 my ($key, $subloc) = $self->_get_key_subloc( $keys, $i );
853 # End of bucket list -- return to outer loop
855 $obj->{return_next} = 1;
858 # Located previous key -- return next one found
859 elsif ($key eq $obj->{prev_md5}) {
860 $obj->{return_next} = 1;
863 # Seek to bucket location and skip over signature
864 elsif ($obj->{return_next}) {
865 seek($fh, $subloc + $self->_fileobj->{file_offset}, SEEK_SET);
867 # Skip over value to get to plain key
869 read( $fh, $sig, SIG_SIZE );
872 read( $fh, $size, $self->{data_size});
873 $size = unpack($self->{data_pack}, $size);
874 if ($size) { seek($fh, $size, SEEK_CUR); }
876 # Read in plain key and return as scalar
878 read( $fh, $size, $self->{data_size});
879 $size = unpack($self->{data_pack}, $size);
880 if ($size) { read( $fh, $plain_key, $size); }
886 $obj->{return_next} = 1;
887 } # tag is a bucket list
894 # Locate next key, given digested previous one
899 $obj->{prev_md5} = $_[1] ? $_[1] : undef;
900 $obj->{return_next} = 0;
903 # If the previous key was not specifed, start at the top and
904 # return the first one found.
906 if (!$obj->{prev_md5}) {
907 $obj->{prev_md5} = chr(0) x $self->{hash_size};
908 $obj->{return_next} = 1;
911 return $self->traverse_index( $obj, $obj->_base_offset, 0 );
916 sub _get_key_subloc {
918 my ($keys, $idx) = @_;
920 my ($key, $subloc, $size) = unpack(
921 "a$self->{hash_size} $self->{long_pack} $self->{long_pack}",
924 ($idx * $self->{bucket_size}),
925 $self->{bucket_size},
929 return ($key, $subloc, $size);
932 sub _find_in_buckets {
934 my ($tag, $md5) = @_;
937 for ( my $i = 0; $i < $self->{max_buckets}; $i++ ) {
938 my ($key, $subloc, $size) = $self->_get_key_subloc(
942 return ($subloc, $i * $self->{bucket_size}, $size) unless $subloc;
944 next BUCKET if $key ne $md5;
946 return ($subloc, $i * $self->{bucket_size}, $size);
956 my $loc = $self->_fileobj->{end};
957 $self->_fileobj->{end} += $size;
964 my ($size, $loc) = @_;
969 seek( $fh, $loc + $self->_fileobj->{file_offset}, SEEK_SET );
971 . pack($self->{long_pack}, $size )
972 . pack($self->{long_pack}, $next_loc )
979 die "DBM::Deep: $_[1]\n";
985 # This will be added in later, after more refactoring is done. This is an early
986 # attempt at refactoring on the physical level instead of the virtual level.
989 my ($spot, $amount, $unpack) = @_;
992 seek( $fh, $spot + $self->_fileobj->{file_offset}, SEEK_SET );
995 my $bytes_read = read( $fh, $buffer, $amount );
998 $buffer = unpack( $unpack, $buffer );
1002 return ($buffer, $bytes_read);
1011 my ($spot, $data) = @_;
1013 my $fh = $self->_fh;
1014 seek( $fh, $spot, SEEK_SET );
1020 sub get_file_version {
1023 my $fh = $self->_fh;
1025 seek( $fh, 13 + $self->_fileobj->{file_offset}, SEEK_SET );
1027 my $bytes_read = read( $fh, $buffer, 4 );
1028 unless ( $bytes_read == 4 ) {
1029 $self->_throw_error( "Cannot read file version" );
1032 return unpack( 'N', $buffer );
1035 sub write_file_version {
1037 my ($new_version) = @_;
1039 my $fh = $self->_fh;
1041 seek( $fh, 13 + $self->_fileobj->{file_offset}, SEEK_SET );
1042 print( $fh pack( 'N', $new_version ) );