1 package DBM::Deep::Engine;
8 use Fcntl qw( :DEFAULT :flock :seek );
11 # Setup file and tag signatures. These should never change.
13 sub SIG_FILE () { 'DPDB' }
14 sub SIG_HEADER () { 'h' }
15 sub SIG_INTERNAL () { 'i' }
16 sub SIG_HASH () { 'H' }
17 sub SIG_ARRAY () { 'A' }
18 sub SIG_NULL () { 'N' }
19 sub SIG_DATA () { 'D' }
20 sub SIG_INDEX () { 'I' }
21 sub SIG_BLIST () { 'B' }
22 sub SIG_FREE () { 'F' }
35 digest => \&Digest::MD5::md5,
39 # Maximum number of buckets per list before another level of indexing is
40 # done. Increase this value for slightly greater speed, but larger database
41 # files. DO NOT decrease this value below 16, due to risk of recursive
49 if ( defined $args->{pack_size} ) {
50 if ( lc $args->{pack_size} eq 'small' ) {
51 $args->{long_size} = 2;
52 $args->{long_pack} = 'S';
54 elsif ( lc $args->{pack_size} eq 'medium' ) {
55 $args->{long_size} = 4;
56 $args->{long_pack} = 'N';
58 elsif ( lc $args->{pack_size} eq 'large' ) {
59 $args->{long_size} = 8;
60 $args->{long_pack} = 'Q';
63 die "Unknown pack_size value: '$args->{pack_size}'\n";
67 # Grab the parameters we want to use
68 foreach my $param ( keys %$self ) {
69 next unless exists $args->{$param};
70 $self->{$param} = $args->{$param};
73 if ( $self->{max_buckets} < 16 ) {
74 warn "Floor of max_buckets is 16. Setting it to 16 from '$self->{max_buckets}'\n";
75 $self->{max_buckets} = 16;
81 sub _fileobj { return $_[0]{fileobj} }
82 sub _fh { return $_[0]->_fileobj->{fh} }
87 #XXX Does this need to be updated with different hashing algorithms?
88 $self->{index_size} = (2**8) * $self->{long_size};
89 #ACID This needs modified - DONE
90 $self->{bucket_size} = $self->{hash_size} + $self->{long_size} * 3;
91 $self->{bucket_list_size} = $self->{max_buckets} * $self->{bucket_size};
96 sub write_file_header {
101 my $loc = $self->_request_space( length( SIG_FILE ) + 21 );
102 seek($fh, $loc + $self->_fileobj->{file_offset}, SEEK_SET);
106 pack('N', 1), # header version
107 pack('N', 12), # header size
108 pack('N', 0), # file version
109 pack('S', $self->{long_size}),
110 pack('A', $self->{long_pack}),
111 pack('S', $self->{data_size}),
112 pack('A', $self->{data_pack}),
113 pack('S', $self->{max_buckets}),
119 sub read_file_header {
124 seek($fh, 0 + $self->_fileobj->{file_offset}, SEEK_SET);
126 my $bytes_read = read( $fh, $buffer, length(SIG_FILE) + 9 );
128 return unless $bytes_read;
130 my ($file_signature, $sig_header, $header_version, $size) = unpack(
134 unless ( $file_signature eq SIG_FILE ) {
135 $self->{fileobj}->close;
136 $self->_throw_error( "Signature not found -- file is not a Deep DB" );
139 unless ( $sig_header eq SIG_HEADER ) {
140 $self->{fileobj}->close;
141 $self->_throw_error( "Old file version found." );
145 $bytes_read += read( $fh, $buffer2, $size );
146 my ($file_version, @values) = unpack( 'N S A S A S', $buffer2 );
147 if ( @values < 5 || grep { !defined } @values ) {
148 $self->{fileobj}->close;
149 $self->_throw_error("Corrupted file - bad header");
152 #XXX Add warnings if values weren't set right
153 @{$self}{qw(long_size long_pack data_size data_pack max_buckets)} = @values;
165 #XXX The duplication of calculate_sizes needs to go away
166 unless ( $obj->{base_offset} ) {
167 my $bytes_read = $self->read_file_header;
169 $self->calculate_sizes;
172 # File is empty -- write header and master index
175 $self->write_file_header;
177 $obj->{base_offset} = $self->_request_space( $self->tag_size( $self->{index_size} ) );
180 $obj->_base_offset, $obj->_type,
181 chr(0)x$self->{index_size},
184 # Flush the filehandle
185 my $old_fh = select $fh;
186 my $old_af = $|; $| = 1; $| = $old_af;
190 $obj->{base_offset} = $bytes_read;
193 # Get our type from master index header
195 my $tag = $self->load_tag($obj->_base_offset)
196 or $self->_throw_error("Corrupted file, no master index record");
198 unless ($obj->_type eq $tag->{signature}) {
199 $self->_throw_error("File type mismatch");
204 $self->calculate_sizes;
207 #XXX We have to make sure we don't mess up when autoflush isn't turned on
208 unless ( $self->_fileobj->{inode} ) {
209 my @stats = stat($fh);
210 $self->_fileobj->{inode} = $stats[1];
211 $self->_fileobj->{end} = $stats[7];
222 return SIG_SIZE + $self->{data_size} + $size;
227 # Given offset, signature and content, create tag and write to disk
230 my ($offset, $sig, $content) = @_;
231 my $size = length( $content );
235 if ( defined $offset ) {
236 seek($fh, $offset + $self->_fileobj->{file_offset}, SEEK_SET);
239 print( $fh $sig . pack($self->{data_pack}, $size) . $content );
241 return unless defined $offset;
246 offset => $offset + SIG_SIZE + $self->{data_size},
253 # Given offset, load single tag and return signature, size and data
258 # print join(':',map{$_||''}caller(1)), $/;
262 seek($fh, $offset + $self->_fileobj->{file_offset}, SEEK_SET);
264 #XXX I'm not sure this check will work if autoflush isn't enabled ...
268 read( $fh, $b, SIG_SIZE + $self->{data_size} );
269 my ($sig, $size) = unpack( "A $self->{data_pack}", $b );
272 read( $fh, $buffer, $size);
277 offset => $offset + SIG_SIZE + $self->{data_size},
282 sub _get_dbm_object {
287 if ($item->isa( 'DBM::Deep' )) {
294 my $r = Scalar::Util::reftype( $item ) || '';
295 if ( $r eq 'HASH' ) {
298 my $obj = tied(%$item);
299 if ($obj->isa( 'DBM::Deep' )) {
306 elsif ( $r eq 'ARRAY' ) {
309 my $obj = tied(@$item);
310 if ($obj->isa( 'DBM::Deep' )) {
323 my ($value, $key) = @_;
325 my $is_dbm_deep = eval {
326 local $SIG{'__DIE__'};
327 $value->isa( 'DBM::Deep' );
330 my $len = SIG_SIZE + $self->{data_size}
331 + $self->{data_size} + length( $key );
333 if ( $is_dbm_deep && $value->_fileobj eq $self->_fileobj ) {
334 return $len + $self->{long_size};
337 my $r = Scalar::Util::reftype( $value ) || '';
338 if ( $self->_fileobj->{autobless} ) {
339 # This is for the bit saying whether or not this thing is blessed.
343 unless ( $r eq 'HASH' || $r eq 'ARRAY' ) {
344 if ( defined $value ) {
345 $len += length( $value );
350 $len += $self->{index_size};
352 # if autobless is enabled, must also take into consideration
353 # the class name as it is stored after the key.
354 if ( $self->_fileobj->{autobless} ) {
355 my $c = Scalar::Util::blessed($value);
356 if ( defined $c && !$is_dbm_deep ) {
357 $len += $self->{data_size} + length($c);
366 # Adds one key/value pair to bucket list, given offset, MD5 digest of key,
367 # plain (undigested) key and value.
370 my ($tag, $md5, $plain_key, $value) = @_;
372 # This verifies that only supported values will be stored.
374 my $r = Scalar::Util::reftype( $value );
377 last if $r eq 'HASH';
378 last if $r eq 'ARRAY';
381 "Storage of variables of type '$r' is not supported."
388 my $root = $self->_fileobj;
391 my $actual_length = $self->_length_needed( $value, $plain_key );
393 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
395 print "$subloc - $offset - $size\n";
396 # $self->_release_space( $size, $subloc );
397 # Updating a known md5
398 #XXX This needs updating to use _release_space
402 if ($actual_length <= $size) {
406 $location = $self->_request_space( $actual_length );
409 $tag->{offset} + $offset
410 + $self->{hash_size} + $root->{file_offset},
413 print( $fh pack($self->{long_pack}, $location ) );
414 print( $fh pack($self->{long_pack}, $actual_length ) );
415 print( $fh pack($self->{long_pack}, $root->transaction_id ) );
419 elsif ( defined $offset ) {
420 $location = $self->_request_space( $actual_length );
422 seek( $fh, $tag->{offset} + $offset + $root->{file_offset}, SEEK_SET );
423 print( $fh $md5 . pack($self->{long_pack}, $location ) );
424 print( $fh pack($self->{long_pack}, $actual_length ) );
425 print( $fh pack($self->{long_pack}, $root->transaction_id ) );
427 # If bucket didn't fit into list, split into a new index level
428 # split_index() will do the _request_space() call
430 $location = $self->split_index( $md5, $tag );
433 $self->write_value( $location, $plain_key, $value );
440 my ($location, $key, $value) = @_;
443 my $root = $self->_fileobj;
445 my $dbm_deep_obj = _get_dbm_object( $value );
446 if ( $dbm_deep_obj && $dbm_deep_obj->_fileobj ne $self->_fileobj ) {
447 $self->_throw_error( "Cannot cross-reference. Use export() instead" );
450 seek($fh, $location + $root->{file_offset}, SEEK_SET);
453 # Write signature based on content type, set content length and write
456 my $r = Scalar::Util::reftype( $value ) || '';
457 if ( $dbm_deep_obj ) {
458 $self->write_tag( undef, SIG_INTERNAL,pack($self->{long_pack}, $dbm_deep_obj->_base_offset) );
460 elsif ($r eq 'HASH') {
461 if ( !$dbm_deep_obj && tied %{$value} ) {
462 $self->_throw_error( "Cannot store something that is tied" );
464 $self->write_tag( undef, SIG_HASH, chr(0)x$self->{index_size} );
466 elsif ($r eq 'ARRAY') {
467 if ( !$dbm_deep_obj && tied @{$value} ) {
468 $self->_throw_error( "Cannot store something that is tied" );
470 $self->write_tag( undef, SIG_ARRAY, chr(0)x$self->{index_size} );
472 elsif (!defined($value)) {
473 $self->write_tag( undef, SIG_NULL, '' );
476 $self->write_tag( undef, SIG_DATA, $value );
480 # Plain key is stored AFTER value, as keys are typically fetched less often.
482 print( $fh pack($self->{data_pack}, length($key)) . $key );
484 # Internal references don't care about autobless
485 return 1 if $dbm_deep_obj;
488 # If value is blessed, preserve class name
490 if ( $root->{autobless} ) {
491 my $c = Scalar::Util::blessed($value);
492 if ( defined $c && !$dbm_deep_obj ) {
494 print( $fh pack($self->{data_pack}, length($c)) . $c );
502 # Tie the passed in reference so that changes to it are reflected in the
503 # datafile. The use of $location as the base_offset will act as the
504 # the linkage between parent and child.
506 # The overall assignment is a hack around the fact that just tying doesn't
507 # store the values. This may not be the wrong thing to do.
511 tie %$value, 'DBM::Deep', {
512 base_offset => $location,
517 elsif ($r eq 'ARRAY') {
519 tie @$value, 'DBM::Deep', {
520 base_offset => $location,
531 my ($md5, $tag) = @_;
534 my $root = $self->_fileobj;
536 my $loc = $self->_request_space(
537 $self->tag_size( $self->{index_size} ),
540 seek($fh, $tag->{ref_loc} + $root->{file_offset}, SEEK_SET);
541 print( $fh pack($self->{long_pack}, $loc) );
543 my $index_tag = $self->write_tag(
545 chr(0)x$self->{index_size},
548 my $newtag_loc = $self->_request_space(
549 $self->tag_size( $self->{bucket_list_size} ),
552 my $keys = $tag->{content}
553 . $md5 . pack($self->{long_pack}, $newtag_loc)
554 . pack($self->{long_pack}, 0) # size
555 . pack($self->{long_pack}, 0); # transaction #
559 for (my $i = 0; $i <= $self->{max_buckets}; $i++) {
560 my ($key, $old_subloc, $size) = $self->_get_key_subloc( $keys, $i );
562 die "[INTERNAL ERROR]: No key in split_index()\n" unless $key;
563 die "[INTERNAL ERROR]: No subloc in split_index()\n" unless $old_subloc;
565 my $num = ord(substr($key, $tag->{ch} + 1, 1));
568 seek($fh, $newloc[$num] + $root->{file_offset}, SEEK_SET);
570 read( $fh, $subkeys, $self->{bucket_list_size});
572 # This is looking for the first empty spot
573 my ($subloc, $offset, $size) = $self->_find_in_buckets(
574 { content => $subkeys }, '',
577 seek($fh, $newloc[$num] + $offset + $root->{file_offset}, SEEK_SET);
578 print( $fh $key . pack($self->{long_pack}, $old_subloc) );
583 seek($fh, $index_tag->{offset} + ($num * $self->{long_size}) + $root->{file_offset}, SEEK_SET);
585 my $loc = $self->_request_space(
586 $self->tag_size( $self->{bucket_list_size} ),
589 print( $fh pack($self->{long_pack}, $loc) );
591 my $blist_tag = $self->write_tag(
593 chr(0)x$self->{bucket_list_size},
596 seek($fh, $blist_tag->{offset} + $root->{file_offset}, SEEK_SET);
597 print( $fh $key . pack($self->{long_pack}, $old_subloc) );
599 $newloc[$num] = $blist_tag->{offset};
602 $self->_release_space(
603 $self->tag_size( $self->{bucket_list_size} ),
604 $tag->{offset} - SIG_SIZE - $self->{data_size},
617 # Found match -- seek to offset and read signature
620 seek($fh, $subloc + $self->_fileobj->{file_offset}, SEEK_SET);
621 read( $fh, $signature, SIG_SIZE);
624 # If value is a hash or array, return new DBM::Deep object with correct offset
626 if (($signature eq SIG_HASH) || ($signature eq SIG_ARRAY)) {
627 my $new_obj = DBM::Deep->new({
629 base_offset => $subloc,
630 fileobj => $self->_fileobj,
633 if ($new_obj->_fileobj->{autobless}) {
635 # Skip over value and plain key to see if object needs
638 seek($fh, $self->{data_size} + $self->{index_size}, SEEK_CUR);
641 read( $fh, $size, $self->{data_size});
642 $size = unpack($self->{data_pack}, $size);
643 if ($size) { seek($fh, $size, SEEK_CUR); }
646 read( $fh, $bless_bit, 1);
647 if (ord($bless_bit)) {
649 # Yes, object needs to be re-blessed
652 read( $fh, $size, $self->{data_size});
653 $size = unpack($self->{data_pack}, $size);
654 if ($size) { read( $fh, $class_name, $size); }
655 if ($class_name) { $new_obj = bless( $new_obj, $class_name ); }
661 elsif ( $signature eq SIG_INTERNAL ) {
663 read( $fh, $size, $self->{data_size});
664 $size = unpack($self->{data_pack}, $size);
668 read( $fh, $new_loc, $size );
669 $new_loc = unpack( $self->{long_pack}, $new_loc );
671 return $self->read_from_loc( $new_loc );
678 # Otherwise return actual value
680 elsif ( $signature eq SIG_DATA ) {
682 read( $fh, $size, $self->{data_size});
683 $size = unpack($self->{data_pack}, $size);
686 if ($size) { read( $fh, $value, $size); }
691 # Key exists, but content is null
696 sub get_bucket_value {
698 # Fetch single value given tag and MD5 digested key.
701 my ($tag, $md5) = @_;
703 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
705 return $self->read_from_loc( $subloc );
712 # Delete single key/value pair given tag and MD5 digested key.
715 my ($tag, $md5) = @_;
717 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
718 #XXX This needs _release_space()
721 seek($fh, $tag->{offset} + $offset + $self->_fileobj->{file_offset}, SEEK_SET);
722 print( $fh substr($tag->{content}, $offset + $self->{bucket_size} ) );
723 print( $fh chr(0) x $self->{bucket_size} );
732 # Check existence of single key given tag and MD5 digested key.
735 my ($tag, $md5) = @_;
737 my ($subloc, $offset, $size) = $self->_find_in_buckets( $tag, $md5 );
741 sub find_bucket_list {
743 # Locate offset for bucket list, given digested key
746 my ($offset, $md5, $args) = @_;
747 $args = {} unless $args;
750 # Locate offset for bucket list using digest index system
752 my $tag = $self->load_tag( $offset )
753 or $self->_throw_error( "INTERNAL ERROR - Cannot find tag" );
756 while ($tag->{signature} ne SIG_BLIST) {
757 my $num = ord substr($md5, $ch, 1);
759 my $ref_loc = $tag->{offset} + ($num * $self->{long_size});
760 $tag = $self->index_lookup( $tag, $num );
763 return if !$args->{create};
765 my $loc = $self->_request_space(
766 $self->tag_size( $self->{bucket_list_size} ),
770 seek($fh, $ref_loc + $self->_fileobj->{file_offset}, SEEK_SET);
771 print( $fh pack($self->{long_pack}, $loc) );
773 $tag = $self->write_tag(
775 chr(0)x$self->{bucket_list_size},
778 $tag->{ref_loc} = $ref_loc;
785 $tag->{ref_loc} = $ref_loc;
793 # Given index tag, lookup single entry in index and return .
796 my ($tag, $index) = @_;
798 my $location = unpack(
802 $index * $self->{long_size},
807 if (!$location) { return; }
809 return $self->load_tag( $location );
814 # Scan index and recursively step into deeper levels, looking for next key.
817 my ($obj, $offset, $ch, $force_return_next) = @_;
819 my $tag = $self->load_tag( $offset );
823 if ($tag->{signature} ne SIG_BLIST) {
824 my $content = $tag->{content};
825 my $start = $obj->{return_next} ? 0 : ord(substr($obj->{prev_md5}, $ch, 1));
827 for (my $idx = $start; $idx < (2**8); $idx++) {
832 $idx * $self->{long_size},
838 my $result = $self->traverse_index(
839 $obj, $subloc, $ch + 1, $force_return_next,
842 if (defined($result)) { return $result; }
846 $obj->{return_next} = 1;
850 my $keys = $tag->{content};
851 if ($force_return_next) { $obj->{return_next} = 1; }
854 # Iterate through buckets, looking for a key match
856 for (my $i = 0; $i < $self->{max_buckets}; $i++) {
857 my ($key, $subloc) = $self->_get_key_subloc( $keys, $i );
859 # End of bucket list -- return to outer loop
861 $obj->{return_next} = 1;
864 # Located previous key -- return next one found
865 elsif ($key eq $obj->{prev_md5}) {
866 $obj->{return_next} = 1;
869 # Seek to bucket location and skip over signature
870 elsif ($obj->{return_next}) {
871 seek($fh, $subloc + $self->_fileobj->{file_offset}, SEEK_SET);
873 # Skip over value to get to plain key
875 read( $fh, $sig, SIG_SIZE );
878 read( $fh, $size, $self->{data_size});
879 $size = unpack($self->{data_pack}, $size);
880 if ($size) { seek($fh, $size, SEEK_CUR); }
882 # Read in plain key and return as scalar
884 read( $fh, $size, $self->{data_size});
885 $size = unpack($self->{data_pack}, $size);
886 if ($size) { read( $fh, $plain_key, $size); }
892 $obj->{return_next} = 1;
893 } # tag is a bucket list
900 # Locate next key, given digested previous one
905 $obj->{prev_md5} = $_[1] ? $_[1] : undef;
906 $obj->{return_next} = 0;
909 # If the previous key was not specifed, start at the top and
910 # return the first one found.
912 if (!$obj->{prev_md5}) {
913 $obj->{prev_md5} = chr(0) x $self->{hash_size};
914 $obj->{return_next} = 1;
917 return $self->traverse_index( $obj, $obj->_base_offset, 0 );
922 #ACID This needs modified - DONE
923 sub _get_key_subloc {
925 my ($keys, $idx) = @_;
927 my ($key, $subloc, $size, $transaction) = unpack(
928 # This is 'a', not 'A'. Please read the pack() documentation for the
929 # difference between the two and why it's important.
930 "a$self->{hash_size} $self->{long_pack} $self->{long_pack} $self->{long_pack}",
933 ($idx * $self->{bucket_size}),
934 $self->{bucket_size},
938 return ($key, $subloc, $size, $transaction);
941 sub _find_in_buckets {
943 my ($tag, $md5) = @_;
945 my $trans_id = $self->_fileobj->transaction_id;
948 for ( my $i = 0; $i < $self->{max_buckets}; $i++ ) {
949 my ($key, $subloc, $size, $transaction_id) = $self->_get_key_subloc(
953 return ($subloc, $i * $self->{bucket_size}, $size) unless $subloc;
955 next BUCKET if $key ne $md5 || $transaction_id != $trans_id;
957 return ($subloc, $i * $self->{bucket_size}, $size);
967 my $loc = $self->_fileobj->{end};
968 $self->_fileobj->{end} += $size;
975 my ($size, $loc) = @_;
980 seek( $fh, $loc + $self->_fileobj->{file_offset}, SEEK_SET );
982 . pack($self->{long_pack}, $size )
983 . pack($self->{long_pack}, $next_loc )
990 die "DBM::Deep: $_[1]\n";
996 # This will be added in later, after more refactoring is done. This is an early
997 # attempt at refactoring on the physical level instead of the virtual level.
1000 my ($spot, $amount, $unpack) = @_;
1002 my $fh = $self->_fh;
1003 seek( $fh, $spot + $self->_fileobj->{file_offset}, SEEK_SET );
1006 my $bytes_read = read( $fh, $buffer, $amount );
1009 $buffer = unpack( $unpack, $buffer );
1013 return ($buffer, $bytes_read);
1022 my ($spot, $data) = @_;
1024 my $fh = $self->_fh;
1025 seek( $fh, $spot, SEEK_SET );
1031 sub get_file_version {
1034 my $fh = $self->_fh;
1036 seek( $fh, 13 + $self->_fileobj->{file_offset}, SEEK_SET );
1038 my $bytes_read = read( $fh, $buffer, 4 );
1039 unless ( $bytes_read == 4 ) {
1040 $self->_throw_error( "Cannot read file version" );
1043 return unpack( 'N', $buffer );
1046 sub write_file_version {
1048 my ($new_version) = @_;
1050 my $fh = $self->_fh;
1052 seek( $fh, 13 + $self->_fileobj->{file_offset}, SEEK_SET );
1053 print( $fh pack( 'N', $new_version ) );