X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FDBM%2FDeep%2FEngine.pm;h=7c8d13949939c6bbe922b98c0816b99cf9c71e35;hb=a21f2d90935286a81dbaa6299707e140060e52d3;hp=016bd16f1d80785d00827d39a9a96631dd9da613;hpb=251dfd0e8f354bcd692c222d0c10178bd4be95a5;p=dbsrgits%2FDBM-Deep.git diff --git a/lib/DBM/Deep/Engine.pm b/lib/DBM/Deep/Engine.pm index 016bd16..7c8d139 100644 --- a/lib/DBM/Deep/Engine.pm +++ b/lib/DBM/Deep/Engine.pm @@ -4,29 +4,10 @@ use strict; use Fcntl qw( :DEFAULT :flock :seek ); -## -# Set to 4 and 'N' for 32-bit offset tags (default). Theoretical limit of 4 GB per file. -# (Perl must be compiled with largefile support for files > 2 GB) -# -# Set to 8 and 'Q' for 64-bit offsets. Theoretical limit of 16 XB per file. -# (Perl must be compiled with largefile and 64-bit long support) -## -## -# Set to 4 and 'N' for 32-bit data length prefixes. Limit of 4 GB for each key/value. -# Upgrading this is possible (see above) but probably not necessary. If you need -# more than 4 GB for a single key or value, this module is really not for you :-) -## - -## -# Maximum number of buckets per list before another level of indexing is done. -# Increase this value for slightly greater speed, but larger database files. -# DO NOT decrease this value below 16, due to risk of recursive reindex overrun. -## - sub precalc_sizes { - ## - # Precalculate index, bucket and bucket list sizes - ## + ## + # Precalculate index, bucket and bucket list sizes + ## my $self = shift; $self->{index_size} = (2**8) * $self->{long_size}; @@ -37,32 +18,44 @@ sub precalc_sizes { } sub set_pack { - ## - # Set pack/unpack modes (see file header for more) - ## + ## + # Set pack/unpack modes (see file header for more) + ## my $self = shift; my ($long_s, $long_p, $data_s, $data_p) = @_; + ## + # Set to 4 and 'N' for 32-bit offset tags (default). Theoretical limit of 4 GB per file. + # (Perl must be compiled with largefile support for files > 2 GB) + # + # Set to 8 and 'Q' for 64-bit offsets. Theoretical limit of 16 XB per file. + # (Perl must be compiled with largefile and 64-bit long support) + ## $self->{long_size} = $long_s ? $long_s : 4; $self->{long_pack} = $long_p ? $long_p : 'N'; + ## + # Set to 4 and 'N' for 32-bit data length prefixes. Limit of 4 GB for each key/value. + # Upgrading this is possible (see above) but probably not necessary. If you need + # more than 4 GB for a single key or value, this module is really not for you :-) + ## $self->{data_size} = $data_s ? $data_s : 4; $self->{data_pack} = $data_p ? $data_p : 'N'; - return $self->precalc_sizes(); + return $self->precalc_sizes(); } sub set_digest { - ## - # Set key digest function (default is MD5) - ## + ## + # Set key digest function (default is MD5) + ## my $self = shift; my ($digest_func, $hash_size) = @_; - $self->{digest} = $digest_func ? $digest_func : \&Digest::MD5::md5; + $self->{digest} = $digest_func ? $digest_func : \&Digest::MD5::md5; $self->{hash_size} = $hash_size ? $hash_size : 16; - return $self->precalc_sizes(); + return $self->precalc_sizes(); } sub new { @@ -78,6 +71,11 @@ sub new { digest => \&Digest::MD5::md5, hash_size => 16, + ## + # Maximum number of buckets per list before another level of indexing is done. + # Increase this value for slightly greater speed, but larger database files. + # DO NOT decrease this value below 16, due to risk of recursive reindex overrun. + ## max_buckets => 16, }, $class; @@ -110,7 +108,7 @@ sub open { my $self = shift; my ($obj) = @_; - if (defined($obj->_fh)) { $self->close( $obj ); } + if (defined($obj->_fh)) { $self->close_fh( $obj ); } eval { local $SIG{'__DIE__'}; @@ -150,6 +148,7 @@ sub open { if (!$bytes_read) { seek($fh, 0 + $obj->_root->{file_offset}, SEEK_SET); print( $fh DBM::Deep->SIG_FILE); + $self->create_tag($obj, $obj->_base_offset, $obj->_type, chr(0) x $self->{index_size}); my $plain_key = "[base]"; @@ -167,7 +166,7 @@ sub open { # Check signature was valid ## unless ($signature eq DBM::Deep->SIG_FILE) { - $self->close( $obj ); + $self->close_fh( $obj ); return $obj->_throw_error("Signature not found -- file is not a Deep DB"); } @@ -175,23 +174,23 @@ sub open { # Get our type from master index signature ## my $tag = $self->load_tag($obj, $obj->_base_offset); - -#XXX We probably also want to store the hash algorithm name and not assume anything -#XXX The cool thing would be to allow a different hashing algorithm at every level - if (!$tag) { return $obj->_throw_error("Corrupted file, no master index record"); } + if ($obj->{type} ne $tag->{signature}) { return $obj->_throw_error("File type mismatch"); } +#XXX We probably also want to store the hash algorithm name and not assume anything +#XXX The cool thing would be to allow a different hashing algorithm at every level + return 1; } -sub close { +sub close_fh { my $self = shift; - my $obj = shift; + my ($obj) = @_; if ( my $fh = $obj->_root->{fh} ) { close $fh; @@ -253,19 +252,6 @@ sub load_tag { }; } -sub index_lookup { - ## - # Given index tag, lookup single entry in index and return . - ## - my $self = shift; - my ($obj, $tag, $index) = @_; - - my $location = unpack($self->{long_pack}, substr($tag->{content}, $index * $self->{long_size}, $self->{long_size}) ); - if (!$location) { return; } - - return $self->load_tag( $obj, $location ); -} - sub add_bucket { ## # Adds one key/value pair to bucket list, given offset, MD5 digest of key, @@ -287,7 +273,7 @@ sub add_bucket { ## # Iterate through buckets, seeing if this is a new entry or a replace. ## - for (my $i=0; $i<$self->{max_buckets}; $i++) { + for (my $i = 0; $i < $self->{max_buckets}; $i++) { my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); if (!$subloc) { ## @@ -515,29 +501,29 @@ sub add_bucket { } sub get_bucket_value { - ## - # Fetch single value given tag and MD5 digested key. - ## - my $self = shift; - my ($obj, $tag, $md5) = @_; - my $keys = $tag->{content}; + ## + # Fetch single value given tag and MD5 digested key. + ## + my $self = shift; + my ($obj, $tag, $md5) = @_; + my $keys = $tag->{content}; my $fh = $obj->_fh; - ## - # Iterate through buckets, looking for a key match - ## + ## + # Iterate through buckets, looking for a key match + ## BUCKET: - for (my $i=0; $i<$self->{max_buckets}; $i++) { - my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); - my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); + for (my $i=0; $i<$self->{max_buckets}; $i++) { + my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); + my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); - if (!$subloc) { - ## - # Hit end of list, no match - ## - return; - } + if (!$subloc) { + ## + # Hit end of list, no match + ## + return; + } if ( $md5 ne $key ) { next BUCKET; @@ -549,7 +535,7 @@ sub get_bucket_value { my $signature; seek($fh, $subloc + $obj->_root->{file_offset}, SEEK_SET); read( $fh, $signature, DBM::Deep->SIG_SIZE); - + ## # If value is a hash or array, return new DBM::Deep object with correct offset ## @@ -559,18 +545,18 @@ sub get_bucket_value { base_offset => $subloc, root => $obj->_root, ); - + if ($obj->_root->{autobless}) { ## # Skip over value and plain key to see if object needs # to be re-blessed ## seek($fh, $self->{data_size} + $self->{index_size}, SEEK_CUR); - + my $size; read( $fh, $size, $self->{data_size}); $size = unpack($self->{data_pack}, $size); if ($size) { seek($fh, $size, SEEK_CUR); } - + my $bless_bit; read( $fh, $bless_bit, 1); if (ord($bless_bit)) { @@ -583,10 +569,10 @@ sub get_bucket_value { if ($class_name) { $obj = bless( $obj, $class_name ); } } } - + return $obj; } - + ## # Otherwise return actual value ## @@ -597,40 +583,40 @@ sub get_bucket_value { if ($size) { read( $fh, $value, $size); } return $value; } - + ## # Key exists, but content is null ## else { return; } - } # i loop + } # i loop - return; + return; } sub delete_bucket { - ## - # Delete single key/value pair given tag and MD5 digested key. - ## - my $self = shift; - my ($obj, $tag, $md5) = @_; - my $keys = $tag->{content}; + ## + # Delete single key/value pair given tag and MD5 digested key. + ## + my $self = shift; + my ($obj, $tag, $md5) = @_; + my $keys = $tag->{content}; my $fh = $obj->_fh; - - ## - # Iterate through buckets, looking for a key match - ## + + ## + # Iterate through buckets, looking for a key match + ## BUCKET: - for (my $i=0; $i<$self->{max_buckets}; $i++) { - my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); - my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); + for (my $i=0; $i<$self->{max_buckets}; $i++) { + my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); + my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); - if (!$subloc) { - ## - # Hit end of list, no match - ## - return; - } + if (!$subloc) { + ## + # Hit end of list, no match + ## + return; + } if ( $md5 ne $key ) { next BUCKET; @@ -642,35 +628,35 @@ sub delete_bucket { seek($fh, $tag->{offset} + ($i * $self->{bucket_size}) + $obj->_root->{file_offset}, SEEK_SET); print( $fh substr($keys, ($i+1) * $self->{bucket_size} ) ); print( $fh chr(0) x $self->{bucket_size} ); - + return 1; - } # i loop + } # i loop - return; + return; } sub bucket_exists { - ## - # Check existence of single key given tag and MD5 digested key. - ## - my $self = shift; - my ($obj, $tag, $md5) = @_; - my $keys = $tag->{content}; - - ## - # Iterate through buckets, looking for a key match - ## + ## + # Check existence of single key given tag and MD5 digested key. + ## + my $self = shift; + my ($obj, $tag, $md5) = @_; + my $keys = $tag->{content}; + + ## + # Iterate through buckets, looking for a key match + ## BUCKET: - for (my $i=0; $i<$self->{max_buckets}; $i++) { - my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); - my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); + for (my $i=0; $i<$self->{max_buckets}; $i++) { + my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); + my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); - if (!$subloc) { - ## - # Hit end of list, no match - ## - return; - } + if (!$subloc) { + ## + # Hit end of list, no match + ## + return; + } if ( $md5 ne $key ) { next BUCKET; @@ -680,138 +666,190 @@ sub bucket_exists { # Matched key -- return true ## return 1; - } # i loop + } # i loop - return; + return; } sub find_bucket_list { - ## - # Locate offset for bucket list, given digested key - ## - my $self = shift; - my ($obj, $md5) = @_; - - ## - # Locate offset for bucket list using digest index system - ## - my $ch = 0; - my $tag = $self->load_tag($obj, $obj->_base_offset); - if (!$tag) { return; } - - while ($tag->{signature} ne DBM::Deep->SIG_BLIST) { - $tag = $self->index_lookup($obj, $tag, ord(substr($md5, $ch, 1))); - if (!$tag) { return; } - $ch++; - } - - return $tag; + ## + # Locate offset for bucket list, given digested key + ## + my $self = shift; + my ($obj, $md5, $args) = @_; + $args = {} unless $args; + + ## + # Locate offset for bucket list using digest index system + ## + my $ch = 0; + my $tag = $self->load_tag($obj, $obj->_base_offset); + if (!$tag) { + return $self->_throw_error( "INTERNAL ERROR - Cannot find tag" ); + } + + while ($tag->{signature} ne DBM::Deep->SIG_BLIST) { + my $num = ord substr($md5, $ch, 1); + + my $ref_loc = $tag->{offset} + ($num * $self->{long_size}); + $tag = $self->index_lookup( $obj, $tag, $num ); + + if (!$tag) { + if ( $args->{create} ) { + my $fh = $obj->_fh; + seek($fh, $ref_loc + $obj->_root->{file_offset}, SEEK_SET); + print( $fh pack($self->{long_pack}, $obj->_root->{end}) ); + + $tag = $self->create_tag( + $obj, $obj->_root->{end}, + DBM::Deep->SIG_BLIST, + chr(0) x $self->{bucket_list_size}, + ); + + $tag->{ref_loc} = $ref_loc; + $tag->{ch} = $ch; + + last; + } + else { + return; + } + } + + $tag->{ch} = $ch; + $tag->{ref_loc} = $ref_loc; + + $ch++; + } + + return $tag; +} + +sub index_lookup { + ## + # Given index tag, lookup single entry in index and return . + ## + my $self = shift; + my ($obj, $tag, $index) = @_; + + my $location = unpack( + $self->{long_pack}, + substr( + $tag->{content}, + $index * $self->{long_size}, + $self->{long_size}, + ), + ); + + if (!$location) { return; } + + return $self->load_tag( $obj, $location ); } sub traverse_index { - ## - # Scan index and recursively step into deeper levels, looking for next key. - ## + ## + # Scan index and recursively step into deeper levels, looking for next key. + ## my $self = shift; my ($obj, $offset, $ch, $force_return_next) = @_; $force_return_next = undef unless $force_return_next; - - my $tag = $self->load_tag($obj, $offset ); + + my $tag = $self->load_tag($obj, $offset ); my $fh = $obj->_fh; - - if ($tag->{signature} ne DBM::Deep->SIG_BLIST) { - my $content = $tag->{content}; - my $start; - if ($obj->{return_next}) { $start = 0; } - else { $start = ord(substr($obj->{prev_md5}, $ch, 1)); } - - for (my $index = $start; $index < 256; $index++) { - my $subloc = unpack($self->{long_pack}, substr($content, $index * $self->{long_size}, $self->{long_size}) ); - if ($subloc) { - my $result = $self->traverse_index( $obj, $subloc, $ch + 1, $force_return_next ); - if (defined($result)) { return $result; } - } - } # index loop - - $obj->{return_next} = 1; - } # tag is an index - - elsif ($tag->{signature} eq DBM::Deep->SIG_BLIST) { - my $keys = $tag->{content}; - if ($force_return_next) { $obj->{return_next} = 1; } - - ## - # Iterate through buckets, looking for a key match - ## - for (my $i=0; $i<$self->{max_buckets}; $i++) { - my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); - my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); - - if (!$subloc) { - ## - # End of bucket list -- return to outer loop - ## - $obj->{return_next} = 1; - last; - } - elsif ($key eq $obj->{prev_md5}) { - ## - # Located previous key -- return next one found - ## - $obj->{return_next} = 1; - next; - } - elsif ($obj->{return_next}) { - ## - # Seek to bucket location and skip over signature - ## - seek($fh, $subloc + DBM::Deep->SIG_SIZE + $obj->_root->{file_offset}, SEEK_SET); - - ## - # Skip over value to get to plain key - ## - my $size; - read( $fh, $size, $self->{data_size}); $size = unpack($self->{data_pack}, $size); - if ($size) { seek($fh, $size, SEEK_CUR); } - - ## - # Read in plain key and return as scalar - ## - my $plain_key; - read( $fh, $size, $self->{data_size}); $size = unpack($self->{data_pack}, $size); - if ($size) { read( $fh, $plain_key, $size); } - - return $plain_key; - } - } # bucket loop - - $obj->{return_next} = 1; - } # tag is a bucket list - - return; + + if ($tag->{signature} ne DBM::Deep->SIG_BLIST) { + my $content = $tag->{content}; + my $start; + if ($obj->{return_next}) { $start = 0; } + else { $start = ord(substr($obj->{prev_md5}, $ch, 1)); } + + for (my $index = $start; $index < 256; $index++) { + my $subloc = unpack($self->{long_pack}, substr($content, $index * $self->{long_size}, $self->{long_size}) ); + if ($subloc) { + my $result = $self->traverse_index( $obj, $subloc, $ch + 1, $force_return_next ); + if (defined($result)) { return $result; } + } + } # index loop + + $obj->{return_next} = 1; + } # tag is an index + + elsif ($tag->{signature} eq DBM::Deep->SIG_BLIST) { + my $keys = $tag->{content}; + if ($force_return_next) { $obj->{return_next} = 1; } + + ## + # Iterate through buckets, looking for a key match + ## + for (my $i=0; $i<$self->{max_buckets}; $i++) { + my $key = substr($keys, $i * $self->{bucket_size}, $self->{hash_size}); + my $subloc = unpack($self->{long_pack}, substr($keys, ($i * $self->{bucket_size}) + $self->{hash_size}, $self->{long_size})); + + if (!$subloc) { + ## + # End of bucket list -- return to outer loop + ## + $obj->{return_next} = 1; + last; + } + elsif ($key eq $obj->{prev_md5}) { + ## + # Located previous key -- return next one found + ## + $obj->{return_next} = 1; + next; + } + elsif ($obj->{return_next}) { + ## + # Seek to bucket location and skip over signature + ## + seek($fh, $subloc + DBM::Deep->SIG_SIZE + $obj->_root->{file_offset}, SEEK_SET); + + ## + # Skip over value to get to plain key + ## + my $size; + read( $fh, $size, $self->{data_size}); $size = unpack($self->{data_pack}, $size); + if ($size) { seek($fh, $size, SEEK_CUR); } + + ## + # Read in plain key and return as scalar + ## + my $plain_key; + read( $fh, $size, $self->{data_size}); $size = unpack($self->{data_pack}, $size); + if ($size) { read( $fh, $plain_key, $size); } + + return $plain_key; + } + } # bucket loop + + $obj->{return_next} = 1; + } # tag is a bucket list + + return; } sub get_next_key { - ## - # Locate next key, given digested previous one - ## + ## + # Locate next key, given digested previous one + ## my $self = shift; my ($obj) = @_; - - $obj->{prev_md5} = $_[1] ? $_[1] : undef; - $obj->{return_next} = 0; - - ## - # If the previous key was not specifed, start at the top and - # return the first one found. - ## - if (!$obj->{prev_md5}) { - $obj->{prev_md5} = chr(0) x $self->{hash_size}; - $obj->{return_next} = 1; - } - - return $self->traverse_index( $obj, $obj->_base_offset, 0 ); + + $obj->{prev_md5} = $_[1] ? $_[1] : undef; + $obj->{return_next} = 0; + + ## + # If the previous key was not specifed, start at the top and + # return the first one found. + ## + if (!$obj->{prev_md5}) { + $obj->{prev_md5} = chr(0) x $self->{hash_size}; + $obj->{return_next} = 1; + } + + return $self->traverse_index( $obj, $obj->_base_offset, 0 ); } 1;