1
0
mirror of https://github.com/samba-team/samba.git synced 2025-02-28 01:58:17 +03:00
Rusty Russell dd42962878 ntdb: remove hash table trees.
TDB2 started with a top-level hash of 1024 entries, divided into 128
groups of 8 buckets.  When a bucket filled, the 8 bucket group
expanded into pointers into 8 new 64-entry hash tables.  When these
filled, they expanded in turn, etc.

It's a nice idea to automatically expand the hash tables, but it
doesn't pay off.  Remove it for NTDB.

1) It only beats TDB performance when the database is huge and the
   TDB hashsize is small.  We are about 20% slower on medium-size
   databases (1000 to 10000 records), worse on really small ones.
2) Since we're 64 bits, our hash tables are already twice as expensive
   as TDB.
3) Since our hash function is good, it means that all groups tend to
   fill at the same time, meaning the hash enlarges by a factor of 128
   all at once, leading to a very large database at that point.
4) Our efficiency would improve if we enlarged the top level, but
   that makes our minimum db size even worse: it's already over 8k,
   and jumps to 1M after about 1000 entries!
5) Making the sub group size larger gives a shallower tree, which
   performs better, but makes the "hash explosion" problem worse.
6) The code is complicated, having to handle delete and reshuffling
   groups of hash buckets, and expansion of buckets.
7) We have to handle the case where all the records somehow end up with
   the same hash value, which requires special code to chain records for
   that case.

On the other hand, it would be nice if we didn't degrade as badly as
TDB does when the hash chains get long.

This patch removes the hash-growing code, but instead of chaining like
TDB does when a bucket fills, we point the bucket to an array of
record pointers.  Since each on-disk NTDB pointer contains some hash
bits from the record (we steal the upper 8 bits of the offset), 99.5%
of the time we don't need to load the record to determine if it
matches.  This makes an array of offsets much more cache-friendly than
a linked list.

Here are the times (in ns) for tdb_store of N records, tdb_store of N
records the second time, and a fetch of all N records.  I've also
included the final database size and the smbtorture local.[n]tdb_speed
results.

Benchmark details:
1) Compiled with -O2.
2) assert() was disabled in TDB2 and NTDB.
3) The "optimize fetch" patch was applied to NTDB.

10 runs, using tmpfs (otherwise massive swapping as db hits ~30M,
despite plenty of RAM).

				Insert	Re-ins	Fetch	Size	dbspeed
				(nsec)	(nsec)	(nsec)	(Kb)	(ops/sec)
TDB (10000 hashsize):	
	100 records:		 3882	 3320	1609	   53	203204
	1000 records:		 3651	 3281	1571	  115	218021
	10000 records:		 3404	 3326	1595	  880	202874
	100000 records:		 4317	 3825	2097	 8262	126811
	1000000 records:	11568	11578	9320	77005	 25046

TDB2 (1024 hashsize, expandable):
	100 records:		 3867	 3329	1699	   17	187100	
	1000 records:		 4040	 3249	1639	  154	186255
	10000 records:		 4143	 3300	1695	 1226	185110
	100000 records:		 4481	 3425	1800	17848	163483
	1000000 records:	 4055	 3534	1878   106386	160774

NTDB (8192 hashsize)
	100 records:		 4259	 3376	1692	   82	190852
	1000 records:		 3640	 3275	1566	  130	195106
	10000 records:		 4337	 3438	1614	  773	188362
	100000 records:		 4750	 5165	1746	 9001	169197
	1000000 records:	 4897	 5180	2341	83838	121901

Analysis:
	1) TDB wins on small databases, beating TDB2 by ~15%, NTDB by ~10%.
	2) TDB starts to lose when hash chains get 10 long (fetch 10% slower
	   than TDB2/NTDB).
	3) TDB does horribly when hash chains get 100 long (fetch 4x slower
	   than NTDB, 5x slower than TDB2, insert about 2-3x slower).
	4) TDB2 databases are 40% larger than TDB1.  NTDB is about 15% larger
	   than TDB1
2012-06-19 05:38:07 +02:00

726 lines
20 KiB
C

/*
Trivial Database 2: free list/block handling
Copyright (C) Rusty Russell 2010
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "private.h"
#include <ccan/likely/likely.h>
#include <ccan/asearch/asearch.h>
/* We keep an ordered array of offsets. */
static bool append(struct ntdb_context *ntdb,
ntdb_off_t **arr, size_t *num, ntdb_off_t off)
{
ntdb_off_t *new;
if (*num == 0) {
new = ntdb->alloc_fn(ntdb, sizeof(ntdb_off_t), ntdb->alloc_data);
} else {
new = ntdb->expand_fn(*arr, (*num + 1) * sizeof(ntdb_off_t),
ntdb->alloc_data);
}
if (!new)
return false;
new[(*num)++] = off;
*arr = new;
return true;
}
static enum NTDB_ERROR check_header(struct ntdb_context *ntdb,
ntdb_off_t *recovery,
uint64_t *features,
size_t *num_capabilities)
{
uint64_t hash_test;
struct ntdb_header hdr;
enum NTDB_ERROR ecode;
ntdb_off_t off, next;
ecode = ntdb_read_convert(ntdb, 0, &hdr, sizeof(hdr));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* magic food should not be converted, so convert back. */
ntdb_convert(ntdb, hdr.magic_food, sizeof(hdr.magic_food));
hash_test = NTDB_HASH_MAGIC;
hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
if (hdr.hash_test != hash_test) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"check: hash test %llu should be %llu",
(long long)hdr.hash_test,
(long long)hash_test);
}
if (strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"check: bad magic '%.*s'",
(unsigned)sizeof(hdr.magic_food),
hdr.magic_food);
}
/* Features which are used must be a subset of features offered. */
if (hdr.features_used & ~hdr.features_offered) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"check: features used (0x%llx) which"
" are not offered (0x%llx)",
(long long)hdr.features_used,
(long long)hdr.features_offered);
}
*features = hdr.features_offered;
*recovery = hdr.recovery;
if (*recovery) {
if (*recovery < sizeof(hdr)
|| *recovery > ntdb->file->map_size) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check:"
" invalid recovery offset %zu",
(size_t)*recovery);
}
}
for (off = hdr.capabilities; off && ecode == NTDB_SUCCESS; off = next) {
const struct ntdb_capability *cap;
enum NTDB_ERROR e;
cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
if (NTDB_PTR_IS_ERR(cap)) {
return NTDB_PTR_ERR(cap);
}
/* All capabilities are unknown. */
e = unknown_capability(ntdb, "ntdb_check", cap->type);
next = cap->next;
ntdb_access_release(ntdb, cap);
if (e)
return e;
(*num_capabilities)++;
}
/* Don't check reserved: they *can* be used later. */
return NTDB_SUCCESS;
}
static int off_cmp(const ntdb_off_t *a, const ntdb_off_t *b)
{
/* Can overflow an int. */
return *a > *b ? 1
: *a < *b ? -1
: 0;
}
static enum NTDB_ERROR check_entry(struct ntdb_context *ntdb,
ntdb_off_t off_and_hash,
ntdb_len_t bucket,
ntdb_off_t used[],
size_t num_used,
size_t *num_found,
enum NTDB_ERROR (*check)(NTDB_DATA,
NTDB_DATA,
void *),
void *data)
{
enum NTDB_ERROR ecode;
const struct ntdb_used_record *r;
const unsigned char *kptr;
ntdb_len_t klen, dlen;
uint32_t hash;
ntdb_off_t off = off_and_hash & NTDB_OFF_MASK;
ntdb_off_t *p;
/* Empty bucket is fine. */
if (!off_and_hash) {
return NTDB_SUCCESS;
}
/* This can't point to a chain, we handled those at toplevel. */
if (off_and_hash & (1ULL << NTDB_OFF_CHAIN_BIT)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Invalid chain bit in offset "
" %llu", (long long)off_and_hash);
}
p = asearch(&off, used, num_used, off_cmp);
if (!p) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Invalid offset"
" %llu in hash", (long long)off);
}
/* Mark it invalid. */
*p ^= 1;
(*num_found)++;
r = ntdb_access_read(ntdb, off, sizeof(*r), true);
if (NTDB_PTR_IS_ERR(r)) {
return NTDB_PTR_ERR(r);
}
klen = rec_key_length(r);
dlen = rec_data_length(r);
ntdb_access_release(ntdb, r);
kptr = ntdb_access_read(ntdb, off + sizeof(*r), klen + dlen, false);
if (NTDB_PTR_IS_ERR(kptr)) {
return NTDB_PTR_ERR(kptr);
}
hash = ntdb_hash(ntdb, kptr, klen);
/* Are we in the right chain? */
if (bits_from(hash, 0, ntdb->hash_bits) != bucket) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: Bad bucket %u vs %llu",
bits_from(hash, 0, ntdb->hash_bits),
(long long)bucket);
/* Next 8 bits should be the same as top bits of bucket. */
} else if (bits_from(hash, ntdb->hash_bits, NTDB_OFF_UPPER_STEAL)
!= bits_from(off_and_hash, 64-NTDB_OFF_UPPER_STEAL,
NTDB_OFF_UPPER_STEAL)) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: Bad hash bits %llu vs %llu",
(long long)off_and_hash,
(long long)hash);
} else if (check) {
NTDB_DATA k, d;
k = ntdb_mkdata(kptr, klen);
d = ntdb_mkdata(kptr + klen, dlen);
ecode = check(k, d, data);
} else {
ecode = NTDB_SUCCESS;
}
ntdb_access_release(ntdb, kptr);
return ecode;
}
static enum NTDB_ERROR check_hash_chain(struct ntdb_context *ntdb,
ntdb_off_t off,
ntdb_len_t bucket,
ntdb_off_t used[],
size_t num_used,
size_t *num_found,
enum NTDB_ERROR (*check)(NTDB_DATA,
NTDB_DATA,
void *),
void *data)
{
struct ntdb_used_record rec;
enum NTDB_ERROR ecode;
const ntdb_off_t *entries;
ntdb_len_t i, num;
/* This is a used entry. */
(*num_found)++;
ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (rec_magic(&rec) != NTDB_CHAIN_MAGIC) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash chain magic %llu",
(long long)rec_magic(&rec));
}
if (rec_data_length(&rec) % sizeof(ntdb_off_t)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash chain data length %llu",
(long long)rec_data_length(&rec));
}
if (rec_key_length(&rec) != 0) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash chain key length %llu",
(long long)rec_key_length(&rec));
}
off += sizeof(rec);
num = rec_data_length(&rec) / sizeof(ntdb_off_t);
entries = ntdb_access_read(ntdb, off, rec_data_length(&rec), true);
if (NTDB_PTR_IS_ERR(entries)) {
return NTDB_PTR_ERR(entries);
}
/* Check each non-deleted entry in chain. */
for (i = 0; i < num; i++) {
ecode = check_entry(ntdb, entries[i], bucket,
used, num_used, num_found, check, data);
if (ecode) {
break;
}
}
ntdb_access_release(ntdb, entries);
return ecode;
}
static enum NTDB_ERROR check_hash(struct ntdb_context *ntdb,
ntdb_off_t used[],
size_t num_used,
size_t num_other_used,
enum NTDB_ERROR (*check)(NTDB_DATA,
NTDB_DATA,
void *),
void *data)
{
enum NTDB_ERROR ecode;
struct ntdb_used_record rec;
const ntdb_off_t *entries;
ntdb_len_t i;
/* Free tables and capabilities also show up as used, as do we. */
size_t num_found = num_other_used + 1;
ecode = ntdb_read_convert(ntdb, NTDB_HASH_OFFSET, &rec, sizeof(rec));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (rec_magic(&rec) != NTDB_HTABLE_MAGIC) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash table magic %llu",
(long long)rec_magic(&rec));
}
if (rec_data_length(&rec) != (sizeof(ntdb_off_t) << ntdb->hash_bits)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash table data length %llu",
(long long)rec_data_length(&rec));
}
if (rec_key_length(&rec) != 0) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Bad hash table key length %llu",
(long long)rec_key_length(&rec));
}
entries = ntdb_access_read(ntdb, NTDB_HASH_OFFSET + sizeof(rec),
rec_data_length(&rec), true);
if (NTDB_PTR_IS_ERR(entries)) {
return NTDB_PTR_ERR(entries);
}
for (i = 0; i < (1 << ntdb->hash_bits); i++) {
ntdb_off_t off = entries[i] & NTDB_OFF_MASK;
if (entries[i] & (1ULL << NTDB_OFF_CHAIN_BIT)) {
ecode = check_hash_chain(ntdb, off, i,
used, num_used, &num_found,
check, data);
} else {
ecode = check_entry(ntdb, entries[i], i,
used, num_used, &num_found,
check, data);
}
if (ecode) {
break;
}
}
ntdb_access_release(ntdb, entries);
if (ecode == NTDB_SUCCESS && num_found != num_used) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Not all entries are in hash");
}
return ecode;
}
static enum NTDB_ERROR check_free(struct ntdb_context *ntdb,
ntdb_off_t off,
const struct ntdb_free_record *frec,
ntdb_off_t prev, unsigned int ftable,
unsigned int bucket)
{
enum NTDB_ERROR ecode;
if (frec_magic(frec) != NTDB_FREE_MAGIC) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: offset %llu bad magic 0x%llx",
(long long)off,
(long long)frec->magic_and_prev);
}
if (frec_ftable(frec) != ftable) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: offset %llu bad freetable %u",
(long long)off, frec_ftable(frec));
}
ecode = ntdb_oob(ntdb, off,
frec_len(frec) + sizeof(struct ntdb_used_record),
false);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (size_to_bucket(frec_len(frec)) != bucket) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: offset %llu in wrong bucket"
" (%u vs %u)",
(long long)off,
bucket, size_to_bucket(frec_len(frec)));
}
if (prev && prev != frec_prev(frec)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: offset %llu bad prev"
" (%llu vs %llu)",
(long long)off,
(long long)prev, (long long)frec_len(frec));
}
return NTDB_SUCCESS;
}
static enum NTDB_ERROR check_free_table(struct ntdb_context *ntdb,
ntdb_off_t ftable_off,
unsigned ftable_num,
ntdb_off_t fr[],
size_t num_free,
size_t *num_found)
{
struct ntdb_freetable ft;
ntdb_off_t h;
unsigned int i;
enum NTDB_ERROR ecode;
ecode = ntdb_read_convert(ntdb, ftable_off, &ft, sizeof(ft));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (rec_magic(&ft.hdr) != NTDB_FTABLE_MAGIC
|| rec_key_length(&ft.hdr) != 0
|| rec_data_length(&ft.hdr) != sizeof(ft) - sizeof(ft.hdr)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Invalid header on free table");
}
for (i = 0; i < NTDB_FREE_BUCKETS; i++) {
ntdb_off_t off, prev = 0, *p, first = 0;
struct ntdb_free_record f;
h = bucket_off(ftable_off, i);
for (off = ntdb_read_off(ntdb, h); off; off = f.next) {
if (NTDB_OFF_IS_ERR(off)) {
return NTDB_OFF_TO_ERR(off);
}
if (!first) {
off &= NTDB_OFF_MASK;
first = off;
}
ecode = ntdb_read_convert(ntdb, off, &f, sizeof(f));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
ecode = check_free(ntdb, off, &f, prev, ftable_num, i);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* FIXME: Check hash bits */
p = asearch(&off, fr, num_free, off_cmp);
if (!p) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: Invalid offset"
" %llu in free table",
(long long)off);
}
/* Mark it invalid. */
*p ^= 1;
(*num_found)++;
prev = off;
}
if (first) {
/* Now we can check first back pointer. */
ecode = ntdb_read_convert(ntdb, first, &f, sizeof(f));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
ecode = check_free(ntdb, first, &f, prev, ftable_num, i);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
}
}
return NTDB_SUCCESS;
}
/* Slow, but should be very rare. */
ntdb_off_t dead_space(struct ntdb_context *ntdb, ntdb_off_t off)
{
size_t len;
enum NTDB_ERROR ecode;
for (len = 0; off + len < ntdb->file->map_size; len++) {
char c;
ecode = ntdb->io->tread(ntdb, off, &c, 1);
if (ecode != NTDB_SUCCESS) {
return NTDB_ERR_TO_OFF(ecode);
}
if (c != 0 && c != 0x43)
break;
}
return len;
}
static enum NTDB_ERROR check_linear(struct ntdb_context *ntdb,
ntdb_off_t **used, size_t *num_used,
ntdb_off_t **fr, size_t *num_free,
uint64_t features, ntdb_off_t recovery)
{
ntdb_off_t off;
ntdb_len_t len;
enum NTDB_ERROR ecode;
bool found_recovery = false;
for (off = sizeof(struct ntdb_header);
off < ntdb->file->map_size;
off += len) {
union {
struct ntdb_used_record u;
struct ntdb_free_record f;
struct ntdb_recovery_record r;
} rec;
/* r is larger: only get that if we need to. */
ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.f));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
/* If we crash after ftruncate, we can get zeroes or fill. */
if (rec.r.magic == NTDB_RECOVERY_INVALID_MAGIC
|| rec.r.magic == 0x4343434343434343ULL) {
ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (recovery == off) {
found_recovery = true;
len = sizeof(rec.r) + rec.r.max_len;
} else {
len = dead_space(ntdb, off);
if (NTDB_OFF_IS_ERR(len)) {
return NTDB_OFF_TO_ERR(len);
}
if (len < sizeof(rec.r)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: invalid"
" dead space at %zu",
(size_t)off);
}
ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
"Dead space at %zu-%zu (of %zu)",
(size_t)off, (size_t)(off + len),
(size_t)ntdb->file->map_size);
}
} else if (rec.r.magic == NTDB_RECOVERY_MAGIC) {
ecode = ntdb_read_convert(ntdb, off, &rec, sizeof(rec.r));
if (ecode != NTDB_SUCCESS) {
return ecode;
}
if (recovery != off) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: unexpected"
" recovery record at offset"
" %zu",
(size_t)off);
}
if (rec.r.len > rec.r.max_len) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: invalid recovery"
" length %zu",
(size_t)rec.r.len);
}
if (rec.r.eof > ntdb->file->map_size) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: invalid old EOF"
" %zu", (size_t)rec.r.eof);
}
found_recovery = true;
len = sizeof(rec.r) + rec.r.max_len;
} else if (frec_magic(&rec.f) == NTDB_FREE_MAGIC) {
len = sizeof(rec.u) + frec_len(&rec.f);
if (off + len > ntdb->file->map_size) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: free overlength"
" %llu at offset %llu",
(long long)len,
(long long)off);
}
/* This record should be in free lists. */
if (frec_ftable(&rec.f) != NTDB_FTABLE_NONE
&& !append(ntdb, fr, num_free, off)) {
return ntdb_logerr(ntdb, NTDB_ERR_OOM,
NTDB_LOG_ERROR,
"ntdb_check: tracking %zu'th"
" free record.", *num_free);
}
} else if (rec_magic(&rec.u) == NTDB_USED_MAGIC
|| rec_magic(&rec.u) == NTDB_CHAIN_MAGIC
|| rec_magic(&rec.u) == NTDB_HTABLE_MAGIC
|| rec_magic(&rec.u) == NTDB_FTABLE_MAGIC
|| rec_magic(&rec.u) == NTDB_CAP_MAGIC) {
uint64_t klen, dlen, extra;
/* This record is used! */
if (!append(ntdb, used, num_used, off)) {
return ntdb_logerr(ntdb, NTDB_ERR_OOM,
NTDB_LOG_ERROR,
"ntdb_check: tracking %zu'th"
" used record.", *num_used);
}
klen = rec_key_length(&rec.u);
dlen = rec_data_length(&rec.u);
extra = rec_extra_padding(&rec.u);
len = sizeof(rec.u) + klen + dlen + extra;
if (off + len > ntdb->file->map_size) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: used overlength"
" %llu at offset %llu",
(long long)len,
(long long)off);
}
if (len < sizeof(rec.f)) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: too short record"
" %llu at %llu",
(long long)len,
(long long)off);
}
/* Check that records have correct 0 at end (but may
* not in future). */
if (extra && !features
&& rec_magic(&rec.u) != NTDB_CAP_MAGIC) {
const char *p;
char c;
p = ntdb_access_read(ntdb, off + sizeof(rec.u)
+ klen + dlen, 1, false);
if (NTDB_PTR_IS_ERR(p))
return NTDB_PTR_ERR(p);
c = *p;
ntdb_access_release(ntdb, p);
if (c != '\0') {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check:"
" non-zero extra"
" at %llu",
(long long)off);
}
}
} else {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT,
NTDB_LOG_ERROR,
"ntdb_check: Bad magic 0x%llx"
" at offset %zu",
(long long)rec_magic(&rec.u),
(size_t)off);
}
}
/* We must have found recovery area if there was one. */
if (recovery != 0 && !found_recovery) {
return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: expected a recovery area at %zu",
(size_t)recovery);
}
return NTDB_SUCCESS;
}
_PUBLIC_ enum NTDB_ERROR ntdb_check_(struct ntdb_context *ntdb,
enum NTDB_ERROR (*check)(NTDB_DATA, NTDB_DATA, void *),
void *data)
{
ntdb_off_t *fr = NULL, *used = NULL, ft, recovery;
size_t num_free = 0, num_used = 0, num_found = 0, num_ftables = 0,
num_capabilities = 0;
uint64_t features;
enum NTDB_ERROR ecode;
if (ntdb->flags & NTDB_CANT_CHECK) {
return ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
"ntdb_check: database has unknown capability,"
" cannot check.");
}
ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
if (ecode != NTDB_SUCCESS) {
return ecode;
}
ecode = ntdb_lock_expand(ntdb, F_RDLCK);
if (ecode != NTDB_SUCCESS) {
ntdb_allrecord_unlock(ntdb, F_RDLCK);
return ecode;
}
ecode = check_header(ntdb, &recovery, &features, &num_capabilities);
if (ecode != NTDB_SUCCESS)
goto out;
/* First we do a linear scan, checking all records. */
ecode = check_linear(ntdb, &used, &num_used, &fr, &num_free, features,
recovery);
if (ecode != NTDB_SUCCESS)
goto out;
for (ft = first_ftable(ntdb); ft; ft = next_ftable(ntdb, ft)) {
if (NTDB_OFF_IS_ERR(ft)) {
ecode = NTDB_OFF_TO_ERR(ft);
goto out;
}
ecode = check_free_table(ntdb, ft, num_ftables, fr, num_free,
&num_found);
if (ecode != NTDB_SUCCESS)
goto out;
num_ftables++;
}
/* FIXME: Check key uniqueness? */
ecode = check_hash(ntdb, used, num_used, num_ftables + num_capabilities,
check, data);
if (ecode != NTDB_SUCCESS)
goto out;
if (num_found != num_free) {
ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
"ntdb_check: Not all entries are in"
" free table");
}
out:
ntdb_allrecord_unlock(ntdb, F_RDLCK);
ntdb_unlock_expand(ntdb, F_RDLCK);
ntdb->free_fn(fr, ntdb->alloc_data);
ntdb->free_fn(used, ntdb->alloc_data);
return ecode;
}