1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-23 17:34:34 +03:00

ntdb: update documentation.

Update the design.lyx file with the latest status and the change in hashing.
Also, refresh and add examples to the TDB_porting.txt file.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2012-06-19 12:43:09 +09:30
parent b888bc4316
commit 5ff92d8f7d
6 changed files with 506 additions and 5822 deletions

View File

@ -6,39 +6,415 @@ Interface differences between TDB and NTDB.
otherwise you'll get a compile error when tdb.h re-defined struct
TDB_DATA.
Example:
#include <tdb.h>
#include <ntdb.h>
- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
error on failure, whereas tdb functions returned 0 on success, and
-1 on failure. tdb then used tdb_error() to determine the error;
this API is nasty if we ever want to support threads, so is not supported.
Example:
#include <tdb.h>
#include <ntdb.h>
void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
{
if (tdb_store(tdb, key, d) == -1) {
printf("store failed: %s\n", tdb_errorstr(tdb));
}
}
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
{
enum NTDB_ERROR e;
e = ntdb_store(ntdb, key, d);
if (e) {
printf("store failed: %s\n", ntdb_errorstr(e));
}
}
- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
(or tdb_null, and you were supposed to check tdb_error() to find out why).
Example:
#include <tdb.h>
#include <ntdb.h>
void tdb_example(struct tdb_context *tdb, TDB_DATA key)
{
TDB_DATA data;
data = tdb_fetch(tdb, key);
if (!data.dptr) {
printf("fetch failed: %s\n", tdb_errorstr(tdb));
}
}
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
{
NTDB_DATA data;
enum NTDB_ERROR e;
e = ntdb_fetch(ntdb, key, &data);
if (e) {
printf("fetch failed: %s\n", ntdb_errorstr(e));
}
}
- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
this manually.
- tdb's tdb_open/tdb_open_ex took an explicit hash size. ntdb's hash table
resizes as required.
Example:
#include <tdb.h>
#include <ntdb.h>
void tdb_example(struct tdb_context *tdb)
{
TDB_DATA key, next, data;
for (key = tdb_firstkey(tdb); key.dptr; key = next) {
printf("Got key!\n");
next = tdb_nextkey(tdb, key);
free(key.dptr);
}
}
void ntdb_example(struct ntdb_context *ntdb)
{
NTDB_DATA k, data;
enum NTDB_ERROR e;
for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
printf("Got key!\n");
}
- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
Example:
#include <tdb.h>
#include <ntdb.h>
struct tdb_context *tdb_example(void)
{
return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
}
struct ntdb_context *ntdb_example(void)
{
return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
}
- ntdb uses a linked list of attribute structures to implement logging and
alternate hashes. tdb used tdb_open_ex, which was not extensible.
Example:
#include <tdb.h>
#include <ntdb.h>
/* Custom hash function */
static unsigned int my_tdb_hash_func(TDB_DATA *key)
{
return key->dsize;
}
struct tdb_context *tdb_example(void)
{
return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
}
/* Custom hash function */
static unsigned int my_ntdb_hash_func(const void *key, size_t len,
uint32_t seed, void *data)
{
return len;
}
struct ntdb_context *ntdb_example(void)
{
union ntdb_attribute hash;
hash.base.attr = NTDB_ATTRIBUTE_HASH;
hash.base.next = NULL;
hash.hash.fn = my_ntdb_hash_func;
return ntdb_open("example.ntdb", NTDB_DEFAULT,
O_CREAT|O_RDWR, 0600, &hash);
}
- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
131. ntdb's uses an attribute for this, defaulting to 8192.
Example:
#include <tdb.h>
#include <ntdb.h>
struct tdb_context *tdb_example(void)
{
return tdb_open("example.tdb", 10007, TDB_DEFAULT,
O_CREAT|O_RDWR, 0600);
}
struct ntdb_context *ntdb_example(void)
{
union ntdb_attribute hashsize;
hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
hashsize.base.next = NULL;
hashsize.hashsize.size = 16384;
return ntdb_open("example.ntdb", NTDB_DEFAULT,
O_CREAT|O_RDWR, 0600, &hashsize);
}
- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
- ntdb's log function is simpler than tdb's log function. The string is
already formatted, and it takes an enum ntdb_log_level not a tdb_debug_level,
and which has only three values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and
NTDB_LOG_WARNING.
Example:
#include <tdb.h>
#include <ntdb.h>
struct tdb_context *tdb_example(void)
{
return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
}
struct ntdb_context *ntdb_example(void)
{
return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
}
- ntdb's log function is simpler than tdb's log function. The string
is already formatted, is not terminated by a '\n', and it takes an
enum ntdb_log_level not a tdb_debug_level, and which has only three
values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
#include <tdb.h>
#include <ntdb.h>
static void tdb_log(struct tdb_context *tdb,
enum tdb_debug_level level, const char *fmt, ...)
{
va_list ap;
const char *name;
switch (level) {
case TDB_DEBUG_FATAL:
fprintf(stderr, "FATAL: ");
break;
case TDB_DEBUG_ERROR:
fprintf(stderr, "ERROR: ");
break;
case TDB_DEBUG_WARNING:
fprintf(stderr, "WARNING: ");
break;
case TDB_DEBUG_TRACE:
/* Don't print out tracing. */
return;
}
name = tdb_name(tdb);
if (!name) {
name = "unnamed";
}
fprintf(stderr, "tdb(%s):", name);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
}
struct tdb_context *tdb_example(void)
{
struct tdb_logging_context lctx;
lctx.log_fn = tdb_log;
return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
O_CREAT|O_RDWR, 0600, &lctx, NULL);
}
static void ntdb_log(struct ntdb_context *ntdb,
enum ntdb_log_level level,
enum NTDB_ERROR ecode,
const char *message,
void *data)
{
switch (level) {
case NTDB_LOG_ERROR:
fprintf(stderr, "ERROR: ");
break;
case NTDB_LOG_USE_ERROR:
/* We made a mistake, so abort. */
abort();
break;
case NTDB_LOG_WARNING:
fprintf(stderr, "WARNING: ");
break;
}
fprintf(stderr, "ntdb(%s):%s:%s\n",
ntdb_name(ntdb), ntdb_errorstr(ecode), message);
}
struct ntdb_context *ntdb_example(void)
{
union ntdb_attribute log;
log.base.attr = NTDB_ATTRIBUTE_LOG;
log.base.next = NULL;
log.log.fn = ntdb_log;
return ntdb_open("example.ntdb", NTDB_DEFAULT,
O_CREAT|O_RDWR, 0600, &log);
}
- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
creating an NTDB_DATA.
- ntdb's ntdb_name() returns a copy of the name even for NTDB_INTERNAL dbs.
#include <tdb.h>
#include <ntdb.h>
void tdb_example(struct tdb_context *tdb)
{
TDB_DATA data, key;
key.dsize = strlen("hello");
key.dptr = "hello";
data = tdb_fetch(tdb, key);
if (data.dsize == key.dsize
&& !memcmp(data.dptr, key.dptr, key.dsize))
printf("key is same as data\n");
}
free(data.dptr);
}
void ntdb_example(struct ntdb_context *ntdb)
{
NTDB_DATA data, key;
key = ntdb_mkdata("hello", strlen("hello"));
if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
if (ntdb_deq(key, data)) {
printf("key is same as data\n");
}
free(data.dptr);
}
}
- Failure inside a transaction (such as a lock function failing) does
not implicitly cancel the transaction; you still need to call
ntdb_transaction_cancel().
#include <tdb.h>
#include <ntdb.h>
void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
{
if (tdb_transaction_start(tdb) == -1) {
printf("transaction failed: %s\n", tdb_errorstr(tdb));
return;
}
if (tdb_store(tdb, key, d) == -1) {
printf("store failed: %s\n", tdb_errorstr(tdb));
return;
}
if (tdb_transaction_commit(tdb) == -1) {
printf("commit failed: %s\n", tdb_errorstr(tdb));
}
}
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
{
enum NTDB_ERROR e;
e = ntdb_transaction_start(ntdb);
if (e) {
printf("transaction failed: %s\n", ntdb_errorstr(e));
return;
}
e = ntdb_store(ntdb, key, d);
if (e) {
printf("store failed: %s\n", ntdb_errorstr(e));
ntdb_transaction_cancel(ntdb);
}
e = ntdb_transaction_commit(ntdb);
if (e) {
printf("commit failed: %s\n", ntdb_errorstr(e));
}
}
- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
API problems. If necessary, you can emulate this by using the open
hook and placing a 1-byte lock at offset 4. If your program forks
and exits, you will need to place this lock again in the child before
the parent exits.
Example:
#include <tdb.h>
#include <ntdb.h>
struct tdb_context *tdb_example(void)
{
return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
O_CREAT|O_RDWR, 0600);
}
static enum NTDB_ERROR clear_if_first(int fd, void *unused)
{
/* We hold a lock offset 4 always, so we can tell if
* anyone else is. */
struct flock fl;
fl.l_type = F_WRLCK;
fl.l_whence = SEEK_SET;
fl.l_start = 4; /* ACTIVE_LOCK */
fl.l_len = 1;
if (fcntl(fd, F_SETLK, &fl) == 0) {
/* We must be first ones to open it! Clear it. */
if (ftruncate(fd, 0) != 0) {
return NTDB_ERR_IO;
}
}
fl.l_type = F_RDLCK;
if (fcntl(fd, F_SETLKW, &fl) != 0) {
return NTDB_ERR_IO;
}
return NTDB_SUCCESS;
}
struct ntdb_context *ntdb_example(void)
{
union ntdb_attribute open_attr;
open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
open_attr.openhook.base.next = NULL;
open_attr.openhook.fn = clear_if_first;
return ntdb_open("example.ntdb", NTDB_DEFAULT,
O_CREAT|O_RDWR, 0600, &open_attr);
}
- ntdb traversals are not reliable if the database is changed during
the traversal, ie your traversal may not cover all elements, or may
cover elements multiple times. As a special exception, deleting the
current record within ntdb_traverse() is reliable.
- There is no ntdb_traverse_read, since ntdb_traverse does not hold
a lock across the entire traversal anyway. If you want to make sure
that your traversal function does not write to the database, you can
set and clear the NTDB_RDONLY flag around the traversal.
- ntdb does not need tdb_reopen() or tdb_reopen_all(). If you call
fork() after during certain operations the child should close the
tdb, or complete the operations before continuing to use the tdb:
ntdb, or complete the operations before continuing to use the tdb:
ntdb_transaction_start(): child must ntdb_transaction_cancel()
ntdb_lockall(): child must call ntdb_unlockall()
@ -46,19 +422,5 @@ Interface differences between TDB and NTDB.
ntdb_chainlock(): child must call ntdb_chainunlock()
ntdb_parse() callback: child must return from ntdb_parse()
- ntdb will not open a non-tdb file, even if O_CREAT is specified.
- There is no ntdb_traverse_read. For operating on TDB files, you can
simulate it by ntdb_add_flag(tdb, NTDB_RDONLY); ntdb_traverse();
ntdb_remove_flag(tdb, NTDB_RDONLY). This may be desirable because
traverse on TDB files use a write lock on the entire database
unless it's read-only.
- Failure inside a transaction (such as a lock function failing) does
not implicitly cancel the transaction; you still need to call
ntdb_transaction_cancel().
- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
API problems. If necessary, you can emulate this by using the open
hook and placing a 1-byte lock at offset 4. If your program forks,
you will need to place this lock again in the child.
- ntdb will not open a non-ntdb file, even if O_CREAT is specified. tdb
will overwrite an unknown file in that case.

File diff suppressed because it is too large Load Diff

View File

@ -1,48 +1,66 @@
#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
\lyxformat 345
#LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass article
\use_default_options true
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\defskip medskip
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes true
\output_changes true
\author ""
\author ""
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
TDB2: A Redesigning The Trivial DataBase
NTDB: Redesigning The Trivial DataBase
\end_layout
\begin_layout Author
@ -50,7 +68,7 @@ Rusty Russell, IBM Corporation
\end_layout
\begin_layout Date
17-March-2011
19 June 2012
\end_layout
\begin_layout Abstract
@ -87,7 +105,7 @@ The wider variety and greater demands of TDB-using code has lead to some
\begin_layout Standard
\begin_inset Tabular
<lyxtabular version="3" rows="12" columns="3">
<features>
<features tabularvalignment="middle">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
<column alignment="center" valignment="top" width="0">
@ -453,6 +471,20 @@ This review is an attempt to catalog and address all the known issues with
second system syndrome in rewriting a successful project like this.
\end_layout
\begin_layout Standard
Note: the final decision was to make ntdb a separate library, with a separarate
'ntdb' namespace so both can potentially be linked together.
This document still refers to
\begin_inset Quotes eld
\end_inset
tdb
\begin_inset Quotes erd
\end_inset
everywhere, for simplicity.
\end_layout
\begin_layout Section
API Issues
\end_layout
@ -960,7 +992,6 @@ There are several issues with this approach.
have under some circumstances.
I don't believe this is currently the case, but it constrains the implementatio
n.
\end_layout
\begin_layout Subsubsection
@ -1025,7 +1056,7 @@ Status
\end_layout
\begin_layout Standard
Incomplete.
Complete.
\end_layout
\begin_layout Subsection
@ -1114,7 +1145,7 @@ Status
\end_layout
\begin_layout Standard
Incomplete.
Complete.
\end_layout
\begin_layout Subsection
@ -1291,6 +1322,7 @@ Status
\begin_layout Standard
Complete.
An open hook is provided to replicate this functionality if required.
\end_layout
\begin_layout Subsection
@ -1433,7 +1465,7 @@ Status
\end_layout
\begin_layout Standard
Deferred.
Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
\end_layout
\begin_layout Section
@ -1661,7 +1693,12 @@ Status
\end_layout
\begin_layout Standard
Complete.
Ignore.
Scaling the hash automatically proved inefficient at small hash sizes;
we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
and when buckets clash we expand to an array of hash entries.
This scales slightly better than the tdb chain (due to the 8 top bits containin
g extra hash).
\end_layout
\begin_layout Subsection
@ -1738,7 +1775,6 @@ If it's more than max_dead, bulk free all the dead ones (similar to steps
\begin_layout Enumerate
Simply mark this record as dead and return.
\end_layout
\end_deeper
@ -1920,7 +1956,6 @@ reference "sub:Records-Incur-A"
\end_inset
.
\end_layout
\begin_layout Standard
@ -2357,7 +2392,11 @@ TDB Does Not Have Snapshot Support
\end_layout
\begin_layout Subsubsection
Proposed SolutionNone.
Proposed Solution
\end_layout
\begin_layout Standard
None.
At some point you say
\begin_inset Quotes eld
\end_inset
@ -2666,7 +2705,6 @@ name "replay-attribute"
\begin_layout Standard
Tridge points out that an attribute can be later added to tdb_open (see
\begin_inset CommandInset ref
LatexCommand ref
reference "attributes"

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,8 +1,8 @@
TDB2: A Redesigning The Trivial DataBase
NTDB: Redesigning The Trivial DataBase
Rusty Russell, IBM Corporation
1-December-2010
19 June 2012
Abstract
@ -65,6 +65,11 @@ without significantly increasing complexity; all involved are far
too aware of the dangers of second system syndrome in rewriting a
successful project like this.
Note: the final decision was to make ntdb a separate library,
with a separarate 'ntdb' namespace so both can potentially be
linked together. This document still refers to “tdb” everywhere,
for simplicity.
2 API Issues
2.1 tdb_open_ex Is Not Expandable
@ -182,7 +187,7 @@ This flag can also be changed at runtime.
2.3.1 Proposed Solution
Given the usage patterns, it seems that the “least-surprise”
Given the usage patterns, it seems that the“least-surprise”
behavior of disallowing nested transactions should become the
default. Additionally, it seems the outer transaction is the only
code which knows whether inner transactions should be allowed, so
@ -193,7 +198,7 @@ expanded for this relatively-obscure case.
2.3.2 Status
Incomplete; nesting flag is still defined as per tdb1.
Complete; the nesting flag has been removed.
2.4 Incorrect Hash Function is Not Detected
@ -217,7 +222,7 @@ Complete.
In response to scalability issues with the free list ([TDB-Freelist-Is]
) two API workarounds have been incorporated in TDB:
tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
latter actually calls the former with an argument of “5”.
latter actually calls the former with an argument of“5”.
This code allows deleted records to accumulate without putting
them in the free list. On delete we iterate through each chain
@ -235,8 +240,8 @@ will become a no-op.
2.5.2 Status
Incomplete. TDB_VOLATILE still defined, but implementation should
fail on unknown flags to be future-proof.
Complete. Unknown flags cause tdb_open() to fail as well, so they
can be detected at runtime.
2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
In The Same Process
@ -275,7 +280,7 @@ to allow other to create such an API.
2.6.2 Status
Incomplete.
Complete.
2.7 TDB API Is Not POSIX Thread-safe
@ -283,19 +288,19 @@ The TDB API uses an error code which can be queried after an
operation to determine what went wrong. This programming model
does not work with threads, unless specific additional guarantees
are given by the implementation. In addition, even
otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
).
2.7.1 Proposed Solution
Reachitecting the API to include a tdb_errcode pointer would be a
great deal of churn; we are better to guarantee that the
tdb_errcode is per-thread so the current programming model can be
maintained.
This requires dynamic per-thread allocations, which is awkward
with POSIX threads (pthread_key_create space is limited and we
cannot simply allocate a key for every TDB).
great deal of churn, but fortunately most functions return 0 on
success and -1 on error: we can change these to return 0 on
success and a negative error code on error, and the API remains
similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
functions need to take a TDB_DATA pointer and return an error
code. It is also simpler to have tdb_nextkey replace its key
argument in place, freeing up any old .dptr.
Internal locking is required to make sure that fcntl locks do not
overlap between threads, and also that the global list of tdbs is
@ -304,12 +309,13 @@ maintained.
The aim is that building tdb with -DTDB_PTHREAD will result in a
pthread-safe version of the library, and otherwise no overhead
will exist. Alternatively, a hooking mechanism similar to that
proposed for [Proposed-Solution-locking-hook] could be used to
proposed for[Proposed-Solution-locking-hook] could be used to
enable pthread locking at runtime.
2.7.2 Status
Incomplete.
Incomplete; API has been changed but thread safety has not been
implemented.
2.8 *_nonblock Functions And *_mark Functions Expose
Implementation
@ -375,7 +381,7 @@ it is needed.
2.8.2 Status
Incomplete.
Complete.
2.9 tdb_chainlock Functions Expose Implementation
@ -427,7 +433,7 @@ otherwise EAGAIN.
2.10.2 Status
Incomplete.
Complete.
2.11 The API Uses Gratuitous Typedefs, Capitals
@ -477,7 +483,7 @@ Complete.
2.13 Various Callback Functions Are Not Typesafe
The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
and tdb_check all take void * and must internally convert it to
the argument type they were expecting.
@ -499,7 +505,7 @@ http://ccan.ozlabs.org/info/typesafe_cb.html
2.13.2 Status
Incomplete.
Complete.
2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
tdb_reopen_all Problematic
@ -519,12 +525,12 @@ it alone has opened the TDB and will erase it.
2.14.1 Proposed Solution
Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
2.14.2 Status
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
implemented.
Complete. An open hook is provided to replicate this
functionality if required.
2.15 Extending The Header Is Difficult
@ -537,7 +543,7 @@ not.
2.15.1 Proposed Solution
The header should contain a “format variant” value (64-bit). This
The header should contain a“format variant” value (64-bit). This
is divided into two 32-bit parts:
1. The lower part reflects the format variant understood by code
@ -558,7 +564,7 @@ writes to the database.
2.15.2 Status
Incomplete.
Complete.
2.16 Record Headers Are Not Expandible
@ -576,7 +582,7 @@ would know the extension is not present on that record.
2.16.2 Status
Incomplete.
Complete.
2.17 TDB Does Not Use Talloc
@ -589,10 +595,10 @@ conveniently.
The allocation within TDB is not complicated enough to justify
the use of talloc, and I am reluctant to force another
(excellent) library on TDB users. Nonetheless a compromise is
possible. An attribute (see [attributes]) can be added later to
possible. An attribute (see[attributes]) can be added later to
tdb_open() to provide an alternate allocation mechanism,
specifically for talloc but usable by any other allocator (which
would ignore the “context” argument).
would ignore the“context” argument).
This would form a talloc heirarchy as expected, but the caller
would still have to attach a destructor to the tdb context
@ -602,7 +608,7 @@ manage them (using talloc_free() or talloc_steal()).
2.17.2 Status
Deferred.
Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
3 Performance And Scalability Issues
@ -635,11 +641,11 @@ can simply unlink the old tdb at that point.
3.1.2 Status
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
Complete.
3.2 TDB Files Have a 4G Limit
This seems to be becoming an issue (so much for “trivial”!),
This seems to be becoming an issue (so much for“trivial”!),
particularly for ldb.
3.2.1 Proposed Solution
@ -679,7 +685,7 @@ Record sizes will be 64 bit, with an error returned on 32 bit
platforms which try to access such records (the current
implementation would return TDB_ERR_OOM in a similar case). It
seems unlikely that 32 bit keys will be a limitation, so the
implementation may not support this (see [sub:Records-Incur-A]).
implementation may not support this (see[sub:Records-Incur-A]).
3.3.2 Status
@ -728,7 +734,11 @@ invalid.
3.4.2 Status
Complete.
Ignore. Scaling the hash automatically proved inefficient at
small hash sizes; we default to a 8192-element hash (changable
via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
an array of hash entries. This scales slightly better than the
tdb chain (due to the 8 top bits containing extra hash).
3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
@ -783,7 +793,7 @@ Deleting a record occurs as follows:
7. Otherwise, prepend ourselves to the free list.
Disabling right-merging (step [right-merging]) causes
Disabling right-merging (step[right-merging]) causes
fragmentation; the other heuristics proved insufficient to
address this, so the final answer to this was that when we expand
the TDB file inside a transaction commit, we repack the entire
@ -812,7 +822,7 @@ zone) which produces too many clashes for our hash table to
handle well, and for coalescing we search by address. Thus an
array of doubly-linked free lists seems preferable.
There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
) but it's not clear this would reduce contention in the common
case where all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most
@ -822,7 +832,7 @@ ordering.
Unfortunately it is difficult to know what heuristics should be
used to determine zone sizes, and our transaction code relies on
being able to create a “recovery area” by simply appending to the
being able to create a“recovery area” by simply appending to the
file (difficult if it would need to create a new zone header).
Thus we use a linked-list of free tables; currently we only ever
create one, but if there is more than one we choose one at random
@ -862,9 +872,9 @@ coalescing at this point:
This optimizes rapid insert/delete of free list entries by not
coalescing them all the time.. First-fit address ordering
ordering seems to be fairly good for keeping fragmentation low
(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
does not need a tailer to coalesce, though if we needed one we
could have one cheaply: see [sub:Records-Incur-A].
could have one cheaply: see[sub:Records-Incur-A].
Each free entry has the free table number in the header: less
than 255. It also contains a doubly-linked list for easy
@ -884,7 +894,7 @@ db when a transaction commit needs to enlarge the file.
The 25% overhead on allocation works in practice for ldb because
indexes tend to expand by one record at a time. This internal
fragmentation can be resolved by having an “expanded” bit in the
fragmentation can be resolved by having an“expanded” bit in the
header to note entries that have previously expanded, and
allocating more space for them.
@ -970,13 +980,13 @@ block:
scale as fast as data, so I'm assuming a maximum key size of 32
bits.
4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
this is diminishing returns after a handful of bits (at 10
bits, it reduces 99.9% of false memcmp). As an aside, as the
lower bits are already incorporated in the hash table
resolution, the upper bits should be used here. Note that it's
not clear that these bits will be a win, given the extra bits
in the hash table itself (see [sub:Hash-Size-Solution]).
in the hash table itself (see[sub:Hash-Size-Solution]).
5. 'magic' does not need to be enlarged: it currently reflects
one of 5 values (used, free, dead, recovery, and
@ -1094,8 +1104,10 @@ Deferred.
3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
3.9.1 Proposed SolutionNone. At some point you say “use a real
database” (but see [replay-attribute]).
3.9.1 Proposed Solution
None. At some point you say“use a real database” (but see[replay-attribute]
).
But as a thought experiment, if we implemented transactions to
only overwrite free entries (this is tricky: there must not be a
@ -1128,7 +1140,7 @@ failed.
3.10.1 Proposed Solution
None (but see [replay-attribute]). We could solve a small part of
None (but see[replay-attribute]). We could solve a small part of
the problem by providing read-only transactions. These would
allow one write transaction to begin, but it could not commit
until all r/o transactions are done. This would require a new
@ -1175,7 +1187,7 @@ indefinitely.
3.12.1 Proposed Solution
Remove reliability guarantees; see [traverse-Proposed-Solution].
Remove reliability guarantees; see[traverse-Proposed-Solution].
3.12.2 Status
@ -1214,7 +1226,7 @@ normal (fast) usage, and occasionally empties the results into a
transactional TDB. This kind of usage prioritizes performance
over durability: as long as we are consistent, data can be lost.
This would be more neatly implemented inside tdb: a “soft”
This would be more neatly implemented inside tdb: a“soft”
transaction commit (ie. syncless) which meant that data may be
reverted on a crash.
@ -1226,12 +1238,12 @@ Unfortunately any transaction scheme which overwrites old data
requires a sync before that overwrite to avoid the possibility of
corruption.
It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
,where transactions are committed without overwriting existing
data, and an array of top-level pointers were available in the
header. If the transaction is “soft” then we would not need a
sync at all: existing processes would pick up the new hash table
and free list and work with that.
header. If the transaction is“soft” then we would not need a sync
at all: existing processes would pick up the new hash table and
free list and work with that.
At some later point, a sync would allow recovery of the old data
into the free lists (perhaps when the array of top-level pointers
@ -1249,7 +1261,7 @@ so it can coordinate cluster-wide transactions.
3.15.1 Proposed Solution<replay-attribute>
Tridge points out that an attribute can be later added to
tdb_open (see [attributes]) to provide replay/trace hooks, which
tdb_open (see[attributes]) to provide replay/trace hooks, which
could become the basis for this and future parallel transactions
and snapshot support.