mirror of
https://github.com/samba-team/samba.git
synced 2024-12-23 17:34:34 +03:00
ntdb: update documentation.
Update the design.lyx file with the latest status and the change in hashing. Also, refresh and add examples to the TDB_porting.txt file. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
b888bc4316
commit
5ff92d8f7d
@ -6,39 +6,415 @@ Interface differences between TDB and NTDB.
|
||||
otherwise you'll get a compile error when tdb.h re-defined struct
|
||||
TDB_DATA.
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
- ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
|
||||
error on failure, whereas tdb functions returned 0 on success, and
|
||||
-1 on failure. tdb then used tdb_error() to determine the error;
|
||||
this API is nasty if we ever want to support threads, so is not supported.
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
|
||||
{
|
||||
if (tdb_store(tdb, key, d) == -1) {
|
||||
printf("store failed: %s\n", tdb_errorstr(tdb));
|
||||
}
|
||||
}
|
||||
|
||||
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
|
||||
{
|
||||
enum NTDB_ERROR e;
|
||||
|
||||
e = ntdb_store(ntdb, key, d);
|
||||
if (e) {
|
||||
printf("store failed: %s\n", ntdb_errorstr(e));
|
||||
}
|
||||
}
|
||||
|
||||
- ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
|
||||
(or tdb_null, and you were supposed to check tdb_error() to find out why).
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
void tdb_example(struct tdb_context *tdb, TDB_DATA key)
|
||||
{
|
||||
TDB_DATA data;
|
||||
|
||||
data = tdb_fetch(tdb, key);
|
||||
if (!data.dptr) {
|
||||
printf("fetch failed: %s\n", tdb_errorstr(tdb));
|
||||
}
|
||||
}
|
||||
|
||||
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
|
||||
{
|
||||
NTDB_DATA data;
|
||||
enum NTDB_ERROR e;
|
||||
|
||||
e = ntdb_fetch(ntdb, key, &data);
|
||||
if (e) {
|
||||
printf("fetch failed: %s\n", ntdb_errorstr(e));
|
||||
}
|
||||
}
|
||||
|
||||
- ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
|
||||
this manually.
|
||||
|
||||
- tdb's tdb_open/tdb_open_ex took an explicit hash size. ntdb's hash table
|
||||
resizes as required.
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
void tdb_example(struct tdb_context *tdb)
|
||||
{
|
||||
TDB_DATA key, next, data;
|
||||
|
||||
for (key = tdb_firstkey(tdb); key.dptr; key = next) {
|
||||
printf("Got key!\n");
|
||||
next = tdb_nextkey(tdb, key);
|
||||
free(key.dptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ntdb_example(struct ntdb_context *ntdb)
|
||||
{
|
||||
NTDB_DATA k, data;
|
||||
enum NTDB_ERROR e;
|
||||
|
||||
for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
|
||||
printf("Got key!\n");
|
||||
}
|
||||
|
||||
- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
|
||||
even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
|
||||
}
|
||||
|
||||
- ntdb uses a linked list of attribute structures to implement logging and
|
||||
alternate hashes. tdb used tdb_open_ex, which was not extensible.
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
/* Custom hash function */
|
||||
static unsigned int my_tdb_hash_func(TDB_DATA *key)
|
||||
{
|
||||
return key->dsize;
|
||||
}
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
|
||||
}
|
||||
|
||||
/* Custom hash function */
|
||||
static unsigned int my_ntdb_hash_func(const void *key, size_t len,
|
||||
uint32_t seed, void *data)
|
||||
{
|
||||
return len;
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
union ntdb_attribute hash;
|
||||
|
||||
hash.base.attr = NTDB_ATTRIBUTE_HASH;
|
||||
hash.base.next = NULL;
|
||||
hash.hash.fn = my_ntdb_hash_func;
|
||||
return ntdb_open("example.ntdb", NTDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, &hash);
|
||||
}
|
||||
|
||||
- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
|
||||
131. ntdb's uses an attribute for this, defaulting to 8192.
|
||||
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
return tdb_open("example.tdb", 10007, TDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600);
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
union ntdb_attribute hashsize;
|
||||
|
||||
hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
|
||||
hashsize.base.next = NULL;
|
||||
hashsize.hashsize.size = 16384;
|
||||
return ntdb_open("example.ntdb", NTDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, &hashsize);
|
||||
}
|
||||
|
||||
- ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
|
||||
tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.
|
||||
|
||||
- ntdb's log function is simpler than tdb's log function. The string is
|
||||
already formatted, and it takes an enum ntdb_log_level not a tdb_debug_level,
|
||||
and which has only three values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and
|
||||
NTDB_LOG_WARNING.
|
||||
Example:
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
|
||||
}
|
||||
|
||||
- ntdb's log function is simpler than tdb's log function. The string
|
||||
is already formatted, is not terminated by a '\n', and it takes an
|
||||
enum ntdb_log_level not a tdb_debug_level, and which has only three
|
||||
values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
|
||||
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
static void tdb_log(struct tdb_context *tdb,
|
||||
enum tdb_debug_level level, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
const char *name;
|
||||
|
||||
switch (level) {
|
||||
case TDB_DEBUG_FATAL:
|
||||
fprintf(stderr, "FATAL: ");
|
||||
break;
|
||||
case TDB_DEBUG_ERROR:
|
||||
fprintf(stderr, "ERROR: ");
|
||||
break;
|
||||
case TDB_DEBUG_WARNING:
|
||||
fprintf(stderr, "WARNING: ");
|
||||
break;
|
||||
case TDB_DEBUG_TRACE:
|
||||
/* Don't print out tracing. */
|
||||
return;
|
||||
}
|
||||
|
||||
name = tdb_name(tdb);
|
||||
if (!name) {
|
||||
name = "unnamed";
|
||||
}
|
||||
|
||||
fprintf(stderr, "tdb(%s):", name);
|
||||
|
||||
va_start(ap, fmt);
|
||||
vfprintf(stderr, fmt, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
struct tdb_logging_context lctx;
|
||||
|
||||
lctx.log_fn = tdb_log;
|
||||
return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, &lctx, NULL);
|
||||
}
|
||||
|
||||
static void ntdb_log(struct ntdb_context *ntdb,
|
||||
enum ntdb_log_level level,
|
||||
enum NTDB_ERROR ecode,
|
||||
const char *message,
|
||||
void *data)
|
||||
{
|
||||
switch (level) {
|
||||
case NTDB_LOG_ERROR:
|
||||
fprintf(stderr, "ERROR: ");
|
||||
break;
|
||||
case NTDB_LOG_USE_ERROR:
|
||||
/* We made a mistake, so abort. */
|
||||
abort();
|
||||
break;
|
||||
case NTDB_LOG_WARNING:
|
||||
fprintf(stderr, "WARNING: ");
|
||||
break;
|
||||
}
|
||||
|
||||
fprintf(stderr, "ntdb(%s):%s:%s\n",
|
||||
ntdb_name(ntdb), ntdb_errorstr(ecode), message);
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
union ntdb_attribute log;
|
||||
|
||||
log.base.attr = NTDB_ATTRIBUTE_LOG;
|
||||
log.base.next = NULL;
|
||||
log.log.fn = ntdb_log;
|
||||
return ntdb_open("example.ntdb", NTDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, &log);
|
||||
}
|
||||
|
||||
- ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
|
||||
creating an NTDB_DATA.
|
||||
|
||||
- ntdb's ntdb_name() returns a copy of the name even for NTDB_INTERNAL dbs.
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
void tdb_example(struct tdb_context *tdb)
|
||||
{
|
||||
TDB_DATA data, key;
|
||||
|
||||
key.dsize = strlen("hello");
|
||||
key.dptr = "hello";
|
||||
data = tdb_fetch(tdb, key);
|
||||
if (data.dsize == key.dsize
|
||||
&& !memcmp(data.dptr, key.dptr, key.dsize))
|
||||
printf("key is same as data\n");
|
||||
}
|
||||
free(data.dptr);
|
||||
}
|
||||
|
||||
void ntdb_example(struct ntdb_context *ntdb)
|
||||
{
|
||||
NTDB_DATA data, key;
|
||||
|
||||
key = ntdb_mkdata("hello", strlen("hello"));
|
||||
if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
|
||||
if (ntdb_deq(key, data)) {
|
||||
printf("key is same as data\n");
|
||||
}
|
||||
free(data.dptr);
|
||||
}
|
||||
}
|
||||
|
||||
- Failure inside a transaction (such as a lock function failing) does
|
||||
not implicitly cancel the transaction; you still need to call
|
||||
ntdb_transaction_cancel().
|
||||
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
|
||||
{
|
||||
if (tdb_transaction_start(tdb) == -1) {
|
||||
printf("transaction failed: %s\n", tdb_errorstr(tdb));
|
||||
return;
|
||||
}
|
||||
|
||||
if (tdb_store(tdb, key, d) == -1) {
|
||||
printf("store failed: %s\n", tdb_errorstr(tdb));
|
||||
return;
|
||||
}
|
||||
if (tdb_transaction_commit(tdb) == -1) {
|
||||
printf("commit failed: %s\n", tdb_errorstr(tdb));
|
||||
}
|
||||
}
|
||||
|
||||
void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
|
||||
{
|
||||
enum NTDB_ERROR e;
|
||||
|
||||
e = ntdb_transaction_start(ntdb);
|
||||
if (e) {
|
||||
printf("transaction failed: %s\n", ntdb_errorstr(e));
|
||||
return;
|
||||
}
|
||||
|
||||
e = ntdb_store(ntdb, key, d);
|
||||
if (e) {
|
||||
printf("store failed: %s\n", ntdb_errorstr(e));
|
||||
ntdb_transaction_cancel(ntdb);
|
||||
}
|
||||
|
||||
e = ntdb_transaction_commit(ntdb);
|
||||
if (e) {
|
||||
printf("commit failed: %s\n", ntdb_errorstr(e));
|
||||
}
|
||||
}
|
||||
|
||||
- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
|
||||
API problems. If necessary, you can emulate this by using the open
|
||||
hook and placing a 1-byte lock at offset 4. If your program forks
|
||||
and exits, you will need to place this lock again in the child before
|
||||
the parent exits.
|
||||
|
||||
Example:
|
||||
|
||||
#include <tdb.h>
|
||||
#include <ntdb.h>
|
||||
|
||||
struct tdb_context *tdb_example(void)
|
||||
{
|
||||
return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
|
||||
O_CREAT|O_RDWR, 0600);
|
||||
}
|
||||
|
||||
static enum NTDB_ERROR clear_if_first(int fd, void *unused)
|
||||
{
|
||||
/* We hold a lock offset 4 always, so we can tell if
|
||||
* anyone else is. */
|
||||
struct flock fl;
|
||||
|
||||
fl.l_type = F_WRLCK;
|
||||
fl.l_whence = SEEK_SET;
|
||||
fl.l_start = 4; /* ACTIVE_LOCK */
|
||||
fl.l_len = 1;
|
||||
|
||||
if (fcntl(fd, F_SETLK, &fl) == 0) {
|
||||
/* We must be first ones to open it! Clear it. */
|
||||
if (ftruncate(fd, 0) != 0) {
|
||||
return NTDB_ERR_IO;
|
||||
}
|
||||
}
|
||||
fl.l_type = F_RDLCK;
|
||||
if (fcntl(fd, F_SETLKW, &fl) != 0) {
|
||||
return NTDB_ERR_IO;
|
||||
}
|
||||
return NTDB_SUCCESS;
|
||||
}
|
||||
|
||||
struct ntdb_context *ntdb_example(void)
|
||||
{
|
||||
union ntdb_attribute open_attr;
|
||||
|
||||
open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
|
||||
open_attr.openhook.base.next = NULL;
|
||||
open_attr.openhook.fn = clear_if_first;
|
||||
|
||||
return ntdb_open("example.ntdb", NTDB_DEFAULT,
|
||||
O_CREAT|O_RDWR, 0600, &open_attr);
|
||||
}
|
||||
|
||||
- ntdb traversals are not reliable if the database is changed during
|
||||
the traversal, ie your traversal may not cover all elements, or may
|
||||
cover elements multiple times. As a special exception, deleting the
|
||||
current record within ntdb_traverse() is reliable.
|
||||
|
||||
- There is no ntdb_traverse_read, since ntdb_traverse does not hold
|
||||
a lock across the entire traversal anyway. If you want to make sure
|
||||
that your traversal function does not write to the database, you can
|
||||
set and clear the NTDB_RDONLY flag around the traversal.
|
||||
|
||||
- ntdb does not need tdb_reopen() or tdb_reopen_all(). If you call
|
||||
fork() after during certain operations the child should close the
|
||||
tdb, or complete the operations before continuing to use the tdb:
|
||||
ntdb, or complete the operations before continuing to use the tdb:
|
||||
|
||||
ntdb_transaction_start(): child must ntdb_transaction_cancel()
|
||||
ntdb_lockall(): child must call ntdb_unlockall()
|
||||
@ -46,19 +422,5 @@ Interface differences between TDB and NTDB.
|
||||
ntdb_chainlock(): child must call ntdb_chainunlock()
|
||||
ntdb_parse() callback: child must return from ntdb_parse()
|
||||
|
||||
- ntdb will not open a non-tdb file, even if O_CREAT is specified.
|
||||
|
||||
- There is no ntdb_traverse_read. For operating on TDB files, you can
|
||||
simulate it by ntdb_add_flag(tdb, NTDB_RDONLY); ntdb_traverse();
|
||||
ntdb_remove_flag(tdb, NTDB_RDONLY). This may be desirable because
|
||||
traverse on TDB files use a write lock on the entire database
|
||||
unless it's read-only.
|
||||
|
||||
- Failure inside a transaction (such as a lock function failing) does
|
||||
not implicitly cancel the transaction; you still need to call
|
||||
ntdb_transaction_cancel().
|
||||
|
||||
- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
|
||||
API problems. If necessary, you can emulate this by using the open
|
||||
hook and placing a 1-byte lock at offset 4. If your program forks,
|
||||
you will need to place this lock again in the child.
|
||||
- ntdb will not open a non-ntdb file, even if O_CREAT is specified. tdb
|
||||
will overwrite an unknown file in that case.
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,48 +1,66 @@
|
||||
#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
|
||||
\lyxformat 345
|
||||
#LyX 2.0 created this file. For more info see http://www.lyx.org/
|
||||
\lyxformat 413
|
||||
\begin_document
|
||||
\begin_header
|
||||
\textclass article
|
||||
\use_default_options true
|
||||
\maintain_unincluded_children false
|
||||
\language english
|
||||
\language_package default
|
||||
\inputencoding auto
|
||||
\fontencoding global
|
||||
\font_roman default
|
||||
\font_sans default
|
||||
\font_typewriter default
|
||||
\font_default_family default
|
||||
\use_non_tex_fonts false
|
||||
\font_sc false
|
||||
\font_osf false
|
||||
\font_sf_scale 100
|
||||
\font_tt_scale 100
|
||||
|
||||
\graphics default
|
||||
\default_output_format default
|
||||
\output_sync 0
|
||||
\bibtex_command default
|
||||
\index_command default
|
||||
\paperfontsize default
|
||||
\use_hyperref false
|
||||
\papersize default
|
||||
\use_geometry false
|
||||
\use_amsmath 1
|
||||
\use_esint 1
|
||||
\use_mhchem 1
|
||||
\use_mathdots 1
|
||||
\cite_engine basic
|
||||
\use_bibtopic false
|
||||
\use_indices false
|
||||
\paperorientation portrait
|
||||
\suppress_date false
|
||||
\use_refstyle 0
|
||||
\index Index
|
||||
\shortcut idx
|
||||
\color #008000
|
||||
\end_index
|
||||
\secnumdepth 3
|
||||
\tocdepth 3
|
||||
\paragraph_separation indent
|
||||
\defskip medskip
|
||||
\paragraph_indentation default
|
||||
\quotes_language english
|
||||
\papercolumns 1
|
||||
\papersides 1
|
||||
\paperpagestyle default
|
||||
\tracking_changes true
|
||||
\output_changes true
|
||||
\author ""
|
||||
\author ""
|
||||
\html_math_output 0
|
||||
\html_css_as_file 0
|
||||
\html_be_strict false
|
||||
\end_header
|
||||
|
||||
\begin_body
|
||||
|
||||
\begin_layout Title
|
||||
TDB2: A Redesigning The Trivial DataBase
|
||||
NTDB: Redesigning The Trivial DataBase
|
||||
\end_layout
|
||||
|
||||
\begin_layout Author
|
||||
@ -50,7 +68,7 @@ Rusty Russell, IBM Corporation
|
||||
\end_layout
|
||||
|
||||
\begin_layout Date
|
||||
17-March-2011
|
||||
19 June 2012
|
||||
\end_layout
|
||||
|
||||
\begin_layout Abstract
|
||||
@ -87,7 +105,7 @@ The wider variety and greater demands of TDB-using code has lead to some
|
||||
\begin_layout Standard
|
||||
\begin_inset Tabular
|
||||
<lyxtabular version="3" rows="12" columns="3">
|
||||
<features>
|
||||
<features tabularvalignment="middle">
|
||||
<column alignment="center" valignment="top" width="0">
|
||||
<column alignment="center" valignment="top" width="0">
|
||||
<column alignment="center" valignment="top" width="0">
|
||||
@ -453,6 +471,20 @@ This review is an attempt to catalog and address all the known issues with
|
||||
second system syndrome in rewriting a successful project like this.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Note: the final decision was to make ntdb a separate library, with a separarate
|
||||
'ntdb' namespace so both can potentially be linked together.
|
||||
This document still refers to
|
||||
\begin_inset Quotes eld
|
||||
\end_inset
|
||||
|
||||
tdb
|
||||
\begin_inset Quotes erd
|
||||
\end_inset
|
||||
|
||||
everywhere, for simplicity.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
API Issues
|
||||
\end_layout
|
||||
@ -960,7 +992,6 @@ There are several issues with this approach.
|
||||
have under some circumstances.
|
||||
I don't believe this is currently the case, but it constrains the implementatio
|
||||
n.
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsubsection
|
||||
@ -1025,7 +1056,7 @@ Status
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Incomplete.
|
||||
Complete.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsection
|
||||
@ -1114,7 +1145,7 @@ Status
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Incomplete.
|
||||
Complete.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsection
|
||||
@ -1291,6 +1322,7 @@ Status
|
||||
|
||||
\begin_layout Standard
|
||||
Complete.
|
||||
An open hook is provided to replicate this functionality if required.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsection
|
||||
@ -1433,7 +1465,7 @@ Status
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Deferred.
|
||||
Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
|
||||
\end_layout
|
||||
|
||||
\begin_layout Section
|
||||
@ -1661,7 +1693,12 @@ Status
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
Complete.
|
||||
Ignore.
|
||||
Scaling the hash automatically proved inefficient at small hash sizes;
|
||||
we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
|
||||
and when buckets clash we expand to an array of hash entries.
|
||||
This scales slightly better than the tdb chain (due to the 8 top bits containin
|
||||
g extra hash).
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsection
|
||||
@ -1738,7 +1775,6 @@ If it's more than max_dead, bulk free all the dead ones (similar to steps
|
||||
|
||||
\begin_layout Enumerate
|
||||
Simply mark this record as dead and return.
|
||||
|
||||
\end_layout
|
||||
|
||||
\end_deeper
|
||||
@ -1920,7 +1956,6 @@ reference "sub:Records-Incur-A"
|
||||
\end_inset
|
||||
|
||||
.
|
||||
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
@ -2357,7 +2392,11 @@ TDB Does Not Have Snapshot Support
|
||||
\end_layout
|
||||
|
||||
\begin_layout Subsubsection
|
||||
Proposed SolutionNone.
|
||||
Proposed Solution
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
None.
|
||||
At some point you say
|
||||
\begin_inset Quotes eld
|
||||
\end_inset
|
||||
@ -2666,7 +2705,6 @@ name "replay-attribute"
|
||||
|
||||
\begin_layout Standard
|
||||
Tridge points out that an attribute can be later added to tdb_open (see
|
||||
|
||||
\begin_inset CommandInset ref
|
||||
LatexCommand ref
|
||||
reference "attributes"
|
||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,8 +1,8 @@
|
||||
TDB2: A Redesigning The Trivial DataBase
|
||||
NTDB: Redesigning The Trivial DataBase
|
||||
|
||||
Rusty Russell, IBM Corporation
|
||||
|
||||
1-December-2010
|
||||
19 June 2012
|
||||
|
||||
Abstract
|
||||
|
||||
@ -65,6 +65,11 @@ without significantly increasing complexity; all involved are far
|
||||
too aware of the dangers of second system syndrome in rewriting a
|
||||
successful project like this.
|
||||
|
||||
Note: the final decision was to make ntdb a separate library,
|
||||
with a separarate 'ntdb' namespace so both can potentially be
|
||||
linked together. This document still refers to “tdb” everywhere,
|
||||
for simplicity.
|
||||
|
||||
2 API Issues
|
||||
|
||||
2.1 tdb_open_ex Is Not Expandable
|
||||
@ -182,7 +187,7 @@ This flag can also be changed at runtime.
|
||||
|
||||
2.3.1 Proposed Solution
|
||||
|
||||
Given the usage patterns, it seems that the “least-surprise”
|
||||
Given the usage patterns, it seems that the“least-surprise”
|
||||
behavior of disallowing nested transactions should become the
|
||||
default. Additionally, it seems the outer transaction is the only
|
||||
code which knows whether inner transactions should be allowed, so
|
||||
@ -193,7 +198,7 @@ expanded for this relatively-obscure case.
|
||||
|
||||
2.3.2 Status
|
||||
|
||||
Incomplete; nesting flag is still defined as per tdb1.
|
||||
Complete; the nesting flag has been removed.
|
||||
|
||||
2.4 Incorrect Hash Function is Not Detected
|
||||
|
||||
@ -217,7 +222,7 @@ Complete.
|
||||
In response to scalability issues with the free list ([TDB-Freelist-Is]
|
||||
) two API workarounds have been incorporated in TDB:
|
||||
tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
|
||||
latter actually calls the former with an argument of “5”.
|
||||
latter actually calls the former with an argument of“5”.
|
||||
|
||||
This code allows deleted records to accumulate without putting
|
||||
them in the free list. On delete we iterate through each chain
|
||||
@ -235,8 +240,8 @@ will become a no-op.
|
||||
|
||||
2.5.2 Status
|
||||
|
||||
Incomplete. TDB_VOLATILE still defined, but implementation should
|
||||
fail on unknown flags to be future-proof.
|
||||
Complete. Unknown flags cause tdb_open() to fail as well, so they
|
||||
can be detected at runtime.
|
||||
|
||||
2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
|
||||
In The Same Process
|
||||
@ -275,7 +280,7 @@ to allow other to create such an API.
|
||||
|
||||
2.6.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.7 TDB API Is Not POSIX Thread-safe
|
||||
|
||||
@ -283,19 +288,19 @@ The TDB API uses an error code which can be queried after an
|
||||
operation to determine what went wrong. This programming model
|
||||
does not work with threads, unless specific additional guarantees
|
||||
are given by the implementation. In addition, even
|
||||
otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
|
||||
otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
|
||||
).
|
||||
|
||||
2.7.1 Proposed Solution
|
||||
|
||||
Reachitecting the API to include a tdb_errcode pointer would be a
|
||||
great deal of churn; we are better to guarantee that the
|
||||
tdb_errcode is per-thread so the current programming model can be
|
||||
maintained.
|
||||
|
||||
This requires dynamic per-thread allocations, which is awkward
|
||||
with POSIX threads (pthread_key_create space is limited and we
|
||||
cannot simply allocate a key for every TDB).
|
||||
great deal of churn, but fortunately most functions return 0 on
|
||||
success and -1 on error: we can change these to return 0 on
|
||||
success and a negative error code on error, and the API remains
|
||||
similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
|
||||
functions need to take a TDB_DATA pointer and return an error
|
||||
code. It is also simpler to have tdb_nextkey replace its key
|
||||
argument in place, freeing up any old .dptr.
|
||||
|
||||
Internal locking is required to make sure that fcntl locks do not
|
||||
overlap between threads, and also that the global list of tdbs is
|
||||
@ -304,12 +309,13 @@ maintained.
|
||||
The aim is that building tdb with -DTDB_PTHREAD will result in a
|
||||
pthread-safe version of the library, and otherwise no overhead
|
||||
will exist. Alternatively, a hooking mechanism similar to that
|
||||
proposed for [Proposed-Solution-locking-hook] could be used to
|
||||
proposed for[Proposed-Solution-locking-hook] could be used to
|
||||
enable pthread locking at runtime.
|
||||
|
||||
2.7.2 Status
|
||||
|
||||
Incomplete.
|
||||
Incomplete; API has been changed but thread safety has not been
|
||||
implemented.
|
||||
|
||||
2.8 *_nonblock Functions And *_mark Functions Expose
|
||||
Implementation
|
||||
@ -375,7 +381,7 @@ it is needed.
|
||||
|
||||
2.8.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.9 tdb_chainlock Functions Expose Implementation
|
||||
|
||||
@ -427,7 +433,7 @@ otherwise EAGAIN.
|
||||
|
||||
2.10.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.11 The API Uses Gratuitous Typedefs, Capitals
|
||||
|
||||
@ -477,7 +483,7 @@ Complete.
|
||||
|
||||
2.13 Various Callback Functions Are Not Typesafe
|
||||
|
||||
The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
|
||||
The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
|
||||
is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
|
||||
and tdb_check all take void * and must internally convert it to
|
||||
the argument type they were expecting.
|
||||
@ -499,7 +505,7 @@ http://ccan.ozlabs.org/info/typesafe_cb.html
|
||||
|
||||
2.13.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
|
||||
tdb_reopen_all Problematic
|
||||
@ -519,12 +525,12 @@ it alone has opened the TDB and will erase it.
|
||||
2.14.1 Proposed Solution
|
||||
|
||||
Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
|
||||
see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
|
||||
see[TDB_CLEAR_IF_FIRST-Imposes-Performance].
|
||||
|
||||
2.14.2 Status
|
||||
|
||||
Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
|
||||
implemented.
|
||||
Complete. An open hook is provided to replicate this
|
||||
functionality if required.
|
||||
|
||||
2.15 Extending The Header Is Difficult
|
||||
|
||||
@ -537,7 +543,7 @@ not.
|
||||
|
||||
2.15.1 Proposed Solution
|
||||
|
||||
The header should contain a “format variant” value (64-bit). This
|
||||
The header should contain a“format variant” value (64-bit). This
|
||||
is divided into two 32-bit parts:
|
||||
|
||||
1. The lower part reflects the format variant understood by code
|
||||
@ -558,7 +564,7 @@ writes to the database.
|
||||
|
||||
2.15.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.16 Record Headers Are Not Expandible
|
||||
|
||||
@ -576,7 +582,7 @@ would know the extension is not present on that record.
|
||||
|
||||
2.16.2 Status
|
||||
|
||||
Incomplete.
|
||||
Complete.
|
||||
|
||||
2.17 TDB Does Not Use Talloc
|
||||
|
||||
@ -589,10 +595,10 @@ conveniently.
|
||||
The allocation within TDB is not complicated enough to justify
|
||||
the use of talloc, and I am reluctant to force another
|
||||
(excellent) library on TDB users. Nonetheless a compromise is
|
||||
possible. An attribute (see [attributes]) can be added later to
|
||||
possible. An attribute (see[attributes]) can be added later to
|
||||
tdb_open() to provide an alternate allocation mechanism,
|
||||
specifically for talloc but usable by any other allocator (which
|
||||
would ignore the “context” argument).
|
||||
would ignore the“context” argument).
|
||||
|
||||
This would form a talloc heirarchy as expected, but the caller
|
||||
would still have to attach a destructor to the tdb context
|
||||
@ -602,7 +608,7 @@ manage them (using talloc_free() or talloc_steal()).
|
||||
|
||||
2.17.2 Status
|
||||
|
||||
Deferred.
|
||||
Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
|
||||
|
||||
3 Performance And Scalability Issues
|
||||
|
||||
@ -635,11 +641,11 @@ can simply unlink the old tdb at that point.
|
||||
|
||||
3.1.2 Status
|
||||
|
||||
Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
|
||||
Complete.
|
||||
|
||||
3.2 TDB Files Have a 4G Limit
|
||||
|
||||
This seems to be becoming an issue (so much for “trivial”!),
|
||||
This seems to be becoming an issue (so much for“trivial”!),
|
||||
particularly for ldb.
|
||||
|
||||
3.2.1 Proposed Solution
|
||||
@ -679,7 +685,7 @@ Record sizes will be 64 bit, with an error returned on 32 bit
|
||||
platforms which try to access such records (the current
|
||||
implementation would return TDB_ERR_OOM in a similar case). It
|
||||
seems unlikely that 32 bit keys will be a limitation, so the
|
||||
implementation may not support this (see [sub:Records-Incur-A]).
|
||||
implementation may not support this (see[sub:Records-Incur-A]).
|
||||
|
||||
3.3.2 Status
|
||||
|
||||
@ -728,7 +734,11 @@ invalid.
|
||||
|
||||
3.4.2 Status
|
||||
|
||||
Complete.
|
||||
Ignore. Scaling the hash automatically proved inefficient at
|
||||
small hash sizes; we default to a 8192-element hash (changable
|
||||
via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
|
||||
an array of hash entries. This scales slightly better than the
|
||||
tdb chain (due to the 8 top bits containing extra hash).
|
||||
|
||||
3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended
|
||||
|
||||
@ -783,7 +793,7 @@ Deleting a record occurs as follows:
|
||||
|
||||
7. Otherwise, prepend ourselves to the free list.
|
||||
|
||||
Disabling right-merging (step [right-merging]) causes
|
||||
Disabling right-merging (step[right-merging]) causes
|
||||
fragmentation; the other heuristics proved insufficient to
|
||||
address this, so the final answer to this was that when we expand
|
||||
the TDB file inside a transaction commit, we repack the entire
|
||||
@ -812,7 +822,7 @@ zone) which produces too many clashes for our hash table to
|
||||
handle well, and for coalescing we search by address. Thus an
|
||||
array of doubly-linked free lists seems preferable.
|
||||
|
||||
There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
|
||||
There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
|
||||
) but it's not clear this would reduce contention in the common
|
||||
case where all processes are allocating/freeing the same size.
|
||||
Thus we almost certainly need to divide in other ways: the most
|
||||
@ -822,7 +832,7 @@ ordering.
|
||||
|
||||
Unfortunately it is difficult to know what heuristics should be
|
||||
used to determine zone sizes, and our transaction code relies on
|
||||
being able to create a “recovery area” by simply appending to the
|
||||
being able to create a“recovery area” by simply appending to the
|
||||
file (difficult if it would need to create a new zone header).
|
||||
Thus we use a linked-list of free tables; currently we only ever
|
||||
create one, but if there is more than one we choose one at random
|
||||
@ -862,9 +872,9 @@ coalescing at this point:
|
||||
This optimizes rapid insert/delete of free list entries by not
|
||||
coalescing them all the time.. First-fit address ordering
|
||||
ordering seems to be fairly good for keeping fragmentation low
|
||||
(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
|
||||
(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
|
||||
does not need a tailer to coalesce, though if we needed one we
|
||||
could have one cheaply: see [sub:Records-Incur-A].
|
||||
could have one cheaply: see[sub:Records-Incur-A].
|
||||
|
||||
Each free entry has the free table number in the header: less
|
||||
than 255. It also contains a doubly-linked list for easy
|
||||
@ -884,7 +894,7 @@ db when a transaction commit needs to enlarge the file.
|
||||
|
||||
The 25% overhead on allocation works in practice for ldb because
|
||||
indexes tend to expand by one record at a time. This internal
|
||||
fragmentation can be resolved by having an “expanded” bit in the
|
||||
fragmentation can be resolved by having an“expanded” bit in the
|
||||
header to note entries that have previously expanded, and
|
||||
allocating more space for them.
|
||||
|
||||
@ -970,13 +980,13 @@ block:
|
||||
scale as fast as data, so I'm assuming a maximum key size of 32
|
||||
bits.
|
||||
|
||||
4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
|
||||
4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
|
||||
this is diminishing returns after a handful of bits (at 10
|
||||
bits, it reduces 99.9% of false memcmp). As an aside, as the
|
||||
lower bits are already incorporated in the hash table
|
||||
resolution, the upper bits should be used here. Note that it's
|
||||
not clear that these bits will be a win, given the extra bits
|
||||
in the hash table itself (see [sub:Hash-Size-Solution]).
|
||||
in the hash table itself (see[sub:Hash-Size-Solution]).
|
||||
|
||||
5. 'magic' does not need to be enlarged: it currently reflects
|
||||
one of 5 values (used, free, dead, recovery, and
|
||||
@ -1094,8 +1104,10 @@ Deferred.
|
||||
|
||||
3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support
|
||||
|
||||
3.9.1 Proposed SolutionNone. At some point you say “use a real
|
||||
database” (but see [replay-attribute]).
|
||||
3.9.1 Proposed Solution
|
||||
|
||||
None. At some point you say“use a real database” (but see[replay-attribute]
|
||||
).
|
||||
|
||||
But as a thought experiment, if we implemented transactions to
|
||||
only overwrite free entries (this is tricky: there must not be a
|
||||
@ -1128,7 +1140,7 @@ failed.
|
||||
|
||||
3.10.1 Proposed Solution
|
||||
|
||||
None (but see [replay-attribute]). We could solve a small part of
|
||||
None (but see[replay-attribute]). We could solve a small part of
|
||||
the problem by providing read-only transactions. These would
|
||||
allow one write transaction to begin, but it could not commit
|
||||
until all r/o transactions are done. This would require a new
|
||||
@ -1175,7 +1187,7 @@ indefinitely.
|
||||
|
||||
3.12.1 Proposed Solution
|
||||
|
||||
Remove reliability guarantees; see [traverse-Proposed-Solution].
|
||||
Remove reliability guarantees; see[traverse-Proposed-Solution].
|
||||
|
||||
3.12.2 Status
|
||||
|
||||
@ -1214,7 +1226,7 @@ normal (fast) usage, and occasionally empties the results into a
|
||||
transactional TDB. This kind of usage prioritizes performance
|
||||
over durability: as long as we are consistent, data can be lost.
|
||||
|
||||
This would be more neatly implemented inside tdb: a “soft”
|
||||
This would be more neatly implemented inside tdb: a“soft”
|
||||
transaction commit (ie. syncless) which meant that data may be
|
||||
reverted on a crash.
|
||||
|
||||
@ -1226,12 +1238,12 @@ Unfortunately any transaction scheme which overwrites old data
|
||||
requires a sync before that overwrite to avoid the possibility of
|
||||
corruption.
|
||||
|
||||
It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
|
||||
It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
|
||||
,where transactions are committed without overwriting existing
|
||||
data, and an array of top-level pointers were available in the
|
||||
header. If the transaction is “soft” then we would not need a
|
||||
sync at all: existing processes would pick up the new hash table
|
||||
and free list and work with that.
|
||||
header. If the transaction is“soft” then we would not need a sync
|
||||
at all: existing processes would pick up the new hash table and
|
||||
free list and work with that.
|
||||
|
||||
At some later point, a sync would allow recovery of the old data
|
||||
into the free lists (perhaps when the array of top-level pointers
|
||||
@ -1249,7 +1261,7 @@ so it can coordinate cluster-wide transactions.
|
||||
3.15.1 Proposed Solution<replay-attribute>
|
||||
|
||||
Tridge points out that an attribute can be later added to
|
||||
tdb_open (see [attributes]) to provide replay/trace hooks, which
|
||||
tdb_open (see[attributes]) to provide replay/trace hooks, which
|
||||
could become the basis for this and future parallel transactions
|
||||
and snapshot support.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user