2024-12-23 17:34:34 +03:00 · 2012-06-19 12:43:09 +09:30 · 2012-06-19 12:43:09 +09:30 · 5ff92d8f7d
commit 5ff92d8f7d
parent b888bc4316
6 changed files with 506 additions and 5822 deletions
--- a/lib/ntdb/doc/TDB_porting.txt
+++ b/lib/ntdb/doc/TDB_porting.txt
@ -6,39 +6,415 @@ Interface differences between TDB and NTDB.
  otherwise you'll get a compile error when tdb.h re-defined struct
  TDB_DATA.

+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
 - ntdb functions return NTDB_SUCCESS (ie 0) on success, and a negative
  error on failure, whereas tdb functions returned 0 on success, and
  -1 on failure.  tdb then used tdb_error() to determine the error;
  this API is nasty if we ever want to support threads, so is not supported.

+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
+	{
+		if (tdb_store(tdb, key, d) == -1) {
+			printf("store failed: %s\n", tdb_errorstr(tdb));
+		}
+	}
+
+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
+	{
+		enum NTDB_ERROR e;
+
+		e = ntdb_store(ntdb, key, d);
+		if (e) {
+			printf("store failed: %s\n", ntdb_errorstr(e));
+		}
+	}
+
 - ntdb's ntdb_fetch() returns an error, tdb's returned the data directly
  (or tdb_null, and you were supposed to check tdb_error() to find out why).

+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	void tdb_example(struct tdb_context *tdb, TDB_DATA key)
+	{
+		TDB_DATA data;
+
+		data = tdb_fetch(tdb, key);
+		if (!data.dptr) {
+			printf("fetch failed: %s\n", tdb_errorstr(tdb));
+		}
+	}
+
+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key)
+	{
+		NTDB_DATA data;
+		enum NTDB_ERROR e;
+
+		e = ntdb_fetch(ntdb, key, &data);
+		if (e) {
+			printf("fetch failed: %s\n", ntdb_errorstr(e));
+		}
+	}
+
 - ntdb's ntdb_nextkey() frees the old key's dptr, in tdb you needed to do
  this manually.

- tdb's tdb_open/tdb_open_ex took an explicit hash size.  ntdb's hash table
-  resizes as required.
+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	void tdb_example(struct tdb_context *tdb)
+	{
+		TDB_DATA key, next, data;
+
+		for (key = tdb_firstkey(tdb); key.dptr; key = next) {
+			printf("Got key!\n");
+			next = tdb_nextkey(tdb, key);
+			free(key.dptr);
+		}
+	}
+
+
+	void ntdb_example(struct ntdb_context *ntdb)
+	{
+		NTDB_DATA k, data;
+		enum NTDB_ERROR e;
+
+		for (e = ntdb_firstkey(ntdb,&k); !e; e = ntdb_nextkey(ntdb,&k))
+			printf("Got key!\n");
+	}
+
+- Unlike tdb_open/tdb_open_ex, ntdb_open does not allow NULL names,
+  even for NTDB_INTERNAL dbs, and thus ntdb_name() never returns NULL.
+
+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	struct tdb_context *tdb_example(void)
+	{
+		return tdb_open(NULL, 0, TDB_INTERNAL, O_RDWR, 0);
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		return ntdb_open("example", NTDB_INTERNAL, O_RDWR, 0);
+	}

 - ntdb uses a linked list of attribute structures to implement logging and
  alternate hashes.  tdb used tdb_open_ex, which was not extensible.

+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	/* Custom hash function */
+	static unsigned int my_tdb_hash_func(TDB_DATA *key)
+	{
+		return key->dsize;
+	}
+
+	struct tdb_context *tdb_example(void)
+	{
+		return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
+		                   O_CREAT|O_RDWR, 0600, NULL, my_hash_func);
+	}
+
+	/* Custom hash function */
+	static unsigned int my_ntdb_hash_func(const void *key, size_t len,
+					      uint32_t seed, void *data)
+	{
+		return len;
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		union ntdb_attribute hash;
+
+		hash.base.attr = NTDB_ATTRIBUTE_HASH;
+		hash.base.next = NULL;
+		hash.hash.fn = my_ntdb_hash_func;
+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
+		                   O_CREAT|O_RDWR, 0600, &hash);
+	}
+
+- tdb's tdb_open/tdb_open_ex took an explicit hash size, defaulting to
+  131.  ntdb's uses an attribute for this, defaulting to 8192.
+
+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	struct tdb_context *tdb_example(void)
+	{
+		return tdb_open("example.tdb", 10007, TDB_DEFAULT,
+		                O_CREAT|O_RDWR, 0600);
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		union ntdb_attribute hashsize;
+
+		hashsize.base.attr = NTDB_ATTRIBUTE_HASHSIZE;
+		hashsize.base.next = NULL;
+		hashsize.hashsize.size = 16384;
+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
+		                   O_CREAT|O_RDWR, 0600, &hashsize);
+	}
+
 - ntdb does locking on read-only databases (ie. O_RDONLY passed to ntdb_open).
  tdb did not: use the NTDB_NOLOCK flag if you want to suppress locking.

- ntdb's log function is simpler than tdb's log function.  The string is
-  already formatted, and it takes an enum ntdb_log_level not a tdb_debug_level,
-  and which has only three values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and
-  NTDB_LOG_WARNING.
+  Example:
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	struct tdb_context *tdb_example(void)
+	{
+		return tdb_open("example.tdb", 0, TDB_DEFAULT, O_RDONLY, 0);
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		return ntdb_open("example.ntdb", NTDB_NOLOCK, O_RDONLY, NULL);
+	}
+
+- ntdb's log function is simpler than tdb's log function.  The string
+  is already formatted, is not terminated by a '\n', and it takes an
+  enum ntdb_log_level not a tdb_debug_level, and which has only three
+  values: NTDB_LOG_ERROR, NTDB_LOG_USE_ERROR and NTDB_LOG_WARNING.
+
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	static void tdb_log(struct tdb_context *tdb,
+	                    enum tdb_debug_level level, const char *fmt, ...)
+	{
+		va_list ap;
+		const char *name;
+
+		switch (level) {
+		case TDB_DEBUG_FATAL:
+			fprintf(stderr, "FATAL: ");
+			break;
+		case TDB_DEBUG_ERROR:
+			fprintf(stderr, "ERROR: ");
+			break;
+		case TDB_DEBUG_WARNING:
+			fprintf(stderr, "WARNING: ");
+			break;
+		case TDB_DEBUG_TRACE:
+			/* Don't print out tracing. */
+			return;
+		}
+
+		name = tdb_name(tdb);
+		if (!name) {
+			name = "unnamed";
+		}
+
+		fprintf(stderr, "tdb(%s):", name);
+
+		va_start(ap, fmt);
+		vfprintf(stderr, fmt, ap);
+		va_end(ap);
+	}
+
+	struct tdb_context *tdb_example(void)
+	{
+		struct tdb_logging_context lctx;
+
+		lctx.log_fn = tdb_log;
+		return tdb_open_ex("example.tdb", 0, TDB_DEFAULT,
+		                   O_CREAT|O_RDWR, 0600, &lctx, NULL);
+	}
+
+	static void ntdb_log(struct ntdb_context *ntdb,
+			     enum ntdb_log_level level,
+			     enum NTDB_ERROR ecode,
+			     const char *message,
+			     void *data)
+	{
+		switch (level) {
+		case NTDB_LOG_ERROR:
+			fprintf(stderr, "ERROR: ");
+			break;
+		case NTDB_LOG_USE_ERROR:
+			/* We made a mistake, so abort. */
+			abort();
+			break;
+		case NTDB_LOG_WARNING:
+			fprintf(stderr, "WARNING: ");
+			break;
+		}
+
+		fprintf(stderr, "ntdb(%s):%s:%s\n",
+			ntdb_name(ntdb), ntdb_errorstr(ecode), message);
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		union ntdb_attribute log;
+
+		log.base.attr = NTDB_ATTRIBUTE_LOG;
+		log.base.next = NULL;
+		log.log.fn = ntdb_log;
+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
+		                 O_CREAT|O_RDWR, 0600, &log);
+	}

 - ntdb provides ntdb_deq() for comparing two NTDB_DATA, and ntdb_mkdata() for
  creating an NTDB_DATA.

- ntdb's ntdb_name() returns a copy of the name even for NTDB_INTERNAL dbs.
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	void tdb_example(struct tdb_context *tdb)
+	{
+		TDB_DATA data, key;
+
+		key.dsize = strlen("hello");
+		key.dptr = "hello";
+		data = tdb_fetch(tdb, key);
+		if (data.dsize == key.dsize
+		    && !memcmp(data.dptr, key.dptr, key.dsize))
+			printf("key is same as data\n");
+		}
+		free(data.dptr);
+	}
+
+	void ntdb_example(struct ntdb_context *ntdb)
+	{
+		NTDB_DATA data, key;
+
+		key = ntdb_mkdata("hello", strlen("hello"));
+		if (ntdb_fetch(ntdb, key, &data) == NTDB_SUCCESS) {
+			if (ntdb_deq(key, data)) {
+				printf("key is same as data\n");
+			}
+			free(data.dptr);
+		}
+	}
+
+- Failure inside a transaction (such as a lock function failing) does
+  not implicitly cancel the transaction; you still need to call
+  ntdb_transaction_cancel().
+
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	void tdb_example(struct tdb_context *tdb, TDB_DATA key, TDB_DATA d)
+	{
+		if (tdb_transaction_start(tdb) == -1) {
+			printf("transaction failed: %s\n", tdb_errorstr(tdb));
+			return;
+		}
+
+		if (tdb_store(tdb, key, d) == -1) {
+			printf("store failed: %s\n", tdb_errorstr(tdb));
+			return;
+		}
+		if (tdb_transaction_commit(tdb) == -1) {
+			printf("commit failed: %s\n", tdb_errorstr(tdb));
+		}
+	}
+
+	void ntdb_example(struct ntdb_context *ntdb, NTDB_DATA key, NTDB_DATA d)
+	{
+		enum NTDB_ERROR e;
+
+		e = ntdb_transaction_start(ntdb);
+		if (e) {
+			printf("transaction failed: %s\n", ntdb_errorstr(e));
+			return;
+		}
+
+		e = ntdb_store(ntdb, key, d);
+		if (e) {
+			printf("store failed: %s\n", ntdb_errorstr(e));
+			ntdb_transaction_cancel(ntdb);
+		}
+
+		e = ntdb_transaction_commit(ntdb);
+		if (e) {
+			printf("commit failed: %s\n", ntdb_errorstr(e));
+		}
+	}
+
+- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
+  API problems.  If necessary, you can emulate this by using the open
+  hook and placing a 1-byte lock at offset 4.  If your program forks
+  and exits, you will need to place this lock again in the child before
+  the parent exits.
+
+  Example:
+
+	#include <tdb.h>
+	#include <ntdb.h>
+
+	struct tdb_context *tdb_example(void)
+	{
+		return tdb_open("example.tdb", 0, TDB_CLEAR_IF_FIRST,
+		                   O_CREAT|O_RDWR, 0600);
+	}
+
+	static enum NTDB_ERROR clear_if_first(int fd, void *unused)
+	{
+		/* We hold a lock offset 4 always, so we can tell if
+		 * anyone else is. */
+		struct flock fl;
+
+		fl.l_type = F_WRLCK;
+		fl.l_whence = SEEK_SET;
+		fl.l_start = 4; /* ACTIVE_LOCK */
+		fl.l_len = 1;
+
+		if (fcntl(fd, F_SETLK, &fl) == 0) {
+			/* We must be first ones to open it!  Clear it. */
+			if (ftruncate(fd, 0) != 0) {
+				return NTDB_ERR_IO;
+			}
+		}
+		fl.l_type = F_RDLCK;
+		if (fcntl(fd, F_SETLKW, &fl) != 0) {
+			return NTDB_ERR_IO;
+		}
+		return NTDB_SUCCESS;
+	}
+
+	struct ntdb_context *ntdb_example(void)
+	{
+		union ntdb_attribute open_attr;
+
+		open_attr.openhook.base.attr = NTDB_ATTRIBUTE_OPENHOOK;
+		open_attr.openhook.base.next = NULL;
+		open_attr.openhook.fn = clear_if_first;
+
+		return ntdb_open("example.ntdb", NTDB_DEFAULT,
+		                 O_CREAT|O_RDWR, 0600, &open_attr);
+	}
+
+- ntdb traversals are not reliable if the database is changed during
+  the traversal, ie your traversal may not cover all elements, or may
+  cover elements multiple times.  As a special exception, deleting the
+  current record within ntdb_traverse() is reliable.
+
+- There is no ntdb_traverse_read, since ntdb_traverse does not hold
+  a lock across the entire traversal anyway.  If you want to make sure
+  that your traversal function does not write to the database, you can
+  set and clear the NTDB_RDONLY flag around the traversal.

 - ntdb does not need tdb_reopen() or tdb_reopen_all().  If you call
  fork() after during certain operations the child should close the
-  tdb, or complete the operations before continuing to use the tdb:
+  ntdb, or complete the operations before continuing to use the tdb:

 	ntdb_transaction_start(): child must ntdb_transaction_cancel()
 	ntdb_lockall(): child must call ntdb_unlockall()
@ -46,19 +422,5 @@ Interface differences between TDB and NTDB.
 	ntdb_chainlock(): child must call ntdb_chainunlock()
 	ntdb_parse() callback: child must return from ntdb_parse()

- ntdb will not open a non-tdb file, even if O_CREAT is specified.
-
- There is no ntdb_traverse_read.  For operating on TDB files, you can
-  simulate it by ntdb_add_flag(tdb, NTDB_RDONLY); ntdb_traverse();
-  ntdb_remove_flag(tdb, NTDB_RDONLY).  This may be desirable because
-  traverse on TDB files use a write lock on the entire database
-  unless it's read-only.
-
- Failure inside a transaction (such as a lock function failing) does
-  not implicitly cancel the transaction; you still need to call
-  ntdb_transaction_cancel().
-
- There is no NTDB_CLEAR_IF_FIRST flag; it has severe scalability and
-  API problems.  If necessary, you can emulate this by using the open
-  hook and placing a 1-byte lock at offset 4.  If your program forks,
-  you will need to place this lock again in the child.
+- ntdb will not open a non-ntdb file, even if O_CREAT is specified.  tdb
+  will overwrite an unknown file in that case.
--- a/lib/ntdb/doc/design-1.3.txt
+++ b/lib/ntdb/doc/design-1.3.txt
--- a/lib/ntdb/doc/design.lyx
+++ b/lib/ntdb/doc/design.lyx
@ -1,48 +1,66 @@
-#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
-\lyxformat 345
+#LyX 2.0 created this file. For more info see http://www.lyx.org/
+\lyxformat 413
 \begin_document
 \begin_header
 \textclass article
 \use_default_options true
+\maintain_unincluded_children false
 \language english
+\language_package default
 \inputencoding auto
+\fontencoding global
 \font_roman default
 \font_sans default
 \font_typewriter default
 \font_default_family default
+\use_non_tex_fonts false
 \font_sc false
 \font_osf false
 \font_sf_scale 100
 \font_tt_scale 100

 \graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
 \paperfontsize default
 \use_hyperref false
 \papersize default
 \use_geometry false
 \use_amsmath 1
 \use_esint 1
+\use_mhchem 1
+\use_mathdots 1
 \cite_engine basic
 \use_bibtopic false
+\use_indices false
 \paperorientation portrait
+\suppress_date false
+\use_refstyle 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
 \secnumdepth 3
 \tocdepth 3
 \paragraph_separation indent
-\defskip medskip
+\paragraph_indentation default
 \quotes_language english
 \papercolumns 1
 \papersides 1
 \paperpagestyle default
 \tracking_changes true
 \output_changes true
-\author ""
-\author ""
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
 \end_header

 \begin_body

 \begin_layout Title
-TDB2: A Redesigning The Trivial DataBase
+NTDB: Redesigning The Trivial DataBase
 \end_layout

 \begin_layout Author
@ -50,7 +68,7 @@ Rusty Russell, IBM Corporation
 \end_layout

 \begin_layout Date
-17-March-2011
+19 June 2012
 \end_layout

 \begin_layout Abstract
@ -87,7 +105,7 @@ The wider variety and greater demands of TDB-using code has lead to some
 \begin_layout Standard
 \begin_inset Tabular
 <lyxtabular version="3" rows="12" columns="3">
-<features>
+<features tabularvalignment="middle">
 <column alignment="center" valignment="top" width="0">
 <column alignment="center" valignment="top" width="0">
 <column alignment="center" valignment="top" width="0">
@ -453,6 +471,20 @@ This review is an attempt to catalog and address all the known issues with
 second system syndrome in rewriting a successful project like this.
 \end_layout

+\begin_layout Standard
+Note: the final decision was to make ntdb a separate library, with a separarate
+ 'ntdb' namespace so both can potentially be linked together.
+ This document still refers to
+\begin_inset Quotes eld
+\end_inset
+
+tdb
+\begin_inset Quotes erd
+\end_inset
+
+ everywhere, for simplicity.
+\end_layout
+
 \begin_layout Section
 API Issues
 \end_layout
@ -960,7 +992,6 @@ There are several issues with this approach.
 have under some circumstances.
 I don't believe this is currently the case, but it constrains the implementatio
 n.
-
 \end_layout

 \begin_layout Subsubsection
@ -1025,7 +1056,7 @@ Status
 \end_layout

 \begin_layout Standard
-Incomplete.
+Complete.
 \end_layout

 \begin_layout Subsection
@ -1114,7 +1145,7 @@ Status
 \end_layout

 \begin_layout Standard
-Incomplete.
+Complete.
 \end_layout

 \begin_layout Subsection
@ -1291,6 +1322,7 @@ Status

 \begin_layout Standard
 Complete.
+ An open hook is provided to replicate this functionality if required.
 \end_layout

 \begin_layout Subsection
@ -1433,7 +1465,7 @@ Status
 \end_layout

 \begin_layout Standard
-Deferred.
+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.
 \end_layout

 \begin_layout Section
@ -1661,7 +1693,12 @@ Status
 \end_layout

 \begin_layout Standard
-Complete.
+Ignore.
+ Scaling the hash automatically proved inefficient at small hash sizes;
+ we default to a 8192-element hash (changable via NTDB_ATTRIBUTE_HASHSIZE),
+ and when buckets clash we expand to an array of hash entries.
+ This scales slightly better than the tdb chain (due to the 8 top bits containin
+g extra hash).
 \end_layout

 \begin_layout Subsection
@ -1738,7 +1775,6 @@ If it's more than max_dead, bulk free all the dead ones (similar to steps

 \begin_layout Enumerate
 Simply mark this record as dead and return.
-
 \end_layout

 \end_deeper
@ -1920,7 +1956,6 @@ reference "sub:Records-Incur-A"
 \end_inset

 .
-
 \end_layout

 \begin_layout Standard
@ -2357,7 +2392,11 @@ TDB Does Not Have Snapshot Support
 \end_layout

 \begin_layout Subsubsection
-Proposed SolutionNone.
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+None.
 At some point you say
 \begin_inset Quotes eld
 \end_inset
@ -2666,7 +2705,6 @@ name "replay-attribute"

 \begin_layout Standard
 Tridge points out that an attribute can be later added to tdb_open (see
-
 \begin_inset CommandInset ref
 LatexCommand ref
 reference "attributes"
--- a/lib/ntdb/doc/design.lyx,v
+++ b/lib/ntdb/doc/design.lyx,v
--- a/lib/ntdb/doc/design.pdf
+++ b/lib/ntdb/doc/design.pdf
--- a/lib/ntdb/doc/design.txt
+++ b/lib/ntdb/doc/design.txt
@ -1,8 +1,8 @@
-TDB2: A Redesigning The Trivial DataBase
+NTDB: Redesigning The Trivial DataBase

 Rusty Russell, IBM Corporation

-1-December-2010
+19 June 2012

 Abstract

@ -65,6 +65,11 @@ without significantly increasing complexity; all involved are far
 too aware of the dangers of second system syndrome in rewriting a
 successful project like this.

+Note: the final decision was to make ntdb a separate library,
+with a separarate 'ntdb' namespace so both can potentially be
+linked together. This document still refers to “tdb” everywhere,
+for simplicity.
+
 2 API Issues

 2.1 tdb_open_ex Is Not Expandable
@ -182,7 +187,7 @@ This flag can also be changed at runtime.

 2.3.1 Proposed Solution

-Given the usage patterns, it seems that the “least-surprise”
+Given the usage patterns, it seems that the“least-surprise”
 behavior of disallowing nested transactions should become the
 default. Additionally, it seems the outer transaction is the only
 code which knows whether inner transactions should be allowed, so
@ -193,7 +198,7 @@ expanded for this relatively-obscure case.

 2.3.2 Status

-Incomplete; nesting flag is still defined as per tdb1.
+Complete; the nesting flag has been removed.

 2.4 Incorrect Hash Function is Not Detected

@ -217,7 +222,7 @@ Complete.
 In response to scalability issues with the free list ([TDB-Freelist-Is]
 ) two API workarounds have been incorporated in TDB:
 tdb_set_max_dead() and the TDB_VOLATILE flag to tdb_open. The
-latter actually calls the former with an argument of “5”.
+latter actually calls the former with an argument of“5”.

 This code allows deleted records to accumulate without putting
 them in the free list. On delete we iterate through each chain
@ -235,8 +240,8 @@ will become a no-op.

 2.5.2 Status

-Incomplete. TDB_VOLATILE still defined, but implementation should
-fail on unknown flags to be future-proof.
+Complete. Unknown flags cause tdb_open() to fail as well, so they
+can be detected at runtime.

 2.6 <TDB-Files-Cannot>TDB Files Cannot Be Opened Multiple Times
  In The Same Process
@ -275,7 +280,7 @@ to allow other to create such an API.

 2.6.2 Status

-Incomplete.
+Complete.

 2.7 TDB API Is Not POSIX Thread-safe

@ -283,19 +288,19 @@ The TDB API uses an error code which can be queried after an
 operation to determine what went wrong. This programming model
 does not work with threads, unless specific additional guarantees
 are given by the implementation. In addition, even
-otherwise-independent threads cannot open the same TDB (as in [TDB-Files-Cannot]
+otherwise-independent threads cannot open the same TDB (as in[TDB-Files-Cannot]
 ).

 2.7.1 Proposed Solution

 Reachitecting the API to include a tdb_errcode pointer would be a
-great deal of churn; we are better to guarantee that the
-tdb_errcode is per-thread so the current programming model can be
-maintained.
-
-This requires dynamic per-thread allocations, which is awkward
-with POSIX threads (pthread_key_create space is limited and we
-cannot simply allocate a key for every TDB).
+great deal of churn, but fortunately most functions return 0 on
+success and -1 on error: we can change these to return 0 on
+success and a negative error code on error, and the API remains
+similar to previous. The tdb_fetch, tdb_firstkey and tdb_nextkey
+functions need to take a TDB_DATA pointer and return an error
+code. It is also simpler to have tdb_nextkey replace its key
+argument in place, freeing up any old .dptr.

 Internal locking is required to make sure that fcntl locks do not
 overlap between threads, and also that the global list of tdbs is
@ -304,12 +309,13 @@ maintained.
 The aim is that building tdb with -DTDB_PTHREAD will result in a
 pthread-safe version of the library, and otherwise no overhead
 will exist. Alternatively, a hooking mechanism similar to that
-proposed for [Proposed-Solution-locking-hook] could be used to
+proposed for[Proposed-Solution-locking-hook] could be used to
 enable pthread locking at runtime.

 2.7.2 Status

-Incomplete.
+Incomplete; API has been changed but thread safety has not been
+implemented.

 2.8 *_nonblock Functions And *_mark Functions Expose
  Implementation
@ -375,7 +381,7 @@ it is needed.

 2.8.2 Status

-Incomplete.
+Complete.

 2.9 tdb_chainlock Functions Expose Implementation

@ -427,7 +433,7 @@ otherwise EAGAIN.

 2.10.2 Status

-Incomplete.
+Complete.

 2.11 The API Uses Gratuitous Typedefs, Capitals

@ -477,7 +483,7 @@ Complete.

 2.13 Various Callback Functions Are Not Typesafe

-The callback functions in tdb_set_logging_function (after [tdb_log_func-Doesnt-Take]
+The callback functions in tdb_set_logging_function (after[tdb_log_func-Doesnt-Take]
 is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read
 and tdb_check all take void * and must internally convert it to
 the argument type they were expecting.
@ -499,7 +505,7 @@ http://ccan.ozlabs.org/info/typesafe_cb.html

 2.13.2 Status

-Incomplete.
+Complete.

 2.14 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens,
  tdb_reopen_all Problematic
@ -519,12 +525,12 @@ it alone has opened the TDB and will erase it.
 2.14.1 Proposed Solution

 Remove TDB_CLEAR_IF_FIRST. Other workarounds are possible, but
-see [TDB_CLEAR_IF_FIRST-Imposes-Performance].
+see[TDB_CLEAR_IF_FIRST-Imposes-Performance].

 2.14.2 Status

-Incomplete, TDB_CLEAR_IF_FIRST still defined, but not
-implemented.
+Complete. An open hook is provided to replicate this
+functionality if required.

 2.15 Extending The Header Is Difficult

@ -537,7 +543,7 @@ not.

 2.15.1 Proposed Solution

-The header should contain a “format variant” value (64-bit). This
+The header should contain a“format variant” value (64-bit). This
 is divided into two 32-bit parts:

 1. The lower part reflects the format variant understood by code
@ -558,7 +564,7 @@ writes to the database.

 2.15.2 Status

-Incomplete.
+Complete.

 2.16 Record Headers Are Not Expandible

@ -576,7 +582,7 @@ would know the extension is not present on that record.

 2.16.2 Status

-Incomplete.
+Complete.

 2.17 TDB Does Not Use Talloc

@ -589,10 +595,10 @@ conveniently.
 The allocation within TDB is not complicated enough to justify
 the use of talloc, and I am reluctant to force another
 (excellent) library on TDB users. Nonetheless a compromise is
-possible. An attribute (see [attributes]) can be added later to
+possible. An attribute (see[attributes]) can be added later to
 tdb_open() to provide an alternate allocation mechanism,
 specifically for talloc but usable by any other allocator (which
-would ignore the “context” argument).
+would ignore the“context” argument).

 This would form a talloc heirarchy as expected, but the caller
 would still have to attach a destructor to the tdb context
@ -602,7 +608,7 @@ manage them (using talloc_free() or talloc_steal()).

 2.17.2 Status

-Deferred.
+Complete, using the NTDB_ATTRIBUTE_ALLOCATOR attribute.

 3 Performance And Scalability Issues

@ -635,11 +641,11 @@ can simply unlink the old tdb at that point.

 3.1.2 Status

-Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+Complete.

 3.2 TDB Files Have a 4G Limit

-This seems to be becoming an issue (so much for “trivial”!),
+This seems to be becoming an issue (so much for“trivial”!),
 particularly for ldb.

 3.2.1 Proposed Solution
@ -679,7 +685,7 @@ Record sizes will be 64 bit, with an error returned on 32 bit
 platforms which try to access such records (the current
 implementation would return TDB_ERR_OOM in a similar case). It
 seems unlikely that 32 bit keys will be a limitation, so the
-implementation may not support this (see [sub:Records-Incur-A]).
+implementation may not support this (see[sub:Records-Incur-A]).

 3.3.2 Status

@ -728,7 +734,11 @@ invalid.

 3.4.2 Status

-Complete.
+Ignore. Scaling the hash automatically proved inefficient at
+small hash sizes; we default to a 8192-element hash (changable
+via NTDB_ATTRIBUTE_HASHSIZE), and when buckets clash we expand to
+an array of hash entries. This scales slightly better than the
+tdb chain (due to the 8 top bits containing extra hash).

 3.5 <TDB-Freelist-Is>TDB Freelist Is Highly Contended

@ -783,7 +793,7 @@ Deleting a record occurs as follows:

 7. Otherwise, prepend ourselves to the free list.

-Disabling right-merging (step [right-merging]) causes
+Disabling right-merging (step[right-merging]) causes
 fragmentation; the other heuristics proved insufficient to
 address this, so the final answer to this was that when we expand
 the TDB file inside a transaction commit, we repack the entire
@ -812,7 +822,7 @@ zone) which produces too many clashes for our hash table to
 handle well, and for coalescing we search by address. Thus an
 array of doubly-linked free lists seems preferable.

-There are various benefits in using per-size free lists (see [sub:TDB-Becomes-Fragmented]
+There are various benefits in using per-size free lists (see[sub:TDB-Becomes-Fragmented]
 ) but it's not clear this would reduce contention in the common
 case where all processes are allocating/freeing the same size.
 Thus we almost certainly need to divide in other ways: the most
@ -822,7 +832,7 @@ ordering.

 Unfortunately it is difficult to know what heuristics should be
 used to determine zone sizes, and our transaction code relies on
-being able to create a “recovery area” by simply appending to the
+being able to create a“recovery area” by simply appending to the
 file (difficult if it would need to create a new zone header).
 Thus we use a linked-list of free tables; currently we only ever
 create one, but if there is more than one we choose one at random
@ -862,9 +872,9 @@ coalescing at this point:
 This optimizes rapid insert/delete of free list entries by not
 coalescing them all the time.. First-fit address ordering
 ordering seems to be fairly good for keeping fragmentation low
-(see [sub:TDB-Becomes-Fragmented]). Note that address ordering
+(see[sub:TDB-Becomes-Fragmented]). Note that address ordering
 does not need a tailer to coalesce, though if we needed one we
-could have one cheaply: see [sub:Records-Incur-A].
+could have one cheaply: see[sub:Records-Incur-A].

 Each free entry has the free table number in the header: less
 than 255. It also contains a doubly-linked list for easy
@ -884,7 +894,7 @@ db when a transaction commit needs to enlarge the file.

 The 25% overhead on allocation works in practice for ldb because
 indexes tend to expand by one record at a time. This internal
-fragmentation can be resolved by having an “expanded” bit in the
+fragmentation can be resolved by having an“expanded” bit in the
 header to note entries that have previously expanded, and
 allocating more space for them.

@ -970,13 +980,13 @@ block:
  scale as fast as data, so I'm assuming a maximum key size of 32
  bits.

-4. 'full_hash' is used to avoid a memcmp on the “miss” case, but
+4. 'full_hash' is used to avoid a memcmp on the“miss” case, but
  this is diminishing returns after a handful of bits (at 10
  bits, it reduces 99.9% of false memcmp). As an aside, as the
  lower bits are already incorporated in the hash table
  resolution, the upper bits should be used here. Note that it's
  not clear that these bits will be a win, given the extra bits
-  in the hash table itself (see [sub:Hash-Size-Solution]).
+  in the hash table itself (see[sub:Hash-Size-Solution]).

 5. 'magic' does not need to be enlarged: it currently reflects
  one of 5 values (used, free, dead, recovery, and
@ -1094,8 +1104,10 @@ Deferred.

 3.9 <sub:TDB-Does-Not>TDB Does Not Have Snapshot Support

-3.9.1 Proposed SolutionNone. At some point you say “use a real
-  database” (but see [replay-attribute]).
+3.9.1 Proposed Solution
+
+None. At some point you say“use a real database” (but see[replay-attribute]
+).

 But as a thought experiment, if we implemented transactions to
 only overwrite free entries (this is tricky: there must not be a
@ -1128,7 +1140,7 @@ failed.

 3.10.1 Proposed Solution

-None (but see [replay-attribute]). We could solve a small part of
+None (but see[replay-attribute]). We could solve a small part of
 the problem by providing read-only transactions. These would
 allow one write transaction to begin, but it could not commit
 until all r/o transactions are done. This would require a new
@ -1175,7 +1187,7 @@ indefinitely.

 3.12.1 Proposed Solution

-Remove reliability guarantees; see [traverse-Proposed-Solution].
+Remove reliability guarantees; see[traverse-Proposed-Solution].

 3.12.2 Status

@ -1214,7 +1226,7 @@ normal (fast) usage, and occasionally empties the results into a
 transactional TDB. This kind of usage prioritizes performance
 over durability: as long as we are consistent, data can be lost.

-This would be more neatly implemented inside tdb: a “soft”
+This would be more neatly implemented inside tdb: a“soft”
 transaction commit (ie. syncless) which meant that data may be
 reverted on a crash.

@ -1226,12 +1238,12 @@ Unfortunately any transaction scheme which overwrites old data
 requires a sync before that overwrite to avoid the possibility of
 corruption.

-It seems possible to use a scheme similar to that described in [sub:TDB-Does-Not]
+It seems possible to use a scheme similar to that described in[sub:TDB-Does-Not]
 ,where transactions are committed without overwriting existing
 data, and an array of top-level pointers were available in the
-header. If the transaction is “soft” then we would not need a
-sync at all: existing processes would pick up the new hash table
-and free list and work with that.
+header. If the transaction is“soft” then we would not need a sync
+at all: existing processes would pick up the new hash table and
+free list and work with that.

 At some later point, a sync would allow recovery of the old data
 into the free lists (perhaps when the array of top-level pointers
@ -1249,7 +1261,7 @@ so it can coordinate cluster-wide transactions.
 3.15.1 Proposed Solution<replay-attribute>

 Tridge points out that an attribute can be later added to
-tdb_open (see [attributes]) to provide replay/trace hooks, which
+tdb_open (see[attributes]) to provide replay/trace hooks, which
 could become the basis for this and future parallel transactions
 and snapshot support.