2011-09-14 01:52:13 +04:00
/*
Unix SMB / CIFS implementation .
trivial database library
Copyright ( C ) Andrew Tridgell 2005
* * NOTE ! The following LGPL license applies to the tdb
* * library . This does NOT imply that all of Samba is released
* * under the LGPL
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 3 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , see < http : //www.gnu.org/licenses/>.
*/
# include "tdb1_private.h"
/*
transaction design :
- only allow a single transaction at a time per database . This makes
using the transaction API simpler , as otherwise the caller would
have to cope with temporary failures in transactions that conflict
with other current transactions
- keep the transaction recovery information in the same file as the
database , using a special ' transaction recovery ' record pointed at
by the header . This removes the need for extra journal files as
used by some other databases
- dynamically allocated the transaction recover record , re - using it
for subsequent transactions . If a larger record is needed then
tdb1_free ( ) the old record to place it on the normal tdb freelist
before allocating the new record
- during transactions , keep a linked list of writes all that have
been performed by intercepting all tdb1_write ( ) calls . The hooked
transaction versions of tdb1_read ( ) and tdb1_write ( ) check this
linked list and try to use the elements of the list in preference
to the real database .
- don ' t allow any locks to be held when a transaction starts ,
otherwise we can end up with deadlock ( plus lack of lock nesting
in posix locks would mean the lock is lost )
- if the caller gains a lock during the transaction but doesn ' t
release it then fail the commit
- allow for nested calls to tdb1_transaction_start ( ) , re - using the
existing transaction record . If the inner transaction is cancelled
then a subsequent commit will fail
- keep a mirrored copy of the tdb hash chain heads to allow for the
fast hash heads scan on traverse , updating the mirrored copy in
the transaction version of tdb1_write
- allow callers to mix transaction and non - transaction use of tdb ,
although once a transaction is started then an exclusive lock is
gained until the transaction is committed or cancelled
- the commit stategy involves first saving away all modified data
into a linearised buffer in the transaction recovery area , then
marking the transaction recovery area with a magic value to
indicate a valid recovery record . In total 4 fsync / msync calls are
needed per commit to prevent race conditions . It might be possible
to reduce this to 3 or even 2 with some more work .
- check for a valid recovery record on open of the tdb , while the
open lock is held . Automatically recover from the transaction
recovery area if needed , then continue with the open as
usual . This allows for smooth crash recovery with no administrator
intervention .
2011-09-14 02:05:13 +04:00
- if TDB_NOSYNC is passed to flags in tdb1_open then transactions are
2011-09-14 01:52:13 +04:00
still available , but no transaction recovery area is used and no
fsync / msync calls are made .
2011-09-14 02:05:13 +04:00
- if TDB_ALLOW_NESTING is passed to flags in tdb open , or added using
2011-09-14 01:52:13 +04:00
tdb1_add_flags ( ) transaction nesting is enabled .
2011-09-14 02:05:13 +04:00
The default is that transaction nesting is NOT allowed .
2011-09-14 01:52:13 +04:00
Beware . when transactions are nested a transaction successfully
completed with tdb1_transaction_commit ( ) can be silently unrolled later .
*/
/*
hold the context of any current transaction
*/
struct tdb1_transaction {
/* we keep a mirrored copy of the tdb hash heads here so
tdb1_next_hash_chain ( ) can operate efficiently */
uint32_t * hash_heads ;
/* the original io methods - used to do IOs to the real db */
const struct tdb1_methods * io_methods ;
/* the list of transaction blocks. When a block is first
written to , it gets created in this list */
uint8_t * * blocks ;
uint32_t num_blocks ;
uint32_t block_size ; /* bytes in each block */
uint32_t last_block_size ; /* number of valid bytes in the last block */
/* non-zero when an internal transaction error has
occurred . All write operations will then fail until the
transaction is ended */
int transaction_error ;
/* when inside a transaction we need to keep track of any
nested tdb1_transaction_start ( ) calls , as these are allowed ,
but don ' t create a new transaction */
int nesting ;
/* set when a prepare has already occurred */
bool prepared ;
tdb1_off_t magic_offset ;
/* old file size before transaction */
tdb1_len_t old_map_size ;
/* did we expand in this transaction */
bool expanded ;
} ;
/*
read while in a transaction . We need to check first if the data is in our list
of transaction elements , then if not do a real read
*/
2011-09-14 02:13:13 +04:00
static int transaction1_read ( struct tdb_context * tdb , tdb1_off_t off , void * buf ,
2011-09-14 01:52:13 +04:00
tdb1_len_t len , int cv )
{
uint32_t blk ;
/* break it down into block sized ops */
2011-09-14 02:13:13 +04:00
while ( len + ( off % tdb - > tdb1 . transaction - > block_size ) > tdb - > tdb1 . transaction - > block_size ) {
tdb1_len_t len2 = tdb - > tdb1 . transaction - > block_size - ( off % tdb - > tdb1 . transaction - > block_size ) ;
2011-09-14 01:52:13 +04:00
if ( transaction1_read ( tdb , off , buf , len2 , cv ) ! = 0 ) {
return - 1 ;
}
len - = len2 ;
off + = len2 ;
buf = ( void * ) ( len2 + ( char * ) buf ) ;
}
if ( len = = 0 ) {
return 0 ;
}
2011-09-14 02:13:13 +04:00
blk = off / tdb - > tdb1 . transaction - > block_size ;
2011-09-14 01:52:13 +04:00
/* see if we have it in the block list */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > num_blocks < = blk | |
tdb - > tdb1 . transaction - > blocks [ blk ] = = NULL ) {
2011-09-14 01:52:13 +04:00
/* nope, do a real read */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > io_methods - > tdb1_read ( tdb , off , buf , len , cv ) ! = 0 ) {
2011-09-14 01:52:13 +04:00
goto fail ;
}
return 0 ;
}
/* it is in the block list. Now check for the last block */
2011-09-14 02:13:13 +04:00
if ( blk = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
if ( len > tdb - > tdb1 . transaction - > last_block_size ) {
2011-09-14 01:52:13 +04:00
goto fail ;
}
}
/* now copy it out of this block */
2011-09-14 02:13:13 +04:00
memcpy ( buf , tdb - > tdb1 . transaction - > blocks [ blk ] + ( off % tdb - > tdb1 . transaction - > block_size ) , len ) ;
2011-09-14 01:52:13 +04:00
if ( cv ) {
tdb1_convert ( buf , len ) ;
}
return 0 ;
fail :
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_IO , TDB_LOG_ERROR ,
" transaction_read: failed at off=%d len=%d " ,
off , len ) ;
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > transaction_error = 1 ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/*
write while in a transaction
*/
2011-09-14 02:13:13 +04:00
static int transaction1_write ( struct tdb_context * tdb , tdb1_off_t off ,
2011-09-14 01:52:13 +04:00
const void * buf , tdb1_len_t len )
{
uint32_t blk ;
/* Only a commit is allowed on a prepared transaction */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > prepared ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL , TDB_LOG_USE_ERROR ,
" transaction_write: transaction already "
" prepared, write not allowed " ) ;
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > transaction_error = 1 ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* if the write is to a hash head, then update the transaction
hash heads */
if ( len = = sizeof ( tdb1_off_t ) & & off > = TDB1_FREELIST_TOP & &
off < TDB1_FREELIST_TOP + TDB1_HASHTABLE_SIZE ( tdb ) ) {
uint32_t chain = ( off - TDB1_FREELIST_TOP ) / sizeof ( tdb1_off_t ) ;
2011-09-14 02:13:13 +04:00
memcpy ( & tdb - > tdb1 . transaction - > hash_heads [ chain ] , buf , len ) ;
2011-09-14 01:52:13 +04:00
}
/* break it up into block sized chunks */
2011-09-14 02:13:13 +04:00
while ( len + ( off % tdb - > tdb1 . transaction - > block_size ) > tdb - > tdb1 . transaction - > block_size ) {
tdb1_len_t len2 = tdb - > tdb1 . transaction - > block_size - ( off % tdb - > tdb1 . transaction - > block_size ) ;
2011-09-14 01:52:13 +04:00
if ( transaction1_write ( tdb , off , buf , len2 ) ! = 0 ) {
return - 1 ;
}
len - = len2 ;
off + = len2 ;
if ( buf ! = NULL ) {
buf = ( const void * ) ( len2 + ( const char * ) buf ) ;
}
}
if ( len = = 0 ) {
return 0 ;
}
2011-09-14 02:13:13 +04:00
blk = off / tdb - > tdb1 . transaction - > block_size ;
off = off % tdb - > tdb1 . transaction - > block_size ;
2011-09-14 01:52:13 +04:00
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > num_blocks < = blk ) {
2011-09-14 01:52:13 +04:00
uint8_t * * new_blocks ;
/* expand the blocks array */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks = = NULL ) {
2011-09-14 01:52:13 +04:00
new_blocks = ( uint8_t * * ) malloc (
( blk + 1 ) * sizeof ( uint8_t * ) ) ;
} else {
new_blocks = ( uint8_t * * ) realloc (
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > blocks ,
2011-09-14 01:52:13 +04:00
( blk + 1 ) * sizeof ( uint8_t * ) ) ;
}
if ( new_blocks = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_OOM ;
2011-09-14 01:52:13 +04:00
goto fail ;
}
2011-09-14 02:13:13 +04:00
memset ( & new_blocks [ tdb - > tdb1 . transaction - > num_blocks ] , 0 ,
( 1 + ( blk - tdb - > tdb1 . transaction - > num_blocks ) ) * sizeof ( uint8_t * ) ) ;
tdb - > tdb1 . transaction - > blocks = new_blocks ;
tdb - > tdb1 . transaction - > num_blocks = blk + 1 ;
tdb - > tdb1 . transaction - > last_block_size = 0 ;
2011-09-14 01:52:13 +04:00
}
/* allocate and fill a block? */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks [ blk ] = = NULL ) {
tdb - > tdb1 . transaction - > blocks [ blk ] = ( uint8_t * ) calloc ( tdb - > tdb1 . transaction - > block_size , 1 ) ;
if ( tdb - > tdb1 . transaction - > blocks [ blk ] = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_OOM ;
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > transaction_error = 1 ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > old_map_size > blk * tdb - > tdb1 . transaction - > block_size ) {
tdb1_len_t len2 = tdb - > tdb1 . transaction - > block_size ;
if ( len2 + ( blk * tdb - > tdb1 . transaction - > block_size ) > tdb - > tdb1 . transaction - > old_map_size ) {
len2 = tdb - > tdb1 . transaction - > old_map_size - ( blk * tdb - > tdb1 . transaction - > block_size ) ;
2011-09-14 01:52:13 +04:00
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > io_methods - > tdb1_read ( tdb , blk * tdb - > tdb1 . transaction - > block_size ,
tdb - > tdb1 . transaction - > blocks [ blk ] ,
2011-09-14 01:52:13 +04:00
len2 , 0 ) ! = 0 ) {
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks [ blk ] ) ;
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_IO ;
2011-09-14 01:52:13 +04:00
goto fail ;
}
2011-09-14 02:13:13 +04:00
if ( blk = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
tdb - > tdb1 . transaction - > last_block_size = len2 ;
2011-09-14 01:52:13 +04:00
}
}
}
/* overwrite part of an existing block */
if ( buf = = NULL ) {
2011-09-14 02:13:13 +04:00
memset ( tdb - > tdb1 . transaction - > blocks [ blk ] + off , 0 , len ) ;
2011-09-14 01:52:13 +04:00
} else {
2011-09-14 02:13:13 +04:00
memcpy ( tdb - > tdb1 . transaction - > blocks [ blk ] + off , buf , len ) ;
2011-09-14 01:52:13 +04:00
}
2011-09-14 02:13:13 +04:00
if ( blk = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
if ( len + off > tdb - > tdb1 . transaction - > last_block_size ) {
tdb - > tdb1 . transaction - > last_block_size = len + off ;
2011-09-14 01:52:13 +04:00
}
}
return 0 ;
fail :
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" transaction_write: failed at off=%d len=%d " ,
2011-09-14 02:13:13 +04:00
( blk * tdb - > tdb1 . transaction - > block_size ) + off , len ) ;
tdb - > tdb1 . transaction - > transaction_error = 1 ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/*
write while in a transaction - this varient never expands the transaction blocks , it only
updates existing blocks . This means it cannot change the recovery size
*/
2011-09-14 02:13:13 +04:00
static int transaction1_write_existing ( struct tdb_context * tdb , tdb1_off_t off ,
2011-09-14 01:52:13 +04:00
const void * buf , tdb1_len_t len )
{
uint32_t blk ;
/* break it up into block sized chunks */
2011-09-14 02:13:13 +04:00
while ( len + ( off % tdb - > tdb1 . transaction - > block_size ) > tdb - > tdb1 . transaction - > block_size ) {
tdb1_len_t len2 = tdb - > tdb1 . transaction - > block_size - ( off % tdb - > tdb1 . transaction - > block_size ) ;
2011-09-14 01:52:13 +04:00
if ( transaction1_write_existing ( tdb , off , buf , len2 ) ! = 0 ) {
return - 1 ;
}
len - = len2 ;
off + = len2 ;
if ( buf ! = NULL ) {
buf = ( const void * ) ( len2 + ( const char * ) buf ) ;
}
}
if ( len = = 0 ) {
return 0 ;
}
2011-09-14 02:13:13 +04:00
blk = off / tdb - > tdb1 . transaction - > block_size ;
off = off % tdb - > tdb1 . transaction - > block_size ;
2011-09-14 01:52:13 +04:00
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > num_blocks < = blk | |
tdb - > tdb1 . transaction - > blocks [ blk ] = = NULL ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
2011-09-14 02:13:13 +04:00
if ( blk = = tdb - > tdb1 . transaction - > num_blocks - 1 & &
off + len > tdb - > tdb1 . transaction - > last_block_size ) {
if ( off > = tdb - > tdb1 . transaction - > last_block_size ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
2011-09-14 02:13:13 +04:00
len = tdb - > tdb1 . transaction - > last_block_size - off ;
2011-09-14 01:52:13 +04:00
}
/* overwrite part of an existing block */
2011-09-14 02:13:13 +04:00
memcpy ( tdb - > tdb1 . transaction - > blocks [ blk ] + off , buf , len ) ;
2011-09-14 01:52:13 +04:00
return 0 ;
}
/*
accelerated hash chain head search , using the cached hash heads
*/
2011-09-14 02:13:13 +04:00
static void transaction1_next_hash_chain ( struct tdb_context * tdb , uint32_t * chain )
2011-09-14 01:52:13 +04:00
{
uint32_t h = * chain ;
2011-09-14 02:13:13 +04:00
for ( ; h < tdb - > tdb1 . header . hash_size ; h + + ) {
2011-09-14 01:52:13 +04:00
/* the +1 takes account of the freelist */
2011-09-14 02:13:13 +04:00
if ( 0 ! = tdb - > tdb1 . transaction - > hash_heads [ h + 1 ] ) {
2011-09-14 01:52:13 +04:00
break ;
}
}
( * chain ) = h ;
}
/*
out of bounds check during a transaction
*/
2011-09-14 02:13:13 +04:00
static int transaction1_oob ( struct tdb_context * tdb , tdb1_off_t len , int probe )
2011-09-14 01:52:13 +04:00
{
2011-09-14 02:07:13 +04:00
if ( len < = tdb - > file - > map_size ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_IO ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/*
transaction version of tdb1_expand ( ) .
*/
2011-09-14 02:13:13 +04:00
static int transaction1_expand_file ( struct tdb_context * tdb , tdb1_off_t size ,
2011-09-14 01:52:13 +04:00
tdb1_off_t addition )
{
/* add a write to the transaction elements, so subsequent
reads see the zero data */
if ( transaction1_write ( tdb , size , NULL , addition ) ! = 0 ) {
return - 1 ;
}
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > expanded = true ;
2011-09-14 01:52:13 +04:00
return 0 ;
}
static const struct tdb1_methods transaction1_methods = {
transaction1_read ,
transaction1_write ,
transaction1_next_hash_chain ,
transaction1_oob ,
transaction1_expand_file ,
} ;
/*
start a tdb transaction . No token is returned , as only a single
2011-09-14 02:13:13 +04:00
transaction is allowed to be pending per tdb_context
2011-09-14 01:52:13 +04:00
*/
2011-09-14 02:13:13 +04:00
static int _tdb1_transaction_start ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
/* some sanity checks */
2011-09-14 02:41:13 +04:00
if ( tdb - > flags & TDB_INTERNAL ) {
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL ,
TDB_LOG_USE_ERROR ,
" tdb1_transaction_start: "
" cannot start a "
" transaction on an "
" internal tdb " ) ;
return - 1 ;
}
if ( ( tdb - > flags & TDB_RDONLY ) | | tdb - > tdb1 . traverse_read ) {
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_RDONLY ,
TDB_LOG_USE_ERROR ,
" tdb_transaction_start: "
" cannot start a "
" transaction on a "
" read-only tdb " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* cope with nested tdb1_transaction_start() calls */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction ! = NULL ) {
2011-09-14 02:05:13 +04:00
if ( ! ( tdb - > flags & TDB_ALLOW_NESTING ) ) {
2011-09-14 02:41:13 +04:00
tdb - > last_error
= tdb_logerr ( tdb , TDB_ERR_EINVAL ,
TDB_LOG_USE_ERROR ,
" tdb_transaction_start: "
" already inside transaction " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:40:13 +04:00
tdb - > stats . transaction_nest + + ;
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > nesting + + ;
2011-09-14 01:52:13 +04:00
return 0 ;
}
if ( tdb1_have_extra_locks ( tdb ) ) {
/* the caller must not have any locks when starting a
transaction as otherwise we ' ll be screwed by lack
of nested locks in posix */
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_LOCK , TDB_LOG_USE_ERROR ,
" tdb1_transaction_start: cannot start a "
" transaction with locks held " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . travlocks . next ! = NULL ) {
2011-09-14 01:52:13 +04:00
/* you cannot use transactions inside a traverse (although you can use
traverse inside a transaction ) as otherwise you can end up with
deadlock */
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_LOCK , TDB_LOG_USE_ERROR ,
" tdb1_transaction_start: cannot start a "
" transaction within a traverse " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction = ( struct tdb1_transaction * )
2011-09-14 01:52:13 +04:00
calloc ( sizeof ( struct tdb1_transaction ) , 1 ) ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_OOM ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* a page at a time seems like a reasonable compromise between compactness and efficiency */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > block_size = tdb - > tdb1 . page_size ;
2011-09-14 01:52:13 +04:00
/* get the transaction write lock. This is a blocking lock. As
discussed with Volker , there are a number of ways we could
make this async , which we will probably do in the future */
2011-09-14 02:03:13 +04:00
if ( tdb1_transaction_lock ( tdb , F_WRLCK , TDB_LOCK_WAIT ) = = - 1 ) {
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks ) ;
SAFE_FREE ( tdb - > tdb1 . transaction ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* get a read lock from the freelist to the end of file. This
is upgraded to a write lock during the commit */
2011-09-14 02:03:13 +04:00
if ( tdb1_allrecord_lock ( tdb , F_RDLCK , TDB_LOCK_WAIT , true ) = = - 1 ) {
2011-09-14 02:09:13 +04:00
if ( errno ! = EAGAIN & & errno ! = EINTR ) {
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_start: "
" failed to get hash locks " ) ;
}
2011-09-14 01:52:13 +04:00
goto fail_allrecord_lock ;
}
/* setup a copy of the hash table heads so the hash scan in
traverse can be fast */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > hash_heads = ( uint32_t * )
calloc ( tdb - > tdb1 . header . hash_size + 1 , sizeof ( uint32_t ) ) ;
if ( tdb - > tdb1 . transaction - > hash_heads = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_OOM ;
2011-09-14 01:52:13 +04:00
goto fail ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . io - > tdb1_read ( tdb , TDB1_FREELIST_TOP , tdb - > tdb1 . transaction - > hash_heads ,
2011-09-14 01:52:13 +04:00
TDB1_HASHTABLE_SIZE ( tdb ) , 0 ) ! = 0 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_start: failed to read hash heads " ) ;
2011-09-14 01:52:13 +04:00
goto fail ;
}
/* make sure we know about any file expansions already done by
anyone else */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . io - > tdb1_oob ( tdb , tdb - > file - > map_size + 1 , 1 ) ;
tdb - > tdb1 . transaction - > old_map_size = tdb - > file - > map_size ;
2011-09-14 01:52:13 +04:00
/* finally hook the io methods, replacing them with
transaction specific methods */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > io_methods = tdb - > tdb1 . io ;
tdb - > tdb1 . io = & transaction1_methods ;
2011-09-14 01:52:13 +04:00
2011-09-14 02:40:13 +04:00
tdb - > stats . transactions + + ;
2011-09-14 01:52:13 +04:00
return 0 ;
fail :
2011-09-14 01:57:13 +04:00
tdb1_allrecord_unlock ( tdb , F_RDLCK ) ;
2011-09-14 01:52:13 +04:00
fail_allrecord_lock :
tdb1_transaction_unlock ( tdb , F_WRLCK ) ;
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks ) ;
SAFE_FREE ( tdb - > tdb1 . transaction - > hash_heads ) ;
SAFE_FREE ( tdb - > tdb1 . transaction ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
int tdb1_transaction_start ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
2011-09-14 01:57:13 +04:00
return _tdb1_transaction_start ( tdb ) ;
2011-09-14 01:52:13 +04:00
}
/*
sync to disk
*/
2011-09-14 02:13:13 +04:00
static int transaction1_sync ( struct tdb_context * tdb , tdb1_off_t offset , tdb1_len_t length )
2011-09-14 01:52:13 +04:00
{
2011-09-14 02:05:13 +04:00
if ( tdb - > flags & TDB_NOSYNC ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
# if HAVE_FDATASYNC
2011-09-14 02:07:13 +04:00
if ( fdatasync ( tdb - > file - > fd ) ! = 0 ) {
2011-09-14 01:52:13 +04:00
# else
2011-09-14 02:07:13 +04:00
if ( fsync ( tdb - > file - > fd ) ! = 0 ) {
2011-09-14 01:52:13 +04:00
# endif
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_IO , TDB_LOG_ERROR ,
" tdb1_transaction: fsync failed " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
# if HAVE_MMAP
2011-09-14 02:07:13 +04:00
if ( tdb - > file - > map_ptr ) {
2011-09-14 02:13:13 +04:00
tdb1_off_t moffset = offset & ~ ( tdb - > tdb1 . page_size - 1 ) ;
2011-09-14 02:07:13 +04:00
if ( msync ( moffset + ( char * ) tdb - > file - > map_ptr ,
2011-09-14 01:52:13 +04:00
length + ( offset - moffset ) , MS_SYNC ) ! = 0 ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_IO , TDB_LOG_ERROR ,
" tdb1_transaction: "
" msync failed - %s " ,
strerror ( errno ) ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
}
# endif
return 0 ;
}
2011-09-14 02:13:13 +04:00
static int _tdb1_transaction_cancel ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
int i , ret = 0 ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL , TDB_LOG_USE_ERROR ,
" tdb1_transaction_cancel: "
" no transaction " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > nesting ! = 0 ) {
tdb - > tdb1 . transaction - > transaction_error = 1 ;
tdb - > tdb1 . transaction - > nesting - - ;
2011-09-14 01:52:13 +04:00
return 0 ;
}
2011-09-14 02:13:13 +04:00
tdb - > file - > map_size = tdb - > tdb1 . transaction - > old_map_size ;
2011-09-14 01:52:13 +04:00
/* free all the transaction blocks */
2011-09-14 02:13:13 +04:00
for ( i = 0 ; i < tdb - > tdb1 . transaction - > num_blocks ; i + + ) {
if ( tdb - > tdb1 . transaction - > blocks [ i ] ! = NULL ) {
free ( tdb - > tdb1 . transaction - > blocks [ i ] ) ;
2011-09-14 01:52:13 +04:00
}
}
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks ) ;
2011-09-14 01:52:13 +04:00
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > magic_offset ) {
const struct tdb1_methods * methods = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
const uint32_t invalid = TDB1_RECOVERY_INVALID_MAGIC ;
/* remove the recovery marker */
2011-09-14 02:13:13 +04:00
if ( methods - > tdb1_write ( tdb , tdb - > tdb1 . transaction - > magic_offset , & invalid , 4 ) = = - 1 | |
transaction1_sync ( tdb , tdb - > tdb1 . transaction - > magic_offset , 4 ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_cancel: failed to "
" remove recovery magic " ) ;
2011-09-14 01:52:13 +04:00
ret = - 1 ;
}
}
/* This also removes the OPEN_LOCK, if we have it. */
tdb1_release_transaction_locks ( tdb ) ;
/* restore the normal io methods */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . io = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > hash_heads ) ;
SAFE_FREE ( tdb - > tdb1 . transaction ) ;
2011-09-14 01:52:13 +04:00
return ret ;
}
/*
cancel the current transaction
*/
2011-09-14 02:13:13 +04:00
int tdb1_transaction_cancel ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
2011-09-14 02:40:13 +04:00
tdb - > stats . transaction_cancel + + ;
2011-09-14 01:52:13 +04:00
return _tdb1_transaction_cancel ( tdb ) ;
}
/*
work out how much space the linearised recovery data will consume
*/
2011-09-14 02:13:13 +04:00
static tdb1_len_t tdb1_recovery_size ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
tdb1_len_t recovery_size = 0 ;
int i ;
recovery_size = sizeof ( uint32_t ) ;
2011-09-14 02:13:13 +04:00
for ( i = 0 ; i < tdb - > tdb1 . transaction - > num_blocks ; i + + ) {
if ( i * tdb - > tdb1 . transaction - > block_size > = tdb - > tdb1 . transaction - > old_map_size ) {
2011-09-14 01:52:13 +04:00
break ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks [ i ] = = NULL ) {
2011-09-14 01:52:13 +04:00
continue ;
}
recovery_size + = 2 * sizeof ( tdb1_off_t ) ;
2011-09-14 02:13:13 +04:00
if ( i = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
recovery_size + = tdb - > tdb1 . transaction - > last_block_size ;
2011-09-14 01:52:13 +04:00
} else {
2011-09-14 02:13:13 +04:00
recovery_size + = tdb - > tdb1 . transaction - > block_size ;
2011-09-14 01:52:13 +04:00
}
}
return recovery_size ;
}
2011-09-14 02:13:13 +04:00
int tdb1_recovery_area ( struct tdb_context * tdb ,
2011-09-14 01:52:13 +04:00
const struct tdb1_methods * methods ,
tdb1_off_t * recovery_offset ,
struct tdb1_record * rec )
{
if ( tdb1_ofs_read ( tdb , TDB1_RECOVERY_HEAD , recovery_offset ) = = - 1 ) {
return - 1 ;
}
if ( * recovery_offset = = 0 ) {
rec - > rec_len = 0 ;
return 0 ;
}
if ( methods - > tdb1_read ( tdb , * recovery_offset , rec , sizeof ( * rec ) ,
TDB1_DOCONV ( ) ) = = - 1 ) {
return - 1 ;
}
/* ignore invalid recovery regions: can happen in crash */
if ( rec - > magic ! = TDB1_RECOVERY_MAGIC & &
rec - > magic ! = TDB1_RECOVERY_INVALID_MAGIC ) {
* recovery_offset = 0 ;
rec - > rec_len = 0 ;
}
return 0 ;
}
/*
allocate the recovery area , or use an existing recovery area if it is
large enough
*/
2011-09-14 02:13:13 +04:00
static int tdb1_recovery_allocate ( struct tdb_context * tdb ,
2011-09-14 01:52:13 +04:00
tdb1_len_t * recovery_size ,
tdb1_off_t * recovery_offset ,
tdb1_len_t * recovery_max_size )
{
struct tdb1_record rec ;
2011-09-14 02:13:13 +04:00
const struct tdb1_methods * methods = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
tdb1_off_t recovery_head ;
if ( tdb1_recovery_area ( tdb , methods , & recovery_head , & rec ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_recovery_allocate: "
" failed to read recovery head " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
* recovery_size = tdb1_recovery_size ( tdb ) ;
if ( recovery_head ! = 0 & & * recovery_size < = rec . rec_len ) {
/* it fits in the existing area */
* recovery_max_size = rec . rec_len ;
* recovery_offset = recovery_head ;
return 0 ;
}
/* we need to free up the old recovery area, then allocate a
new one at the end of the file . Note that we cannot use
tdb1_allocate ( ) to allocate the new one as that might return
us an area that is being currently used ( as of the start of
the transaction ) */
if ( recovery_head ! = 0 ) {
if ( tdb1_free ( tdb , recovery_head , & rec ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_recovery_allocate: failed to free "
" previous recovery area " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
}
/* the tdb1_free() call might have increased the recovery size */
* recovery_size = tdb1_recovery_size ( tdb ) ;
/* round up to a multiple of page size */
2011-09-14 02:13:13 +04:00
* recovery_max_size = TDB1_ALIGN ( sizeof ( rec ) + * recovery_size ,
tdb - > tdb1 . page_size ) - sizeof ( rec ) ;
2011-09-14 02:07:13 +04:00
* recovery_offset = tdb - > file - > map_size ;
2011-09-14 01:52:13 +04:00
recovery_head = * recovery_offset ;
2011-09-14 02:13:13 +04:00
if ( methods - > tdb1_expand_file ( tdb , tdb - > tdb1 . transaction - > old_map_size ,
( tdb - > file - > map_size - tdb - > tdb1 . transaction - > old_map_size ) +
2011-09-14 01:52:13 +04:00
sizeof ( rec ) + * recovery_max_size ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_recovery_allocate: "
" failed to create recovery area " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:40:13 +04:00
tdb - > stats . transaction_expand_file + + ;
2011-09-14 01:52:13 +04:00
/* remap the file (if using mmap) */
2011-09-14 02:07:13 +04:00
methods - > tdb1_oob ( tdb , tdb - > file - > map_size + 1 , 1 ) ;
2011-09-14 01:52:13 +04:00
/* we have to reset the old map size so that we don't try to expand the file
again in the transaction commit , which would destroy the recovery area */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > old_map_size = tdb - > file - > map_size ;
2011-09-14 01:52:13 +04:00
/* write the recovery header offset and sync - we can sync without a race here
as the magic ptr in the recovery record has not been set */
TDB1_CONV ( recovery_head ) ;
if ( methods - > tdb1_write ( tdb , TDB1_RECOVERY_HEAD ,
& recovery_head , sizeof ( tdb1_off_t ) ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_recovery_allocate: "
" failed to write recovery head " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
if ( transaction1_write_existing ( tdb , TDB1_RECOVERY_HEAD , & recovery_head , sizeof ( tdb1_off_t ) ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_recovery_allocate: "
" failed to write recovery head " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
return 0 ;
}
/*
setup the recovery data that will be used on a crash during commit
*/
2011-09-14 02:13:13 +04:00
static int transaction1_setup_recovery ( struct tdb_context * tdb ,
2011-09-14 01:52:13 +04:00
tdb1_off_t * magic_offset )
{
tdb1_len_t recovery_size ;
unsigned char * data , * p ;
2011-09-14 02:13:13 +04:00
const struct tdb1_methods * methods = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
struct tdb1_record * rec ;
tdb1_off_t recovery_offset , recovery_max_size ;
2011-09-14 02:13:13 +04:00
tdb1_off_t old_map_size = tdb - > tdb1 . transaction - > old_map_size ;
2011-09-14 01:52:13 +04:00
uint32_t magic , tailer ;
int i ;
/*
check that the recovery area has enough space
*/
if ( tdb1_recovery_allocate ( tdb , & recovery_size ,
& recovery_offset , & recovery_max_size ) = = - 1 ) {
return - 1 ;
}
data = ( unsigned char * ) malloc ( recovery_size + sizeof ( * rec ) ) ;
if ( data = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_OOM ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
rec = ( struct tdb1_record * ) data ;
memset ( rec , 0 , sizeof ( * rec ) ) ;
rec - > magic = TDB1_RECOVERY_INVALID_MAGIC ;
rec - > data_len = recovery_size ;
rec - > rec_len = recovery_max_size ;
rec - > key_len = old_map_size ;
TDB1_CONV ( * rec ) ;
/* build the recovery data into a single blob to allow us to do a single
large write , which should be more efficient */
p = data + sizeof ( * rec ) ;
2011-09-14 02:13:13 +04:00
for ( i = 0 ; i < tdb - > tdb1 . transaction - > num_blocks ; i + + ) {
2011-09-14 01:52:13 +04:00
tdb1_off_t offset ;
tdb1_len_t length ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks [ i ] = = NULL ) {
2011-09-14 01:52:13 +04:00
continue ;
}
2011-09-14 02:13:13 +04:00
offset = i * tdb - > tdb1 . transaction - > block_size ;
length = tdb - > tdb1 . transaction - > block_size ;
if ( i = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
length = tdb - > tdb1 . transaction - > last_block_size ;
2011-09-14 01:52:13 +04:00
}
if ( offset > = old_map_size ) {
continue ;
}
2011-09-14 02:13:13 +04:00
if ( offset + length > tdb - > tdb1 . transaction - > old_map_size ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_CORRUPT ,
TDB_LOG_ERROR ,
" tdb1_transaction_setup_recovery: transaction data over new region boundary " ) ;
2011-09-14 01:52:13 +04:00
free ( data ) ;
return - 1 ;
}
memcpy ( p , & offset , 4 ) ;
memcpy ( p + 4 , & length , 4 ) ;
if ( TDB1_DOCONV ( ) ) {
tdb1_convert ( p , 8 ) ;
}
/* the recovery area contains the old data, not the
new data , so we have to call the original tdb1_read
method to get it */
if ( methods - > tdb1_read ( tdb , offset , p + 8 , length , 0 ) ! = 0 ) {
free ( data ) ;
2011-09-14 02:02:13 +04:00
tdb - > last_error = TDB_ERR_IO ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
p + = 8 + length ;
}
/* and the tailer */
tailer = sizeof ( * rec ) + recovery_max_size ;
memcpy ( p , & tailer , 4 ) ;
if ( TDB1_DOCONV ( ) ) {
tdb1_convert ( p , 4 ) ;
}
/* write the recovery data to the recovery area */
if ( methods - > tdb1_write ( tdb , recovery_offset , data , sizeof ( * rec ) + recovery_size ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_setup_recovery: "
" failed to write recovery data " ) ;
2011-09-14 01:52:13 +04:00
free ( data ) ;
return - 1 ;
}
if ( transaction1_write_existing ( tdb , recovery_offset , data , sizeof ( * rec ) + recovery_size ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_setup_recovery: failed to write "
" secondary recovery data " ) ;
2011-09-14 01:52:13 +04:00
free ( data ) ;
return - 1 ;
}
/* as we don't have ordered writes, we have to sync the recovery
data before we update the magic to indicate that the recovery
data is present */
if ( transaction1_sync ( tdb , recovery_offset , sizeof ( * rec ) + recovery_size ) = = - 1 ) {
free ( data ) ;
return - 1 ;
}
free ( data ) ;
magic = TDB1_RECOVERY_MAGIC ;
TDB1_CONV ( magic ) ;
* magic_offset = recovery_offset + offsetof ( struct tdb1_record , magic ) ;
if ( methods - > tdb1_write ( tdb , * magic_offset , & magic , sizeof ( magic ) ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_setup_recovery: "
" failed to write recovery magic " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
if ( transaction1_write_existing ( tdb , * magic_offset , & magic , sizeof ( magic ) ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_setup_recovery: "
" failed to write secondary recovery magic " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* ensure the recovery magic marker is on disk */
if ( transaction1_sync ( tdb , * magic_offset , sizeof ( magic ) ) = = - 1 ) {
return - 1 ;
}
return 0 ;
}
2011-09-14 02:13:13 +04:00
static int _tdb1_transaction_prepare_commit ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
const struct tdb1_methods * methods ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL , TDB_LOG_USE_ERROR ,
" tdb1_transaction_prepare_commit: "
" no transaction " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > prepared ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL , TDB_LOG_USE_ERROR ,
" tdb1_transaction_prepare_commit: "
" transaction already prepared " ) ;
2011-09-14 01:52:13 +04:00
_tdb1_transaction_cancel ( tdb ) ;
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > transaction_error ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_IO , TDB_LOG_ERROR ,
" tdb1_transaction_prepare_commit: "
" transaction error pending " ) ;
2011-09-14 01:52:13 +04:00
_tdb1_transaction_cancel ( tdb ) ;
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > nesting ! = 0 ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
/* check for a null transaction */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks = = NULL ) {
2011-09-14 01:52:13 +04:00
return 0 ;
}
2011-09-14 02:13:13 +04:00
methods = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
/* if there are any locks pending then the caller has not
nested their locks properly , so fail the transaction */
if ( tdb1_have_extra_locks ( tdb ) ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_LOCK , TDB_LOG_USE_ERROR ,
" tdb1_transaction_prepare_commit: "
" locks pending on commit " ) ;
2011-09-14 01:52:13 +04:00
_tdb1_transaction_cancel ( tdb ) ;
return - 1 ;
}
/* upgrade the main transaction lock region to a write lock */
if ( tdb1_allrecord_upgrade ( tdb ) = = - 1 ) {
2011-09-14 02:09:13 +04:00
if ( errno ! = EAGAIN & & errno ! = EINTR ) {
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_prepare_commit: "
" failed to upgrade hash locks " ) ;
}
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* get the open lock - this prevents new users attaching to the database
during the commit */
2011-09-14 02:03:13 +04:00
if ( tdb1_nest_lock ( tdb , TDB1_OPEN_LOCK , F_WRLCK , TDB_LOCK_WAIT ) = = - 1 ) {
2011-09-14 02:09:13 +04:00
if ( errno ! = EAGAIN & & errno ! = EINTR ) {
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_prepare_commit: "
" failed to get open lock " ) ;
}
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:05:13 +04:00
if ( ! ( tdb - > flags & TDB_NOSYNC ) ) {
2011-09-14 01:52:13 +04:00
/* write the recovery data to the end of the file */
2011-09-14 02:13:13 +04:00
if ( transaction1_setup_recovery ( tdb , & tdb - > tdb1 . transaction - > magic_offset ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_prepare_commit: "
" failed to setup recovery data " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
}
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > prepared = true ;
2011-09-14 01:52:13 +04:00
/* expand the file to the new size if needed */
2011-09-14 02:13:13 +04:00
if ( tdb - > file - > map_size ! = tdb - > tdb1 . transaction - > old_map_size ) {
if ( methods - > tdb1_expand_file ( tdb , tdb - > tdb1 . transaction - > old_map_size ,
2011-09-14 02:07:13 +04:00
tdb - > file - > map_size -
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . transaction - > old_map_size ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_prepare_commit: "
" expansion failed " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:40:13 +04:00
tdb - > stats . transaction_expand_file + + ;
2011-09-14 02:13:13 +04:00
tdb - > file - > map_size = tdb - > tdb1 . transaction - > old_map_size ;
2011-09-14 02:07:13 +04:00
methods - > tdb1_oob ( tdb , tdb - > file - > map_size + 1 , 1 ) ;
2011-09-14 01:52:13 +04:00
}
/* Keep the open lock until the actual commit */
return 0 ;
}
/*
prepare to commit the current transaction
*/
2011-09-14 02:13:13 +04:00
int tdb1_transaction_prepare_commit ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
return _tdb1_transaction_prepare_commit ( tdb ) ;
}
/* A repack is worthwhile if the largest is less than half total free. */
2011-09-14 02:13:13 +04:00
static bool repack_worthwhile ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
tdb1_off_t ptr ;
struct tdb1_record rec ;
tdb1_len_t total = 0 , largest = 0 ;
if ( tdb1_ofs_read ( tdb , TDB1_FREELIST_TOP , & ptr ) = = - 1 ) {
return false ;
}
while ( ptr ! = 0 & & tdb1_rec_free_read ( tdb , ptr , & rec ) = = 0 ) {
total + = rec . rec_len ;
if ( rec . rec_len > largest ) {
largest = rec . rec_len ;
}
ptr = rec . next ;
}
return total > largest * 2 ;
}
/*
commit the current transaction
*/
2011-09-14 02:13:13 +04:00
int tdb1_transaction_commit ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
const struct tdb1_methods * methods ;
int i ;
bool need_repack = false ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_EINVAL , TDB_LOG_USE_ERROR ,
" tdb1_transaction_commit: "
" no transaction " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > transaction_error ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_IO , TDB_LOG_ERROR ,
" tdb1_transaction_commit: "
" transaction error pending " ) ;
2011-09-14 01:52:13 +04:00
_tdb1_transaction_cancel ( tdb ) ;
return - 1 ;
}
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > nesting ! = 0 ) {
tdb - > tdb1 . transaction - > nesting - - ;
2011-09-14 01:52:13 +04:00
return 0 ;
}
/* check for a null transaction */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks = = NULL ) {
2011-09-14 01:52:13 +04:00
_tdb1_transaction_cancel ( tdb ) ;
return 0 ;
}
2011-09-14 02:13:13 +04:00
if ( ! tdb - > tdb1 . transaction - > prepared ) {
2011-09-14 01:52:13 +04:00
int ret = _tdb1_transaction_prepare_commit ( tdb ) ;
2011-09-14 02:10:13 +04:00
if ( ret ) {
_tdb1_transaction_cancel ( tdb ) ;
2011-09-14 01:52:13 +04:00
return ret ;
2011-09-14 02:10:13 +04:00
}
2011-09-14 01:52:13 +04:00
}
2011-09-14 02:13:13 +04:00
methods = tdb - > tdb1 . transaction - > io_methods ;
2011-09-14 01:52:13 +04:00
/* perform all the writes */
2011-09-14 02:13:13 +04:00
for ( i = 0 ; i < tdb - > tdb1 . transaction - > num_blocks ; i + + ) {
2011-09-14 01:52:13 +04:00
tdb1_off_t offset ;
tdb1_len_t length ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > blocks [ i ] = = NULL ) {
2011-09-14 01:52:13 +04:00
continue ;
}
2011-09-14 02:13:13 +04:00
offset = i * tdb - > tdb1 . transaction - > block_size ;
length = tdb - > tdb1 . transaction - > block_size ;
if ( i = = tdb - > tdb1 . transaction - > num_blocks - 1 ) {
length = tdb - > tdb1 . transaction - > last_block_size ;
2011-09-14 01:52:13 +04:00
}
2011-09-14 02:13:13 +04:00
if ( methods - > tdb1_write ( tdb , offset , tdb - > tdb1 . transaction - > blocks [ i ] , length ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_commit: "
" write failed during commit " ) ;
2011-09-14 01:52:13 +04:00
/* we've overwritten part of the data and
possibly expanded the file , so we need to
run the crash recovery code */
2011-09-14 02:13:13 +04:00
tdb - > tdb1 . io = methods ;
2011-09-14 01:52:13 +04:00
tdb1_transaction_recover ( tdb ) ;
_tdb1_transaction_cancel ( tdb ) ;
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_commit: write failed " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks [ i ] ) ;
2011-09-14 01:52:13 +04:00
}
/* Do this before we drop lock or blocks. */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . transaction - > expanded ) {
2011-09-14 01:52:13 +04:00
need_repack = repack_worthwhile ( tdb ) ;
}
2011-09-14 02:13:13 +04:00
SAFE_FREE ( tdb - > tdb1 . transaction - > blocks ) ;
tdb - > tdb1 . transaction - > num_blocks = 0 ;
2011-09-14 01:52:13 +04:00
/* ensure the new data is on disk */
2011-09-14 02:07:13 +04:00
if ( transaction1_sync ( tdb , 0 , tdb - > file - > map_size ) = = - 1 ) {
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/*
TODO : maybe write to some dummy hdr field , or write to magic
offset without mmap , before the last sync , instead of the
utime ( ) call
*/
/* on some systems (like Linux 2.6.x) changes via mmap/msync
don ' t change the mtime of the file , this means the file may
not be backed up ( as tdb rounding to block sizes means that
file size changes are quite rare too ) . The following forces
mtime changes when a transaction completes */
# if HAVE_UTIME
utime ( tdb - > name , NULL ) ;
# endif
/* use a transaction cancel to free memory and remove the
transaction locks */
_tdb1_transaction_cancel ( tdb ) ;
if ( need_repack ) {
2011-09-14 02:43:27 +04:00
if ( tdb_repack ( tdb ) ! = 0 )
return - 1 ;
2011-09-14 01:52:13 +04:00
}
return 0 ;
}
/*
recover from an aborted transaction . Must be called with exclusive
database write access already established ( including the open
lock to prevent new processes attaching )
*/
2011-09-14 02:13:13 +04:00
int tdb1_transaction_recover ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
tdb1_off_t recovery_head , recovery_eof ;
unsigned char * data , * p ;
uint32_t zero = 0 ;
struct tdb1_record rec ;
/* find the recovery area */
if ( tdb1_ofs_read ( tdb , TDB1_RECOVERY_HEAD , & recovery_head ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" failed to read recovery head " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
if ( recovery_head = = 0 ) {
/* we have never allocated a recovery record */
return 0 ;
}
/* read the recovery record */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . io - > tdb1_read ( tdb , recovery_head , & rec ,
2011-09-14 01:52:13 +04:00
sizeof ( rec ) , TDB1_DOCONV ( ) ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" failed to read recovery record " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
if ( rec . magic ! = TDB1_RECOVERY_MAGIC ) {
/* there is no valid recovery data */
return 0 ;
}
2011-09-14 02:12:13 +04:00
if ( tdb - > flags & TDB_RDONLY ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_CORRUPT , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" attempt to recover read only "
" database " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
recovery_eof = rec . key_len ;
data = ( unsigned char * ) malloc ( rec . data_len ) ;
if ( data = = NULL ) {
2011-09-14 02:02:13 +04:00
tdb - > last_error = tdb_logerr ( tdb , TDB_ERR_OOM , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" failed to allocate recovery data " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* read the full recovery data */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . io - > tdb1_read ( tdb , recovery_head + sizeof ( rec ) , data ,
2011-09-14 01:52:13 +04:00
rec . data_len , 0 ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" failed to read recovery data " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* recover the file data */
p = data ;
while ( p + 8 < data + rec . data_len ) {
uint32_t ofs , len ;
if ( TDB1_DOCONV ( ) ) {
tdb1_convert ( p , 8 ) ;
}
memcpy ( & ofs , p , 4 ) ;
memcpy ( & len , p + 4 , 4 ) ;
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . io - > tdb1_write ( tdb , ofs , p + 8 , len ) = = - 1 ) {
2011-09-14 01:52:13 +04:00
free ( data ) ;
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: failed to recover "
" %d bytes at offset %d " , len , ofs ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
p + = 8 + len ;
}
free ( data ) ;
2011-09-14 02:07:13 +04:00
if ( transaction1_sync ( tdb , 0 , tdb - > file - > map_size ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: failed to sync recovery " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
/* if the recovery area is after the recovered eof then remove it */
if ( recovery_eof < = recovery_head ) {
if ( tdb1_ofs_write ( tdb , TDB1_RECOVERY_HEAD , & zero ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: failed to remove "
" recovery head " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
}
/* remove the recovery magic */
if ( tdb1_ofs_write ( tdb , recovery_head + offsetof ( struct tdb1_record , magic ) ,
& zero ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: failed to remove "
" recovery magic " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
if ( transaction1_sync ( tdb , 0 , recovery_eof ) = = - 1 ) {
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , tdb - > last_error , TDB_LOG_ERROR ,
" tdb1_transaction_recover: "
" failed to sync2 recovery " ) ;
2011-09-14 01:52:13 +04:00
return - 1 ;
}
2011-09-14 02:02:13 +04:00
tdb_logerr ( tdb , TDB_SUCCESS , TDB_LOG_WARNING ,
" tdb1_transaction_recover: recovered %d byte database " ,
recovery_eof ) ;
2011-09-14 01:52:13 +04:00
/* all done */
return 0 ;
}
/* Any I/O failures we say "needs recovery". */
2011-09-14 02:37:13 +04:00
tdb_bool_err tdb1_needs_recovery ( struct tdb_context * tdb )
2011-09-14 01:52:13 +04:00
{
tdb1_off_t recovery_head ;
struct tdb1_record rec ;
/* find the recovery area */
if ( tdb1_ofs_read ( tdb , TDB1_RECOVERY_HEAD , & recovery_head ) = = - 1 ) {
2011-09-14 02:43:27 +04:00
return TDB_ERR_TO_OFF ( tdb - > last_error ) ;
2011-09-14 01:52:13 +04:00
}
if ( recovery_head = = 0 ) {
/* we have never allocated a recovery record */
return false ;
}
/* read the recovery record */
2011-09-14 02:13:13 +04:00
if ( tdb - > tdb1 . io - > tdb1_read ( tdb , recovery_head , & rec ,
2011-09-14 01:52:13 +04:00
sizeof ( rec ) , TDB1_DOCONV ( ) ) = = - 1 ) {
2011-09-14 02:43:27 +04:00
return TDB_ERR_TO_OFF ( tdb - > last_error ) ;
2011-09-14 01:52:13 +04:00
}
return ( rec . magic = = TDB1_RECOVERY_MAGIC ) ;
}