2012-06-18 22:30:26 +09:30
/*
Unix SMB / CIFS implementation .
trivial database library
Copyright ( C ) Andrew Tridgell 2005
Copyright ( C ) Rusty Russell 2010
* * NOTE ! The following LGPL license applies to the ntdb
* * library . This does NOT imply that all of Samba is released
* * under the LGPL
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 3 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , see < http : //www.gnu.org/licenses/>.
*/
# include "private.h"
2012-06-18 22:30:29 +09:30
# include <assert.h>
2012-06-19 12:42:08 +09:30
# define SAFE_FREE(ntdb, x) do { if ((x) != NULL) {ntdb->free_fn((void *)x, ntdb->alloc_data); (x)=NULL;} } while(0)
2012-06-18 22:30:26 +09:30
/*
transaction design :
- only allow a single transaction at a time per database . This makes
using the transaction API simpler , as otherwise the caller would
have to cope with temporary failures in transactions that conflict
with other current transactions
- keep the transaction recovery information in the same file as the
database , using a special ' transaction recovery ' record pointed at
by the header . This removes the need for extra journal files as
used by some other databases
- dynamically allocated the transaction recover record , re - using it
for subsequent transactions . If a larger record is needed then
ntdb_free ( ) the old record to place it on the normal ntdb freelist
before allocating the new record
- during transactions , keep a linked list of writes all that have
been performed by intercepting all ntdb_write ( ) calls . The hooked
transaction versions of ntdb_read ( ) and ntdb_write ( ) check this
linked list and try to use the elements of the list in preference
to the real database .
- don ' t allow any locks to be held when a transaction starts ,
otherwise we can end up with deadlock ( plus lack of lock nesting
in POSIX locks would mean the lock is lost )
- if the caller gains a lock during the transaction but doesn ' t
release it then fail the commit
- allow for nested calls to ntdb_transaction_start ( ) , re - using the
existing transaction record . If the inner transaction is canceled
then a subsequent commit will fail
- keep a mirrored copy of the ntdb hash chain heads to allow for the
fast hash heads scan on traverse , updating the mirrored copy in
the transaction version of ntdb_write
- allow callers to mix transaction and non - transaction use of ntdb ,
although once a transaction is started then an exclusive lock is
gained until the transaction is committed or canceled
- the commit stategy involves first saving away all modified data
into a linearised buffer in the transaction recovery area , then
marking the transaction recovery area with a magic value to
indicate a valid recovery record . In total 4 fsync / msync calls are
needed per commit to prevent race conditions . It might be possible
to reduce this to 3 or even 2 with some more work .
- check for a valid recovery record on open of the ntdb , while the
open lock is held . Automatically recover from the transaction
recovery area if needed , then continue with the open as
usual . This allows for smooth crash recovery with no administrator
intervention .
- if NTDB_NOSYNC is passed to flags in ntdb_open then transactions are
2012-06-19 12:39:33 +09:30
still available , but fsync / msync calls are made . This means we
still are safe against unexpected death during transaction commit ,
but not against machine reboots .
2012-06-18 22:30:26 +09:30
*/
/*
hold the context of any current transaction
*/
struct ntdb_transaction {
/* the original io methods - used to do IOs to the real db */
const struct ntdb_methods * io_methods ;
/* the list of transaction blocks. When a block is first
written to , it gets created in this list */
uint8_t * * blocks ;
size_t num_blocks ;
/* non-zero when an internal transaction error has
occurred . All write operations will then fail until the
transaction is ended */
int transaction_error ;
/* when inside a transaction we need to keep track of any
nested ntdb_transaction_start ( ) calls , as these are allowed ,
but don ' t create a new transaction */
unsigned int nesting ;
/* set when a prepare has already occurred */
bool prepared ;
ntdb_off_t magic_offset ;
/* old file size before transaction */
ntdb_len_t old_map_size ;
} ;
/*
read while in a transaction . We need to check first if the data is in our list
of transaction elements , then if not do a real read
*/
static enum NTDB_ERROR transaction_read ( struct ntdb_context * ntdb , ntdb_off_t off ,
void * buf , ntdb_len_t len )
{
size_t blk ;
enum NTDB_ERROR ecode ;
/* break it down into block sized ops */
2012-06-18 22:30:29 +09:30
while ( len + ( off % NTDB_PGSIZE ) > NTDB_PGSIZE ) {
ntdb_len_t len2 = NTDB_PGSIZE - ( off % NTDB_PGSIZE ) ;
2012-06-18 22:30:26 +09:30
ecode = transaction_read ( ntdb , off , buf , len2 ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
}
len - = len2 ;
off + = len2 ;
buf = ( void * ) ( len2 + ( char * ) buf ) ;
}
if ( len = = 0 ) {
return NTDB_SUCCESS ;
}
2012-06-18 22:30:29 +09:30
blk = off / NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
/* see if we have it in the block list */
if ( ntdb - > transaction - > num_blocks < = blk | |
ntdb - > transaction - > blocks [ blk ] = = NULL ) {
/* nope, do a real read */
ecode = ntdb - > transaction - > io_methods - > tread ( ntdb , off , buf , len ) ;
if ( ecode ! = NTDB_SUCCESS ) {
goto fail ;
}
return 0 ;
}
/* now copy it out of this block */
2012-06-18 22:30:29 +09:30
memcpy ( buf , ntdb - > transaction - > blocks [ blk ] + ( off % NTDB_PGSIZE ) , len ) ;
2012-06-18 22:30:26 +09:30
return NTDB_SUCCESS ;
fail :
ntdb - > transaction - > transaction_error = 1 ;
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" transaction_read: failed at off=%zu len=%zu " ,
( size_t ) off , ( size_t ) len ) ;
}
/*
write while in a transaction
*/
static enum NTDB_ERROR transaction_write ( struct ntdb_context * ntdb , ntdb_off_t off ,
const void * buf , ntdb_len_t len )
{
size_t blk ;
enum NTDB_ERROR ecode ;
/* Only a commit is allowed on a prepared transaction */
if ( ntdb - > transaction - > prepared ) {
ecode = ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_ERROR ,
" transaction_write: transaction already "
" prepared, write not allowed " ) ;
goto fail ;
}
/* break it up into block sized chunks */
2012-06-18 22:30:29 +09:30
while ( len + ( off % NTDB_PGSIZE ) > NTDB_PGSIZE ) {
ntdb_len_t len2 = NTDB_PGSIZE - ( off % NTDB_PGSIZE ) ;
2012-06-18 22:30:26 +09:30
ecode = transaction_write ( ntdb , off , buf , len2 ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
}
len - = len2 ;
off + = len2 ;
if ( buf ! = NULL ) {
buf = ( const void * ) ( len2 + ( const char * ) buf ) ;
}
}
if ( len = = 0 ) {
return NTDB_SUCCESS ;
}
2012-06-18 22:30:29 +09:30
blk = off / NTDB_PGSIZE ;
off = off % NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
if ( ntdb - > transaction - > num_blocks < = blk ) {
uint8_t * * new_blocks ;
/* expand the blocks array */
if ( ntdb - > transaction - > blocks = = NULL ) {
2012-06-19 12:42:08 +09:30
new_blocks = ( uint8_t * * ) ntdb - > alloc_fn ( ntdb ,
( blk + 1 ) * sizeof ( uint8_t * ) , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
} else {
2012-06-19 12:42:08 +09:30
new_blocks = ( uint8_t * * ) ntdb - > expand_fn (
2012-06-18 22:30:26 +09:30
ntdb - > transaction - > blocks ,
2012-06-19 12:42:08 +09:30
( blk + 1 ) * sizeof ( uint8_t * ) , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
}
if ( new_blocks = = NULL ) {
ecode = ntdb_logerr ( ntdb , NTDB_ERR_OOM , NTDB_LOG_ERROR ,
" transaction_write: "
" failed to allocate " ) ;
goto fail ;
}
memset ( & new_blocks [ ntdb - > transaction - > num_blocks ] , 0 ,
( 1 + ( blk - ntdb - > transaction - > num_blocks ) ) * sizeof ( uint8_t * ) ) ;
ntdb - > transaction - > blocks = new_blocks ;
ntdb - > transaction - > num_blocks = blk + 1 ;
}
/* allocate and fill a block? */
if ( ntdb - > transaction - > blocks [ blk ] = = NULL ) {
2012-06-19 12:42:08 +09:30
ntdb - > transaction - > blocks [ blk ] = ( uint8_t * )
ntdb - > alloc_fn ( ntdb - > transaction - > blocks , NTDB_PGSIZE ,
ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
if ( ntdb - > transaction - > blocks [ blk ] = = NULL ) {
ecode = ntdb_logerr ( ntdb , NTDB_ERR_OOM , NTDB_LOG_ERROR ,
" transaction_write: "
" failed to allocate " ) ;
goto fail ;
}
2012-06-19 12:42:08 +09:30
memset ( ntdb - > transaction - > blocks [ blk ] , 0 , NTDB_PGSIZE ) ;
2012-06-18 22:30:29 +09:30
if ( ntdb - > transaction - > old_map_size > blk * NTDB_PGSIZE ) {
ntdb_len_t len2 = NTDB_PGSIZE ;
if ( len2 + ( blk * NTDB_PGSIZE ) > ntdb - > transaction - > old_map_size ) {
len2 = ntdb - > transaction - > old_map_size - ( blk * NTDB_PGSIZE ) ;
2012-06-18 22:30:26 +09:30
}
ecode = ntdb - > transaction - > io_methods - > tread ( ntdb ,
2012-06-18 22:30:29 +09:30
blk * NTDB_PGSIZE ,
2012-06-18 22:30:26 +09:30
ntdb - > transaction - > blocks [ blk ] ,
len2 ) ;
if ( ecode ! = NTDB_SUCCESS ) {
ecode = ntdb_logerr ( ntdb , ecode ,
NTDB_LOG_ERROR ,
" transaction_write: "
" failed to "
" read old block: %s " ,
strerror ( errno ) ) ;
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks [ blk ] ) ;
2012-06-18 22:30:26 +09:30
goto fail ;
}
}
}
/* overwrite part of an existing block */
if ( buf = = NULL ) {
memset ( ntdb - > transaction - > blocks [ blk ] + off , 0 , len ) ;
} else {
memcpy ( ntdb - > transaction - > blocks [ blk ] + off , buf , len ) ;
}
return NTDB_SUCCESS ;
fail :
ntdb - > transaction - > transaction_error = 1 ;
return ecode ;
}
/*
write while in a transaction - this variant never expands the transaction blocks , it only
updates existing blocks . This means it cannot change the recovery size
*/
static void transaction_write_existing ( struct ntdb_context * ntdb , ntdb_off_t off ,
const void * buf , ntdb_len_t len )
{
size_t blk ;
/* break it up into block sized chunks */
2012-06-18 22:30:29 +09:30
while ( len + ( off % NTDB_PGSIZE ) > NTDB_PGSIZE ) {
ntdb_len_t len2 = NTDB_PGSIZE - ( off % NTDB_PGSIZE ) ;
2012-06-18 22:30:26 +09:30
transaction_write_existing ( ntdb , off , buf , len2 ) ;
len - = len2 ;
off + = len2 ;
if ( buf ! = NULL ) {
buf = ( const void * ) ( len2 + ( const char * ) buf ) ;
}
}
if ( len = = 0 ) {
return ;
}
2012-06-18 22:30:29 +09:30
blk = off / NTDB_PGSIZE ;
off = off % NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
if ( ntdb - > transaction - > num_blocks < = blk | |
ntdb - > transaction - > blocks [ blk ] = = NULL ) {
return ;
}
/* overwrite part of an existing block */
memcpy ( ntdb - > transaction - > blocks [ blk ] + off , buf , len ) ;
}
/*
out of bounds check during a transaction
*/
static enum NTDB_ERROR transaction_oob ( struct ntdb_context * ntdb ,
ntdb_off_t off , ntdb_len_t len , bool probe )
{
if ( ( off + len > = off & & off + len < = ntdb - > file - > map_size ) | | probe ) {
return NTDB_SUCCESS ;
}
ntdb_logerr ( ntdb , NTDB_ERR_IO , NTDB_LOG_ERROR ,
" ntdb_oob len %lld beyond transaction size %lld " ,
( long long ) ( off + len ) ,
( long long ) ntdb - > file - > map_size ) ;
return NTDB_ERR_IO ;
}
/*
transaction version of ntdb_expand ( ) .
*/
static enum NTDB_ERROR transaction_expand_file ( struct ntdb_context * ntdb ,
ntdb_off_t addition )
{
enum NTDB_ERROR ecode ;
2012-06-18 22:30:29 +09:30
assert ( ( ntdb - > file - > map_size + addition ) % NTDB_PGSIZE = = 0 ) ;
2012-06-18 22:30:26 +09:30
/* add a write to the transaction elements, so subsequent
reads see the zero data */
ecode = transaction_write ( ntdb , ntdb - > file - > map_size , NULL , addition ) ;
if ( ecode = = NTDB_SUCCESS ) {
ntdb - > file - > map_size + = addition ;
}
return ecode ;
}
static void * transaction_direct ( struct ntdb_context * ntdb , ntdb_off_t off ,
size_t len , bool write_mode )
{
2012-06-18 22:30:29 +09:30
size_t blk = off / NTDB_PGSIZE , end_blk ;
2012-06-18 22:30:26 +09:30
/* This is wrong for zero-length blocks, but will fail gracefully */
2012-06-18 22:30:29 +09:30
end_blk = ( off + len - 1 ) / NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
/* Can only do direct if in single block and we've already copied. */
if ( write_mode ) {
ntdb - > stats . transaction_write_direct + + ;
if ( blk ! = end_blk
| | blk > = ntdb - > transaction - > num_blocks
| | ntdb - > transaction - > blocks [ blk ] = = NULL ) {
ntdb - > stats . transaction_write_direct_fail + + ;
return NULL ;
}
2012-06-18 22:30:29 +09:30
return ntdb - > transaction - > blocks [ blk ] + off % NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
}
ntdb - > stats . transaction_read_direct + + ;
/* Single which we have copied? */
if ( blk = = end_blk
& & blk < ntdb - > transaction - > num_blocks
& & ntdb - > transaction - > blocks [ blk ] )
2012-06-18 22:30:29 +09:30
return ntdb - > transaction - > blocks [ blk ] + off % NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
/* Otherwise must be all not copied. */
while ( blk < = end_blk ) {
if ( blk > = ntdb - > transaction - > num_blocks )
break ;
if ( ntdb - > transaction - > blocks [ blk ] ) {
ntdb - > stats . transaction_read_direct_fail + + ;
return NULL ;
}
blk + + ;
}
return ntdb - > transaction - > io_methods - > direct ( ntdb , off , len , false ) ;
}
2012-06-19 12:42:13 +09:30
static ntdb_off_t transaction_read_off ( struct ntdb_context * ntdb ,
ntdb_off_t off )
{
ntdb_off_t ret ;
enum NTDB_ERROR ecode ;
ecode = transaction_read ( ntdb , off , & ret , sizeof ( ret ) ) ;
ntdb_convert ( ntdb , & ret , sizeof ( ret ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return NTDB_ERR_TO_OFF ( ecode ) ;
}
return ret ;
}
static enum NTDB_ERROR transaction_write_off ( struct ntdb_context * ntdb ,
ntdb_off_t off , ntdb_off_t val )
{
ntdb_convert ( ntdb , & val , sizeof ( val ) ) ;
return transaction_write ( ntdb , off , & val , sizeof ( val ) ) ;
}
2012-06-18 22:30:26 +09:30
static const struct ntdb_methods transaction_methods = {
transaction_read ,
transaction_write ,
transaction_oob ,
transaction_expand_file ,
transaction_direct ,
2012-06-19 12:42:13 +09:30
transaction_read_off ,
transaction_write_off ,
2012-06-18 22:30:26 +09:30
} ;
/*
sync to disk
*/
static enum NTDB_ERROR transaction_sync ( struct ntdb_context * ntdb ,
ntdb_off_t offset , ntdb_len_t length )
{
if ( ntdb - > flags & NTDB_NOSYNC ) {
return NTDB_SUCCESS ;
}
if ( fsync ( ntdb - > file - > fd ) ! = 0 ) {
return ntdb_logerr ( ntdb , NTDB_ERR_IO , NTDB_LOG_ERROR ,
" ntdb_transaction: fsync failed: %s " ,
strerror ( errno ) ) ;
}
# ifdef MS_SYNC
if ( ntdb - > file - > map_ptr ) {
ntdb_off_t moffset = offset & ~ ( getpagesize ( ) - 1 ) ;
if ( msync ( moffset + ( char * ) ntdb - > file - > map_ptr ,
length + ( offset - moffset ) , MS_SYNC ) ! = 0 ) {
return ntdb_logerr ( ntdb , NTDB_ERR_IO , NTDB_LOG_ERROR ,
" ntdb_transaction: msync failed: %s " ,
strerror ( errno ) ) ;
}
}
# endif
return NTDB_SUCCESS ;
}
2013-02-17 20:56:34 +10:30
static void free_transaction_blocks ( struct ntdb_context * ntdb )
{
int i ;
/* free all the transaction blocks */
for ( i = 0 ; i < ntdb - > transaction - > num_blocks ; i + + ) {
if ( ntdb - > transaction - > blocks [ i ] ! = NULL ) {
ntdb - > free_fn ( ntdb - > transaction - > blocks [ i ] ,
ntdb - > alloc_data ) ;
}
}
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks ) ;
ntdb - > transaction - > num_blocks = 0 ;
}
2012-06-18 22:30:26 +09:30
static void _ntdb_transaction_cancel ( struct ntdb_context * ntdb )
{
enum NTDB_ERROR ecode ;
if ( ntdb - > transaction = = NULL ) {
ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_cancel: no transaction " ) ;
return ;
}
if ( ntdb - > transaction - > nesting ! = 0 ) {
ntdb - > transaction - > transaction_error = 1 ;
ntdb - > transaction - > nesting - - ;
return ;
}
ntdb - > file - > map_size = ntdb - > transaction - > old_map_size ;
2013-02-17 20:56:34 +10:30
free_transaction_blocks ( ntdb ) ;
2012-06-18 22:30:26 +09:30
if ( ntdb - > transaction - > magic_offset ) {
const struct ntdb_methods * methods = ntdb - > transaction - > io_methods ;
uint64_t invalid = NTDB_RECOVERY_INVALID_MAGIC ;
/* remove the recovery marker */
ecode = methods - > twrite ( ntdb , ntdb - > transaction - > magic_offset ,
& invalid , sizeof ( invalid ) ) ;
if ( ecode = = NTDB_SUCCESS )
ecode = transaction_sync ( ntdb ,
ntdb - > transaction - > magic_offset ,
sizeof ( invalid ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_cancel: failed to remove "
" recovery magic " ) ;
}
}
if ( ntdb - > file - > allrecord_lock . count )
ntdb_allrecord_unlock ( ntdb , ntdb - > file - > allrecord_lock . ltype ) ;
/* restore the normal io methods */
ntdb - > io = ntdb - > transaction - > io_methods ;
ntdb_transaction_unlock ( ntdb , F_WRLCK ) ;
if ( ntdb_has_open_lock ( ntdb ) )
ntdb_unlock_open ( ntdb , F_WRLCK ) ;
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction ) ;
2012-06-18 22:30:26 +09:30
}
/*
start a ntdb transaction . No token is returned , as only a single
transaction is allowed to be pending per ntdb_context
*/
_PUBLIC_ enum NTDB_ERROR ntdb_transaction_start ( struct ntdb_context * ntdb )
{
enum NTDB_ERROR ecode ;
ntdb - > stats . transactions + + ;
/* some sanity checks */
if ( ntdb - > flags & NTDB_INTERNAL ) {
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_start: "
" cannot start a transaction on an "
" internal ntdb " ) ;
2012-06-18 22:30:26 +09:30
}
if ( ntdb - > flags & NTDB_RDONLY ) {
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_RDONLY , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_start: "
" cannot start a transaction on a "
" read-only ntdb " ) ;
2012-06-18 22:30:26 +09:30
}
/* cope with nested ntdb_transaction_start() calls */
if ( ntdb - > transaction ! = NULL ) {
if ( ! ( ntdb - > flags & NTDB_ALLOW_NESTING ) ) {
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_IO ,
NTDB_LOG_USE_ERROR ,
" ntdb_transaction_start: "
" already inside transaction " ) ;
2012-06-18 22:30:26 +09:30
}
ntdb - > transaction - > nesting + + ;
ntdb - > stats . transaction_nest + + ;
return 0 ;
}
if ( ntdb_has_hash_locks ( ntdb ) ) {
/* the caller must not have any locks when starting a
transaction as otherwise we ' ll be screwed by lack
of nested locks in POSIX */
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_LOCK ,
NTDB_LOG_USE_ERROR ,
" ntdb_transaction_start: "
" cannot start a transaction with locks "
" held " ) ;
2012-06-18 22:30:26 +09:30
}
ntdb - > transaction = ( struct ntdb_transaction * )
2012-06-19 12:42:08 +09:30
ntdb - > alloc_fn ( ntdb , sizeof ( struct ntdb_transaction ) ,
ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
if ( ntdb - > transaction = = NULL ) {
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_OOM , NTDB_LOG_ERROR ,
" ntdb_transaction_start: "
" cannot allocate " ) ;
2012-06-18 22:30:26 +09:30
}
2012-06-19 12:42:08 +09:30
memset ( ntdb - > transaction , 0 , sizeof ( * ntdb - > transaction ) ) ;
2012-06-18 22:30:26 +09:30
/* get the transaction write lock. This is a blocking lock. As
discussed with Volker , there are a number of ways we could
make this async , which we will probably do in the future */
ecode = ntdb_transaction_lock ( ntdb , F_WRLCK ) ;
if ( ecode ! = NTDB_SUCCESS ) {
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks ) ;
SAFE_FREE ( ntdb , ntdb - > transaction ) ;
2012-06-18 22:30:27 +09:30
return ecode ;
2012-06-18 22:30:26 +09:30
}
/* get a read lock over entire file. This is upgraded to a write
lock during the commit */
ecode = ntdb_allrecord_lock ( ntdb , F_RDLCK , NTDB_LOCK_WAIT , true ) ;
if ( ecode ! = NTDB_SUCCESS ) {
goto fail_allrecord_lock ;
}
/* make sure we know about any file expansions already done by
anyone else */
2012-06-19 12:42:12 +09:30
ntdb_oob ( ntdb , ntdb - > file - > map_size , 1 , true ) ;
2012-06-18 22:30:26 +09:30
ntdb - > transaction - > old_map_size = ntdb - > file - > map_size ;
/* finally hook the io methods, replacing them with
transaction specific methods */
ntdb - > transaction - > io_methods = ntdb - > io ;
ntdb - > io = & transaction_methods ;
2012-06-18 22:30:27 +09:30
return NTDB_SUCCESS ;
2012-06-18 22:30:26 +09:30
fail_allrecord_lock :
ntdb_transaction_unlock ( ntdb , F_WRLCK ) ;
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks ) ;
SAFE_FREE ( ntdb , ntdb - > transaction ) ;
2012-06-18 22:30:27 +09:30
return ecode ;
2012-06-18 22:30:26 +09:30
}
/*
cancel the current transaction
*/
_PUBLIC_ void ntdb_transaction_cancel ( struct ntdb_context * ntdb )
{
ntdb - > stats . transaction_cancel + + ;
_ntdb_transaction_cancel ( ntdb ) ;
}
/*
work out how much space the linearised recovery data will consume ( worst case )
*/
static ntdb_len_t ntdb_recovery_size ( struct ntdb_context * ntdb )
{
ntdb_len_t recovery_size = 0 ;
int i ;
recovery_size = 0 ;
for ( i = 0 ; i < ntdb - > transaction - > num_blocks ; i + + ) {
2012-06-18 22:30:29 +09:30
if ( i * NTDB_PGSIZE > = ntdb - > transaction - > old_map_size ) {
2012-06-18 22:30:26 +09:30
break ;
}
if ( ntdb - > transaction - > blocks [ i ] = = NULL ) {
continue ;
}
2012-06-18 22:30:30 +09:30
recovery_size + = 2 * sizeof ( ntdb_off_t ) + NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
}
return recovery_size ;
}
static enum NTDB_ERROR ntdb_recovery_area ( struct ntdb_context * ntdb ,
const struct ntdb_methods * methods ,
ntdb_off_t * recovery_offset ,
struct ntdb_recovery_record * rec )
{
enum NTDB_ERROR ecode ;
* recovery_offset = ntdb_read_off ( ntdb ,
offsetof ( struct ntdb_header , recovery ) ) ;
if ( NTDB_OFF_IS_ERR ( * recovery_offset ) ) {
return NTDB_OFF_TO_ERR ( * recovery_offset ) ;
}
if ( * recovery_offset = = 0 ) {
rec - > max_len = 0 ;
return NTDB_SUCCESS ;
}
ecode = methods - > tread ( ntdb , * recovery_offset , rec , sizeof ( * rec ) ) ;
if ( ecode ! = NTDB_SUCCESS )
return ecode ;
ntdb_convert ( ntdb , rec , sizeof ( * rec ) ) ;
/* ignore invalid recovery regions: can happen in crash */
if ( rec - > magic ! = NTDB_RECOVERY_MAGIC & &
rec - > magic ! = NTDB_RECOVERY_INVALID_MAGIC ) {
* recovery_offset = 0 ;
rec - > max_len = 0 ;
}
return NTDB_SUCCESS ;
}
static unsigned int same ( const unsigned char * new ,
const unsigned char * old ,
unsigned int length )
{
unsigned int i ;
for ( i = 0 ; i < length ; i + + ) {
if ( new [ i ] ! = old [ i ] )
break ;
}
return i ;
}
static unsigned int different ( const unsigned char * new ,
const unsigned char * old ,
unsigned int length ,
unsigned int min_same ,
unsigned int * samelen )
{
unsigned int i ;
* samelen = 0 ;
for ( i = 0 ; i < length ; i + + ) {
if ( new [ i ] = = old [ i ] ) {
( * samelen ) + + ;
} else {
if ( * samelen > = min_same ) {
return i - * samelen ;
}
* samelen = 0 ;
}
}
if ( * samelen < min_same )
* samelen = 0 ;
return length - * samelen ;
}
/* Allocates recovery blob, without ntdb_recovery_record at head set up. */
static struct ntdb_recovery_record * alloc_recovery ( struct ntdb_context * ntdb ,
ntdb_len_t * len )
{
struct ntdb_recovery_record * rec ;
size_t i ;
enum NTDB_ERROR ecode ;
unsigned char * p ;
const struct ntdb_methods * old_methods = ntdb - > io ;
2012-06-19 12:42:08 +09:30
rec = ntdb - > alloc_fn ( ntdb , sizeof ( * rec ) + ntdb_recovery_size ( ntdb ) ,
ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
if ( ! rec ) {
ntdb_logerr ( ntdb , NTDB_ERR_OOM , NTDB_LOG_ERROR ,
" transaction_setup_recovery: "
" cannot allocate " ) ;
return NTDB_ERR_PTR ( NTDB_ERR_OOM ) ;
}
/* We temporarily revert to the old I/O methods, so we can use
* ntdb_access_read */
ntdb - > io = ntdb - > transaction - > io_methods ;
/* build the recovery data into a single blob to allow us to do a single
large write , which should be more efficient */
p = ( unsigned char * ) ( rec + 1 ) ;
for ( i = 0 ; i < ntdb - > transaction - > num_blocks ; i + + ) {
ntdb_off_t offset ;
ntdb_len_t length ;
unsigned int off ;
const unsigned char * buffer ;
if ( ntdb - > transaction - > blocks [ i ] = = NULL ) {
continue ;
}
2012-06-18 22:30:29 +09:30
offset = i * NTDB_PGSIZE ;
length = NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
if ( offset > = ntdb - > transaction - > old_map_size ) {
continue ;
}
if ( offset + length > ntdb - > file - > map_size ) {
ecode = ntdb_logerr ( ntdb , NTDB_ERR_CORRUPT , NTDB_LOG_ERROR ,
" ntdb_transaction_setup_recovery: "
" transaction data over new region "
" boundary " ) ;
goto fail ;
}
buffer = ntdb_access_read ( ntdb , offset , length , false ) ;
if ( NTDB_PTR_IS_ERR ( buffer ) ) {
ecode = NTDB_PTR_ERR ( buffer ) ;
goto fail ;
}
/* Skip over anything the same at the start. */
off = same ( ntdb - > transaction - > blocks [ i ] , buffer , length ) ;
offset + = off ;
while ( off < length ) {
ntdb_len_t len1 ;
unsigned int samelen ;
len1 = different ( ntdb - > transaction - > blocks [ i ] + off ,
buffer + off , length - off ,
sizeof ( offset ) + sizeof ( len1 ) + 1 ,
& samelen ) ;
memcpy ( p , & offset , sizeof ( offset ) ) ;
memcpy ( p + sizeof ( offset ) , & len1 , sizeof ( len1 ) ) ;
ntdb_convert ( ntdb , p , sizeof ( offset ) + sizeof ( len1 ) ) ;
p + = sizeof ( offset ) + sizeof ( len1 ) ;
memcpy ( p , buffer + off , len1 ) ;
p + = len1 ;
off + = len1 + samelen ;
offset + = len1 + samelen ;
}
ntdb_access_release ( ntdb , buffer ) ;
}
* len = p - ( unsigned char * ) ( rec + 1 ) ;
ntdb - > io = old_methods ;
return rec ;
fail :
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( rec , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
ntdb - > io = old_methods ;
return NTDB_ERR_PTR ( ecode ) ;
}
static ntdb_off_t create_recovery_area ( struct ntdb_context * ntdb ,
ntdb_len_t rec_length ,
struct ntdb_recovery_record * rec )
{
ntdb_off_t off , recovery_off ;
ntdb_len_t addition ;
enum NTDB_ERROR ecode ;
const struct ntdb_methods * methods = ntdb - > transaction - > io_methods ;
/* round up to a multiple of page size. Overallocate, since each
* such allocation forces us to expand the file . */
rec - > max_len = ntdb_expand_adjust ( ntdb - > file - > map_size , rec_length ) ;
/* Round up to a page. */
2012-06-18 22:30:29 +09:30
rec - > max_len = ( ( sizeof ( * rec ) + rec - > max_len + NTDB_PGSIZE - 1 )
& ~ ( NTDB_PGSIZE - 1 ) )
2012-06-18 22:30:26 +09:30
- sizeof ( * rec ) ;
off = ntdb - > file - > map_size ;
/* Restore ->map_size before calling underlying expand_file.
Also so that we don ' t try to expand the file again in the
transaction commit , which would destroy the recovery
area */
addition = ( ntdb - > file - > map_size - ntdb - > transaction - > old_map_size ) +
sizeof ( * rec ) + rec - > max_len ;
ntdb - > file - > map_size = ntdb - > transaction - > old_map_size ;
ntdb - > stats . transaction_expand_file + + ;
ecode = methods - > expand_file ( ntdb , addition ) ;
if ( ecode ! = NTDB_SUCCESS ) {
ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_recovery_allocate: "
" failed to create recovery area " ) ;
return NTDB_ERR_TO_OFF ( ecode ) ;
}
/* we have to reset the old map size so that we don't try to
expand the file again in the transaction commit , which
would destroy the recovery area */
ntdb - > transaction - > old_map_size = ntdb - > file - > map_size ;
/* write the recovery header offset and sync - we can sync without a race here
as the magic ptr in the recovery record has not been set */
recovery_off = off ;
ntdb_convert ( ntdb , & recovery_off , sizeof ( recovery_off ) ) ;
ecode = methods - > twrite ( ntdb , offsetof ( struct ntdb_header , recovery ) ,
& recovery_off , sizeof ( ntdb_off_t ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_recovery_allocate: "
" failed to write recovery head " ) ;
return NTDB_ERR_TO_OFF ( ecode ) ;
}
transaction_write_existing ( ntdb , offsetof ( struct ntdb_header , recovery ) ,
& recovery_off ,
sizeof ( ntdb_off_t ) ) ;
return off ;
}
/*
setup the recovery data that will be used on a crash during commit
*/
static enum NTDB_ERROR transaction_setup_recovery ( struct ntdb_context * ntdb )
{
ntdb_len_t recovery_size = 0 ;
ntdb_off_t recovery_off = 0 ;
ntdb_off_t old_map_size = ntdb - > transaction - > old_map_size ;
struct ntdb_recovery_record * recovery ;
const struct ntdb_methods * methods = ntdb - > transaction - > io_methods ;
uint64_t magic ;
enum NTDB_ERROR ecode ;
recovery = alloc_recovery ( ntdb , & recovery_size ) ;
if ( NTDB_PTR_IS_ERR ( recovery ) )
return NTDB_PTR_ERR ( recovery ) ;
2013-02-17 20:56:34 +10:30
/* If we didn't actually change anything we overwrote? */
if ( recovery_size = = 0 ) {
/* In theory, we could have just appended data. */
if ( ntdb - > transaction - > num_blocks * NTDB_PGSIZE
< ntdb - > transaction - > old_map_size ) {
free_transaction_blocks ( ntdb ) ;
}
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
return NTDB_SUCCESS ;
}
2012-06-18 22:30:26 +09:30
ecode = ntdb_recovery_area ( ntdb , methods , & recovery_off , recovery ) ;
if ( ecode ) {
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
return ecode ;
}
if ( recovery - > max_len < recovery_size ) {
/* Not large enough. Free up old recovery area. */
if ( recovery_off ) {
ntdb - > stats . frees + + ;
ecode = add_free_record ( ntdb , recovery_off ,
sizeof ( * recovery )
+ recovery - > max_len ,
NTDB_LOCK_WAIT , true ) ;
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_recovery_allocate: "
" failed to free previous "
" recovery area " ) ;
}
/* Refresh recovery after add_free_record above. */
recovery = alloc_recovery ( ntdb , & recovery_size ) ;
if ( NTDB_PTR_IS_ERR ( recovery ) )
return NTDB_PTR_ERR ( recovery ) ;
}
recovery_off = create_recovery_area ( ntdb , recovery_size ,
recovery ) ;
if ( NTDB_OFF_IS_ERR ( recovery_off ) ) {
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
return NTDB_OFF_TO_ERR ( recovery_off ) ;
}
}
/* Now we know size, convert rec header. */
recovery - > magic = NTDB_RECOVERY_INVALID_MAGIC ;
recovery - > len = recovery_size ;
recovery - > eof = old_map_size ;
ntdb_convert ( ntdb , recovery , sizeof ( * recovery ) ) ;
/* write the recovery data to the recovery area */
2012-06-18 22:30:29 +09:30
ecode = methods - > twrite ( ntdb , recovery_off , recovery ,
sizeof ( * recovery ) + recovery_size ) ;
2012-06-18 22:30:26 +09:30
if ( ecode ! = NTDB_SUCCESS ) {
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_setup_recovery: "
" failed to write recovery data " ) ;
}
transaction_write_existing ( ntdb , recovery_off , recovery , recovery_size ) ;
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( recovery , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
/* as we don't have ordered writes, we have to sync the recovery
data before we update the magic to indicate that the recovery
data is present */
ecode = transaction_sync ( ntdb , recovery_off , recovery_size ) ;
if ( ecode ! = NTDB_SUCCESS )
return ecode ;
magic = NTDB_RECOVERY_MAGIC ;
ntdb_convert ( ntdb , & magic , sizeof ( magic ) ) ;
ntdb - > transaction - > magic_offset
= recovery_off + offsetof ( struct ntdb_recovery_record , magic ) ;
ecode = methods - > twrite ( ntdb , ntdb - > transaction - > magic_offset ,
& magic , sizeof ( magic ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_setup_recovery: "
" failed to write recovery magic " ) ;
}
transaction_write_existing ( ntdb , ntdb - > transaction - > magic_offset ,
& magic , sizeof ( magic ) ) ;
/* ensure the recovery magic marker is on disk */
return transaction_sync ( ntdb , ntdb - > transaction - > magic_offset ,
sizeof ( magic ) ) ;
}
static enum NTDB_ERROR _ntdb_transaction_prepare_commit ( struct ntdb_context * ntdb )
{
const struct ntdb_methods * methods ;
enum NTDB_ERROR ecode ;
if ( ntdb - > transaction = = NULL ) {
return ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_prepare_commit: "
" no transaction " ) ;
}
if ( ntdb - > transaction - > prepared ) {
_ntdb_transaction_cancel ( ntdb ) ;
return ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_prepare_commit: "
" transaction already prepared " ) ;
}
if ( ntdb - > transaction - > transaction_error ) {
_ntdb_transaction_cancel ( ntdb ) ;
return ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_ERROR ,
" ntdb_transaction_prepare_commit: "
" transaction error pending " ) ;
}
if ( ntdb - > transaction - > nesting ! = 0 ) {
return NTDB_SUCCESS ;
}
/* check for a null transaction */
if ( ntdb - > transaction - > blocks = = NULL ) {
return NTDB_SUCCESS ;
}
methods = ntdb - > transaction - > io_methods ;
/* upgrade the main transaction lock region to a write lock */
ecode = ntdb_allrecord_upgrade ( ntdb , NTDB_HASH_LOCK_START ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
}
/* get the open lock - this prevents new users attaching to the database
during the commit */
ecode = ntdb_lock_open ( ntdb , F_WRLCK , NTDB_LOCK_WAIT | NTDB_LOCK_NOCHECK ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
}
2012-06-19 12:39:33 +09:30
/* Sets up ntdb->transaction->recovery and
* ntdb - > transaction - > magic_offset . */
ecode = transaction_setup_recovery ( ntdb ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
2012-06-18 22:30:26 +09:30
}
ntdb - > transaction - > prepared = true ;
/* expand the file to the new size if needed */
if ( ntdb - > file - > map_size ! = ntdb - > transaction - > old_map_size ) {
ntdb_len_t add ;
add = ntdb - > file - > map_size - ntdb - > transaction - > old_map_size ;
/* Restore original map size for ntdb_expand_file */
ntdb - > file - > map_size = ntdb - > transaction - > old_map_size ;
ecode = methods - > expand_file ( ntdb , add ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ecode ;
}
}
/* Keep the open lock until the actual commit */
return NTDB_SUCCESS ;
}
/*
prepare to commit the current transaction
*/
_PUBLIC_ enum NTDB_ERROR ntdb_transaction_prepare_commit ( struct ntdb_context * ntdb )
{
2012-06-18 22:30:27 +09:30
return _ntdb_transaction_prepare_commit ( ntdb ) ;
2012-06-18 22:30:26 +09:30
}
/*
commit the current transaction
*/
_PUBLIC_ enum NTDB_ERROR ntdb_transaction_commit ( struct ntdb_context * ntdb )
{
const struct ntdb_methods * methods ;
int i ;
enum NTDB_ERROR ecode ;
if ( ntdb - > transaction = = NULL ) {
2012-06-18 22:30:27 +09:30
return ntdb_logerr ( ntdb , NTDB_ERR_EINVAL , NTDB_LOG_USE_ERROR ,
" ntdb_transaction_commit: "
" no transaction " ) ;
2012-06-18 22:30:26 +09:30
}
ntdb_trace ( ntdb , " ntdb_transaction_commit " ) ;
if ( ntdb - > transaction - > nesting ! = 0 ) {
ntdb - > transaction - > nesting - - ;
2012-06-18 22:30:27 +09:30
return NTDB_SUCCESS ;
2012-06-18 22:30:26 +09:30
}
if ( ! ntdb - > transaction - > prepared ) {
ecode = _ntdb_transaction_prepare_commit ( ntdb ) ;
if ( ecode ! = NTDB_SUCCESS ) {
_ntdb_transaction_cancel ( ntdb ) ;
2012-06-18 22:30:27 +09:30
return ecode ;
2012-06-18 22:30:26 +09:30
}
}
2013-02-17 20:56:34 +10:30
/* check for a null transaction (prepare_commit may do this!) */
if ( ntdb - > transaction - > blocks = = NULL ) {
_ntdb_transaction_cancel ( ntdb ) ;
return NTDB_SUCCESS ;
}
2012-06-18 22:30:26 +09:30
methods = ntdb - > transaction - > io_methods ;
/* perform all the writes */
for ( i = 0 ; i < ntdb - > transaction - > num_blocks ; i + + ) {
ntdb_off_t offset ;
ntdb_len_t length ;
if ( ntdb - > transaction - > blocks [ i ] = = NULL ) {
continue ;
}
2012-06-18 22:30:29 +09:30
offset = i * NTDB_PGSIZE ;
length = NTDB_PGSIZE ;
2012-06-18 22:30:26 +09:30
ecode = methods - > twrite ( ntdb , offset ,
ntdb - > transaction - > blocks [ i ] , length ) ;
if ( ecode ! = NTDB_SUCCESS ) {
/* we've overwritten part of the data and
possibly expanded the file , so we need to
run the crash recovery code */
ntdb - > io = methods ;
ntdb_transaction_recover ( ntdb ) ;
_ntdb_transaction_cancel ( ntdb ) ;
2012-06-18 22:30:27 +09:30
return ecode ;
2012-06-18 22:30:26 +09:30
}
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks [ i ] ) ;
2012-06-18 22:30:26 +09:30
}
2012-06-19 12:42:08 +09:30
SAFE_FREE ( ntdb , ntdb - > transaction - > blocks ) ;
2012-06-18 22:30:26 +09:30
ntdb - > transaction - > num_blocks = 0 ;
/* ensure the new data is on disk */
ecode = transaction_sync ( ntdb , 0 , ntdb - > file - > map_size ) ;
if ( ecode ! = NTDB_SUCCESS ) {
2012-06-18 22:30:27 +09:30
return ecode ;
2012-06-18 22:30:26 +09:30
}
/*
TODO : maybe write to some dummy hdr field , or write to magic
offset without mmap , before the last sync , instead of the
utime ( ) call
*/
/* on some systems (like Linux 2.6.x) changes via mmap/msync
don ' t change the mtime of the file , this means the file may
not be backed up ( as ntdb rounding to block sizes means that
file size changes are quite rare too ) . The following forces
mtime changes when a transaction completes */
# if HAVE_UTIME
utime ( ntdb - > name , NULL ) ;
# endif
/* use a transaction cancel to free memory and remove the
transaction locks : it " restores " map_size , too . */
ntdb - > transaction - > old_map_size = ntdb - > file - > map_size ;
_ntdb_transaction_cancel ( ntdb ) ;
2012-06-18 22:30:27 +09:30
return NTDB_SUCCESS ;
2012-06-18 22:30:26 +09:30
}
/*
recover from an aborted transaction . Must be called with exclusive
database write access already established ( including the open
lock to prevent new processes attaching )
*/
enum NTDB_ERROR ntdb_transaction_recover ( struct ntdb_context * ntdb )
{
ntdb_off_t recovery_head , recovery_eof ;
unsigned char * data , * p ;
struct ntdb_recovery_record rec ;
enum NTDB_ERROR ecode ;
/* find the recovery area */
recovery_head = ntdb_read_off ( ntdb , offsetof ( struct ntdb_header , recovery ) ) ;
if ( NTDB_OFF_IS_ERR ( recovery_head ) ) {
ecode = NTDB_OFF_TO_ERR ( recovery_head ) ;
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to read recovery head " ) ;
}
if ( recovery_head = = 0 ) {
/* we have never allocated a recovery record */
return NTDB_SUCCESS ;
}
/* read the recovery record */
ecode = ntdb_read_convert ( ntdb , recovery_head , & rec , sizeof ( rec ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to read recovery record " ) ;
}
if ( rec . magic ! = NTDB_RECOVERY_MAGIC ) {
/* there is no valid recovery data */
return NTDB_SUCCESS ;
}
if ( ntdb - > flags & NTDB_RDONLY ) {
return ntdb_logerr ( ntdb , NTDB_ERR_CORRUPT , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" attempt to recover read only database " ) ;
}
recovery_eof = rec . eof ;
2012-06-19 12:42:08 +09:30
data = ( unsigned char * ) ntdb - > alloc_fn ( ntdb , rec . len , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
if ( data = = NULL ) {
return ntdb_logerr ( ntdb , NTDB_ERR_OOM , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to allocate recovery data " ) ;
}
/* read the full recovery data */
ecode = ntdb - > io - > tread ( ntdb , recovery_head + sizeof ( rec ) , data ,
rec . len ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to read recovery data " ) ;
}
/* recover the file data */
p = data ;
while ( p + sizeof ( ntdb_off_t ) + sizeof ( ntdb_len_t ) < data + rec . len ) {
ntdb_off_t ofs ;
ntdb_len_t len ;
ntdb_convert ( ntdb , p , sizeof ( ofs ) + sizeof ( len ) ) ;
memcpy ( & ofs , p , sizeof ( ofs ) ) ;
memcpy ( & len , p + sizeof ( ofs ) , sizeof ( len ) ) ;
p + = sizeof ( ofs ) + sizeof ( len ) ;
ecode = ntdb - > io - > twrite ( ntdb , ofs , p , len ) ;
if ( ecode ! = NTDB_SUCCESS ) {
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( data , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to recover %zu bytes "
" at offset %zu " ,
( size_t ) len , ( size_t ) ofs ) ;
}
p + = len ;
}
2012-06-19 12:42:08 +09:30
ntdb - > free_fn ( data , ntdb - > alloc_data ) ;
2012-06-18 22:30:26 +09:30
ecode = transaction_sync ( ntdb , 0 , ntdb - > file - > map_size ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to sync recovery " ) ;
}
/* if the recovery area is after the recovered eof then remove it */
if ( recovery_eof < = recovery_head ) {
ecode = ntdb_write_off ( ntdb , offsetof ( struct ntdb_header ,
recovery ) ,
0 ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to remove recovery head " ) ;
}
}
/* remove the recovery magic */
ecode = ntdb_write_off ( ntdb ,
recovery_head
+ offsetof ( struct ntdb_recovery_record , magic ) ,
NTDB_RECOVERY_INVALID_MAGIC ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to remove recovery magic " ) ;
}
ecode = transaction_sync ( ntdb , 0 , recovery_eof ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return ntdb_logerr ( ntdb , ecode , NTDB_LOG_ERROR ,
" ntdb_transaction_recover: "
" failed to sync2 recovery " ) ;
}
ntdb_logerr ( ntdb , NTDB_SUCCESS , NTDB_LOG_WARNING ,
" ntdb_transaction_recover: recovered %zu byte database " ,
( size_t ) recovery_eof ) ;
/* all done */
return NTDB_SUCCESS ;
}
ntdb_bool_err ntdb_needs_recovery ( struct ntdb_context * ntdb )
{
ntdb_off_t recovery_head ;
struct ntdb_recovery_record rec ;
enum NTDB_ERROR ecode ;
/* find the recovery area */
recovery_head = ntdb_read_off ( ntdb , offsetof ( struct ntdb_header , recovery ) ) ;
if ( NTDB_OFF_IS_ERR ( recovery_head ) ) {
return recovery_head ;
}
if ( recovery_head = = 0 ) {
/* we have never allocated a recovery record */
return false ;
}
/* read the recovery record */
ecode = ntdb_read_convert ( ntdb , recovery_head , & rec , sizeof ( rec ) ) ;
if ( ecode ! = NTDB_SUCCESS ) {
return NTDB_ERR_TO_OFF ( ecode ) ;
}
return ( rec . magic = = NTDB_RECOVERY_MAGIC ) ;
}