2009-01-08 17:12:33 +00:00
# include <stdint.h>
# include <errno.h>
# include <string.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <dirent.h>
# include <unistd.h>
# include <signal.h>
# include <ext2fs/ext2_fs.h>
# include <ext2fs/ext2fs.h>
# include <linux/kdev_t.h>
# define __USE_GNU /* for O_DIRECT */
# include <fcntl.h>
2009-04-21 19:16:22 +00:00
# include <time.h>
2009-01-08 17:12:33 +00:00
# include "linux/dm-clog-tfr.h"
# include "list.h"
# include "functions.h"
# include "common.h"
# include "cluster.h"
# include "logging.h"
# define BYTE_SHIFT 3
/*
* Magic for persistent mirrors : " MiRr "
* Following on - disk header information is stolen from
* drivers / md / dm - log . c
*/
# define MIRROR_MAGIC 0x4D695272
# define MIRROR_DISK_VERSION 2
# define LOG_OFFSET 2
# define RESYNC_HISTORY 50
static char resync_history [ RESYNC_HISTORY ] [ 128 ] ;
static int idx = 0 ;
# define LOG_SPRINT(f, arg...) do {\
idx + + ; \
idx = idx % RESYNC_HISTORY ; \
sprintf ( resync_history [ idx ] , f , # # arg ) ; \
} while ( 0 )
struct log_header {
uint32_t magic ;
uint32_t version ;
uint64_t nr_regions ;
} ;
struct log_c {
struct list_head list ;
char uuid [ DM_UUID_LEN ] ;
uint32_t ref_count ;
2009-04-21 19:16:22 +00:00
time_t delay ; /* limits how fast a resume can happen after suspend */
2009-01-08 17:12:33 +00:00
int touched ;
uint32_t region_size ;
uint32_t region_count ;
uint64_t sync_count ;
uint32_t bitset_uint32_count ;
uint32_t * clean_bits ;
uint32_t * sync_bits ;
uint32_t recoverer ;
uint64_t recovering_region ; /* -1 means not recovering */
2009-04-21 19:16:22 +00:00
uint64_t skip_bit_warning ; /* used to warn if region skipped */
2009-01-08 17:12:33 +00:00
int sync_search ;
int resume_override ;
uint32_t block_on_error ;
enum sync {
DEFAULTSYNC , /* Synchronize if necessary */
NOSYNC , /* Devices known to be already in sync */
FORCESYNC , /* Force a sync to happen */
} sync ;
uint32_t state ; /* current operational state of the log */
struct list_head mark_list ;
uint32_t recovery_halted ;
struct recovery_request * recovery_request_list ;
int disk_fd ; /* -1 means no disk log */
int log_dev_failed ;
uint64_t disk_nr_regions ;
size_t disk_size ; /* size of disk_buffer in bytes */
void * disk_buffer ; /* aligned memory for O_DIRECT */
} ;
struct mark_entry {
struct list_head list ;
uint32_t nodeid ;
uint64_t region ;
} ;
struct recovery_request {
uint64_t region ;
struct recovery_request * next ;
} ;
static struct list_head log_list = LIST_HEAD_INIT ( log_list ) ;
static struct list_head log_pending_list = LIST_HEAD_INIT ( log_pending_list ) ;
static int log_test_bit ( uint32_t * bs , unsigned bit )
{
return ext2fs_test_bit ( bit , ( unsigned int * ) bs ) ? 1 : 0 ;
}
static void log_set_bit ( struct log_c * lc , uint32_t * bs , unsigned bit )
{
ext2fs_set_bit ( bit , ( unsigned int * ) bs ) ;
lc - > touched = 1 ;
}
static void log_clear_bit ( struct log_c * lc , uint32_t * bs , unsigned bit )
{
ext2fs_clear_bit ( bit , ( unsigned int * ) bs ) ;
lc - > touched = 1 ;
}
/* FIXME: Why aren't count and start the same type? */
static uint64_t find_next_zero_bit ( uint32_t * bits , uint32_t count , int start )
{
for ( ; ( start < count ) & & log_test_bit ( bits , start ) ; start + + ) ;
return start ;
}
static uint64_t count_bits32 ( uint32_t * addr , uint32_t count )
{
int j ;
uint32_t i ;
uint64_t rtn = 0 ;
for ( i = 0 ; i < count ; i + + ) {
if ( ! addr [ i ] )
continue ;
for ( j = 0 ; j < 32 ; j + + )
rtn + = ( addr [ i ] & ( 1 < < j ) ) ? 1 : 0 ;
}
return rtn ;
}
/*
* get_log
* @ tfr
*
* Returns : log if found , NULL otherwise
*/
static struct log_c * get_log ( const char * uuid )
{
struct list_head * l ;
struct log_c * lc ;
/* FIXME: Need prefetch to do this right */
__list_for_each ( l , & log_list ) {
lc = list_entry ( l , struct log_c , list ) ;
if ( ! strcmp ( lc - > uuid , uuid ) )
return lc ;
}
return NULL ;
}
/*
* get_pending_log
* @ tfr
*
* Pending logs are logs that have been ' clog_ctr ' ed , but
* have not joined the CPG ( via clog_resume ) .
*
* Returns : log if found , NULL otherwise
*/
static struct log_c * get_pending_log ( const char * uuid )
{
struct list_head * l ;
struct log_c * lc ;
/* FIXME: Need prefetch to do this right */
__list_for_each ( l , & log_pending_list ) {
lc = list_entry ( l , struct log_c , list ) ;
if ( ! strcmp ( lc - > uuid , uuid ) )
return lc ;
}
return NULL ;
}
static void header_to_disk ( struct log_header * mem , struct log_header * disk )
{
memcpy ( disk , mem , sizeof ( struct log_header ) ) ;
}
static void header_from_disk ( struct log_header * mem , struct log_header * disk )
{
memcpy ( mem , disk , sizeof ( struct log_header ) ) ;
}
static int rw_log ( struct log_c * lc , int do_write )
{
int r ;
r = lseek ( lc - > disk_fd , 0 , SEEK_SET ) ;
if ( r < 0 ) {
LOG_ERROR ( " [%s] rw_log: lseek failure: %s " ,
SHORT_UUID ( lc - > uuid ) , strerror ( errno ) ) ;
return - errno ;
}
if ( do_write ) {
r = write ( lc - > disk_fd , lc - > disk_buffer , lc - > disk_size ) ;
if ( r < 0 ) {
LOG_ERROR ( " [%s] rw_log: write failure: %s " ,
SHORT_UUID ( lc - > uuid ) , strerror ( errno ) ) ;
return - EIO ; /* Failed disk write */
}
return 0 ;
}
/* Read */
r = read ( lc - > disk_fd , lc - > disk_buffer , lc - > disk_size ) ;
if ( r < 0 )
LOG_ERROR ( " [%s] rw_log: read failure: %s " ,
SHORT_UUID ( lc - > uuid ) , strerror ( errno ) ) ;
if ( r ! = lc - > disk_size )
return - EIO ; /* Failed disk read */
return 0 ;
}
/*
* read_log
* @ lc
*
* Valid return codes :
* - EINVAL : Invalid header , bits not copied
* - EIO : Unable to read disk log
* 0 : Valid header , disk bit - > lc - > clean_bits
*
* Returns : 0 on success , - EXXX on failure
*/
static int read_log ( struct log_c * lc )
{
struct log_header lh ;
size_t bitset_size ;
memset ( & lh , 0 , sizeof ( struct log_header ) ) ;
if ( rw_log ( lc , 0 ) )
return - EIO ; /* Failed disk read */
header_from_disk ( & lh , lc - > disk_buffer ) ;
if ( lh . magic ! = MIRROR_MAGIC )
return - EINVAL ;
lc - > disk_nr_regions = lh . nr_regions ;
/* Read disk bits into sync_bits */
bitset_size = lc - > region_count / 8 ;
bitset_size + = ( lc - > region_count % 8 ) ? 1 : 0 ;
memcpy ( lc - > clean_bits , lc - > disk_buffer + 1024 , bitset_size ) ;
return 0 ;
}
/*
* write_log
* @ lc
*
* Returns : 0 on success , - EIO on failure
*/
static int write_log ( struct log_c * lc )
{
struct log_header lh ;
size_t bitset_size ;
lh . magic = MIRROR_MAGIC ;
lh . version = MIRROR_DISK_VERSION ;
lh . nr_regions = lc - > region_count ;
header_to_disk ( & lh , lc - > disk_buffer ) ;
/* Write disk bits from clean_bits */
bitset_size = lc - > region_count / 8 ;
bitset_size + = ( lc - > region_count % 8 ) ? 1 : 0 ;
memcpy ( lc - > disk_buffer + 1024 , lc - > clean_bits , bitset_size ) ;
if ( rw_log ( lc , 1 ) ) {
lc - > log_dev_failed = 1 ;
return - EIO ; /* Failed disk write */
}
return 0 ;
}
static int find_disk_path ( char * major_minor_str , char * path_rtn , int * unlink_path )
{
int r ;
DIR * dp ;
struct dirent * dep ;
struct stat statbuf ;
int major , minor ;
r = sscanf ( major_minor_str , " %d:%d " , & major , & minor ) ;
if ( r ! = 2 )
return - EINVAL ;
LOG_DBG ( " Checking /dev/mapper for device %d:%d " , major , minor ) ;
/* Check /dev/mapper dir */
dp = opendir ( " /dev/mapper " ) ;
if ( ! dp )
return - ENOENT ;
while ( ( dep = readdir ( dp ) ) ! = NULL ) {
/*
* FIXME : This is racy . By the time the path is used ,
* it may point to something else . ' fstat ' will be
* required upon opening to ensure we got what we
* wanted .
*/
sprintf ( path_rtn , " /dev/mapper/%s " , dep - > d_name ) ;
stat ( path_rtn , & statbuf ) ;
if ( S_ISBLK ( statbuf . st_mode ) & &
( major ( statbuf . st_rdev ) = = major ) & &
( minor ( statbuf . st_rdev ) = = minor ) ) {
LOG_DBG ( " %s: YES " , dep - > d_name ) ;
closedir ( dp ) ;
return 0 ;
} else {
LOG_DBG ( " %s: NO " , dep - > d_name ) ;
}
}
closedir ( dp ) ;
LOG_DBG ( " Path not found for %d/%d " , major , minor ) ;
LOG_DBG ( " Creating /dev/mapper/%d-%d " , major , minor ) ;
sprintf ( path_rtn , " /dev/mapper/%d-%d " , major , minor ) ;
r = mknod ( path_rtn , S_IFBLK | S_IRUSR | S_IWUSR , MKDEV ( major , minor ) ) ;
/*
* If we have to make the path , we unlink it after we open it
*/
* unlink_path = 1 ;
return r ? - errno : 0 ;
}
static int _clog_ctr ( int argc , char * * argv , uint64_t device_size )
{
int i ;
int r = 0 ;
char * p ;
uint64_t region_size ;
uint64_t region_count ;
uint32_t bitset_size ;
struct log_c * lc = NULL ;
struct log_c * dup ;
enum sync sync = DEFAULTSYNC ;
uint32_t block_on_error = 0 ;
int disk_log = 0 ;
char disk_path [ 128 ] ;
int unlink_path = 0 ;
size_t page_size ;
int pages ;
/* If core log request, then argv[0] will be region_size */
if ( ! strtoll ( argv [ 0 ] , & p , 0 ) | | * p ) {
disk_log = 1 ;
if ( ( argc < 3 ) | | ( argc > 5 ) ) {
LOG_ERROR ( " Too %s arguments to clustered_disk log type " ,
( argc < 3 ) ? " few " : " many " ) ;
r = - EINVAL ;
goto fail ;
}
r = find_disk_path ( argv [ 0 ] , disk_path , & unlink_path ) ;
if ( r ) {
LOG_ERROR ( " Unable to find path to device %s " , argv [ 0 ] ) ;
goto fail ;
}
LOG_DBG ( " Clustered log disk is %s " , disk_path ) ;
} else {
disk_log = 0 ;
if ( ( argc < 2 ) | | ( argc > 4 ) ) {
LOG_ERROR ( " Too %s arguments to clustered_core log type " ,
( argc < 2 ) ? " few " : " many " ) ;
r = - EINVAL ;
goto fail ;
}
}
if ( ! ( region_size = strtoll ( argv [ disk_log ] , & p , 0 ) ) | | * p ) {
LOG_ERROR ( " Invalid region_size argument to clustered_%s log type " ,
( disk_log ) ? " disk " : " core " ) ;
r = - EINVAL ;
goto fail ;
}
region_count = device_size / region_size ;
if ( device_size % region_size ) {
/*
* I can ' t remember if device_size must be a multiple
* of region_size , so check it anyway .
*/
region_count + + ;
}
for ( i = 0 ; i < argc ; i + + ) {
if ( ! strcmp ( argv [ i ] , " sync " ) )
sync = FORCESYNC ;
else if ( ! strcmp ( argv [ i ] , " nosync " ) )
sync = NOSYNC ;
else if ( ! strcmp ( argv [ i ] , " block_on_error " ) )
block_on_error = 1 ;
}
lc = malloc ( sizeof ( * lc ) ) ;
if ( ! lc ) {
LOG_ERROR ( " Unable to allocate cluster log context " ) ;
r = - ENOMEM ;
goto fail ;
}
memset ( lc , 0 , sizeof ( * lc ) ) ;
lc - > region_size = region_size ;
lc - > region_count = region_count ;
lc - > sync = sync ;
lc - > block_on_error = block_on_error ;
lc - > sync_search = 0 ;
lc - > recovering_region = ( uint64_t ) - 1 ;
2009-04-21 19:16:22 +00:00
lc - > skip_bit_warning = region_count ;
2009-01-08 17:12:33 +00:00
lc - > disk_fd = - 1 ;
lc - > log_dev_failed = 0 ;
lc - > ref_count = 1 ;
strncpy ( lc - > uuid , argv [ 1 + disk_log ] , DM_UUID_LEN ) ;
if ( ( dup = get_log ( lc - > uuid ) ) | |
( dup = get_pending_log ( lc - > uuid ) ) ) {
LOG_DBG ( " [%s] Inc reference count on cluster log " ,
SHORT_UUID ( lc - > uuid ) ) ;
free ( lc ) ;
dup - > ref_count + + ;
return 0 ;
}
INIT_LIST_HEAD ( & lc - > mark_list ) ;
lc - > bitset_uint32_count = region_count /
( sizeof ( * lc - > clean_bits ) < < BYTE_SHIFT ) ;
if ( region_count % ( sizeof ( * lc - > clean_bits ) < < BYTE_SHIFT ) )
lc - > bitset_uint32_count + + ;
bitset_size = lc - > bitset_uint32_count * sizeof ( * lc - > clean_bits ) ;
lc - > clean_bits = malloc ( bitset_size ) ;
if ( ! lc - > clean_bits ) {
LOG_ERROR ( " Unable to allocate clean bitset " ) ;
r = - ENOMEM ;
goto fail ;
}
memset ( lc - > clean_bits , - 1 , bitset_size ) ;
lc - > sync_bits = malloc ( bitset_size ) ;
if ( ! lc - > sync_bits ) {
LOG_ERROR ( " Unable to allocate sync bitset " ) ;
r = - ENOMEM ;
goto fail ;
}
memset ( lc - > sync_bits , ( sync = = NOSYNC ) ? - 1 : 0 , bitset_size ) ;
lc - > sync_count = ( sync = = NOSYNC ) ? region_count : 0 ;
if ( disk_log ) {
page_size = sysconf ( _SC_PAGESIZE ) ;
pages = bitset_size / page_size ;
pages + = bitset_size % page_size ? 1 : 0 ;
pages + = 1 ; /* for header */
r = open ( disk_path , O_RDWR | O_DIRECT ) ;
if ( r < 0 ) {
LOG_ERROR ( " Unable to open log device, %s: %s " ,
disk_path , strerror ( errno ) ) ;
r = errno ;
goto fail ;
}
if ( unlink_path )
unlink ( disk_path ) ;
lc - > disk_fd = r ;
lc - > disk_size = pages * page_size ;
r = posix_memalign ( & ( lc - > disk_buffer ) , page_size ,
lc - > disk_size ) ;
if ( r ) {
LOG_ERROR ( " Unable to allocate memory for disk_buffer " ) ;
goto fail ;
}
memset ( lc - > disk_buffer , 0 , lc - > disk_size ) ;
LOG_DBG ( " Disk log ready " ) ;
}
list_add ( & lc - > list , & log_pending_list ) ;
return 0 ;
fail :
if ( lc ) {
if ( lc - > clean_bits )
free ( lc - > clean_bits ) ;
if ( lc - > sync_bits )
free ( lc - > sync_bits ) ;
if ( lc - > disk_buffer )
free ( lc - > disk_buffer ) ;
if ( lc - > disk_fd > = 0 )
close ( lc - > disk_fd ) ;
free ( lc ) ;
}
return r ;
}
/*
* clog_ctr
* @ tfr
*
* tfr - > data should contain constructor string as follows :
* [ disk ] < regiion_size > < uuid > [ [ no ] sync ] < device_len >
* The kernel is responsible for adding the < dev_len > argument
* to the end ; otherwise , we cannot compute the region_count .
*
* FIXME : Currently relies on caller to fill in tfr - > error
*/
static int clog_dtr ( struct clog_tfr * tfr ) ;
static int clog_ctr ( struct clog_tfr * tfr )
{
int argc , i , r = 0 ;
char * p , * * argv = NULL ;
uint64_t device_size ;
/* Sanity checks */
if ( ! tfr - > data_size ) {
LOG_ERROR ( " Received constructor request with no data " ) ;
return - EINVAL ;
}
if ( strlen ( tfr - > data ) ! = tfr - > data_size ) {
LOG_ERROR ( " Received constructor request with bad data " ) ;
LOG_ERROR ( " strlen(tfr->data)[%d] != tfr->data_size[%d] " ,
( int ) strlen ( tfr - > data ) , tfr - > data_size ) ;
LOG_ERROR ( " tfr->data = '%s' [%d] " ,
tfr - > data , ( int ) strlen ( tfr - > data ) ) ;
return - EINVAL ;
}
/* Split up args */
for ( argc = 1 , p = tfr - > data ; ( p = strstr ( p , " " ) ) ; p + + , argc + + )
* p = ' \0 ' ;
argv = malloc ( argc * sizeof ( char * ) ) ;
if ( ! argv )
return - ENOMEM ;
for ( i = 0 , p = tfr - > data ; i < argc ; i + + , p = p + strlen ( p ) + 1 )
argv [ i ] = p ;
if ( ! ( device_size = strtoll ( argv [ argc - 1 ] , & p , 0 ) ) | | * p ) {
LOG_ERROR ( " Invalid device size argument: %s " , argv [ argc - 1 ] ) ;
free ( argv ) ;
return - EINVAL ;
}
argc - - ; /* We pass in the device_size separate */
r = _clog_ctr ( argc , argv , device_size ) ;
/* We join the CPG when we resume */
/* No returning data */
tfr - > data_size = 0 ;
free ( argv ) ;
if ( r )
LOG_ERROR ( " Failed to create cluster log (%s) " , tfr - > uuid ) ;
else
LOG_DBG ( " [%s] Cluster log created " ,
SHORT_UUID ( tfr - > uuid ) ) ;
return r ;
}
/*
* clog_dtr
* @ tfr
*
*/
static int clog_dtr ( struct clog_tfr * tfr )
{
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( lc ) {
/*
* The log should not be on the official list . There
* should have been a suspend first .
*/
lc - > ref_count - - ;
if ( ! lc - > ref_count ) {
LOG_ERROR ( " [%s] DTR before SUS: leaving CPG " ,
SHORT_UUID ( tfr - > uuid ) ) ;
destroy_cluster_cpg ( tfr - > uuid ) ;
}
} else if ( ( lc = get_pending_log ( tfr - > uuid ) ) ) {
lc - > ref_count - - ;
} else {
LOG_ERROR ( " clog_dtr called on log that is not official or pending " ) ;
return - EINVAL ;
}
if ( lc - > ref_count ) {
LOG_DBG ( " [%s] Dec reference count on cluster log " ,
SHORT_UUID ( lc - > uuid ) ) ;
return 0 ;
}
LOG_DBG ( " [%s] Cluster log removed " , SHORT_UUID ( lc - > uuid ) ) ;
list_del_init ( & lc - > list ) ;
if ( lc - > disk_fd ! = - 1 )
close ( lc - > disk_fd ) ;
if ( lc - > disk_buffer )
free ( lc - > disk_buffer ) ;
free ( lc - > clean_bits ) ;
free ( lc - > sync_bits ) ;
free ( lc ) ;
return 0 ;
}
/*
* clog_presuspend
* @ tfr
*
*/
static int clog_presuspend ( struct clog_tfr * tfr )
{
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( lc - > touched )
LOG_DBG ( " WARNING: log still marked as 'touched' during suspend " ) ;
lc - > recovery_halted = 1 ;
return 0 ;
}
/*
* clog_postsuspend
* @ tfr
*
*/
static int clog_postsuspend ( struct clog_tfr * tfr )
{
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
LOG_DBG ( " [%s] clog_postsuspend: leaving CPG " , SHORT_UUID ( lc - > uuid ) ) ;
destroy_cluster_cpg ( tfr - > uuid ) ;
2009-04-21 19:16:22 +00:00
lc - > state = LOG_SUSPENDED ;
2009-01-08 17:12:33 +00:00
lc - > recovering_region = ( uint64_t ) - 1 ;
lc - > recoverer = ( uint32_t ) - 1 ;
2009-04-21 19:16:22 +00:00
lc - > delay = time ( NULL ) ;
2009-01-08 17:12:33 +00:00
return 0 ;
}
/*
* cluster_postsuspend
* @ tfr
*
*/
int cluster_postsuspend ( char * uuid )
{
struct log_c * lc = get_log ( uuid ) ;
if ( ! lc )
return - EINVAL ;
LOG_DBG ( " [%s] clog_postsuspend: finalizing " , SHORT_UUID ( lc - > uuid ) ) ;
lc - > resume_override = 0 ;
/* move log to pending list */
list_del_init ( & lc - > list ) ;
list_add ( & lc - > list , & log_pending_list ) ;
return 0 ;
}
/*
* clog_resume
* @ tfr
*
* Does the main work of resuming .
*/
static int clog_resume ( struct clog_tfr * tfr )
{
uint32_t i ;
int commit_log = 0 ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
size_t size = lc - > bitset_uint32_count * sizeof ( uint32_t ) ;
if ( ! lc )
return - EINVAL ;
switch ( lc - > resume_override ) {
case 1000 :
LOG_ERROR ( " [%s] Additional resume issued before suspend " ,
SHORT_UUID ( tfr - > uuid ) ) ;
2009-04-21 19:16:22 +00:00
# ifdef DEBUG
kill ( getpid ( ) , SIGUSR1 ) ;
# endif
2009-01-08 17:12:33 +00:00
return 0 ;
case 0 :
lc - > resume_override = 1000 ;
if ( lc - > disk_fd = = - 1 ) {
LOG_DBG ( " [%s] Master resume. " ,
SHORT_UUID ( lc - > uuid ) ) ;
goto no_disk ;
}
LOG_DBG ( " [%s] Master resume: reading disk log " ,
SHORT_UUID ( lc - > uuid ) ) ;
commit_log = 1 ;
break ;
case 1 :
LOG_ERROR ( " Error:: partial bit loading (just sync_bits) " ) ;
return - EINVAL ;
case 2 :
LOG_ERROR ( " Error:: partial bit loading (just clean_bits) " ) ;
return - EINVAL ;
case 3 :
LOG_DBG ( " [%s] Non-master resume: bits pre-loaded " ,
SHORT_UUID ( lc - > uuid ) ) ;
lc - > resume_override = 1000 ;
goto out ;
default :
LOG_ERROR ( " Error:: multiple loading of bits (%d) " , lc - > resume_override ) ;
return - EINVAL ;
}
if ( lc - > log_dev_failed ) {
LOG_ERROR ( " Log device has failed, unable to read bits " ) ;
tfr - > error = 0 ; /* We can handle this so far */
lc - > disk_nr_regions = 0 ;
} else
tfr - > error = read_log ( lc ) ;
switch ( tfr - > error ) {
case 0 :
if ( lc - > disk_nr_regions < lc - > region_count )
LOG_DBG ( " [%s] Mirror has grown, updating log bits " ,
SHORT_UUID ( lc - > uuid ) ) ;
else if ( lc - > disk_nr_regions > lc - > region_count )
LOG_DBG ( " [%s] Mirror has shrunk, updating log bits " ,
SHORT_UUID ( lc - > uuid ) ) ;
break ;
case - EINVAL :
LOG_PRINT ( " [%s] (Re)initializing mirror log - resync issued. " ,
SHORT_UUID ( lc - > uuid ) ) ;
lc - > disk_nr_regions = 0 ;
break ;
default :
LOG_ERROR ( " Failed to read disk log " ) ;
lc - > disk_nr_regions = 0 ;
break ;
}
no_disk :
/* If mirror has grown, set bits appropriately */
if ( lc - > sync = = NOSYNC )
for ( i = lc - > disk_nr_regions ; i < lc - > region_count ; i + + )
log_set_bit ( lc , lc - > clean_bits , i ) ;
else
for ( i = lc - > disk_nr_regions ; i < lc - > region_count ; i + + )
log_clear_bit ( lc , lc - > clean_bits , i ) ;
/* Clear any old bits if device has shrunk */
for ( i = lc - > region_count ; i % 32 ; i + + )
log_clear_bit ( lc , lc - > clean_bits , i ) ;
/* copy clean across to sync */
memcpy ( lc - > sync_bits , lc - > clean_bits , size ) ;
if ( commit_log & & ( lc - > disk_fd > = 0 ) ) {
tfr - > error = write_log ( lc ) ;
if ( tfr - > error )
LOG_ERROR ( " Failed initial disk log write " ) ;
else
LOG_DBG ( " Disk log initialized " ) ;
lc - > touched = 0 ;
}
out :
/*
* Clear any old bits if device has shrunk - necessary
* for non - master resume
*/
for ( i = lc - > region_count ; i % 32 ; i + + ) {
log_clear_bit ( lc , lc - > clean_bits , i ) ;
log_clear_bit ( lc , lc - > sync_bits , i ) ;
}
lc - > sync_count = count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ;
2009-04-21 19:16:22 +00:00
LOG_SPRINT ( " [%s] Initial sync_count = %llu " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long ) lc - > sync_count ) ;
2009-01-08 17:12:33 +00:00
lc - > sync_search = 0 ;
lc - > state = LOG_RESUMED ;
lc - > recovery_halted = 0 ;
return tfr - > error ;
}
/*
* local_resume
* @ tfr
*
* If the log is pending , we must first join the cpg and
* put the log in the official list .
*
*/
int local_resume ( struct clog_tfr * tfr )
{
int r ;
2009-04-21 19:16:22 +00:00
time_t t ;
2009-01-08 17:12:33 +00:00
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc ) {
/* Is the log in the pending list? */
lc = get_pending_log ( tfr - > uuid ) ;
if ( ! lc ) {
LOG_ERROR ( " clog_resume called on log that is not official or pending " ) ;
return - EINVAL ;
}
2009-04-21 19:16:22 +00:00
t = time ( NULL ) ;
t - = lc - > delay ;
/*
* This should be considered a temporary fix . It addresses
* a problem that exists when nodes suspend / resume in rapid
* succession . While the problem is very rare , it has been
* seen to happen in real - world - like testing .
*
* The problem :
* - Node A joins cluster
* - Node B joins cluster
* - Node A prepares checkpoint
* - Node A gets ready to write checkpoint
* - Node B leaves
* - Node B joins
* - Node A finishes write of checkpoint
* - Node B receives checkpoint meant for previous session
* - - Node B can now be non - coherent
*
* This timer will solve the problem for now , but could be
* replaced by a generation number sent with the resume
* command from the kernel . The generation number would
* be included in the name of the checkpoint to prevent
* reading stale data .
*/
if ( ( t < 3 ) & & ( t > = 0 ) )
sleep ( 3 - t ) ;
2009-01-08 17:12:33 +00:00
/* Join the CPG */
r = create_cluster_cpg ( tfr - > uuid ) ;
if ( r ) {
LOG_ERROR ( " clog_resume: Failed to create cluster CPG " ) ;
return r ;
}
/* move log to official list */
list_del_init ( & lc - > list ) ;
list_add ( & lc - > list , & log_list ) ;
}
return 0 ;
}
/*
* clog_get_region_size
* @ tfr
*
* Since this value doesn ' t change , the kernel
* should not need to talk to server to get this
* The function is here for completness
*
* Returns : 0 on success , - EXXX on failure
*/
static int clog_get_region_size ( struct clog_tfr * tfr )
{
uint64_t * rtn = ( uint64_t * ) tfr - > data ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
LOG_PRINT ( " WARNING: kernel should not be calling clog_get_region_size " ) ;
if ( ! lc )
return - EINVAL ;
/* FIXME: region_size is 32-bit, while function requires 64-bit */
* rtn = lc - > region_size ;
tfr - > data_size = sizeof ( * rtn ) ;
return 0 ;
}
/*
* clog_is_clean
* @ tfr
*
* Returns : 1 if clean , 0 otherwise
*/
static int clog_is_clean ( struct clog_tfr * tfr )
{
int * rtn = ( int * ) tfr - > data ;
uint64_t region = * ( ( uint64_t * ) ( tfr - > data ) ) ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
* rtn = log_test_bit ( lc - > clean_bits , region ) ;
tfr - > data_size = sizeof ( * rtn ) ;
return 0 ;
}
/*
* clog_in_sync
* @ tfr
*
* We ignore any request for non - block . That
* should be handled elsewhere . ( If the request
* has come this far , it has already blocked . )
*
* Returns : 1 if in - sync , 0 otherwise
*/
static int clog_in_sync ( struct clog_tfr * tfr )
{
int * rtn = ( int * ) tfr - > data ;
uint64_t region = * ( ( uint64_t * ) ( tfr - > data ) ) ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( region > lc - > region_count )
return - EINVAL ;
* rtn = log_test_bit ( lc - > sync_bits , region ) ;
if ( * rtn )
LOG_DBG ( " [%s] Region is in-sync: %llu " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long ) region ) ;
else
LOG_DBG ( " [%s] Region is not in-sync: %llu " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long ) region ) ;
tfr - > data_size = sizeof ( * rtn ) ;
return 0 ;
}
/*
* clog_flush
* @ tfr
*
*/
static int clog_flush ( struct clog_tfr * tfr , int server )
{
int r = 0 ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( ! lc - > touched )
return 0 ;
/*
* Do the actual flushing of the log only
* if we are the server .
*/
if ( server & & ( lc - > disk_fd > = 0 ) ) {
r = tfr - > error = write_log ( lc ) ;
if ( r )
LOG_ERROR ( " [%s] Error writing to disk log " ,
SHORT_UUID ( lc - > uuid ) ) ;
else
LOG_DBG ( " [%s] Disk log written " , SHORT_UUID ( lc - > uuid ) ) ;
}
lc - > touched = 0 ;
return r ;
}
/*
* mark_region
* @ lc
* @ region
* @ who
*
* Put a mark region request in the tree for tracking .
*
* Returns : 0 on success , - EXXX on error
*/
static int mark_region ( struct log_c * lc , uint64_t region , uint32_t who )
{
int found = 0 ;
struct mark_entry * m ;
struct list_head * p , * n ;
list_for_each_safe ( p , n , & lc - > mark_list ) {
/* FIXME: Use proper macros */
m = ( struct mark_entry * ) p ;
if ( m - > region = = region ) {
found = 1 ;
if ( m - > nodeid = = who )
return 0 ;
}
}
if ( ! found )
log_clear_bit ( lc , lc - > clean_bits , region ) ;
/*
* Save allocation until here - if there is a failure ,
* at least we have cleared the bit .
*/
m = malloc ( sizeof ( * m ) ) ;
if ( ! m ) {
LOG_ERROR ( " Unable to allocate space for mark_entry: %llu/%u " ,
( unsigned long long ) region , who ) ;
return - ENOMEM ;
}
m - > nodeid = who ;
m - > region = region ;
list_add_tail ( & m - > list , & lc - > mark_list ) ;
return 0 ;
}
/*
* clog_mark_region
* @ tfr
*
* tfr may contain more than one mark request . We
* can determine the number from the ' data_size ' field .
*
* Returns : 0 on success , - EXXX on failure
*/
static int clog_mark_region ( struct clog_tfr * tfr )
{
int r ;
int count ;
uint64_t * region ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( tfr - > data_size % sizeof ( uint64_t ) ) {
LOG_ERROR ( " Bad data size given for mark_region request " ) ;
return - EINVAL ;
}
count = tfr - > data_size / sizeof ( uint64_t ) ;
region = ( uint64_t * ) & tfr - > data ;
for ( ; count > 0 ; count - - , region + + ) {
r = mark_region ( lc , * region , tfr - > originator ) ;
if ( r )
return r ;
}
tfr - > data_size = 0 ;
return 0 ;
}
static int clear_region ( struct log_c * lc , uint64_t region , uint32_t who )
{
int other_matches = 0 ;
struct mark_entry * m ;
struct list_head * p , * n ;
list_for_each_safe ( p , n , & lc - > mark_list ) {
/* FIXME: Use proper macros */
m = ( struct mark_entry * ) p ;
if ( m - > region = = region ) {
if ( m - > nodeid = = who ) {
list_del_init ( & m - > list ) ;
free ( m ) ;
} else
other_matches = 1 ;
}
}
/*
* Clear region if :
* 1 ) It is in - sync
* 2 ) There are no other machines that have it marked
*/
if ( ! other_matches & & log_test_bit ( lc - > sync_bits , region ) )
log_set_bit ( lc , lc - > clean_bits , region ) ;
return 0 ;
}
/*
* clog_clear_region
* @ tfr
*
* tfr may contain more than one clear request . We
* can determine the number from the ' data_size ' field .
*
* Returns : 0 on success , - EXXX on failure
*/
static int clog_clear_region ( struct clog_tfr * tfr )
{
int r ;
int count ;
uint64_t * region ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( tfr - > data_size % sizeof ( uint64_t ) ) {
LOG_ERROR ( " Bad data size given for clear_region request " ) ;
return - EINVAL ;
}
count = tfr - > data_size / sizeof ( uint64_t ) ;
region = ( uint64_t * ) & tfr - > data ;
for ( ; count > 0 ; count - - , region + + ) {
r = clear_region ( lc , * region , tfr - > originator ) ;
if ( r )
return r ;
}
tfr - > data_size = 0 ;
return 0 ;
}
/*
* clog_get_resync_work
* @ tfr
*
*/
static int clog_get_resync_work ( struct clog_tfr * tfr )
{
struct { int i ; uint64_t r ; } * pkg = ( void * ) tfr - > data ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
tfr - > data_size = sizeof ( * pkg ) ;
pkg - > i = 0 ;
if ( lc - > sync_search > = lc - > region_count ) {
/*
* FIXME : handle intermittent errors during recovery
* by resetting sync_search . . . but not to many times .
*/
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Recovery finished " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ) ;
return 0 ;
}
if ( lc - > recovering_region ! = ( uint64_t ) - 1 ) {
if ( lc - > recoverer = = tfr - > originator ) {
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Re-requesting work (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) lc - > recovering_region ) ;
pkg - > r = lc - > recovering_region ;
pkg - > i = 1 ;
2009-04-21 19:16:22 +00:00
LOG_COND ( log_resend_requests , " ***** RE-REQUEST ***** " ) ;
2009-01-08 17:12:33 +00:00
} else {
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Someone already recovering (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) lc - > recovering_region ) ;
}
return 0 ;
}
while ( lc - > recovery_request_list ) {
struct recovery_request * del ;
del = lc - > recovery_request_list ;
lc - > recovery_request_list = del - > next ;
pkg - > r = del - > region ;
free ( del ) ;
if ( ! log_test_bit ( lc - > sync_bits , pkg - > r ) ) {
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Assigning priority resync work (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) pkg - > r ) ;
pkg - > i = 1 ;
lc - > recovering_region = pkg - > r ;
lc - > recoverer = tfr - > originator ;
return 0 ;
}
}
pkg - > r = find_next_zero_bit ( lc - > sync_bits ,
lc - > region_count ,
lc - > sync_search ) ;
if ( pkg - > r > = lc - > region_count ) {
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Resync work complete. " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ) ;
return 0 ;
}
lc - > sync_search = pkg - > r + 1 ;
LOG_SPRINT ( " GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Assigning resync work (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) pkg - > r ) ;
pkg - > i = 1 ;
lc - > recovering_region = pkg - > r ;
lc - > recoverer = tfr - > originator ;
return 0 ;
}
/*
* clog_set_region_sync
* @ tfr
*/
static int clog_set_region_sync ( struct clog_tfr * tfr )
{
struct { uint64_t region ; int in_sync ; } * pkg = ( void * ) tfr - > data ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
lc - > recovering_region = ( uint64_t ) - 1 ;
if ( pkg - > in_sync ) {
if ( log_test_bit ( lc - > sync_bits , pkg - > region ) ) {
LOG_SPRINT ( " SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Region already set (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) pkg - > region ) ;
} else {
log_set_bit ( lc , lc - > sync_bits , pkg - > region ) ;
lc - > sync_count + + ;
2009-04-21 19:16:22 +00:00
/* The rest of this section is all for debugging */
2009-01-08 17:12:33 +00:00
LOG_SPRINT ( " SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Setting region (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) pkg - > region ) ;
2009-04-21 19:16:22 +00:00
if ( pkg - > region = = lc - > skip_bit_warning )
lc - > skip_bit_warning = lc - > region_count ;
if ( pkg - > region > ( lc - > skip_bit_warning + 5 ) ) {
LOG_ERROR ( " *** Region #%llu skipped during recovery *** " ,
( unsigned long long ) lc - > skip_bit_warning ) ;
lc - > skip_bit_warning = lc - > region_count ;
# ifdef DEBUG
kill ( getpid ( ) , SIGUSR1 ) ;
# endif
}
if ( ! log_test_bit ( lc - > sync_bits ,
( pkg - > region ) ? pkg - > region - 1 : 0 ) ) {
LOG_SPRINT ( " *** Previous bit not set *** " ) ;
lc - > skip_bit_warning = ( pkg - > region ) ?
pkg - > region - 1 : 0 ;
}
2009-01-08 17:12:33 +00:00
}
} else if ( log_test_bit ( lc - > sync_bits , pkg - > region ) ) {
lc - > sync_count - - ;
log_clear_bit ( lc , lc - > sync_bits , pkg - > region ) ;
LOG_SPRINT ( " SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" Unsetting region (%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) pkg - > region ) ;
}
if ( lc - > sync_count ! = count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ) {
unsigned long long reset = count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ;
LOG_SPRINT ( " SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" sync_count(%llu) != bitmap count(%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) lc - > sync_count , reset ) ;
2009-04-21 19:16:22 +00:00
# ifdef DEBUG
kill ( getpid ( ) , SIGUSR1 ) ;
# endif
2009-01-08 17:12:33 +00:00
lc - > sync_count = reset ;
}
if ( lc - > sync_count > lc - > region_count )
LOG_SPRINT ( " SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
" (lc->sync_count > lc->region_count) - this is bad " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ) ;
tfr - > data_size = 0 ;
return 0 ;
}
/*
* clog_get_sync_count
* @ tfr
*/
static int clog_get_sync_count ( struct clog_tfr * tfr )
{
uint64_t * sync_count = ( uint64_t * ) tfr - > data ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
/*
* FIXME : Mirror requires us to be able to ask for
* the sync count while pending . . . but I don ' t like
* it because other machines may not be suspended and
* the stored value may not be accurate .
*/
if ( ! lc )
lc = get_pending_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
* sync_count = lc - > sync_count ;
tfr - > data_size = sizeof ( * sync_count ) ;
2009-04-21 19:16:22 +00:00
if ( lc - > sync_count ! = count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ) {
unsigned long long reset = count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ;
LOG_SPRINT ( " get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
" sync_count(%llu) != bitmap count(%llu) " ,
tfr - > seq , SHORT_UUID ( lc - > uuid ) , tfr - > originator ,
( unsigned long long ) lc - > sync_count , reset ) ;
# ifdef DEBUG
kill ( getpid ( ) , SIGUSR1 ) ;
# endif
lc - > sync_count = reset ;
}
2009-01-08 17:12:33 +00:00
return 0 ;
}
static int core_status_info ( struct log_c * lc , struct clog_tfr * tfr )
{
char * data = ( char * ) tfr - > data ;
tfr - > data_size = sprintf ( data , " 1 clustered_core " ) ;
return 0 ;
}
static int disk_status_info ( struct log_c * lc , struct clog_tfr * tfr )
{
char * data = ( char * ) tfr - > data ;
struct stat statbuf ;
if ( fstat ( lc - > disk_fd , & statbuf ) ) {
tfr - > error = - errno ;
return - errno ;
}
tfr - > data_size = sprintf ( data , " 3 clustered_disk %d:%d %c " ,
major ( statbuf . st_rdev ) , minor ( statbuf . st_rdev ) ,
( lc - > log_dev_failed ) ? ' D ' : ' A ' ) ;
return 0 ;
}
/*
* clog_status_info
* @ tfr
*
*/
static int clog_status_info ( struct clog_tfr * tfr )
{
int r ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
lc = get_pending_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( lc - > disk_fd = = - 1 )
r = core_status_info ( lc , tfr ) ;
else
r = disk_status_info ( lc , tfr ) ;
return r ;
}
static int core_status_table ( struct log_c * lc , struct clog_tfr * tfr )
{
int params ;
char * data = ( char * ) tfr - > data ;
params = ( lc - > sync = = DEFAULTSYNC ) ? 3 : 4 ;
tfr - > data_size = sprintf ( data , " clustered_core %d %u %s %s%s " ,
params , lc - > region_size , lc - > uuid ,
( lc - > sync = = DEFAULTSYNC ) ? " " :
( lc - > sync = = NOSYNC ) ? " nosync " : " sync " ,
( lc - > block_on_error ) ? " block_on_error " : " " ) ;
return 0 ;
}
static int disk_status_table ( struct log_c * lc , struct clog_tfr * tfr )
{
int params ;
char * data = ( char * ) tfr - > data ;
struct stat statbuf ;
if ( fstat ( lc - > disk_fd , & statbuf ) ) {
tfr - > error = - errno ;
return - errno ;
}
params = ( lc - > sync = = DEFAULTSYNC ) ? 4 : 5 ;
tfr - > data_size = sprintf ( data , " clustered_disk %d %d:%d %u %s %s%s " ,
params , major ( statbuf . st_rdev ) , minor ( statbuf . st_rdev ) ,
lc - > region_size , lc - > uuid ,
( lc - > sync = = DEFAULTSYNC ) ? " " :
( lc - > sync = = NOSYNC ) ? " nosync " : " sync " ,
( lc - > block_on_error ) ? " block_on_error " : " " ) ;
return 0 ;
}
/*
* clog_status_table
* @ tfr
*
*/
static int clog_status_table ( struct clog_tfr * tfr )
{
int r ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
lc = get_pending_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( lc - > disk_fd = = - 1 )
r = core_status_table ( lc , tfr ) ;
else
r = disk_status_table ( lc , tfr ) ;
return r ;
}
/*
* clog_is_remote_recovering
* @ tfr
*
*/
static int clog_is_remote_recovering ( struct clog_tfr * tfr )
{
uint64_t region = * ( ( uint64_t * ) ( tfr - > data ) ) ;
struct { int is_recovering ; uint64_t in_sync_hint ; } * pkg = ( void * ) tfr - > data ;
struct log_c * lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
if ( region > lc - > region_count )
return - EINVAL ;
if ( lc - > recovery_halted ) {
LOG_DBG ( " [%s] Recovery halted... [not remote recovering]: %llu " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long ) region ) ;
pkg - > is_recovering = 0 ;
pkg - > in_sync_hint = lc - > region_count ; /* none are recovering */
} else {
pkg - > is_recovering = ! log_test_bit ( lc - > sync_bits , region ) ;
/*
* Remember , ' lc - > sync_search ' is 1 plus the region
* currently being recovered . So , we must take off 1
* to account for that .
*/
pkg - > in_sync_hint = ( lc - > sync_search - 1 ) ;
LOG_DBG ( " [%s] Region is %s: %llu " ,
SHORT_UUID ( lc - > uuid ) ,
( region = = lc - > recovering_region ) ?
" currently remote recovering " :
( pkg - > is_recovering ) ? " pending remote recovery " :
" not remote recovering " , ( unsigned long long ) region ) ;
}
if ( pkg - > is_recovering & &
( region ! = lc - > recovering_region ) ) {
struct recovery_request * rr ;
/* Already in the list? */
for ( rr = lc - > recovery_request_list ; rr ; rr = rr - > next )
if ( rr - > region = = region )
goto out ;
/* Failure to allocated simply means we can't prioritize it */
rr = malloc ( sizeof ( * rr ) ) ;
if ( ! rr )
goto out ;
LOG_DBG ( " [%s] Adding region to priority list: %llu " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long ) region ) ;
rr - > region = region ;
rr - > next = lc - > recovery_request_list ;
lc - > recovery_request_list = rr ;
}
out :
tfr - > data_size = sizeof ( * pkg ) ;
return 0 ;
}
/*
* do_request
* @ tfr : the request
* @ server : is this request performed by the server
*
* An inability to perform this function will return an error
* from this function . However , an inability to successfully
* perform the request will fill in the ' tfr - > error ' field .
*
* Returns : 0 on success , - EXXX on error
*/
int do_request ( struct clog_tfr * tfr , int server )
{
int r ;
if ( ! tfr )
return 0 ;
if ( tfr - > error )
LOG_DBG ( " Programmer error: tfr struct has error set " ) ;
switch ( tfr - > request_type ) {
case DM_CLOG_CTR :
r = clog_ctr ( tfr ) ;
break ;
case DM_CLOG_DTR :
r = clog_dtr ( tfr ) ;
break ;
case DM_CLOG_PRESUSPEND :
r = clog_presuspend ( tfr ) ;
break ;
case DM_CLOG_POSTSUSPEND :
r = clog_postsuspend ( tfr ) ;
break ;
case DM_CLOG_RESUME :
r = clog_resume ( tfr ) ;
break ;
case DM_CLOG_GET_REGION_SIZE :
r = clog_get_region_size ( tfr ) ;
break ;
case DM_CLOG_IS_CLEAN :
r = clog_is_clean ( tfr ) ;
break ;
case DM_CLOG_IN_SYNC :
r = clog_in_sync ( tfr ) ;
break ;
case DM_CLOG_FLUSH :
r = clog_flush ( tfr , server ) ;
break ;
case DM_CLOG_MARK_REGION :
r = clog_mark_region ( tfr ) ;
break ;
case DM_CLOG_CLEAR_REGION :
r = clog_clear_region ( tfr ) ;
break ;
case DM_CLOG_GET_RESYNC_WORK :
r = clog_get_resync_work ( tfr ) ;
break ;
case DM_CLOG_SET_REGION_SYNC :
r = clog_set_region_sync ( tfr ) ;
break ;
case DM_CLOG_GET_SYNC_COUNT :
r = clog_get_sync_count ( tfr ) ;
break ;
case DM_CLOG_STATUS_INFO :
r = clog_status_info ( tfr ) ;
break ;
case DM_CLOG_STATUS_TABLE :
r = clog_status_table ( tfr ) ;
break ;
case DM_CLOG_IS_REMOTE_RECOVERING :
r = clog_is_remote_recovering ( tfr ) ;
break ;
default :
LOG_ERROR ( " Unknown request " ) ;
r = tfr - > error = - EINVAL ;
break ;
}
if ( r & & ! tfr - > error )
tfr - > error = r ;
else if ( r ! = tfr - > error )
LOG_DBG ( " Warning: error from function != tfr->error " ) ;
if ( tfr - > error & & tfr - > data_size ) {
/* Make sure I'm handling errors correctly above */
LOG_DBG ( " Programmer error: tfr->error && tfr->data_size " ) ;
tfr - > data_size = 0 ;
}
return 0 ;
}
static void print_bits ( char * buf , int size , int print )
{
int i ;
char outbuf [ 128 ] ;
memset ( outbuf , 0 , sizeof ( outbuf ) ) ;
for ( i = 0 ; i < size ; i + + ) {
if ( ! ( i % 16 ) ) {
if ( outbuf [ 0 ] ! = ' \0 ' ) {
if ( print )
LOG_PRINT ( " %s " , outbuf ) ;
else
LOG_DBG ( " %s " , outbuf ) ;
}
memset ( outbuf , 0 , sizeof ( outbuf ) ) ;
sprintf ( outbuf , " [%3d - %3d] " , i , i + 15 ) ;
}
sprintf ( outbuf + strlen ( outbuf ) , " %.2X " , ( unsigned char ) buf [ i ] ) ;
}
if ( outbuf [ 0 ] ! = ' \0 ' ) {
if ( print )
LOG_PRINT ( " %s " , outbuf ) ;
else
LOG_DBG ( " %s " , outbuf ) ;
}
}
/* int store_bits(const char *uuid, const char *which, char **buf)*/
2009-04-21 19:16:22 +00:00
int push_state ( const char * uuid , const char * which , char * * buf , uint32_t debug_who )
2009-01-08 17:12:33 +00:00
{
int bitset_size ;
struct log_c * lc ;
if ( * buf )
LOG_ERROR ( " store_bits: *buf != NULL " ) ;
lc = get_log ( uuid ) ;
if ( ! lc ) {
LOG_ERROR ( " store_bits: No log found for %s " , uuid ) ;
return - EINVAL ;
}
if ( ! strcmp ( which , " recovering_region " ) ) {
* buf = malloc ( 64 ) ; /* easily handles the 2 written numbers */
if ( ! * buf )
return - ENOMEM ;
sprintf ( * buf , " %llu %u " , ( unsigned long long ) lc - > recovering_region ,
lc - > recoverer ) ;
2009-04-21 19:16:22 +00:00
LOG_SPRINT ( " CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
" recovering_region=%llu, recoverer=%u, sync_count=%llu " ,
SHORT_UUID ( lc - > uuid ) , debug_who ,
( unsigned long long ) lc - > recovering_region ,
lc - > recoverer ,
( unsigned long long ) count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ) ;
2009-01-08 17:12:33 +00:00
return 64 ;
}
bitset_size = lc - > bitset_uint32_count * sizeof ( * lc - > clean_bits ) ;
* buf = malloc ( bitset_size ) ;
if ( ! * buf ) {
LOG_ERROR ( " store_bits: Unable to allocate memory " ) ;
return - ENOMEM ;
}
if ( ! strncmp ( which , " sync_bits " , 9 ) ) {
memcpy ( * buf , lc - > sync_bits , bitset_size ) ;
LOG_DBG ( " [%s] storing sync_bits (sync_count = %llu): " ,
SHORT_UUID ( uuid ) , ( unsigned long long )
count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ) ;
print_bits ( * buf , bitset_size , 0 ) ;
} else if ( ! strncmp ( which , " clean_bits " , 9 ) ) {
memcpy ( * buf , lc - > clean_bits , bitset_size ) ;
LOG_DBG ( " [%s] storing clean_bits: " , SHORT_UUID ( lc - > uuid ) ) ;
print_bits ( * buf , bitset_size , 0 ) ;
}
return bitset_size ;
}
/*int load_bits(const char *uuid, const char *which, char *buf, int size)*/
int pull_state ( const char * uuid , const char * which , char * buf , int size )
{
int bitset_size ;
struct log_c * lc ;
if ( ! buf )
LOG_ERROR ( " pull_state: buf == NULL " ) ;
lc = get_log ( uuid ) ;
if ( ! lc ) {
LOG_ERROR ( " pull_state: No log found for %s " , uuid ) ;
return - EINVAL ;
}
if ( ! strncmp ( which , " recovering_region " , 17 ) ) {
sscanf ( buf , " %llu %u " , ( unsigned long long * ) & lc - > recovering_region ,
& lc - > recoverer ) ;
LOG_SPRINT ( " CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
" recovering_region=%llu, recoverer=%u " ,
SHORT_UUID ( lc - > uuid ) ,
( unsigned long long ) lc - > recovering_region , lc - > recoverer ) ;
return 0 ;
}
bitset_size = lc - > bitset_uint32_count * sizeof ( * lc - > clean_bits ) ;
if ( bitset_size ! = size ) {
LOG_ERROR ( " pull_state(%s): bad bitset_size (%d vs %d) " ,
which , size , bitset_size ) ;
return - EINVAL ;
}
if ( ! strncmp ( which , " sync_bits " , 9 ) ) {
lc - > resume_override + = 1 ;
memcpy ( lc - > sync_bits , buf , bitset_size ) ;
LOG_DBG ( " [%s] loading sync_bits (sync_count = %llu): " ,
SHORT_UUID ( lc - > uuid ) , ( unsigned long long )
count_bits32 ( lc - > sync_bits , lc - > bitset_uint32_count ) ) ;
print_bits ( ( char * ) lc - > sync_bits , bitset_size , 0 ) ;
} else if ( ! strncmp ( which , " clean_bits " , 9 ) ) {
lc - > resume_override + = 2 ;
memcpy ( lc - > clean_bits , buf , bitset_size ) ;
LOG_DBG ( " [%s] loading clean_bits: " , SHORT_UUID ( lc - > uuid ) ) ;
print_bits ( ( char * ) lc - > clean_bits , bitset_size , 0 ) ;
}
return 0 ;
}
int log_get_state ( struct clog_tfr * tfr )
{
struct log_c * lc ;
lc = get_log ( tfr - > uuid ) ;
if ( ! lc )
return - EINVAL ;
return lc - > state ;
}
/*
* log_status
*
* Returns : 1 if logs are still present , 0 otherwise
*/
int log_status ( void )
{
struct list_head * l ;
__list_for_each ( l , & log_list )
return 1 ;
__list_for_each ( l , & log_pending_list )
return 1 ;
return 0 ;
}
void log_debug ( void )
{
struct list_head * l ;
struct log_c * lc ;
uint64_t r ;
int i ;
LOG_ERROR ( " " ) ;
LOG_ERROR ( " LOG COMPONENT DEBUGGING:: " ) ;
LOG_ERROR ( " Official log list: " ) ;
__list_for_each ( l , & log_list ) {
lc = list_entry ( l , struct log_c , list ) ;
LOG_ERROR ( " %s " , lc - > uuid ) ;
LOG_ERROR ( " recoverer : %u " , lc - > recoverer ) ;
LOG_ERROR ( " recovering_region: %llu " ,
( unsigned long long ) lc - > recovering_region ) ;
LOG_ERROR ( " recovery_halted : %s " , ( lc - > recovery_halted ) ?
" YES " : " NO " ) ;
LOG_ERROR ( " sync_bits: " ) ;
print_bits ( ( char * ) lc - > sync_bits ,
lc - > bitset_uint32_count * sizeof ( * lc - > sync_bits ) , 1 ) ;
LOG_ERROR ( " clean_bits: " ) ;
print_bits ( ( char * ) lc - > clean_bits ,
lc - > bitset_uint32_count * sizeof ( * lc - > clean_bits ) , 1 ) ;
}
LOG_ERROR ( " Pending log list: " ) ;
__list_for_each ( l , & log_pending_list ) {
lc = list_entry ( l , struct log_c , list ) ;
LOG_ERROR ( " %s " , lc - > uuid ) ;
LOG_ERROR ( " sync_bits: " ) ;
print_bits ( ( char * ) lc - > sync_bits ,
lc - > bitset_uint32_count * sizeof ( * lc - > sync_bits ) , 1 ) ;
LOG_ERROR ( " clean_bits: " ) ;
print_bits ( ( char * ) lc - > clean_bits ,
lc - > bitset_uint32_count * sizeof ( * lc - > clean_bits ) , 1 ) ;
}
__list_for_each ( l , & log_list ) {
lc = list_entry ( l , struct log_c , list ) ;
LOG_ERROR ( " Validating %s:: " , SHORT_UUID ( lc - > uuid ) ) ;
r = find_next_zero_bit ( lc - > sync_bits , lc - > region_count , 0 ) ;
LOG_ERROR ( " lc->region_count = %llu " ,
( unsigned long long ) lc - > region_count ) ;
LOG_ERROR ( " lc->sync_count = %llu " ,
( unsigned long long ) lc - > sync_count ) ;
LOG_ERROR ( " next zero bit = %llu " ,
( unsigned long long ) r ) ;
if ( ( r > lc - > region_count ) | |
( ( r = = lc - > region_count ) & & ( lc - > sync_count > lc - > region_count ) ) ) {
LOG_ERROR ( " ADJUSTING SYNC_COUNT " ) ;
lc - > sync_count = lc - > region_count ;
}
}
LOG_ERROR ( " Resync request history: " ) ;
for ( i = 0 ; i < RESYNC_HISTORY ; i + + ) {
idx + + ;
idx = idx % RESYNC_HISTORY ;
if ( resync_history [ idx ] [ 0 ] = = ' \0 ' )
continue ;
LOG_ERROR ( " %d:%d) %s " , i , idx , resync_history [ idx ] ) ;
}
}