2007-05-04 02:30:18 +04:00
/*
ctdb recovery daemon
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 07:50:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 09:29:31 +04:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 07:50:53 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-04 02:30:18 +04:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 07:50:53 +04:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 09:29:31 +04:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-04 02:30:18 +04:00
*/
2015-10-26 08:50:46 +03:00
# include "replace.h"
2007-05-04 02:30:18 +04:00
# include "system/filesys.h"
2007-05-10 08:06:48 +04:00
# include "system/time.h"
2007-09-14 04:16:36 +04:00
# include "system/network.h"
2007-10-22 06:34:08 +04:00
# include "system/wait.h"
2015-10-26 08:50:46 +03:00
# include <popt.h>
# include <talloc.h>
# include <tevent.h>
# include <tdb.h>
2014-08-15 09:46:33 +04:00
# include "lib/tdb_wrap/tdb_wrap.h"
2014-08-15 10:18:05 +04:00
# include "lib/util/dlinklist.h"
2015-10-26 08:50:46 +03:00
# include "lib/util/debug.h"
# include "lib/util/samba_util.h"
2015-09-24 02:10:59 +03:00
# include "lib/util/util_process.h"
2015-10-26 08:50:46 +03:00
# include "ctdb_private.h"
# include "ctdb_client.h"
2015-10-23 06:11:53 +03:00
# include "common/system.h"
2015-10-26 08:50:46 +03:00
# include "common/cmdline.h"
2015-10-23 06:17:34 +03:00
# include "common/common.h"
2015-11-11 07:41:10 +03:00
# include "common/logging.h"
2007-05-04 02:30:18 +04:00
2016-02-17 12:20:03 +03:00
# include "ctdb_cluster_mutex.h"
2007-06-07 10:34:33 +04:00
2013-08-16 14:02:34 +04:00
/* List of SRVID requests that need to be processed */
struct srvid_list {
struct srvid_list * next , * prev ;
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message * request ;
2013-08-16 14:02:34 +04:00
} ;
struct srvid_requests {
struct srvid_list * requests ;
2009-07-02 07:00:26 +04:00
} ;
2013-08-16 14:02:34 +04:00
static void srvid_request_reply ( struct ctdb_context * ctdb ,
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message * request ,
2013-08-16 14:02:34 +04:00
TDB_DATA result )
{
/* Someone that sent srvid==0 does not want a reply */
if ( request - > srvid = = 0 ) {
talloc_free ( request ) ;
return ;
}
if ( ctdb_client_send_message ( ctdb , request - > pnn , request - > srvid ,
result ) = = 0 ) {
DEBUG ( DEBUG_INFO , ( " Sent SRVID reply to %u:%llu \n " ,
( unsigned ) request - > pnn ,
( unsigned long long ) request - > srvid ) ) ;
} else {
DEBUG ( DEBUG_ERR , ( " Failed to send SRVID reply to %u:%llu \n " ,
( unsigned ) request - > pnn ,
( unsigned long long ) request - > srvid ) ) ;
}
talloc_free ( request ) ;
}
static void srvid_requests_reply ( struct ctdb_context * ctdb ,
struct srvid_requests * * requests ,
TDB_DATA result )
{
struct srvid_list * r ;
2016-05-03 08:56:09 +03:00
if ( * requests = = NULL ) {
return ;
}
2013-08-16 14:02:34 +04:00
for ( r = ( * requests ) - > requests ; r ! = NULL ; r = r - > next ) {
srvid_request_reply ( ctdb , r - > request , result ) ;
}
/* Free the list structure... */
TALLOC_FREE ( * requests ) ;
}
static void srvid_request_add ( struct ctdb_context * ctdb ,
struct srvid_requests * * requests ,
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message * request )
2013-08-16 14:02:34 +04:00
{
struct srvid_list * t ;
int32_t ret ;
TDB_DATA result ;
if ( * requests = = NULL ) {
* requests = talloc_zero ( ctdb , struct srvid_requests ) ;
if ( * requests = = NULL ) {
goto nomem ;
}
}
t = talloc_zero ( * requests , struct srvid_list ) ;
if ( t = = NULL ) {
/* If *requests was just allocated above then free it */
if ( ( * requests ) - > requests = = NULL ) {
TALLOC_FREE ( * requests ) ;
}
goto nomem ;
}
2015-10-29 06:32:49 +03:00
t - > request = ( struct ctdb_srvid_message * ) talloc_steal ( t , request ) ;
2013-08-16 14:02:34 +04:00
DLIST_ADD ( ( * requests ) - > requests , t ) ;
return ;
nomem :
/* Failed to add the request to the list. Send a fail. */
DEBUG ( DEBUG_ERR , ( __location__
" Out of memory, failed to queue SRVID request \n " ) ) ;
ret = - ENOMEM ;
result . dsize = sizeof ( ret ) ;
result . dptr = ( uint8_t * ) & ret ;
srvid_request_reply ( ctdb , request , result ) ;
}
2015-02-08 12:50:38 +03:00
/* An abstraction to allow an operation (takeover runs, recoveries,
* . . . ) to be disabled for a given timeout */
struct ctdb_op_state {
struct tevent_timer * timer ;
bool in_progress ;
const char * name ;
} ;
static struct ctdb_op_state * ctdb_op_init ( TALLOC_CTX * mem_ctx , const char * name )
{
struct ctdb_op_state * state = talloc_zero ( mem_ctx , struct ctdb_op_state ) ;
if ( state ! = NULL ) {
state - > in_progress = false ;
state - > name = name ;
}
return state ;
}
static bool ctdb_op_is_disabled ( struct ctdb_op_state * state )
{
return state - > timer ! = NULL ;
}
static bool ctdb_op_begin ( struct ctdb_op_state * state )
{
if ( ctdb_op_is_disabled ( state ) ) {
DEBUG ( DEBUG_NOTICE ,
( " Unable to begin - %s are disabled \n " , state - > name ) ) ;
return false ;
}
state - > in_progress = true ;
return true ;
}
static bool ctdb_op_end ( struct ctdb_op_state * state )
{
return state - > in_progress = false ;
}
static bool ctdb_op_is_in_progress ( struct ctdb_op_state * state )
{
return state - > in_progress ;
}
static void ctdb_op_enable ( struct ctdb_op_state * state )
{
TALLOC_FREE ( state - > timer ) ;
}
2015-10-26 08:50:09 +03:00
static void ctdb_op_timeout_handler ( struct tevent_context * ev ,
struct tevent_timer * te ,
2015-02-08 12:50:38 +03:00
struct timeval yt , void * p )
{
struct ctdb_op_state * state =
talloc_get_type ( p , struct ctdb_op_state ) ;
DEBUG ( DEBUG_NOTICE , ( " Reenabling %s after timeout \n " , state - > name ) ) ;
ctdb_op_enable ( state ) ;
}
static int ctdb_op_disable ( struct ctdb_op_state * state ,
struct tevent_context * ev ,
uint32_t timeout )
{
if ( timeout = = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " Reenabling %s \n " , state - > name ) ) ;
ctdb_op_enable ( state ) ;
return 0 ;
}
if ( state - > in_progress ) {
DEBUG ( DEBUG_ERR ,
( " Unable to disable %s - in progress \n " , state - > name ) ) ;
return - EAGAIN ;
}
DEBUG ( DEBUG_NOTICE , ( " Disabling %s for %u seconds \n " ,
state - > name , timeout ) ) ;
/* Clear any old timers */
talloc_free ( state - > timer ) ;
/* Arrange for the timeout to occur */
state - > timer = tevent_add_timer ( ev , state ,
timeval_current_ofs ( timeout , 0 ) ,
ctdb_op_timeout_handler , state ) ;
if ( state - > timer = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to setup timer \n " ) ) ;
return - ENOMEM ;
}
return 0 ;
}
2009-09-03 20:20:39 +04:00
struct ctdb_banning_state {
uint32_t count ;
struct timeval last_reported_time ;
} ;
2007-06-07 09:18:55 +04:00
/*
private state of recovery daemon
*/
struct ctdb_recoverd {
struct ctdb_context * ctdb ;
2008-03-02 23:53:46 +03:00
uint32_t recmaster ;
2009-09-03 20:20:39 +04:00
uint32_t last_culprit_node ;
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-09-13 08:08:18 +04:00
bool need_takeover_run ;
2007-09-14 03:49:12 +04:00
bool need_recovery ;
2007-10-05 07:28:21 +04:00
uint32_t node_flags ;
2015-10-26 08:50:09 +03:00
struct tevent_timer * send_election_te ;
struct tevent_timer * election_timeout ;
2013-08-16 14:02:34 +04:00
struct srvid_requests * reallocate_requests ;
2015-02-08 12:52:12 +03:00
struct ctdb_op_state * takeover_run ;
2015-02-06 06:47:33 +03:00
struct ctdb_op_state * recovery ;
2015-10-28 11:43:48 +03:00
struct ctdb_iface_list_old * ifaces ;
2013-09-04 08:30:04 +04:00
uint32_t * force_rebalance_nodes ;
2014-07-31 09:26:03 +04:00
struct ctdb_node_capabilities * caps ;
2016-06-01 05:10:46 +03:00
bool frozen_on_inactive ;
2016-05-24 07:54:39 +03:00
struct ctdb_cluster_mutex_handle * recovery_lock_handle ;
2007-06-07 09:18:55 +04:00
} ;
2007-05-04 02:30:18 +04:00
2007-06-04 14:22:44 +04:00
# define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
2007-06-06 04:25:46 +04:00
# define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
2007-05-24 07:49:27 +04:00
2015-10-26 08:50:09 +03:00
static void ctdb_restart_recd ( struct tevent_context * ev ,
struct tevent_timer * te , struct timeval t ,
void * private_data ) ;
2008-01-05 01:35:43 +03:00
2007-06-07 10:34:33 +04:00
/*
ban a node for a period of time
*/
2007-09-04 04:33:10 +04:00
static void ctdb_ban_node ( struct ctdb_recoverd * rec , uint32_t pnn , uint32_t ban_time )
2007-06-07 10:34:33 +04:00
{
2009-09-03 20:20:39 +04:00
int ret ;
2007-06-07 10:34:33 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2015-10-28 10:18:33 +03:00
struct ctdb_ban_state bantime ;
2007-09-04 04:33:10 +04:00
if ( ! ctdb_validate_pnn ( ctdb , pnn ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Bad pnn %u in ctdb_ban_node \n " , pnn ) ) ;
2007-06-07 10:48:31 +04:00
return ;
}
2013-06-24 08:18:58 +04:00
DEBUG ( DEBUG_NOTICE , ( " Banning node %u for %u seconds \n " , pnn , ban_time ) ) ;
2009-09-03 20:20:39 +04:00
bantime . pnn = pnn ;
bantime . time = ban_time ;
2007-11-23 04:36:14 +03:00
2009-09-03 20:20:39 +04:00
ret = ctdb_ctrl_set_ban ( ctdb , CONTROL_TIMEOUT ( ) , pnn , & bantime ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to ban node %d \n " , pnn ) ) ;
2007-12-03 07:45:53 +03:00
return ;
2007-06-07 12:37:27 +04:00
}
2007-06-07 10:34:33 +04:00
}
2007-08-27 04:31:22 +04:00
enum monitor_result { MONITOR_OK , MONITOR_RECOVERY_NEEDED , MONITOR_ELECTION_NEEDED , MONITOR_FAILED } ;
2008-06-12 10:53:36 +04:00
/*
remember the trouble maker
*/
2009-09-03 20:20:39 +04:00
static void ctdb_set_culprit_count ( struct ctdb_recoverd * rec , uint32_t culprit , uint32_t count )
2008-06-12 10:53:36 +04:00
{
2009-09-03 20:20:39 +04:00
struct ctdb_context * ctdb = talloc_get_type ( rec - > ctdb , struct ctdb_context ) ;
struct ctdb_banning_state * ban_state ;
if ( culprit > ctdb - > num_nodes ) {
DEBUG ( DEBUG_ERR , ( " Trying to set culprit %d but num_nodes is %d \n " , culprit , ctdb - > num_nodes ) ) ;
return ;
}
2013-06-28 08:10:47 +04:00
/* If we are banned or stopped, do not set other nodes as culprits */
if ( rec - > node_flags & NODE_FLAGS_INACTIVE ) {
DEBUG ( DEBUG_NOTICE , ( " This node is INACTIVE, cannot set culprit node %d \n " , culprit ) ) ;
return ;
}
2009-09-03 20:20:39 +04:00
if ( ctdb - > nodes [ culprit ] - > ban_state = = NULL ) {
ctdb - > nodes [ culprit ] - > ban_state = talloc_zero ( ctdb - > nodes [ culprit ] , struct ctdb_banning_state ) ;
CTDB_NO_MEMORY_VOID ( ctdb , ctdb - > nodes [ culprit ] - > ban_state ) ;
2008-06-12 10:53:36 +04:00
2009-09-03 20:20:39 +04:00
}
ban_state = ctdb - > nodes [ culprit ] - > ban_state ;
if ( timeval_elapsed ( & ban_state - > last_reported_time ) > ctdb - > tunable . recovery_grace_period ) {
/* this was the first time in a long while this node
misbehaved so we will forgive any old transgressions .
*/
ban_state - > count = 0 ;
2008-06-12 10:53:36 +04:00
}
2009-09-03 20:20:39 +04:00
ban_state - > count + = count ;
ban_state - > last_reported_time = timeval_current ( ) ;
rec - > last_culprit_node = culprit ;
2008-06-12 10:53:36 +04:00
}
2009-04-24 07:58:32 +04:00
/*
remember the trouble maker
*/
2009-09-03 20:20:39 +04:00
static void ctdb_set_culprit ( struct ctdb_recoverd * rec , uint32_t culprit )
2009-04-24 07:58:32 +04:00
{
2009-09-03 20:20:39 +04:00
ctdb_set_culprit_count ( rec , culprit , 1 ) ;
2009-04-24 07:58:32 +04:00
}
2008-06-12 10:53:36 +04:00
2009-09-03 20:20:39 +04:00
2012-09-24 08:32:04 +04:00
/* this callback is called for every node that failed to execute the
recovered event
*/
static void recovered_fail_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( callback_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR , ( __location__ " Node %u failed the recovered event. Setting it as recovery fail culprit \n " , node_pnn ) ) ;
ctdb_set_culprit ( rec , node_pnn ) ;
}
/*
run the " recovered " eventscript on all nodes
*/
2015-10-29 09:22:48 +03:00
static int run_recovered_eventscript ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap , const char * caller )
2012-09-24 08:32:04 +04:00
{
TALLOC_CTX * tmp_ctx ;
uint32_t * nodes ;
struct ctdb_context * ctdb = rec - > ctdb ;
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_END_RECOVERY ,
nodes , 0 ,
CONTROL_TIMEOUT ( ) , false , tdb_null ,
NULL , recovered_fail_callback ,
rec ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'recovered' event when called from %s \n " , caller ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2008-06-12 10:53:36 +04:00
/* this callback is called for every node that failed to execute the
start recovery event
*/
static void startrecovery_fail_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( callback_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR , ( __location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit \n " , node_pnn ) ) ;
ctdb_set_culprit ( rec , node_pnn ) ;
}
2008-01-29 05:59:28 +03:00
/*
run the " startrecovery " eventscript on all nodes
*/
2015-10-29 09:22:48 +03:00
static int run_startrecovery_eventscript ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap )
2008-01-29 05:59:28 +03:00
{
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-06-07 10:34:33 +04:00
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_START_RECOVERY ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-06-12 10:53:36 +04:00
CONTROL_TIMEOUT ( ) , false , tdb_null ,
NULL ,
startrecovery_fail_callback ,
rec ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'startrecovery' event. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2008-01-06 04:38:01 +03:00
2008-05-06 09:42:59 +04:00
/*
2015-10-27 07:09:33 +03:00
Retrieve capabilities from all connected nodes
2008-05-06 09:42:59 +04:00
*/
2014-07-31 09:26:03 +04:00
static int update_capabilities ( struct ctdb_recoverd * rec ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap )
2008-05-06 09:42:59 +04:00
{
2014-07-31 09:26:03 +04:00
uint32_t * capp ;
2008-05-06 09:42:59 +04:00
TALLOC_CTX * tmp_ctx ;
2014-07-31 09:26:03 +04:00
struct ctdb_node_capabilities * caps ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-05-06 09:42:59 +04:00
2014-07-31 09:26:03 +04:00
tmp_ctx = talloc_new ( rec ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2014-07-31 09:26:03 +04:00
caps = ctdb_get_capabilities ( ctdb , tmp_ctx ,
CONTROL_TIMEOUT ( ) , nodemap ) ;
if ( caps = = NULL ) {
DEBUG ( DEBUG_ERR ,
( __location__ " Failed to get node capabilities \n " ) ) ;
2008-05-06 09:42:59 +04:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2014-07-31 09:26:03 +04:00
capp = ctdb_get_node_capabilities ( caps , ctdb_get_pnn ( ctdb ) ) ;
if ( capp = = NULL ) {
DEBUG ( DEBUG_ERR ,
( __location__
" Capabilities don't include current node. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
ctdb - > capabilities = * capp ;
TALLOC_FREE ( rec - > caps ) ;
rec - > caps = talloc_steal ( rec , caps ) ;
2008-05-06 09:42:59 +04:00
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2009-10-08 09:45:25 +04:00
static void set_recmode_fail_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( callback_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR , ( " Failed to freeze node %u during recovery. Set it as ban culprit for %d credits \n " , node_pnn , rec - > nodemap - > num ) ) ;
ctdb_set_culprit_count ( rec , node_pnn , rec - > nodemap - > num ) ;
}
2009-10-12 09:48:05 +04:00
static void transaction_start_fail_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( callback_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR , ( " Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits \n " , node_pnn , rec - > nodemap - > num ) ) ;
ctdb_set_culprit_count ( rec , node_pnn , rec - > nodemap - > num ) ;
}
2007-06-07 09:18:55 +04:00
/*
change recovery mode on all nodes
*/
2015-10-06 03:52:06 +03:00
static int set_recovery_mode ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
2015-10-06 03:52:06 +03:00
uint32_t rec_mode , bool freeze )
2007-05-06 03:53:12 +04:00
{
2008-01-06 04:38:01 +03:00
TDB_DATA data ;
2008-01-29 05:59:28 +03:00
uint32_t * nodes ;
TALLOC_CTX * tmp_ctx ;
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2014-05-06 08:24:52 +04:00
data . dsize = sizeof ( uint32_t ) ;
data . dptr = ( unsigned char * ) & rec_mode ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_SET_RECMODE ,
nodes , 0 ,
CONTROL_TIMEOUT ( ) ,
false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode. Recovery failed. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
/* freeze all nodes */
2015-10-06 03:52:06 +03:00
if ( freeze & & rec_mode = = CTDB_RECOVERY_ACTIVE ) {
2009-10-12 05:08:39 +04:00
int i ;
for ( i = 1 ; i < = NUM_DB_PRIORITIES ; i + + ) {
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_FREEZE ,
nodes , i ,
CONTROL_TIMEOUT ( ) ,
2008-06-12 10:53:36 +04:00
false , tdb_null ,
2009-10-08 09:45:25 +04:00
NULL ,
set_recmode_fail_callback ,
rec ) ! = 0 ) {
2009-10-12 05:08:39 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to freeze nodes. Recovery failed. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2007-08-27 04:31:22 +04:00
}
}
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2007-05-06 03:53:12 +04:00
return 0 ;
}
2009-10-10 09:28:20 +04:00
/* update all remote nodes to use the same db priority that we have
this can fail if the remove node has not yet been upgraded to
support this function , so we always return success and never fail
a recovery if this call fails .
*/
static int update_db_priority_on_remote_nodes ( struct ctdb_context * ctdb ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
2015-10-29 09:46:05 +03:00
uint32_t pnn , struct ctdb_dbid_map_old * dbmap , TALLOC_CTX * mem_ctx )
2009-10-10 09:28:20 +04:00
{
int db ;
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
struct ctdb_db_priority db_prio ;
int ret ;
2015-10-29 09:46:05 +03:00
db_prio . db_id = dbmap - > dbs [ db ] . db_id ;
ret = ctdb_ctrl_get_db_priority ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , dbmap - > dbs [ db ] . db_id , & db_prio . priority ) ;
2009-10-10 09:28:20 +04:00
if ( ret ! = 0 ) {
2015-10-29 09:46:05 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read database priority from local node for db 0x%08x \n " , dbmap - > dbs [ db ] . db_id ) ) ;
2009-10-10 09:28:20 +04:00
continue ;
}
2015-10-29 09:46:05 +03:00
DEBUG ( DEBUG_INFO , ( " Update DB priority for db 0x%08x to %u \n " , dbmap - > dbs [ db ] . db_id , db_prio . priority ) ) ;
2009-10-10 09:28:20 +04:00
2014-04-02 10:17:47 +04:00
ret = ctdb_ctrl_set_db_priority ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & db_prio ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to set DB priority for 0x%08x \n " ,
db_prio . db_id ) ) ;
2009-10-10 09:28:20 +04:00
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure all other nodes have attached to any databases that we have
*/
2015-10-29 09:22:48 +03:00
static int create_missing_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap ,
2015-10-29 09:46:05 +03:00
uint32_t pnn , struct ctdb_dbid_map_old * dbmap , TALLOC_CTX * mem_ctx )
2007-05-04 03:45:53 +04:00
{
2007-05-04 09:21:40 +04:00
int i , j , db , ret ;
2015-10-29 09:46:05 +03:00
struct ctdb_dbid_map_old * remote_dbmap ;
2007-05-04 09:21:40 +04:00
2007-05-06 00:58:01 +04:00
/* verify that all other nodes have all our databases */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-27 00:02:57 +03:00
/* we don't need to ourself ourselves */
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 00:58:01 +04:00
continue ;
}
2015-07-27 00:02:57 +03:00
/* don't check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 00:58:01 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < remote_dbmap - > num ; i + + ) {
2015-10-29 09:46:05 +03:00
if ( dbmap - > dbs [ db ] . db_id = = remote_dbmap - > dbs [ i ] . db_id ) {
2007-05-06 00:58:01 +04:00
break ;
}
}
/* the remote node already have this database */
if ( i ! = remote_dbmap - > num ) {
continue ;
}
/* ok so we need to create this database */
2013-11-11 05:39:27 +04:00
ret = ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , pnn ,
2015-10-29 09:46:05 +03:00
dbmap - > dbs [ db ] . db_id , mem_ctx ,
2013-11-11 05:39:27 +04:00
& name ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " , pnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
2013-11-11 05:39:27 +04:00
ret = ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) ,
nodemap - > nodes [ j ] . pnn ,
mem_ctx , name ,
dbmap - > dbs [ db ] . flags & CTDB_DB_FLAGS_PERSISTENT ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create remote db:%s \n " , name ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
}
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:04:37 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure we are attached to any databases that anyone else is attached to
*/
2015-10-29 09:22:48 +03:00
static int create_missing_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap ,
2015-10-29 09:46:05 +03:00
uint32_t pnn , struct ctdb_dbid_map_old * * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:12:42 +04:00
{
int i , j , db , ret ;
2015-10-29 09:46:05 +03:00
struct ctdb_dbid_map_old * remote_dbmap ;
2007-05-06 04:12:42 +04:00
/* verify that we have all database any other node has */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-27 00:02:57 +03:00
/* we don't need to ourself ourselves */
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 04:12:42 +04:00
continue ;
}
2015-07-27 00:02:57 +03:00
/* don't check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:12:42 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
/* step through all databases on the remote node */
for ( db = 0 ; db < remote_dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < ( * dbmap ) - > num ; i + + ) {
2015-10-29 09:46:05 +03:00
if ( remote_dbmap - > dbs [ db ] . db_id = = ( * dbmap ) - > dbs [ i ] . db_id ) {
2007-05-06 04:12:42 +04:00
break ;
}
}
/* we already have this db locally */
if ( i ! = ( * dbmap ) - > num ) {
continue ;
}
/* ok so we need to create this database and
rebuild dbmap
*/
2007-09-04 03:50:07 +04:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2015-10-29 09:46:05 +03:00
remote_dbmap - > dbs [ db ] . db_id , mem_ctx , & name ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-09-21 06:24:02 +04:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , name ,
2013-08-13 07:55:47 +04:00
remote_dbmap - > dbs [ db ] . flags & CTDB_DB_FLAGS_PERSISTENT ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create local db:%s \n " , name ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to reread dbmap on node %u \n " , pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:16:48 +04:00
2007-06-07 09:18:55 +04:00
/*
2008-01-06 04:38:01 +03:00
pull the remote database contents from one node into the recdb
2007-06-07 09:18:55 +04:00
*/
2015-03-31 06:03:43 +03:00
static int pull_one_remote_database ( struct ctdb_context * ctdb , uint32_t srcnode ,
2011-11-28 06:56:30 +04:00
struct tdb_wrap * recdb , uint32_t dbid )
2007-05-06 04:16:48 +04:00
{
2008-01-06 04:38:01 +03:00
int ret ;
TDB_DATA outdata ;
2008-07-30 13:59:18 +04:00
struct ctdb_marshall_buffer * reply ;
2015-10-29 09:30:30 +03:00
struct ctdb_rec_data_old * recdata ;
2008-01-06 04:38:01 +03:00
int i ;
TALLOC_CTX * tmp_ctx = talloc_new ( recdb ) ;
2007-05-06 04:16:48 +04:00
2008-01-06 04:38:01 +03:00
ret = ctdb_ctrl_pulldb ( ctdb , srcnode , dbid , CTDB_LMASTER_ANY , tmp_ctx ,
CONTROL_TIMEOUT ( ) , & outdata ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to copy db from node %u \n " , srcnode ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2008-07-30 08:24:56 +04:00
reply = ( struct ctdb_marshall_buffer * ) outdata . dptr ;
2008-01-06 04:38:01 +03:00
2008-07-30 08:24:56 +04:00
if ( outdata . dsize < offsetof ( struct ctdb_marshall_buffer , data ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " invalid data in pulldb reply \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2015-03-29 11:20:55 +03:00
2015-10-29 09:30:30 +03:00
recdata = ( struct ctdb_rec_data_old * ) & reply - > data [ 0 ] ;
2015-03-29 11:20:55 +03:00
2008-01-06 04:38:01 +03:00
for ( i = 0 ;
i < reply - > count ;
2015-10-29 09:30:30 +03:00
recdata = ( struct ctdb_rec_data_old * ) ( recdata - > length + ( uint8_t * ) recdata ) , i + + ) {
2008-01-06 04:38:01 +03:00
TDB_DATA key , data ;
struct ctdb_ltdb_header * hdr ;
TDB_DATA existing ;
2015-03-29 11:20:55 +03:00
key . dptr = & recdata - > data [ 0 ] ;
key . dsize = recdata - > keylen ;
data . dptr = & recdata - > data [ key . dsize ] ;
data . dsize = recdata - > datalen ;
2008-01-06 04:38:01 +03:00
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
if ( data . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " bad ltdb record \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
/* fetch the existing record, if any */
existing = tdb_fetch ( recdb - > tdb , key ) ;
2015-03-31 06:03:43 +03:00
2008-01-06 04:38:01 +03:00
if ( existing . dptr ! = NULL ) {
struct ctdb_ltdb_header header ;
if ( existing . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
2015-03-31 06:03:43 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Bad record size %u from node %u \n " ,
2008-01-07 06:08:25 +03:00
( unsigned ) existing . dsize , srcnode ) ) ;
2008-01-06 04:38:01 +03:00
free ( existing . dptr ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
2007-05-06 04:16:48 +04:00
}
2008-01-06 04:38:01 +03:00
header = * ( struct ctdb_ltdb_header * ) existing . dptr ;
free ( existing . dptr ) ;
2009-12-11 19:05:30 +03:00
if ( ! ( header . rsn < hdr - > rsn | |
2015-03-31 06:03:43 +03:00
( header . dmaster ! = ctdb_get_pnn ( ctdb ) & &
header . rsn = = hdr - > rsn ) ) ) {
2009-12-11 19:05:30 +03:00
continue ;
2009-12-04 13:21:29 +03:00
}
}
2015-03-31 06:03:43 +03:00
2008-01-06 04:38:01 +03:00
if ( tdb_store ( recdb - > tdb , key , data , TDB_REPLACE ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to store record \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
2015-03-31 06:03:43 +03:00
return - 1 ;
2007-05-06 04:16:48 +04:00
}
}
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
2007-05-06 04:16:48 +04:00
return 0 ;
}
2011-11-28 06:56:30 +04:00
struct pull_seqnum_cbdata {
int failed ;
uint32_t pnn ;
uint64_t seqnum ;
} ;
static void pull_seqnum_cb ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct pull_seqnum_cbdata * cb_data = talloc_get_type ( callback_data , struct pull_seqnum_cbdata ) ;
uint64_t seqnum ;
if ( cb_data - > failed ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Got seqnum from node %d but we have already failed the entire operation \n " , node_pnn ) ) ;
return ;
}
if ( res ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Error when pulling seqnum from node %d \n " , node_pnn ) ) ;
cb_data - > failed = 1 ;
return ;
}
if ( outdata . dsize ! = sizeof ( uint64_t ) ) {
DEBUG ( DEBUG_ERR , ( " Error when reading pull seqnum from node %d, got %d bytes but expected %d \n " , node_pnn , ( int ) outdata . dsize , ( int ) sizeof ( uint64_t ) ) ) ;
cb_data - > failed = - 1 ;
return ;
}
seqnum = * ( ( uint64_t * ) outdata . dptr ) ;
2013-11-15 08:20:40 +04:00
if ( seqnum > cb_data - > seqnum | |
( cb_data - > pnn = = - 1 & & seqnum = = 0 ) ) {
2011-11-28 06:56:30 +04:00
cb_data - > seqnum = seqnum ;
cb_data - > pnn = node_pnn ;
}
}
static void pull_seqnum_fail_cb ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct pull_seqnum_cbdata * cb_data = talloc_get_type ( callback_data , struct pull_seqnum_cbdata ) ;
DEBUG ( DEBUG_ERR , ( " Failed to pull db seqnum from node %d \n " , node_pnn ) ) ;
cb_data - > failed = 1 ;
}
static int pull_highest_seqnum_pdb ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
2011-11-28 06:56:30 +04:00
struct tdb_wrap * recdb , uint32_t dbid )
{
TALLOC_CTX * tmp_ctx = talloc_new ( NULL ) ;
uint32_t * nodes ;
TDB_DATA data ;
uint32_t outdata [ 2 ] ;
struct pull_seqnum_cbdata * cb_data ;
DEBUG ( DEBUG_NOTICE , ( " Scan for highest seqnum pdb for db:0x%08x \n " , dbid ) ) ;
outdata [ 0 ] = dbid ;
outdata [ 1 ] = 0 ;
data . dsize = sizeof ( outdata ) ;
data . dptr = ( uint8_t * ) & outdata [ 0 ] ;
cb_data = talloc ( tmp_ctx , struct pull_seqnum_cbdata ) ;
if ( cb_data = = NULL ) {
DEBUG ( DEBUG_ERR , ( " Failed to allocate pull highest seqnum cb_data structure \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
cb_data - > failed = 0 ;
cb_data - > pnn = - 1 ;
cb_data - > seqnum = 0 ;
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_GET_DB_SEQNUM ,
nodes , 0 ,
CONTROL_TIMEOUT ( ) , false , data ,
pull_seqnum_cb ,
pull_seqnum_fail_cb ,
cb_data ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to run async GET_DB_SEQNUM \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
if ( cb_data - > failed ! = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " Failed to pull sequence numbers for DB 0x%08x \n " , dbid ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2013-11-15 08:20:40 +04:00
if ( cb_data - > pnn = = - 1 ) {
2011-11-28 06:56:30 +04:00
DEBUG ( DEBUG_NOTICE , ( " Failed to find a node with highest sequence numbers for DB 0x%08x \n " , dbid ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
DEBUG ( DEBUG_NOTICE , ( " Pull persistent db:0x%08x from node %d with highest seqnum:%lld \n " , dbid , cb_data - > pnn , ( long long ) cb_data - > seqnum ) ) ;
if ( pull_one_remote_database ( ctdb , cb_data - > pnn , recdb , dbid ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to pull higest seqnum database 0x%08x from node %d \n " , dbid , cb_data - > pnn ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
2008-01-06 04:38:01 +03:00
pull all the remote database contents into the recdb
2007-06-07 09:18:55 +04:00
*/
2009-04-24 07:58:32 +04:00
static int pull_remote_database ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
2009-11-29 13:14:31 +03:00
struct tdb_wrap * recdb , uint32_t dbid ,
bool persistent )
2007-05-06 04:22:13 +04:00
{
2008-01-06 04:38:01 +03:00
int j ;
2007-05-06 04:22:13 +04:00
2011-11-28 06:56:30 +04:00
if ( persistent & & ctdb - > tunable . recover_pdb_by_seqnum ! = 0 ) {
int ret ;
ret = pull_highest_seqnum_pdb ( ctdb , rec , nodemap , recdb , dbid ) ;
if ( ret = = 0 ) {
return 0 ;
}
}
2008-01-06 04:38:01 +03:00
/* pull all records from all other nodes across onto this node
( this merges based on rsn )
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-27 00:02:57 +03:00
/* don't merge from nodes that are unavailable */
2008-01-06 04:38:01 +03:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2011-11-28 06:56:30 +04:00
if ( pull_one_remote_database ( ctdb , nodemap - > nodes [ j ] . pnn , recdb , dbid ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to pull remote database from node %u \n " ,
2008-01-06 04:38:01 +03:00
nodemap - > nodes [ j ] . pnn ) ) ;
2009-04-24 07:58:32 +04:00
ctdb_set_culprit_count ( rec , nodemap - > nodes [ j ] . pnn , nodemap - > num ) ;
2008-01-02 14:44:46 +03:00
return - 1 ;
2007-05-06 04:22:13 +04:00
}
}
2008-01-06 04:38:01 +03:00
2007-05-06 04:22:13 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
update flags on all active nodes
*/
2015-10-29 09:22:48 +03:00
static int update_flags_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap , uint32_t pnn , uint32_t flags )
2008-06-26 05:08:09 +04:00
{
2008-11-19 06:43:46 +03:00
int ret ;
2008-06-26 05:08:09 +04:00
2008-12-05 08:32:30 +03:00
ret = ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , pnn , flags , ~ flags ) ;
if ( ret ! = 0 ) {
2008-11-19 06:43:46 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update nodeflags on remote nodes \n " ) ) ;
return - 1 ;
}
2008-06-26 05:08:09 +04:00
return 0 ;
}
2007-05-06 04:38:44 +04:00
2007-06-07 09:18:55 +04:00
/*
ensure all nodes have the same vnnmap we do
*/
2015-10-29 09:22:48 +03:00
static int update_vnnmap_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap ,
2007-09-04 04:33:10 +04:00
uint32_t pnn , struct ctdb_vnn_map * vnnmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:42:18 +04:00
{
int j , ret ;
/* push the new vnn map out to all the nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-27 00:02:57 +03:00
/* don't push to nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:42:18 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , mem_ctx , vnnmap ) ;
2007-05-06 04:42:18 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set vnnmap for node %u \n " , pnn ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
2008-01-08 09:23:27 +03:00
/*
2008-01-08 13:28:42 +03:00
called when a vacuum fetch has completed - just free it and do the next one
2008-01-08 09:23:27 +03:00
*/
static void vacuum_fetch_callback ( struct ctdb_client_call_state * state )
{
talloc_free ( state ) ;
2008-01-08 13:28:42 +03:00
}
2015-06-02 22:39:00 +03:00
/**
* Process one elements of the vacuum fetch list :
* Migrate it over to us with the special flag
* CTDB_CALL_FLAG_VACUUM_MIGRATION .
*/
static bool vacuum_fetch_process_one ( struct ctdb_db_context * ctdb_db ,
uint32_t pnn ,
2015-10-29 09:30:30 +03:00
struct ctdb_rec_data_old * r )
2015-06-02 22:39:00 +03:00
{
struct ctdb_client_call_state * state ;
TDB_DATA data ;
struct ctdb_ltdb_header * hdr ;
struct ctdb_call call ;
ZERO_STRUCT ( call ) ;
call . call_id = CTDB_NULL_FUNC ;
call . flags = CTDB_IMMEDIATE_MIGRATION ;
call . flags | = CTDB_CALL_FLAG_VACUUM_MIGRATION ;
call . key . dptr = & r - > data [ 0 ] ;
call . key . dsize = r - > keylen ;
/* ensure we don't block this daemon - just skip a record if we can't get
the chainlock */
if ( tdb_chainlock_nonblock ( ctdb_db - > ltdb - > tdb , call . key ) ! = 0 ) {
return true ;
}
data = tdb_fetch ( ctdb_db - > ltdb - > tdb , call . key ) ;
if ( data . dptr = = NULL ) {
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
if ( data . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
free ( data . dptr ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
if ( hdr - > dmaster = = pnn ) {
/* its already local */
free ( data . dptr ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
free ( data . dptr ) ;
state = ctdb_call_send ( ctdb_db , & call ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
if ( state = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to setup vacuum fetch call \n " ) ) ;
return false ;
}
state - > async . fn = vacuum_fetch_callback ;
state - > async . private_data = NULL ;
return true ;
}
2008-01-08 13:28:42 +03:00
2008-01-08 09:23:27 +03:00
/*
handler for vacuum fetch
*/
2015-04-08 07:38:26 +03:00
static void vacuum_fetch_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2008-01-08 09:23:27 +03:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-07-30 08:24:56 +04:00
struct ctdb_marshall_buffer * recs ;
2008-01-08 09:23:27 +03:00
int ret , i ;
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
const char * name ;
2015-10-29 09:46:05 +03:00
struct ctdb_dbid_map_old * dbmap = NULL ;
2008-01-08 09:23:27 +03:00
bool persistent = false ;
struct ctdb_db_context * ctdb_db ;
2015-10-29 09:30:30 +03:00
struct ctdb_rec_data_old * r ;
2008-01-08 09:23:27 +03:00
2008-07-30 08:24:56 +04:00
recs = ( struct ctdb_marshall_buffer * ) data . dptr ;
2008-01-08 13:28:42 +03:00
if ( recs - > count = = 0 ) {
2015-06-02 22:57:54 +03:00
goto done ;
2008-01-08 13:28:42 +03:00
}
2008-01-08 09:23:27 +03:00
/* work out if the database is persistent */
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from local node \n " ) ) ;
2015-06-02 22:57:54 +03:00
goto done ;
2008-01-08 09:23:27 +03:00
}
for ( i = 0 ; i < dbmap - > num ; i + + ) {
2015-10-29 09:46:05 +03:00
if ( dbmap - > dbs [ i ] . db_id = = recs - > db_id ) {
2011-09-01 04:21:55 +04:00
persistent = dbmap - > dbs [ i ] . flags & CTDB_DB_FLAGS_PERSISTENT ;
2008-01-08 09:23:27 +03:00
break ;
}
}
if ( i = = dbmap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to find db_id 0x%x on local node \n " , recs - > db_id ) ) ;
2015-06-02 22:57:54 +03:00
goto done ;
2008-01-08 09:23:27 +03:00
}
/* find the name of this database */
if ( ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , recs - > db_id , tmp_ctx , & name ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to get name of db 0x%x \n " , recs - > db_id ) ) ;
2015-06-02 22:57:54 +03:00
goto done ;
2008-01-08 09:23:27 +03:00
}
/* attach to it */
2011-08-08 18:35:56 +04:00
ctdb_db = ctdb_attach ( ctdb , CONTROL_TIMEOUT ( ) , name , persistent , 0 ) ;
2008-01-08 09:23:27 +03:00
if ( ctdb_db = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to attach to database '%s' \n " , name ) ) ;
2015-06-02 22:57:54 +03:00
goto done ;
2008-01-08 09:23:27 +03:00
}
2015-10-29 09:30:30 +03:00
r = ( struct ctdb_rec_data_old * ) & recs - > data [ 0 ] ;
2015-06-05 09:35:48 +03:00
while ( recs - > count ) {
2015-06-02 23:17:03 +03:00
bool ok ;
2015-06-05 09:35:48 +03:00
ok = vacuum_fetch_process_one ( ctdb_db , rec - > ctdb - > pnn , r ) ;
2015-06-02 23:17:03 +03:00
if ( ! ok ) {
break ;
}
2015-10-29 09:30:30 +03:00
r = ( struct ctdb_rec_data_old * ) ( r - > length + ( uint8_t * ) r ) ;
2015-06-05 09:35:48 +03:00
recs - > count - - ;
2015-06-02 23:17:03 +03:00
}
2015-06-02 22:57:54 +03:00
done :
2008-09-16 01:55:57 +04:00
talloc_free ( tmp_ctx ) ;
2008-01-08 09:23:27 +03:00
}
2007-06-07 10:34:33 +04:00
2014-04-22 09:24:49 +04:00
/*
* handler for database detach
*/
2015-04-08 07:38:26 +03:00
static void detach_database_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2014-04-22 09:24:49 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2014-04-22 09:24:49 +04:00
uint32_t db_id ;
struct ctdb_db_context * ctdb_db ;
if ( data . dsize ! = sizeof ( db_id ) ) {
return ;
}
db_id = * ( uint32_t * ) data . dptr ;
ctdb_db = find_ctdb_db ( ctdb , db_id ) ;
if ( ctdb_db = = NULL ) {
/* database is not attached */
return ;
}
DLIST_REMOVE ( ctdb - > db_list , ctdb_db ) ;
DEBUG ( DEBUG_NOTICE , ( " Detached from database '%s' \n " ,
ctdb_db - > db_name ) ) ;
talloc_free ( ctdb_db ) ;
}
2007-07-04 02:36:59 +04:00
/*
called when ctdb_wait_timeout should finish
*/
2015-10-26 08:50:09 +03:00
static void ctdb_wait_handler ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-07-04 02:36:59 +04:00
struct timeval yt , void * p )
{
uint32_t * timed_out = ( uint32_t * ) p ;
( * timed_out ) = 1 ;
}
/*
wait for a given number of seconds
*/
2010-06-22 17:20:35 +04:00
static void ctdb_wait_timeout ( struct ctdb_context * ctdb , double secs )
2007-07-04 02:36:59 +04:00
{
uint32_t timed_out = 0 ;
2010-06-22 17:20:35 +04:00
time_t usecs = ( secs - ( time_t ) secs ) * 1000000 ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb , timeval_current_ofs ( secs , usecs ) ,
ctdb_wait_handler , & timed_out ) ;
2007-07-04 02:36:59 +04:00
while ( ! timed_out ) {
2015-10-26 08:50:09 +03:00
tevent_loop_once ( ctdb - > ev ) ;
2007-07-04 02:36:59 +04:00
}
}
2007-11-13 02:27:44 +03:00
/*
called when an election times out ( ends )
*/
2015-10-26 08:50:09 +03:00
static void ctdb_election_timeout ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-11-13 02:27:44 +03:00
struct timeval t , void * p )
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
rec - > election_timeout = NULL ;
2010-06-22 17:25:20 +04:00
fast_start = false ;
2009-07-17 05:37:03 +04:00
2014-06-20 07:36:25 +04:00
DEBUG ( DEBUG_WARNING , ( " Election period ended \n " ) ) ;
2007-11-13 02:27:44 +03:00
}
/*
wait for an election to finish . It finished election_timeout seconds after
the last election packet is received
*/
static void ctdb_wait_election ( struct ctdb_recoverd * rec )
{
struct ctdb_context * ctdb = rec - > ctdb ;
while ( rec - > election_timeout ) {
2015-10-26 08:50:09 +03:00
tevent_loop_once ( ctdb - > ev ) ;
2007-11-13 02:27:44 +03:00
}
}
2007-10-15 08:28:51 +04:00
/*
2007-11-23 03:31:42 +03:00
Update our local flags from all remote connected nodes .
This is only run when we are or we belive we are the recovery master
2007-10-15 08:28:51 +04:00
*/
2015-10-29 09:22:48 +03:00
static int update_local_flags ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap )
2007-10-15 08:28:51 +04:00
{
int j ;
2007-11-30 00:44:34 +03:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-15 08:28:51 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * remote_nodemap = NULL ;
2007-10-15 08:28:51 +04:00
int ret ;
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
if ( nodemap - > nodes [ j ] . pnn = = ctdb - > pnn ) {
continue ;
}
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
mem_ctx , & remote_nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from remote node %u \n " ,
2007-10-15 08:28:51 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-11-28 07:04:20 +03:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2007-10-15 08:28:51 +04:00
talloc_free ( mem_ctx ) ;
2016-04-27 14:47:08 +03:00
return - 1 ;
2007-10-15 08:28:51 +04:00
}
if ( nodemap - > nodes [ j ] . flags ! = remote_nodemap - > nodes [ j ] . flags ) {
2007-11-23 03:53:06 +03:00
/* We should tell our daemon about this so it
2007-11-23 02:52:29 +03:00
updates its flags or else we will log the same
message again in the next iteration of recovery .
2007-11-23 03:31:42 +03:00
Since we are the recovery master we can just as
well update the flags on all nodes .
2007-11-23 02:52:29 +03:00
*/
2013-06-26 09:22:46 +04:00
ret = ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , remote_nodemap - > nodes [ j ] . flags , ~ remote_nodemap - > nodes [ j ] . flags ) ;
2008-11-19 06:43:46 +03:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update nodeflags on remote nodes \n " ) ) ;
return - 1 ;
}
2007-11-23 02:52:29 +03:00
2007-11-23 03:53:06 +03:00
/* Update our local copy of the flags in the recovery
daemon .
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Remote node %u had flags 0x%x, local had 0x%x - updating local \n " ,
2007-11-23 03:53:06 +03:00
nodemap - > nodes [ j ] . pnn , remote_nodemap - > nodes [ j ] . flags ,
nodemap - > nodes [ j ] . flags ) ) ;
2007-10-15 08:28:51 +04:00
nodemap - > nodes [ j ] . flags = remote_nodemap - > nodes [ j ] . flags ;
}
talloc_free ( remote_nodemap ) ;
}
talloc_free ( mem_ctx ) ;
2016-04-27 14:47:08 +03:00
return 0 ;
2007-10-15 08:28:51 +04:00
}
2015-10-12 17:52:49 +03:00
/* Create a new random generation id.
2007-08-22 06:38:31 +04:00
The generation id can not be the INVALID_GENERATION id
*/
static uint32_t new_generation ( void )
{
uint32_t generation ;
while ( 1 ) {
generation = random ( ) ;
if ( generation ! = INVALID_GENERATION ) {
break ;
}
}
return generation ;
}
2007-10-05 06:01:40 +04:00
2008-01-06 04:38:01 +03:00
/*
create a temporary working database
*/
static struct tdb_wrap * create_recdb ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx )
{
char * name ;
struct tdb_wrap * recdb ;
2008-07-04 11:32:21 +04:00
unsigned tdb_flags ;
2008-01-06 04:38:01 +03:00
/* open up the temporary recovery database */
2009-11-23 17:36:45 +03:00
name = talloc_asprintf ( mem_ctx , " %s/recdb.tdb.%u " ,
ctdb - > db_directory_state ,
ctdb - > pnn ) ;
2008-01-06 04:38:01 +03:00
if ( name = = NULL ) {
return NULL ;
}
unlink ( name ) ;
2008-07-04 11:32:21 +04:00
tdb_flags = TDB_NOLOCK ;
2009-12-16 13:29:15 +03:00
if ( ctdb - > valgrinding ) {
2008-07-04 11:32:21 +04:00
tdb_flags | = TDB_NOMMAP ;
}
2013-07-29 07:50:44 +04:00
tdb_flags | = ( TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING ) ;
2008-07-04 11:32:21 +04:00
2008-01-06 04:38:01 +03:00
recdb = tdb_wrap_open ( mem_ctx , name , ctdb - > tunable . database_hash_size ,
2008-07-04 11:32:21 +04:00
tdb_flags , O_RDWR | O_CREAT | O_EXCL , 0600 ) ;
2008-01-06 04:38:01 +03:00
if ( recdb = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to create temp recovery database '%s' \n " , name ) ) ;
2008-01-06 04:38:01 +03:00
}
talloc_free ( name ) ;
return recdb ;
}
/*
2012-11-19 20:20:11 +04:00
a traverse function for pulling all relevant records from recdb
2008-01-06 04:38:01 +03:00
*/
struct recdb_data {
struct ctdb_context * ctdb ;
2008-07-30 08:24:56 +04:00
struct ctdb_marshall_buffer * recdata ;
2008-01-06 04:38:01 +03:00
uint32_t len ;
2012-05-25 06:27:59 +04:00
uint32_t allocated_len ;
2008-01-07 06:08:25 +03:00
bool failed ;
2009-11-29 13:14:31 +03:00
bool persistent ;
2008-01-06 04:38:01 +03:00
} ;
static int traverse_recdb ( struct tdb_context * tdb , TDB_DATA key , TDB_DATA data , void * p )
{
struct recdb_data * params = ( struct recdb_data * ) p ;
2015-10-29 09:30:30 +03:00
struct ctdb_rec_data_old * recdata ;
2008-01-06 04:38:01 +03:00
struct ctdb_ltdb_header * hdr ;
2012-11-19 20:28:03 +04:00
/*
* skip empty records - but NOT for persistent databases :
*
* The record - by - record mode of recovery deletes empty records .
* For persistent databases , this can lead to data corruption
* by deleting records that should be there :
*
* - Assume the cluster has been running for a while .
*
* - A record R in a persistent database has been created and
* deleted a couple of times , the last operation being deletion ,
* leaving an empty record with a high RSN , say 10.
*
* - Now a node N is turned off .
*
* - This leaves the local database copy of D on N with the empty
* copy of R and RSN 10. On all other nodes , the recovery has deleted
* the copy of record R .
*
* - Now the record is created again while node N is turned off .
* This creates R with RSN = 1 on all nodes except for N .
*
* - Now node N is turned on again . The following recovery will chose
* the older empty copy of R due to RSN 10 > RSN 1.
*
* = = > Hence the record is gone after the recovery .
*
* On databases like Samba ' s registry , this can damage the higher - level
* data structures built from the various tdb - level records .
*/
if ( ! params - > persistent & & data . dsize < = sizeof ( struct ctdb_ltdb_header ) ) {
2008-01-06 04:38:01 +03:00
return 0 ;
}
/* update the dmaster field to point to us */
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
2009-11-29 13:17:18 +03:00
if ( ! params - > persistent ) {
hdr - > dmaster = params - > ctdb - > pnn ;
2010-12-03 17:24:06 +03:00
hdr - > flags | = CTDB_REC_FLAG_MIGRATED_WITH_DATA ;
2009-11-29 13:17:18 +03:00
}
2008-01-06 04:38:01 +03:00
/* add the record to the blob ready to send to the nodes */
2015-03-29 11:20:55 +03:00
recdata = ctdb_marshall_record ( params - > recdata , 0 , key , NULL , data ) ;
if ( recdata = = NULL ) {
2008-01-07 06:08:25 +03:00
params - > failed = true ;
return - 1 ;
}
2015-03-29 11:20:55 +03:00
if ( params - > len + recdata - > length > = params - > allocated_len ) {
params - > allocated_len = recdata - > length + params - > len + params - > ctdb - > tunable . pulldb_preallocation_size ;
2012-05-25 06:27:59 +04:00
params - > recdata = talloc_realloc_size ( NULL , params - > recdata , params - > allocated_len ) ;
}
2008-01-06 04:38:01 +03:00
if ( params - > recdata = = NULL ) {
2011-08-10 19:53:56 +04:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to expand recdata to %u \n " ,
2015-03-29 11:20:55 +03:00
recdata - > length + params - > len ) ) ;
2008-01-07 06:08:25 +03:00
params - > failed = true ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
params - > recdata - > count + + ;
2015-03-29 11:20:55 +03:00
memcpy ( params - > len + ( uint8_t * ) params - > recdata , recdata , recdata - > length ) ;
params - > len + = recdata - > length ;
talloc_free ( recdata ) ;
2008-01-06 04:38:01 +03:00
return 0 ;
}
/*
push the recdb database out to all nodes
*/
static int push_recdb_database ( struct ctdb_context * ctdb , uint32_t dbid ,
2009-11-29 13:14:31 +03:00
bool persistent ,
2015-10-29 09:22:48 +03:00
struct tdb_wrap * recdb , struct ctdb_node_map_old * nodemap )
2008-01-06 04:38:01 +03:00
{
struct recdb_data params ;
2008-07-30 08:24:56 +04:00
struct ctdb_marshall_buffer * recdata ;
2008-01-06 04:38:01 +03:00
TDB_DATA outdata ;
2008-01-29 05:59:28 +03:00
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
2008-07-30 08:24:56 +04:00
recdata = talloc_zero ( recdb , struct ctdb_marshall_buffer ) ;
2008-01-06 04:38:01 +03:00
CTDB_NO_MEMORY ( ctdb , recdata ) ;
recdata - > db_id = dbid ;
params . ctdb = ctdb ;
params . recdata = recdata ;
2008-07-30 08:24:56 +04:00
params . len = offsetof ( struct ctdb_marshall_buffer , data ) ;
2012-05-25 06:27:59 +04:00
params . allocated_len = params . len ;
2008-01-07 06:08:25 +03:00
params . failed = false ;
2009-11-29 13:14:31 +03:00
params . persistent = persistent ;
2008-01-06 04:38:01 +03:00
if ( tdb_traverse_read ( recdb - > tdb , traverse_recdb , & params ) = = - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to traverse recdb database \n " ) ) ;
2008-01-07 06:08:25 +03:00
talloc_free ( params . recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-01-07 06:08:25 +03:00
if ( params . failed ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to traverse recdb database \n " ) ) ;
2008-01-07 06:08:25 +03:00
talloc_free ( params . recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-07 06:08:25 +03:00
return - 1 ;
}
2008-01-06 04:38:01 +03:00
recdata = params . recdata ;
outdata . dptr = ( void * ) recdata ;
outdata . dsize = params . len ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_PUSH_DB ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-06-12 10:53:36 +04:00
CONTROL_TIMEOUT ( ) , false , outdata ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to push recdb records to nodes for db 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - pushed remote database 0x%x of size %u \n " ,
2008-01-06 04:38:01 +03:00
dbid , recdata - > count ) ) ;
talloc_free ( recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return 0 ;
}
/*
go through a full recovery on one database
*/
static int recover_database ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx ,
uint32_t dbid ,
2009-11-29 13:14:31 +03:00
bool persistent ,
2008-01-06 04:38:01 +03:00
uint32_t pnn ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
2008-01-06 05:24:55 +03:00
uint32_t transaction_id )
2008-01-06 04:38:01 +03:00
{
struct tdb_wrap * recdb ;
int ret ;
struct ctdb_context * ctdb = rec - > ctdb ;
TDB_DATA data ;
2015-10-28 11:22:23 +03:00
struct ctdb_transdb w ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2008-01-06 04:38:01 +03:00
recdb = create_recdb ( ctdb , mem_ctx ) ;
if ( recdb = = NULL ) {
return - 1 ;
}
/* pull all remote databases onto the recdb */
2009-11-29 13:14:31 +03:00
ret = pull_remote_database ( ctdb , rec , nodemap , recdb , dbid , persistent ) ;
2008-01-06 04:38:01 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to pull remote database 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - pulled remote database 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
/* wipe all the remote databases. This is safe as we are in a transaction */
2008-01-06 05:24:55 +03:00
w . db_id = dbid ;
2015-10-28 11:22:23 +03:00
w . tid = transaction_id ;
2008-01-06 05:24:55 +03:00
data . dptr = ( void * ) & w ;
data . dsize = sizeof ( w ) ;
2008-01-06 04:38:01 +03:00
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , recdb , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_WIPE_DATABASE ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-06-12 10:53:36 +04:00
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to wipe database. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( recdb ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
/* push out the correct database. This sets the dmaster and skips
the empty records */
2009-11-29 13:14:31 +03:00
ret = push_recdb_database ( ctdb , dbid , persistent , recdb , nodemap ) ;
2008-01-06 04:38:01 +03:00
if ( ret ! = 0 ) {
talloc_free ( recdb ) ;
return - 1 ;
}
/* all done with this database */
talloc_free ( recdb ) ;
return 0 ;
}
2016-05-24 07:54:39 +03:00
static bool ctdb_recovery_have_lock ( struct ctdb_recoverd * rec )
2016-02-17 12:20:03 +03:00
{
2016-05-24 07:54:39 +03:00
return ( rec - > recovery_lock_handle ! = NULL ) ;
2016-02-17 12:20:03 +03:00
}
struct hold_reclock_state {
bool done ;
2016-05-31 11:37:30 +03:00
bool locked ;
2016-06-01 10:32:42 +03:00
double latency ;
2016-02-17 12:20:03 +03:00
} ;
2016-05-29 00:25:05 +03:00
static void take_reclock_handler ( char status ,
2016-02-17 12:20:03 +03:00
double latency ,
void * private_data )
{
struct hold_reclock_state * s =
( struct hold_reclock_state * ) private_data ;
switch ( status ) {
case ' 0 ' :
2016-06-01 10:32:42 +03:00
s - > latency = latency ;
2016-02-17 12:20:03 +03:00
break ;
case ' 1 ' :
DEBUG ( DEBUG_ERR ,
( " Unable to take recovery lock - contention \n " ) ) ;
break ;
default :
DEBUG ( DEBUG_ERR , ( " ERROR: when taking recovery lock \n " ) ) ;
}
s - > done = true ;
2016-05-31 11:37:30 +03:00
s - > locked = ( status = = ' 0 ' ) ;
2016-02-17 12:20:03 +03:00
}
2016-05-29 00:25:05 +03:00
static bool ctdb_recovery_lock ( struct ctdb_recoverd * rec ) ;
static void lost_reclock_handler ( void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type_abort (
private_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR ,
( " Recovery lock helper terminated unexpectedly - "
" trying to retake recovery lock \n " ) ) ;
TALLOC_FREE ( rec - > recovery_lock_handle ) ;
if ( ! ctdb_recovery_lock ( rec ) ) {
DEBUG ( DEBUG_ERR , ( " Failed to take recovery lock \n " ) ) ;
}
}
2016-05-24 07:54:39 +03:00
static bool ctdb_recovery_lock ( struct ctdb_recoverd * rec )
2016-02-17 12:20:03 +03:00
{
2016-05-24 07:54:39 +03:00
struct ctdb_context * ctdb = rec - > ctdb ;
2016-02-17 12:20:03 +03:00
struct ctdb_cluster_mutex_handle * h ;
struct hold_reclock_state s = {
. done = false ,
2016-05-31 11:37:30 +03:00
. locked = false ,
2016-06-01 10:32:42 +03:00
. latency = 0 ,
2016-02-17 12:20:03 +03:00
} ;
2016-06-01 11:56:33 +03:00
h = ctdb_cluster_mutex ( rec , ctdb , ctdb - > recovery_lock , 0 ,
2016-05-29 00:25:05 +03:00
take_reclock_handler , & s ,
lost_reclock_handler , rec ) ;
2016-02-17 12:20:03 +03:00
if ( h = = NULL ) {
2016-06-01 08:56:42 +03:00
return false ;
2016-02-17 12:20:03 +03:00
}
while ( ! s . done ) {
tevent_loop_once ( ctdb - > ev ) ;
}
2016-05-24 07:54:39 +03:00
if ( ! s . locked ) {
2016-06-01 10:32:42 +03:00
talloc_free ( h ) ;
2016-05-24 07:54:39 +03:00
return false ;
}
rec - > recovery_lock_handle = h ;
2016-06-01 10:32:42 +03:00
ctdb_ctrl_report_recd_lock_latency ( ctdb , CONTROL_TIMEOUT ( ) ,
s . latency ) ;
2016-05-24 07:54:39 +03:00
return true ;
2016-02-17 12:20:03 +03:00
}
2016-05-24 07:54:39 +03:00
static void ctdb_recovery_unlock ( struct ctdb_recoverd * rec )
2016-02-17 12:20:03 +03:00
{
2016-05-24 07:54:39 +03:00
if ( rec - > recovery_lock_handle ! = NULL ) {
2016-02-17 12:20:03 +03:00
DEBUG ( DEBUG_NOTICE , ( " Releasing recovery lock \n " ) ) ;
2016-05-24 07:54:39 +03:00
TALLOC_FREE ( rec - > recovery_lock_handle ) ;
2016-02-17 12:20:03 +03:00
}
}
2013-06-28 10:31:07 +04:00
static void ban_misbehaving_nodes ( struct ctdb_recoverd * rec , bool * self_ban )
2013-06-28 08:31:02 +04:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
int i ;
struct ctdb_banning_state * ban_state ;
2013-06-28 10:31:07 +04:00
* self_ban = false ;
2013-06-28 08:31:02 +04:00
for ( i = 0 ; i < ctdb - > num_nodes ; i + + ) {
if ( ctdb - > nodes [ i ] - > ban_state = = NULL ) {
continue ;
}
ban_state = ( struct ctdb_banning_state * ) ctdb - > nodes [ i ] - > ban_state ;
if ( ban_state - > count < 2 * ctdb - > num_nodes ) {
continue ;
}
DEBUG ( DEBUG_NOTICE , ( " Node %u reached %u banning credits - banning it for %u seconds \n " ,
ctdb - > nodes [ i ] - > pnn , ban_state - > count ,
ctdb - > tunable . recovery_ban_period ) ) ;
ctdb_ban_node ( rec , ctdb - > nodes [ i ] - > pnn , ctdb - > tunable . recovery_ban_period ) ;
ban_state - > count = 0 ;
2013-06-28 10:31:07 +04:00
/* Banning ourself? */
if ( ctdb - > nodes [ i ] - > pnn = = rec - > ctdb - > pnn ) {
* self_ban = true ;
}
2013-06-28 08:31:02 +04:00
}
}
2013-08-27 06:14:34 +04:00
static bool do_takeover_run ( struct ctdb_recoverd * rec ,
2016-05-03 08:35:08 +03:00
struct ctdb_node_map_old * nodemap )
2013-08-27 06:14:34 +04:00
{
2013-08-27 09:04:40 +04:00
uint32_t * nodes = NULL ;
2015-10-28 10:23:13 +03:00
struct ctdb_disable_message dtr ;
2013-09-03 05:21:09 +04:00
TDB_DATA data ;
2013-08-27 09:04:40 +04:00
int i ;
2013-09-06 05:23:07 +04:00
uint32_t * rebalance_nodes = rec - > force_rebalance_nodes ;
2013-08-27 06:14:34 +04:00
int ret ;
bool ok ;
2013-09-18 11:06:16 +04:00
DEBUG ( DEBUG_NOTICE , ( " Takeover run starting \n " ) ) ;
2015-02-08 12:52:12 +03:00
if ( ctdb_op_is_in_progress ( rec - > takeover_run ) ) {
2013-09-03 05:20:01 +04:00
DEBUG ( DEBUG_ERR , ( __location__
" takeover run already in progress \n " ) ) ;
ok = false ;
goto done ;
}
2015-02-08 12:52:12 +03:00
if ( ! ctdb_op_begin ( rec - > takeover_run ) ) {
2013-08-27 09:04:40 +04:00
ok = false ;
goto done ;
2013-09-03 05:21:09 +04:00
}
2013-08-27 09:04:40 +04:00
/* Disable IP checks (takeover runs, really) on other nodes
* while doing this takeover run . This will stop those other
* nodes from triggering takeover runs when think they should
* be hosting an IP but it isn ' t yet on an interface . Don ' t
* wait for replies since a failure here might cause some
* noise in the logs but will not actually cause a problem .
*/
2016-01-11 09:23:12 +03:00
ZERO_STRUCT ( dtr ) ;
2013-08-27 09:04:40 +04:00
dtr . srvid = 0 ; /* No reply */
dtr . pnn = - 1 ;
data . dptr = ( uint8_t * ) & dtr ;
data . dsize = sizeof ( dtr ) ;
nodes = list_of_connected_nodes ( rec - > ctdb , nodemap , rec , false ) ;
2013-10-24 04:13:16 +04:00
/* Disable for 60 seconds. This can be a tunable later if
2013-08-27 09:04:40 +04:00
* necessary .
*/
2015-10-28 10:23:13 +03:00
dtr . timeout = 60 ;
2013-08-27 09:04:40 +04:00
for ( i = 0 ; i < talloc_array_length ( nodes ) ; i + + ) {
if ( ctdb_client_send_message ( rec - > ctdb , nodes [ i ] ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
data ) ! = 0 ) {
DEBUG ( DEBUG_INFO , ( " Failed to disable takeover runs \n " ) ) ;
}
}
2013-09-03 05:20:01 +04:00
2013-09-04 08:30:04 +04:00
ret = ctdb_takeover_run ( rec - > ctdb , nodemap ,
2016-05-03 08:35:08 +03:00
rec - > force_rebalance_nodes ) ;
2013-09-03 05:21:09 +04:00
2013-08-27 09:04:40 +04:00
/* Reenable takeover runs and IP checks on other nodes */
2015-10-28 10:23:13 +03:00
dtr . timeout = 0 ;
2013-08-27 09:04:40 +04:00
for ( i = 0 ; i < talloc_array_length ( nodes ) ; i + + ) {
if ( ctdb_client_send_message ( rec - > ctdb , nodes [ i ] ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
data ) ! = 0 ) {
2015-07-27 00:02:57 +03:00
DEBUG ( DEBUG_INFO , ( " Failed to re-enable takeover runs \n " ) ) ;
2013-08-27 09:04:40 +04:00
}
2013-09-03 05:21:09 +04:00
}
2013-08-27 06:14:34 +04:00
if ( ret ! = 0 ) {
2013-09-18 11:06:16 +04:00
DEBUG ( DEBUG_ERR , ( " ctdb_takeover_run() failed \n " ) ) ;
2013-08-27 06:14:34 +04:00
ok = false ;
goto done ;
}
ok = true ;
2013-09-04 08:30:04 +04:00
/* Takeover run was successful so clear force rebalance targets */
2013-09-06 05:23:07 +04:00
if ( rebalance_nodes = = rec - > force_rebalance_nodes ) {
TALLOC_FREE ( rec - > force_rebalance_nodes ) ;
} else {
DEBUG ( DEBUG_WARNING ,
( " Rebalance target nodes changed during takeover run - not clearing \n " ) ) ;
}
2013-08-27 06:14:34 +04:00
done :
rec - > need_takeover_run = ! ok ;
2013-08-27 09:04:40 +04:00
talloc_free ( nodes ) ;
2015-02-08 12:52:12 +03:00
ctdb_op_end ( rec - > takeover_run ) ;
2013-09-18 11:06:16 +04:00
DEBUG ( DEBUG_NOTICE , ( " Takeover run %s \n " , ok ? " completed successfully " : " unsuccessful " ) ) ;
2013-08-27 06:14:34 +04:00
return ok ;
}
2015-09-17 09:22:38 +03:00
struct recovery_helper_state {
int fd [ 2 ] ;
pid_t pid ;
int result ;
bool done ;
} ;
static void ctdb_recovery_handler ( struct tevent_context * ev ,
struct tevent_fd * fde ,
uint16_t flags , void * private_data )
{
struct recovery_helper_state * state = talloc_get_type_abort (
private_data , struct recovery_helper_state ) ;
int ret ;
ret = sys_read ( state - > fd [ 0 ] , & state - > result , sizeof ( state - > result ) ) ;
if ( ret ! = sizeof ( state - > result ) ) {
state - > result = EPIPE ;
}
state - > done = true ;
}
static int db_recovery_parallel ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx )
{
static char prog [ PATH_MAX + 1 ] = " " ;
const char * * args ;
struct recovery_helper_state * state ;
struct tevent_fd * fde ;
int nargs , ret ;
if ( ! ctdb_set_helper ( " recovery_helper " , prog , sizeof ( prog ) ,
" CTDB_RECOVERY_HELPER " , CTDB_HELPER_BINDIR ,
" ctdb_recovery_helper " ) ) {
ctdb_die ( rec - > ctdb , " Unable to set recovery helper \n " ) ;
}
state = talloc_zero ( mem_ctx , struct recovery_helper_state ) ;
if ( state = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
return - 1 ;
}
state - > pid = - 1 ;
ret = pipe ( state - > fd ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR ,
( " Failed to create pipe for recovery helper \n " ) ) ;
goto fail ;
}
set_close_on_exec ( state - > fd [ 0 ] ) ;
nargs = 4 ;
args = talloc_array ( state , const char * , nargs ) ;
if ( args = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
goto fail ;
}
args [ 0 ] = talloc_asprintf ( args , " %d " , state - > fd [ 1 ] ) ;
args [ 1 ] = rec - > ctdb - > daemon . name ;
args [ 2 ] = talloc_asprintf ( args , " %u " , new_generation ( ) ) ;
args [ 3 ] = NULL ;
if ( args [ 0 ] = = NULL | | args [ 2 ] = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
goto fail ;
}
2016-02-11 06:32:34 +03:00
setenv ( " CTDB_DBDIR_STATE " , rec - > ctdb - > db_directory_state , 1 ) ;
2015-09-17 09:22:38 +03:00
if ( ! ctdb_vfork_with_logging ( state , rec - > ctdb , " recovery " , prog , nargs ,
args , NULL , NULL , & state - > pid ) ) {
DEBUG ( DEBUG_ERR ,
( " Failed to create child for recovery helper \n " ) ) ;
goto fail ;
}
close ( state - > fd [ 1 ] ) ;
state - > fd [ 1 ] = - 1 ;
state - > done = false ;
fde = tevent_add_fd ( rec - > ctdb - > ev , rec - > ctdb , state - > fd [ 0 ] ,
TEVENT_FD_READ , ctdb_recovery_handler , state ) ;
if ( fde = = NULL ) {
goto fail ;
}
tevent_fd_set_auto_close ( fde ) ;
while ( ! state - > done ) {
tevent_loop_once ( rec - > ctdb - > ev ) ;
}
close ( state - > fd [ 0 ] ) ;
state - > fd [ 0 ] = - 1 ;
if ( state - > result ! = 0 ) {
goto fail ;
}
ctdb_kill ( rec - > ctdb , state - > pid , SIGKILL ) ;
talloc_free ( state ) ;
return 0 ;
fail :
if ( state - > fd [ 0 ] ! = - 1 ) {
close ( state - > fd [ 0 ] ) ;
}
if ( state - > fd [ 1 ] ! = - 1 ) {
close ( state - > fd [ 1 ] ) ;
}
if ( state - > pid ! = - 1 ) {
ctdb_kill ( rec - > ctdb , state - > pid , SIGKILL ) ;
}
talloc_free ( state ) ;
return - 1 ;
}
2015-09-17 09:00:47 +03:00
static int db_recovery_serial ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx ,
2015-10-29 09:22:48 +03:00
uint32_t pnn , struct ctdb_node_map_old * nodemap ,
2015-09-17 09:00:47 +03:00
struct ctdb_vnn_map * vnnmap ,
2015-10-29 09:46:05 +03:00
struct ctdb_dbid_map_old * dbmap )
2007-05-06 04:04:37 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-06 04:04:37 +04:00
uint32_t generation ;
2008-01-06 05:24:55 +03:00
TDB_DATA data ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2015-09-17 09:00:47 +03:00
int ret , i , j ;
2010-05-06 03:33:08 +04:00
2008-01-06 04:38:01 +03:00
/* set recovery mode to active on all nodes */
2015-10-06 03:52:06 +03:00
ret = set_recovery_mode ( ctdb , rec , nodemap , CTDB_RECOVERY_ACTIVE , true ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-01-29 05:59:28 +03:00
/* execute the "startrecovery" event script on all nodes */
2008-06-12 10:53:36 +04:00
ret = run_startrecovery_eventscript ( rec , nodemap ) ;
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'startrecovery' event on cluster \n " ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2008-01-29 05:59:28 +03:00
}
2008-01-06 04:38:01 +03:00
/* pick a new generation number */
generation = new_generation ( ) ;
2007-05-06 04:22:13 +04:00
2008-01-06 04:38:01 +03:00
/* change the vnnmap on this node to use the new generation
number but not on any other nodes .
this guarantees that if we abort the recovery prematurely
for some reason ( a node stops responding ? )
that we can just return immediately and we will reenter
recovery shortly again .
I . e . we deliberately leave the cluster with an inconsistent
generation id to allow us to abort recovery at any stage and
just restart it from scratch .
2008-01-02 14:44:46 +03:00
*/
2008-01-06 04:38:01 +03:00
vnnmap - > generation = generation ;
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , vnnmap ) ;
2008-01-02 14:44:46 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set vnnmap for node %u \n " , pnn ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2008-01-02 14:44:46 +03:00
}
2015-09-11 07:20:44 +03:00
/* Database generations are updated when the transaction is commited to
* the databases . So make sure to use the final generation as the
* transaction id
*/
generation = new_generation ( ) ;
2008-01-06 05:24:55 +03:00
data . dptr = ( void * ) & generation ;
data . dsize = sizeof ( uint32_t ) ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , mem_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_TRANSACTION_START ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-06-12 10:53:36 +04:00
CONTROL_TIMEOUT ( ) , false , data ,
2009-10-12 09:48:05 +04:00
NULL ,
transaction_start_fail_callback ,
rec ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to start transactions. Recovery failed. \n " ) ) ;
2009-10-12 09:48:05 +04:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_TRANSACTION_CANCEL ,
nodes , 0 ,
CONTROL_TIMEOUT ( ) , false , tdb_null ,
NULL ,
NULL ,
NULL ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to cancel recovery transaction \n " ) ) ;
}
2015-09-17 09:00:47 +03:00
return - 1 ;
2008-01-06 04:38:01 +03:00
}
2008-01-02 14:44:46 +03:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " started transactions on all nodes \n " ) ) ;
2008-01-02 14:44:46 +03:00
2008-01-06 04:38:01 +03:00
for ( i = 0 ; i < dbmap - > num ; i + + ) {
2009-11-29 13:14:31 +03:00
ret = recover_database ( rec , mem_ctx ,
2015-10-29 09:46:05 +03:00
dbmap - > dbs [ i ] . db_id ,
2011-09-01 04:21:55 +04:00
dbmap - > dbs [ i ] . flags & CTDB_DB_FLAGS_PERSISTENT ,
2009-11-29 13:14:31 +03:00
pnn , nodemap , generation ) ;
if ( ret ! = 0 ) {
2015-10-29 09:46:05 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to recover database 0x%x \n " , dbmap - > dbs [ i ] . db_id ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2008-01-06 04:38:01 +03:00
}
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - starting database commits \n " ) ) ;
2008-01-06 04:38:01 +03:00
/* commit all the changes */
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_TRANSACTION_COMMIT ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-06-12 10:53:36 +04:00
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to commit recovery changes. Recovery failed. \n " ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - committed databases \n " ) ) ;
2007-05-06 04:38:44 +04:00
2007-06-07 09:18:55 +04:00
/* build a new vnn map with all the currently active and
unbanned nodes */
2007-05-10 02:49:57 +04:00
vnnmap = talloc ( mem_ctx , struct ctdb_vnn_map ) ;
CTDB_NO_MEMORY ( ctdb , vnnmap ) ;
2007-05-04 09:21:40 +04:00
vnnmap - > generation = generation ;
2008-05-06 09:42:59 +04:00
vnnmap - > size = 0 ;
2007-06-07 10:34:33 +04:00
vnnmap - > map = talloc_zero_array ( vnnmap , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
2007-05-04 09:21:40 +04:00
for ( i = j = 0 ; i < nodemap - > num ; i + + ) {
2008-05-06 09:42:59 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
2007-05-04 09:21:40 +04:00
}
2014-07-31 09:26:03 +04:00
if ( ! ctdb_node_has_capabilities ( rec - > caps ,
ctdb - > nodes [ i ] - > pnn ,
CTDB_CAP_LMASTER ) ) {
2008-05-06 09:42:59 +04:00
/* this node can not be an lmaster */
DEBUG ( DEBUG_DEBUG , ( " Node %d cant be a LMASTER, skipping it \n " , i ) ) ;
continue ;
}
vnnmap - > size + + ;
2008-05-08 13:59:24 +04:00
vnnmap - > map = talloc_realloc ( vnnmap , vnnmap - > map , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
vnnmap - > map [ j + + ] = nodemap - > nodes [ i ] . pnn ;
2007-05-04 09:21:40 +04:00
}
2008-05-06 09:42:59 +04:00
if ( vnnmap - > size = = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " No suitable lmasters found. Adding local node (recmaster) anyway. \n " ) ) ;
vnnmap - > size + + ;
2008-05-08 13:59:24 +04:00
vnnmap - > map = talloc_realloc ( vnnmap , vnnmap - > map , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
vnnmap - > map [ 0 ] = pnn ;
2015-09-17 09:00:47 +03:00
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:42:18 +04:00
/* update to the new vnnmap on all nodes */
2007-09-04 04:33:10 +04:00
ret = update_vnnmap_on_all_nodes ( ctdb , nodemap , pnn , vnnmap , mem_ctx ) ;
2007-05-06 04:42:18 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update vnnmap on all nodes \n " ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated vnnmap \n " ) ) ;
2007-05-04 09:21:40 +04:00
2008-05-14 14:57:04 +04:00
/* disable recovery mode */
2015-10-06 03:52:06 +03:00
ret = set_recovery_mode ( ctdb , rec , nodemap , CTDB_RECOVERY_NORMAL , false ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-05-14 14:57:04 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to normal on cluster \n " ) ) ;
2015-09-17 09:00:47 +03:00
return - 1 ;
2008-05-14 14:57:04 +04:00
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - disabled recovery mode \n " ) ) ;
2015-09-17 09:00:47 +03:00
return 0 ;
}
/*
we are the recmaster , and recovery is needed - start a recovery run
*/
static int do_recovery ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx , uint32_t pnn ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap , struct ctdb_vnn_map * vnnmap )
2015-09-17 09:00:47 +03:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
int i , ret ;
2015-10-29 09:46:05 +03:00
struct ctdb_dbid_map_old * dbmap ;
2015-09-17 09:00:47 +03:00
bool self_ban ;
2015-09-17 09:22:38 +03:00
bool par_recovery ;
2015-09-17 09:00:47 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Starting do_recovery \n " ) ) ;
2015-10-06 09:31:41 +03:00
/* Check if the current node is still the recmaster. It's possible that
2015-10-23 07:33:01 +03:00
* re - election has changed the recmaster .
2015-10-06 09:31:41 +03:00
*/
2015-10-23 07:33:01 +03:00
if ( pnn ! = rec - > recmaster ) {
2015-10-06 09:31:41 +03:00
DEBUG ( DEBUG_NOTICE ,
( " Recovery master changed to %u, aborting recovery \n " ,
2015-10-23 07:33:01 +03:00
rec - > recmaster ) ) ;
2015-10-06 09:31:41 +03:00
return - 1 ;
}
2015-09-17 09:00:47 +03:00
/* if recovery fails, force it again */
rec - > need_recovery = true ;
if ( ! ctdb_op_begin ( rec - > recovery ) ) {
return - 1 ;
}
if ( rec - > election_timeout ) {
/* an election is in progress */
DEBUG ( DEBUG_ERR , ( " do_recovery called while election in progress - try again later \n " ) ) ;
goto fail ;
}
ban_misbehaving_nodes ( rec , & self_ban ) ;
if ( self_ban ) {
DEBUG ( DEBUG_NOTICE , ( " This node was banned, aborting recovery \n " ) ) ;
goto fail ;
}
2016-05-17 11:28:56 +03:00
if ( ctdb - > recovery_lock ! = NULL ) {
2016-05-24 07:54:39 +03:00
if ( ctdb_recovery_have_lock ( rec ) ) {
2015-09-17 09:00:47 +03:00
DEBUG ( DEBUG_NOTICE , ( " Already holding recovery lock \n " ) ) ;
} else {
DEBUG ( DEBUG_NOTICE , ( " Attempting to take recovery lock (%s) \n " ,
2016-05-17 11:28:56 +03:00
ctdb - > recovery_lock ) ) ;
2016-05-24 07:54:39 +03:00
if ( ! ctdb_recovery_lock ( rec ) ) {
2015-09-17 09:00:47 +03:00
if ( ctdb - > runstate = = CTDB_RUNSTATE_FIRST_RECOVERY ) {
/* If ctdb is trying first recovery, it's
* possible that current node does not know
* yet who the recmaster is .
*/
DEBUG ( DEBUG_ERR , ( " Unable to get recovery lock "
" - retrying recovery \n " ) ) ;
goto fail ;
}
DEBUG ( DEBUG_ERR , ( " Unable to get recovery lock - aborting recovery "
" and ban ourself for %u seconds \n " ,
ctdb - > tunable . recovery_ban_period ) ) ;
ctdb_ban_node ( rec , pnn , ctdb - > tunable . recovery_ban_period ) ;
goto fail ;
}
DEBUG ( DEBUG_NOTICE ,
( " Recovery lock taken successfully by recovery daemon \n " ) ) ;
}
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery initiated due to problem with node %u \n " , rec - > last_culprit_node ) ) ;
/* get a list of all databases */
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node :%u \n " , pnn ) ) ;
goto fail ;
}
/* we do the db creation before we set the recovery mode, so the freeze happens
on all databases we will be dealing with . */
/* verify that we have all the databases any other node has */
ret = create_missing_local_databases ( ctdb , nodemap , pnn , & dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing local databases \n " ) ) ;
goto fail ;
}
/* verify that all other nodes have all our databases */
ret = create_missing_remote_databases ( ctdb , nodemap , pnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing remote databases \n " ) ) ;
goto fail ;
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - created remote databases \n " ) ) ;
/* update the database priority for all remote databases */
ret = update_db_priority_on_remote_nodes ( ctdb , nodemap , pnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set db priority on remote nodes \n " ) ) ;
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated db priority for all databases \n " ) ) ;
2015-10-27 07:09:33 +03:00
/* Retrieve capabilities from all connected nodes */
2015-09-17 09:07:37 +03:00
ret = update_capabilities ( rec , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update node capabilities. \n " ) ) ;
return - 1 ;
}
2015-09-17 10:10:15 +03:00
/*
update all nodes to have the same flags that we have
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
ret = update_flags_on_all_nodes ( ctdb , nodemap , i , nodemap - > nodes [ i ] . flags ) ;
if ( ret ! = 0 ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
DEBUG ( DEBUG_WARNING , ( __location__ " Unable to update flags on inactive node %d \n " , i ) ) ;
} else {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update flags on all nodes for node %d \n " , i ) ) ;
return - 1 ;
}
}
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated flags \n " ) ) ;
2015-09-17 09:22:38 +03:00
/* Check if all participating nodes have parallel recovery capability */
par_recovery = true ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
if ( ! ( rec - > caps [ i ] . capabilities &
CTDB_CAP_PARALLEL_RECOVERY ) ) {
par_recovery = false ;
break ;
}
}
if ( par_recovery ) {
ret = db_recovery_parallel ( rec , mem_ctx ) ;
} else {
ret = db_recovery_serial ( rec , mem_ctx , pnn , nodemap , vnnmap ,
dbmap ) ;
}
2015-09-17 09:00:47 +03:00
if ( ret ! = 0 ) {
goto fail ;
}
2016-05-03 08:35:08 +03:00
do_takeover_run ( rec , nodemap ) ;
2008-02-18 11:38:04 +03:00
2008-01-29 05:59:28 +03:00
/* execute the "recovered" event script on all nodes */
2012-09-24 08:32:04 +04:00
ret = run_recovered_eventscript ( rec , nodemap , " do_recovery " ) ;
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-05-15 06:28:52 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'recovered' event on cluster. Recovery process failed. \n " ) ) ;
2015-02-06 06:32:08 +03:00
goto fail ;
2008-01-29 05:59:28 +03:00
}
2008-02-18 11:38:04 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - finished the recovered event \n " ) ) ;
2007-05-25 18:05:30 +04:00
/* send a message to all clients telling them that the cluster
has been reconfigured */
2013-11-11 05:39:27 +04:00
ret = ctdb_client_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
CTDB_SRVID_RECONFIGURE , tdb_null ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to send reconfigure message \n " ) ) ;
2015-02-06 06:32:08 +03:00
goto fail ;
2013-11-11 05:39:27 +04:00
}
2007-05-04 09:21:40 +04:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery complete \n " ) ) ;
2007-07-04 02:36:59 +04:00
2007-09-14 03:49:12 +04:00
rec - > need_recovery = false ;
2015-02-06 06:47:33 +03:00
ctdb_op_end ( rec - > recovery ) ;
2007-09-14 03:49:12 +04:00
2009-09-25 07:14:53 +04:00
/* we managed to complete a full recovery, make sure to forgive
any past sins by the nodes that could now participate in the
recovery .
*/
DEBUG ( DEBUG_ERR , ( " Resetting ban count to 0 for all nodes \n " ) ) ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
struct ctdb_banning_state * ban_state ;
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
ban_state = ( struct ctdb_banning_state * ) ctdb - > nodes [ nodemap - > nodes [ i ] . pnn ] - > ban_state ;
if ( ban_state = = NULL ) {
continue ;
}
ban_state - > count = 0 ;
}
2015-02-06 06:47:33 +03:00
/* We just finished a recovery successfully.
We now wait for rerecovery_timeout before we allow
2007-07-04 02:36:59 +04:00
another recovery to take place .
*/
2010-09-28 02:46:12 +04:00
DEBUG ( DEBUG_NOTICE , ( " Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds) \n " , ctdb - > tunable . rerecovery_timeout ) ) ;
2015-02-06 06:47:33 +03:00
ctdb_op_disable ( rec - > recovery , ctdb - > ev ,
ctdb - > tunable . rerecovery_timeout ) ;
2007-05-04 09:21:40 +04:00
return 0 ;
2015-02-06 06:32:08 +03:00
fail :
2015-02-06 06:47:33 +03:00
ctdb_op_end ( rec - > recovery ) ;
2015-02-06 06:32:08 +03:00
return - 1 ;
2007-05-04 03:45:53 +04:00
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-06-07 13:17:27 +04:00
/*
elections are won by first checking the number of connected nodes , then
2007-09-04 04:33:10 +04:00
the priority time , then the pnn
2007-06-07 13:17:27 +04:00
*/
2007-05-07 00:51:58 +04:00
struct election_message {
2007-06-07 13:17:27 +04:00
uint32_t num_connected ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-09-04 04:33:10 +04:00
uint32_t pnn ;
2007-10-05 07:28:21 +04:00
uint32_t node_flags ;
2007-05-07 00:51:58 +04:00
} ;
2007-06-07 13:17:27 +04:00
/*
form this nodes election data
*/
static void ctdb_election_data ( struct ctdb_recoverd * rec , struct election_message * em )
{
int ret , i ;
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ;
2007-06-07 13:17:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
ZERO_STRUCTP ( em ) ;
2007-09-04 04:33:10 +04:00
em - > pnn = rec - > ctdb - > pnn ;
2007-06-07 13:17:27 +04:00
em - > priority_time = rec - > priority_time ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , rec , & nodemap ) ;
if ( ret ! = 0 ) {
2013-10-30 04:32:28 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " unable to get node map \n " ) ) ;
2007-06-07 13:17:27 +04:00
return ;
}
2009-07-17 05:37:03 +04:00
rec - > node_flags = nodemap - > nodes [ ctdb - > pnn ] . flags ;
em - > node_flags = rec - > node_flags ;
2007-06-07 13:17:27 +04:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
em - > num_connected + + ;
}
}
2008-05-06 07:56:56 +04:00
/* we shouldnt try to win this election if we cant be a recmaster */
if ( ( ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
em - > num_connected = 0 ;
em - > priority_time = timeval_current ( ) ;
}
2007-06-07 13:17:27 +04:00
talloc_free ( nodemap ) ;
}
/*
see if the given election data wins
*/
static bool ctdb_election_win ( struct ctdb_recoverd * rec , struct election_message * em )
{
struct election_message myem ;
2007-10-05 07:28:21 +04:00
int cmp = 0 ;
2007-06-07 13:17:27 +04:00
ctdb_election_data ( rec , & myem ) ;
2015-07-27 00:02:57 +03:00
/* we cant win if we don't have the recmaster capability */
2008-05-06 07:56:56 +04:00
if ( ( rec - > ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
return false ;
}
2007-10-11 00:16:36 +04:00
/* we cant win if we are banned */
if ( rec - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 08:17:49 +04:00
return false ;
2013-06-21 16:06:22 +04:00
}
2007-10-05 07:28:21 +04:00
2009-07-09 08:44:03 +04:00
/* we cant win if we are stopped */
if ( rec - > node_flags & NODE_FLAGS_STOPPED ) {
return false ;
2013-06-21 16:06:22 +04:00
}
2009-07-09 08:44:03 +04:00
2007-10-11 00:16:36 +04:00
/* we will automatically win if the other node is banned */
if ( em - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 08:17:49 +04:00
return true ;
2007-10-05 07:28:21 +04:00
}
2009-07-09 08:44:03 +04:00
/* we will automatically win if the other node is banned */
if ( em - > node_flags & NODE_FLAGS_STOPPED ) {
return true ;
}
2007-06-07 13:17:27 +04:00
/* then the longest running node */
if ( cmp = = 0 ) {
2007-06-07 13:21:55 +04:00
cmp = timeval_compare ( & em - > priority_time , & myem . priority_time ) ;
2007-06-07 13:17:27 +04:00
}
if ( cmp = = 0 ) {
2007-09-04 04:33:10 +04:00
cmp = ( int ) myem . pnn - ( int ) em - > pnn ;
2007-06-07 13:17:27 +04:00
}
return cmp > 0 ;
}
2007-06-07 09:18:55 +04:00
/*
send out an election request
*/
2013-10-29 09:38:42 +04:00
static int send_election_request ( struct ctdb_recoverd * rec , uint32_t pnn )
2007-05-07 00:51:58 +04:00
{
int ret ;
TDB_DATA election_data ;
struct election_message emsg ;
uint64_t srvid ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-11 00:16:36 +04:00
2015-10-29 09:51:52 +03:00
srvid = CTDB_SRVID_ELECTION ;
2007-05-07 00:51:58 +04:00
2007-06-07 13:17:27 +04:00
ctdb_election_data ( rec , & emsg ) ;
2007-05-07 00:51:58 +04:00
election_data . dsize = sizeof ( struct election_message ) ;
election_data . dptr = ( unsigned char * ) & emsg ;
2013-10-29 09:38:42 +04:00
/* first we assume we will win the election and set
recoverymaster to be ourself on the current node
*/
2015-10-23 07:27:12 +03:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , pnn ) ;
2013-10-29 09:38:42 +04:00
if ( ret ! = 0 ) {
2015-10-23 07:27:12 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to set recmaster \n " ) ) ;
2013-10-29 09:38:42 +04:00
return - 1 ;
}
2015-10-23 06:32:41 +03:00
rec - > recmaster = pnn ;
2013-10-29 09:38:42 +04:00
2007-05-07 00:51:58 +04:00
/* send an election message to all active nodes */
2009-07-17 05:37:03 +04:00
DEBUG ( DEBUG_INFO , ( __location__ " Send election request to all active nodes \n " ) ) ;
2013-11-11 05:39:27 +04:00
return ctdb_client_send_message ( ctdb , CTDB_BROADCAST_ALL , srvid , election_data ) ;
2007-05-07 00:51:58 +04:00
}
2007-11-13 02:27:44 +03:00
/*
we think we are winning the election - send a broadcast election request
*/
2015-10-26 08:50:09 +03:00
static void election_send_request ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval t , void * p )
2007-11-13 02:27:44 +03:00
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
int ret ;
2013-10-29 09:38:42 +04:00
ret = send_election_request ( rec , ctdb_get_pnn ( rec - > ctdb ) ) ;
2007-11-13 02:27:44 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to send election request! \n " ) ) ;
2007-11-13 02:27:44 +03:00
}
2015-10-23 08:03:38 +03:00
TALLOC_FREE ( rec - > send_election_te ) ;
2007-11-13 02:27:44 +03:00
}
2008-04-01 08:34:54 +04:00
/*
handler for memory dumps
*/
2015-04-08 07:38:26 +03:00
static void mem_dump_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2008-04-01 08:34:54 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-04-01 08:34:54 +04:00
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
TDB_DATA * dump ;
int ret ;
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message * rd ;
2008-04-01 08:34:54 +04:00
2015-10-29 06:32:49 +03:00
if ( data . dsize ! = sizeof ( struct ctdb_srvid_message ) ) {
2008-04-01 08:34:54 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size of return address. \n " ) ) ;
2008-09-16 03:00:48 +04:00
talloc_free ( tmp_ctx ) ;
2008-04-01 08:34:54 +04:00
return ;
}
2015-10-29 06:32:49 +03:00
rd = ( struct ctdb_srvid_message * ) data . dptr ;
2008-04-01 08:34:54 +04:00
dump = talloc_zero ( tmp_ctx , TDB_DATA ) ;
if ( dump = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to allocate memory for memdump \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
ret = ctdb_dump_memory ( ctdb , dump ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_dump_memory() failed \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
DEBUG ( DEBUG_ERR , ( " recovery master memory dump \n " ) ) ;
2010-06-02 03:45:21 +04:00
ret = ctdb_client_send_message ( ctdb , rd - > pnn , rd - > srvid , * dump ) ;
2008-04-01 08:34:54 +04:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to send rd memdump reply message \n " ) ) ;
2008-09-16 03:00:48 +04:00
talloc_free ( tmp_ctx ) ;
2008-04-01 08:34:54 +04:00
return ;
}
talloc_free ( tmp_ctx ) ;
}
2009-06-01 08:18:34 +04:00
/*
handler for reload_nodes
*/
2015-04-08 07:38:26 +03:00
static void reload_nodes_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2009-06-01 08:18:34 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2009-06-01 08:18:34 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Reload nodes file from recovery daemon \n " ) ) ;
2013-10-14 06:54:39 +04:00
ctdb_load_nodes_file ( rec - > ctdb ) ;
2009-06-01 08:18:34 +04:00
}
2009-10-06 05:11:32 +04:00
2015-04-08 07:38:26 +03:00
static void recd_node_rebalance_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2012-02-27 23:56:04 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2012-02-27 23:56:04 +04:00
uint32_t pnn ;
2013-09-04 08:30:04 +04:00
uint32_t * t ;
int len ;
2012-02-27 23:56:04 +04:00
2013-09-04 08:30:04 +04:00
if ( rec - > recmaster ! = ctdb_get_pnn ( ctdb ) ) {
return ;
}
2012-02-27 23:56:04 +04:00
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes \n " , data . dsize , sizeof ( uint32_t ) ) ) ;
return ;
}
pnn = * ( uint32_t * ) & data . dptr [ 0 ] ;
2013-09-04 08:30:04 +04:00
DEBUG ( DEBUG_NOTICE , ( " Setting up rebalance of IPs to node %u \n " , pnn ) ) ;
2012-02-27 23:56:04 +04:00
2013-09-04 08:30:04 +04:00
/* Copy any existing list of nodes. There's probably some
* sort of realloc variant that will do this but we need to
* make sure that freeing the old array also cancels the timer
* event for the timeout . . . not sure if realloc will do that .
*/
len = ( rec - > force_rebalance_nodes ! = NULL ) ?
talloc_array_length ( rec - > force_rebalance_nodes ) :
0 ;
/* This allows duplicates to be added but they don't cause
* harm . A call to add a duplicate PNN arguably means that
* the timeout should be reset , so this is the simplest
* solution .
*/
t = talloc_zero_array ( rec , uint32_t , len + 1 ) ;
CTDB_NO_MEMORY_VOID ( ctdb , t ) ;
if ( len > 0 ) {
memcpy ( t , rec - > force_rebalance_nodes , sizeof ( uint32_t ) * len ) ;
2012-02-27 23:56:04 +04:00
}
2013-09-04 08:30:04 +04:00
t [ len ] = pnn ;
talloc_free ( rec - > force_rebalance_nodes ) ;
rec - > force_rebalance_nodes = t ;
2012-02-27 23:56:04 +04:00
}
2015-04-08 07:38:26 +03:00
static void recd_update_ip_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2010-04-28 09:43:11 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2010-04-28 09:43:11 +04:00
struct ctdb_public_ip * ip ;
if ( rec - > recmaster ! = rec - > ctdb - > pnn ) {
DEBUG ( DEBUG_INFO , ( " Not recmaster, ignore update ip message \n " ) ) ;
return ;
}
if ( data . dsize ! = sizeof ( struct ctdb_public_ip ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes \n " , data . dsize , sizeof ( struct ctdb_public_ip ) ) ) ;
return ;
}
ip = ( struct ctdb_public_ip * ) data . dptr ;
update_ip_assignment_tree ( rec - > ctdb , ip ) ;
}
2015-02-06 05:05:12 +03:00
static void srvid_disable_and_reply ( struct ctdb_context * ctdb ,
TDB_DATA data ,
struct ctdb_op_state * op_state )
2013-08-27 09:04:40 +04:00
{
2015-10-28 10:23:13 +03:00
struct ctdb_disable_message * r ;
2013-08-27 09:04:40 +04:00
uint32_t timeout ;
TDB_DATA result ;
int32_t ret = 0 ;
/* Validate input data */
2015-10-28 10:23:13 +03:00
if ( data . dsize ! = sizeof ( struct ctdb_disable_message ) ) {
2013-08-27 09:04:40 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size for data :%lu "
" expecting %lu \n " , ( long unsigned ) data . dsize ,
2015-10-29 06:32:49 +03:00
( long unsigned ) sizeof ( struct ctdb_srvid_message ) ) ) ;
2013-11-11 05:39:27 +04:00
return ;
2013-08-27 09:04:40 +04:00
}
if ( data . dptr = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " No data received \n " ) ) ;
2013-11-11 05:39:27 +04:00
return ;
2013-08-27 09:04:40 +04:00
}
2015-10-28 10:23:13 +03:00
r = ( struct ctdb_disable_message * ) data . dptr ;
timeout = r - > timeout ;
2013-08-27 09:04:40 +04:00
2015-02-06 05:05:12 +03:00
ret = ctdb_op_disable ( op_state , ctdb - > ev , timeout ) ;
2015-02-08 12:52:12 +03:00
if ( ret ! = 0 ) {
2013-08-27 09:04:40 +04:00
goto done ;
}
/* Returning our PNN tells the caller that we succeeded */
ret = ctdb_get_pnn ( ctdb ) ;
done :
result . dsize = sizeof ( int32_t ) ;
result . dptr = ( uint8_t * ) & ret ;
2015-10-29 06:32:49 +03:00
srvid_request_reply ( ctdb , ( struct ctdb_srvid_message * ) r , result ) ;
2013-08-27 09:04:40 +04:00
}
2015-04-08 07:38:26 +03:00
static void disable_takeover_runs_handler ( uint64_t srvid , TDB_DATA data ,
2015-02-06 05:05:12 +03:00
void * private_data )
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 05:05:12 +03:00
2015-04-08 07:38:26 +03:00
srvid_disable_and_reply ( rec - > ctdb , data , rec - > takeover_run ) ;
2015-02-06 05:05:12 +03:00
}
2015-02-06 07:03:03 +03:00
/* Backward compatibility for this SRVID */
2015-04-08 07:38:26 +03:00
static void disable_ip_check_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2013-08-28 05:32:54 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 07:03:03 +03:00
uint32_t timeout ;
2013-08-28 05:32:54 +04:00
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size for data :%lu "
" expecting %lu \n " , ( long unsigned ) data . dsize ,
( long unsigned ) sizeof ( uint32_t ) ) ) ;
return ;
}
if ( data . dptr = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " No data received \n " ) ) ;
return ;
}
2015-02-06 07:03:03 +03:00
timeout = * ( ( uint32_t * ) data . dptr ) ;
2013-08-28 05:32:54 +04:00
2015-04-08 07:38:26 +03:00
ctdb_op_disable ( rec - > takeover_run , rec - > ctdb - > ev , timeout ) ;
2013-08-28 05:32:54 +04:00
}
2009-10-06 05:11:32 +04:00
2015-04-08 07:38:26 +03:00
static void disable_recoveries_handler ( uint64_t srvid , TDB_DATA data ,
2015-02-06 07:06:44 +03:00
void * private_data )
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 07:06:44 +03:00
2015-04-08 07:38:26 +03:00
srvid_disable_and_reply ( rec - > ctdb , data , rec - > recovery ) ;
2015-02-06 07:06:44 +03:00
}
2009-07-02 07:00:26 +04:00
/*
2013-08-16 14:10:10 +04:00
handler for ip reallocate , just add it to the list of requests and
2009-07-02 07:00:26 +04:00
handle this later in the monitor_cluster loop so we do not recurse
2013-08-16 14:10:10 +04:00
with other requests to takeover_run ( )
2009-07-02 07:00:26 +04:00
*/
2015-04-08 07:38:26 +03:00
static void ip_reallocate_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2009-07-02 07:00:26 +04:00
{
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message * request ;
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2009-07-02 07:00:26 +04:00
2015-10-29 06:32:49 +03:00
if ( data . dsize ! = sizeof ( struct ctdb_srvid_message ) ) {
2009-07-02 07:00:26 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size of return address. \n " ) ) ;
return ;
}
2015-10-29 06:32:49 +03:00
request = ( struct ctdb_srvid_message * ) data . dptr ;
2009-07-02 07:00:26 +04:00
2015-04-08 07:38:26 +03:00
srvid_request_add ( rec - > ctdb , & rec - > reallocate_requests , request ) ;
2009-07-02 07:00:26 +04:00
}
2013-08-16 14:02:34 +04:00
static void process_ipreallocate_requests ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec )
2009-07-02 07:00:26 +04:00
{
TDB_DATA result ;
int32_t ret ;
2013-11-22 06:57:03 +04:00
struct srvid_requests * current ;
2009-07-02 07:00:26 +04:00
2013-11-22 06:57:03 +04:00
/* Only process requests that are currently pending. More
* might come in while the takeover run is in progress and
* they will need to be processed later since they might
* be in response flag changes .
*/
current = rec - > reallocate_requests ;
rec - > reallocate_requests = NULL ;
2016-05-03 08:35:08 +03:00
if ( do_takeover_run ( rec , rec - > nodemap ) ) {
2015-10-28 12:04:41 +03:00
ret = ctdb_get_pnn ( ctdb ) ;
} else {
ret = - 1 ;
2010-01-19 10:42:48 +03:00
}
2009-07-02 07:00:26 +04:00
result . dsize = sizeof ( int32_t ) ;
result . dptr = ( uint8_t * ) & ret ;
2013-11-22 06:57:03 +04:00
srvid_requests_reply ( ctdb , & current , result ) ;
2009-07-02 07:00:26 +04:00
}
2009-06-01 08:18:34 +04:00
2016-03-17 09:26:30 +03:00
/*
* handler for assigning banning credits
*/
static void banning_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
uint32_t ban_pnn ;
/* Ignore if we are not recmaster */
if ( rec - > ctdb - > pnn ! = rec - > recmaster ) {
return ;
}
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " invalid data size %zu \n " ,
data . dsize ) ) ;
return ;
}
ban_pnn = * ( uint32_t * ) data . dptr ;
ctdb_set_culprit_count ( rec , ban_pnn , rec - > nodemap - > num ) ;
}
2009-06-01 08:18:34 +04:00
2007-05-07 00:51:58 +04:00
/*
handler for recovery master elections
*/
2015-04-08 07:38:26 +03:00
static void election_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2007-05-07 00:51:58 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-07 00:51:58 +04:00
int ret ;
struct election_message * em = ( struct election_message * ) data . dptr ;
2013-11-01 07:34:20 +04:00
/* Ignore election packets from ourself */
if ( ctdb - > pnn = = em - > pnn ) {
return ;
}
2007-11-13 02:27:44 +03:00
/* we got an election packet - update the timeout for the election */
talloc_free ( rec - > election_timeout ) ;
2015-10-26 08:50:09 +03:00
rec - > election_timeout = tevent_add_timer (
ctdb - > ev , ctdb ,
fast_start ?
timeval_current_ofs ( 0 , 500000 ) :
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
2007-11-13 02:27:44 +03:00
2007-05-07 00:51:58 +04:00
/* someone called an election. check their election data
and if we disagree and we would rather be the elected node ,
send a new election message to all other nodes
*/
2007-06-07 13:17:27 +04:00
if ( ctdb_election_win ( rec , em ) ) {
2007-11-13 02:27:44 +03:00
if ( ! rec - > send_election_te ) {
2015-10-26 08:50:09 +03:00
rec - > send_election_te = tevent_add_timer (
ctdb - > ev , rec ,
timeval_current_ofs ( 0 , 500000 ) ,
election_send_request , rec ) ;
2007-05-07 00:51:58 +04:00
}
return ;
}
2014-12-09 05:50:22 +03:00
2007-11-13 02:27:44 +03:00
/* we didn't win */
2015-03-31 05:59:02 +03:00
TALLOC_FREE ( rec - > send_election_te ) ;
2007-05-07 00:51:58 +04:00
2015-03-31 05:59:49 +03:00
/* Release the recovery lock file */
2016-05-24 07:54:39 +03:00
if ( ctdb_recovery_have_lock ( rec ) ) {
ctdb_recovery_unlock ( rec ) ;
2007-05-23 08:35:19 +04:00
}
2015-06-11 08:49:25 +03:00
clear_ip_assignment_tree ( ctdb ) ;
2007-05-07 00:51:58 +04:00
/* ok, let that guy become recmaster then */
2015-10-23 07:27:12 +03:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , em - > pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2015-10-23 07:27:12 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to set recmaster " ) ) ;
2007-05-07 00:51:58 +04:00
return ;
}
2015-10-23 06:32:41 +03:00
rec - > recmaster = em - > pnn ;
2007-05-07 00:51:58 +04:00
return ;
}
2007-06-07 09:18:55 +04:00
/*
force the start of the election process
*/
2008-03-03 01:19:30 +03:00
static void force_election ( struct ctdb_recoverd * rec , uint32_t pnn ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap )
2007-05-07 00:51:58 +04:00
{
int ret ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-10 03:48:14 +04:00
2009-07-17 05:37:03 +04:00
DEBUG ( DEBUG_INFO , ( __location__ " Force an election \n " ) ) ;
2007-05-10 03:48:14 +04:00
/* set all nodes to recovery mode to stop all internode traffic */
2015-10-06 03:52:06 +03:00
ret = set_recovery_mode ( ctdb , rec , nodemap , CTDB_RECOVERY_ACTIVE , false ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-10 03:48:14 +04:00
return ;
}
2007-11-13 02:27:44 +03:00
talloc_free ( rec - > election_timeout ) ;
2015-10-26 08:50:09 +03:00
rec - > election_timeout = tevent_add_timer (
ctdb - > ev , ctdb ,
fast_start ?
timeval_current_ofs ( 0 , 500000 ) :
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
2007-11-13 02:27:44 +03:00
2013-10-29 09:38:42 +04:00
ret = send_election_request ( rec , pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
return ;
}
2007-05-26 08:01:08 +04:00
/* wait for a few seconds to collect all responses */
2007-11-13 02:27:44 +03:00
ctdb_wait_election ( rec ) ;
2007-06-07 09:18:55 +04:00
}
/*
handler for when a node changes its flags
*/
2015-04-08 07:38:26 +03:00
static void monitor_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2007-06-07 09:18:55 +04:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-06-07 09:18:55 +04:00
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap = NULL ;
2007-06-07 09:18:55 +04:00
TALLOC_CTX * tmp_ctx ;
int i ;
if ( data . dsize ! = sizeof ( * c ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Invalid data in ctdb_node_flag_change \n " ) ) ;
2007-06-07 09:18:55 +04:00
return ;
}
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY_VOID ( ctdb , tmp_ctx ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
2007-12-27 02:07:01 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_ctrl_getnodemap failed in monitor_handler \n " ) ) ;
2007-12-27 02:07:01 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
2007-06-07 09:18:55 +04:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ i ] . pnn = = c - > pnn ) break ;
2007-06-07 09:18:55 +04:00
}
if ( i = = nodemap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Flag change for non-existant node %u \n " , c - > pnn ) ) ;
2007-06-07 09:18:55 +04:00
talloc_free ( tmp_ctx ) ;
return ;
}
2013-07-11 07:01:13 +04:00
if ( c - > old_flags ! = c - > new_flags ) {
DEBUG ( DEBUG_NOTICE , ( " Node %u has changed flags - now 0x%x was 0x%x \n " , c - > pnn , c - > new_flags , c - > old_flags ) ) ;
2007-06-07 09:18:55 +04:00
}
2007-08-21 11:25:15 +04:00
nodemap - > nodes [ i ] . flags = c - > new_flags ;
2007-06-07 09:18:55 +04:00
talloc_free ( tmp_ctx ) ;
2007-05-07 00:51:58 +04:00
}
2007-05-06 22:41:12 +04:00
2008-11-19 06:43:46 +03:00
/*
handler for when we need to push out flag changes ot all other nodes
*/
2015-04-08 07:38:26 +03:00
static void push_flags_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2008-11-19 06:43:46 +03:00
{
2015-04-08 07:38:26 +03:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-11-19 06:43:46 +03:00
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap = NULL ;
2009-10-09 17:47:49 +04:00
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
uint32_t * nodes ;
2008-11-19 06:43:46 +03:00
2009-10-09 17:47:49 +04:00
/* read the node flags from the recmaster */
2015-10-23 07:33:01 +03:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , rec - > recmaster ,
tmp_ctx , & nodemap ) ;
2009-10-09 17:47:49 +04:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from node %u \n " , c - > pnn ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
2008-11-19 06:43:46 +03:00
}
2009-10-09 17:47:49 +04:00
if ( c - > pnn > = nodemap - > num ) {
DEBUG ( DEBUG_ERR , ( __location__ " Nodemap from recmaster does not contain node %d \n " , c - > pnn ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
/* send the flags update to all connected nodes */
nodes = list_of_connected_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_MODIFY_FLAGS ,
nodes , 0 , CONTROL_TIMEOUT ( ) ,
false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_control to modify node flags failed \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
talloc_free ( tmp_ctx ) ;
2008-11-19 06:43:46 +03:00
}
2007-06-07 09:18:55 +04:00
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data {
uint32_t count ;
enum monitor_result status ;
} ;
static void verify_recmode_normal_callback ( struct ctdb_client_control_state * state )
{
2007-09-26 08:25:32 +04:00
struct verify_recmode_normal_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmode_normal_data ) ;
2007-08-27 03:40:10 +04:00
/* one more node has responded with recmode data*/
rmdata - > count - - ;
/* if we failed to get the recmode, then return an error and let
the main loop try again .
*/
if ( state - > state ! = CTDB_CONTROL_DONE ) {
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
return ;
}
/* if we got a response, then the recmode will be stored in the
status field
*/
if ( state - > status ! = CTDB_RECOVERY_NORMAL ) {
2013-06-30 11:57:33 +04:00
DEBUG ( DEBUG_NOTICE , ( " Node:%u was in recovery mode. Start recovery process \n " , state - > c - > hdr . destnode ) ) ;
2007-08-27 03:40:10 +04:00
rmdata - > status = MONITOR_RECOVERY_NEEDED ;
}
return ;
}
/* verify that all nodes are in normal recovery mode */
2015-10-29 09:22:48 +03:00
static enum monitor_result verify_recmode ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap )
2007-08-23 07:48:39 +04:00
{
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data * rmdata ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-08-27 03:40:10 +04:00
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
rmdata = talloc ( mem_ctx , struct verify_recmode_normal_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
rmdata - > count = 0 ;
rmdata - > status = MONITOR_OK ;
2007-08-23 07:48:39 +04:00
/* loop over all active nodes and send an async getrecmode call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-08-27 03:40:10 +04:00
state = ctdb_ctrl_getrecmode_send ( ctdb , mem_ctx ,
2007-08-23 07:48:39 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-27 03:40:10 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmode_send during monitoring \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-23 07:48:39 +04:00
return MONITOR_FAILED ;
}
2007-08-23 13:27:09 +04:00
2007-08-27 03:40:10 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmode_normal_callback ;
2007-09-26 08:25:32 +04:00
state - > async . private_data = rmdata ;
2007-08-27 03:40:10 +04:00
/* one more control to wait for to complete */
rmdata - > count + + ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
while ( rmdata - > count > 0 ) {
2015-10-26 08:50:09 +03:00
tevent_loop_once ( ctdb - > ev ) ;
2007-08-27 03:40:10 +04:00
}
status = rmdata - > status ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-27 03:40:10 +04:00
return status ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
2007-08-23 13:27:09 +04:00
struct verify_recmaster_data {
2008-04-21 18:56:27 +04:00
struct ctdb_recoverd * rec ;
2007-08-23 13:27:09 +04:00
uint32_t count ;
2007-09-04 04:33:10 +04:00
uint32_t pnn ;
2007-08-23 13:27:09 +04:00
enum monitor_result status ;
} ;
2007-08-24 04:42:06 +04:00
static void verify_recmaster_callback ( struct ctdb_client_control_state * state )
2007-08-23 13:27:09 +04:00
{
2007-09-26 08:25:32 +04:00
struct verify_recmaster_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmaster_data ) ;
2007-08-23 13:27:09 +04:00
/* one more node has responded with recmaster data*/
rmdata - > count - - ;
/* if we failed to get the recmaster, then return an error and let
the main loop try again .
*/
2007-08-24 04:42:06 +04:00
if ( state - > state ! = CTDB_CONTROL_DONE ) {
2007-08-23 13:27:09 +04:00
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* if we got a response, then the recmaster will be stored in the
status field
*/
2007-09-04 04:33:10 +04:00
if ( state - > status ! = rmdata - > pnn ) {
2013-08-14 05:44:12 +04:00
DEBUG ( DEBUG_ERR , ( " Node %d thinks node %d is recmaster. Need a new recmaster election \n " , state - > c - > hdr . destnode , state - > status ) ) ;
2008-04-21 18:56:27 +04:00
ctdb_set_culprit ( rmdata - > rec , state - > c - > hdr . destnode ) ;
2007-08-23 13:27:09 +04:00
rmdata - > status = MONITOR_ELECTION_NEEDED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* verify that all nodes agree that we are the recmaster */
2015-10-29 09:22:48 +03:00
static enum monitor_result verify_recmaster ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap , uint32_t pnn )
2007-08-23 13:27:09 +04:00
{
2008-04-21 18:56:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-08-23 13:27:09 +04:00
struct verify_recmaster_data * rmdata ;
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
rmdata = talloc ( mem_ctx , struct verify_recmaster_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
2008-04-21 18:56:27 +04:00
rmdata - > rec = rec ;
2007-08-23 13:27:09 +04:00
rmdata - > count = 0 ;
2007-09-04 04:33:10 +04:00
rmdata - > pnn = pnn ;
2007-08-23 13:27:09 +04:00
rmdata - > status = MONITOR_OK ;
2015-10-23 07:05:08 +03:00
/* loop over all active nodes and send an async getrecmaster call to
2007-08-23 13:27:09 +04:00
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-10-23 07:05:08 +03:00
if ( nodemap - > nodes [ j ] . pnn = = rec - > recmaster ) {
continue ;
}
2007-08-23 13:27:09 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
state = ctdb_ctrl_getrecmaster_send ( ctdb , mem_ctx ,
2007-08-23 13:38:54 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-23 13:27:09 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmaster_send during monitoring \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
return MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmaster_callback ;
2007-09-26 08:25:32 +04:00
state - > async . private_data = rmdata ;
2007-08-24 04:42:06 +04:00
2007-08-23 13:27:09 +04:00
/* one more control to wait for to complete */
rmdata - > count + + ;
}
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
2007-08-23 13:38:54 +04:00
while ( rmdata - > count > 0 ) {
2015-10-26 08:50:09 +03:00
tevent_loop_once ( ctdb - > ev ) ;
2007-08-23 13:27:09 +04:00
}
status = rmdata - > status ;
talloc_free ( mem_ctx ) ;
return status ;
}
2013-02-21 03:43:35 +04:00
static bool interfaces_have_changed ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec )
{
2015-10-28 11:43:48 +03:00
struct ctdb_iface_list_old * ifaces = NULL ;
2013-02-21 03:43:35 +04:00
TALLOC_CTX * mem_ctx ;
bool ret = false ;
mem_ctx = talloc_new ( NULL ) ;
/* Read the interfaces from the local node */
if ( ctdb_ctrl_get_ifaces ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , mem_ctx , & ifaces ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to get interfaces from local node %u \n " , ctdb - > pnn ) ) ;
/* We could return an error. However, this will be
* rare so we ' ll decide that the interfaces have
* actually changed , just in case .
*/
talloc_free ( mem_ctx ) ;
return true ;
}
if ( ! rec - > ifaces ) {
/* We haven't been here before so things have changed */
2013-08-15 11:04:01 +04:00
DEBUG ( DEBUG_NOTICE , ( " Initial interface fetched \n " ) ) ;
2013-02-21 03:43:35 +04:00
ret = true ;
} else if ( rec - > ifaces - > num ! = ifaces - > num ) {
/* Number of interfaces has changed */
2013-08-15 11:04:01 +04:00
DEBUG ( DEBUG_NOTICE , ( " Interface count changed from %d to %d \n " ,
rec - > ifaces - > num , ifaces - > num ) ) ;
2013-02-21 03:43:35 +04:00
ret = true ;
} else {
/* See if interface names or link states have changed */
int i ;
for ( i = 0 ; i < rec - > ifaces - > num ; i + + ) {
2015-10-28 11:37:17 +03:00
struct ctdb_iface * iface = & rec - > ifaces - > ifaces [ i ] ;
2013-08-15 11:04:01 +04:00
if ( strcmp ( iface - > name , ifaces - > ifaces [ i ] . name ) ! = 0 ) {
DEBUG ( DEBUG_NOTICE ,
( " Interface in slot %d changed: %s => %s \n " ,
i , iface - > name , ifaces - > ifaces [ i ] . name ) ) ;
ret = true ;
break ;
}
if ( iface - > link_state ! = ifaces - > ifaces [ i ] . link_state ) {
DEBUG ( DEBUG_NOTICE ,
( " Interface %s changed state: %d => %d \n " ,
iface - > name , iface - > link_state ,
ifaces - > ifaces [ i ] . link_state ) ) ;
2013-02-21 03:43:35 +04:00
ret = true ;
break ;
}
}
}
talloc_free ( rec - > ifaces ) ;
rec - > ifaces = talloc_steal ( rec , ifaces ) ;
talloc_free ( mem_ctx ) ;
return ret ;
}
2007-06-07 09:18:55 +04:00
2016-05-03 09:36:37 +03:00
/* Check that the local allocation of public IP addresses is correct
* and do some house - keeping */
static int verify_local_ip_allocation ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
uint32_t pnn ,
struct ctdb_node_map_old * nodemap )
2008-07-02 07:55:59 +04:00
{
TALLOC_CTX * mem_ctx = talloc_new ( NULL ) ;
int ret , j ;
2009-12-22 17:21:08 +03:00
bool need_takeover_run = false ;
2015-11-09 07:41:45 +03:00
struct ctdb_public_ip_list_old * ips = NULL ;
2016-05-03 09:36:37 +03:00
/* If we are not the recmaster then do some housekeeping */
if ( rec - > recmaster ! = pnn ) {
/* Ignore any IP reallocate requests - only recmaster
* processes them
*/
TALLOC_FREE ( rec - > reallocate_requests ) ;
/* Clear any nodes that should be force rebalanced in
* the next takeover run . If the recovery master role
* has moved then we don ' t want to process these some
* time in the future .
*/
TALLOC_FREE ( rec - > force_rebalance_nodes ) ;
}
2015-11-09 07:41:45 +03:00
/* Return early if disabled... */
if ( ctdb - > tunable . disable_ip_failover ! = 0 | |
ctdb_op_is_disabled ( rec - > takeover_run ) ) {
return 0 ;
}
2008-07-02 07:55:59 +04:00
2013-02-21 03:43:35 +04:00
if ( interfaces_have_changed ( ctdb , rec ) ) {
2009-12-22 17:21:08 +03:00
need_takeover_run = true ;
}
2015-11-09 08:12:31 +03:00
/* If there are unhosted IPs but this node can host them then
* trigger an IP reallocation */
2012-10-11 08:17:54 +04:00
2015-11-09 08:12:31 +03:00
/* Read *available* IPs from local node */
ret = ctdb_ctrl_get_public_ips_flags (
ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , mem_ctx ,
CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE , & ips ) ;
2015-11-09 07:41:45 +03:00
if ( ret ! = 0 ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to retrieve available public IPs \n " ) ) ;
2015-11-09 07:41:45 +03:00
talloc_free ( mem_ctx ) ;
return - 1 ;
}
2012-10-11 08:17:54 +04:00
2015-11-09 07:41:45 +03:00
for ( j = 0 ; j < ips - > num ; j + + ) {
if ( ips - > ips [ j ] . pnn = = - 1 & &
nodemap - > nodes [ pnn ] . flags = = 0 ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_WARNING ,
( " Unassigned IP %s can be served by this node \n " ,
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
2015-11-09 07:41:45 +03:00
need_takeover_run = true ;
2012-10-11 08:17:54 +04:00
}
2015-11-09 07:41:45 +03:00
}
2012-10-11 08:17:54 +04:00
2015-11-09 07:41:45 +03:00
talloc_free ( ips ) ;
2012-10-11 08:17:54 +04:00
2015-11-09 07:44:15 +03:00
if ( ! ctdb - > do_checkpublicip ) {
goto done ;
}
2015-11-09 08:12:31 +03:00
/* Validate the IP addresses that this node has on network
* interfaces . If there is an inconsistency between reality
* and the state expected by CTDB then try to fix it by
* triggering an IP reallocation or releasing extraneous IP
* addresses . */
/* Read *known* IPs from local node */
ret = ctdb_ctrl_get_public_ips_flags (
ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , mem_ctx , 0 , & ips ) ;
2015-11-09 07:41:45 +03:00
if ( ret ! = 0 ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to retrieve known public IPs \n " ) ) ;
2015-11-09 07:41:45 +03:00
talloc_free ( mem_ctx ) ;
return - 1 ;
}
2012-10-11 08:17:54 +04:00
2015-11-09 07:41:45 +03:00
for ( j = 0 ; j < ips - > num ; j + + ) {
if ( ips - > ips [ j ] . pnn = = pnn ) {
2015-11-09 07:44:15 +03:00
if ( ! ctdb_sys_have_ip ( & ips - > ips [ j ] . addr ) ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_ERR ,
( " Assigned IP %s not on an interface \n " ,
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
2015-11-09 07:41:45 +03:00
need_takeover_run = true ;
}
} else {
2015-11-09 07:44:15 +03:00
if ( ctdb_sys_have_ip ( & ips - > ips [ j ] . addr ) ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_ERR ,
( " IP %s incorrectly on an interface - releasing \n " ,
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
ret = ctdb_ctrl_release_ip ( ctdb ,
CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE ,
& ips - > ips [ j ] ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR ,
( " Failed to release IP address \n " ) ) ;
2010-11-10 04:06:05 +03:00
}
2008-07-02 07:55:59 +04:00
}
}
}
2015-11-09 07:44:15 +03:00
done :
2009-12-22 17:21:08 +03:00
if ( need_takeover_run ) {
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message rd ;
2009-12-22 17:21:08 +03:00
TDB_DATA data ;
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_NOTICE , ( " Trigger takeoverrun \n " ) ) ;
2009-12-22 17:21:08 +03:00
2016-01-11 09:23:12 +03:00
ZERO_STRUCT ( rd ) ;
2009-12-22 17:21:08 +03:00
rd . pnn = ctdb - > pnn ;
rd . srvid = 0 ;
data . dptr = ( uint8_t * ) & rd ;
data . dsize = sizeof ( rd ) ;
2010-06-02 03:45:21 +04:00
ret = ctdb_client_send_message ( ctdb , rec - > recmaster , CTDB_SRVID_TAKEOVER_RUN , data ) ;
2009-12-22 17:21:08 +03:00
if ( ret ! = 0 ) {
2015-11-09 08:12:31 +03:00
DEBUG ( DEBUG_ERR ,
( " Failed to send takeover run request \n " ) ) ;
2009-12-22 17:21:08 +03:00
}
}
2008-07-02 07:55:59 +04:00
talloc_free ( mem_ctx ) ;
return 0 ;
}
2008-12-05 08:32:30 +03:00
static void async_getnodemap_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * * remote_nodemaps = callback_data ;
2008-12-05 08:32:30 +03:00
if ( node_pnn > = ctdb - > num_nodes ) {
DEBUG ( DEBUG_ERR , ( __location__ " pnn from invalid node \n " ) ) ;
return ;
}
2015-10-29 09:22:48 +03:00
remote_nodemaps [ node_pnn ] = ( struct ctdb_node_map_old * ) talloc_steal ( remote_nodemaps , outdata . dptr ) ;
2008-12-05 08:32:30 +03:00
}
static int get_remote_nodemaps ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx ,
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap ,
struct ctdb_node_map_old * * remote_nodemaps )
2008-12-05 08:32:30 +03:00
{
uint32_t * nodes ;
nodes = list_of_active_nodes ( ctdb , nodemap , mem_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_GET_NODEMAP ,
2009-10-12 05:08:39 +04:00
nodes , 0 ,
2008-12-05 08:32:30 +03:00
CONTROL_TIMEOUT ( ) , false , tdb_null ,
async_getnodemap_callback ,
NULL ,
2008-12-09 02:45:14 +03:00
remote_nodemaps ) ! = 0 ) {
2008-12-05 08:32:30 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to pull all remote nodemaps \n " ) ) ;
return - 1 ;
}
return 0 ;
}
2016-04-28 09:58:35 +03:00
static bool validate_recovery_master ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx )
2015-10-27 08:43:07 +03:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
uint32_t pnn = ctdb_get_pnn ( ctdb ) ;
struct ctdb_node_map_old * nodemap = rec - > nodemap ;
struct ctdb_node_map_old * recmaster_nodemap = NULL ;
int ret ;
/* When recovery daemon is started, recmaster is set to
* " unknown " so it knows to start an election .
*/
if ( rec - > recmaster = = CTDB_UNKNOWN_PNN ) {
DEBUG ( DEBUG_NOTICE ,
( " Initial recovery master set - forcing election \n " ) ) ;
2016-04-28 09:58:35 +03:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 08:43:07 +03:00
}
/*
* If the current recmaster does not have CTDB_CAP_RECMASTER ,
* but we have , then force an election and try to become the new
* recmaster .
*/
if ( ! ctdb_node_has_capabilities ( rec - > caps ,
rec - > recmaster ,
CTDB_CAP_RECMASTER ) & &
( rec - > ctdb - > capabilities & CTDB_CAP_RECMASTER ) & &
! ( nodemap - > nodes [ pnn ] . flags & NODE_FLAGS_INACTIVE ) ) {
DEBUG ( DEBUG_ERR ,
( " Current recmaster node %u does not have CAP_RECMASTER, "
" but we (node %u) have - force an election \n " ,
rec - > recmaster , pnn ) ) ;
2016-04-28 09:58:35 +03:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 08:43:07 +03:00
}
/* Verify that the master node has not been deleted. This
* should not happen because a node should always be shutdown
* before being deleted , causing a new master to be elected
* before now . However , if something strange has happened
* then checking here will ensure we don ' t index beyond the
* end of the nodemap array . */
if ( rec - > recmaster > = nodemap - > num ) {
DEBUG ( DEBUG_ERR ,
( " Recmaster node %u has been deleted. Force election \n " ,
rec - > recmaster ) ) ;
2016-04-28 09:58:35 +03:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 08:43:07 +03:00
}
/* if recovery master is disconnected/deleted we must elect a new recmaster */
if ( nodemap - > nodes [ rec - > recmaster ] . flags &
( NODE_FLAGS_DISCONNECTED | NODE_FLAGS_DELETED ) ) {
DEBUG ( DEBUG_NOTICE ,
( " Recmaster node %u is disconnected/deleted. Force election \n " ,
rec - > recmaster ) ) ;
2016-04-28 09:58:35 +03:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 08:43:07 +03:00
}
/* get nodemap from the recovery master to check if it is inactive */
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , rec - > recmaster ,
mem_ctx , & recmaster_nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR ,
( __location__
" Unable to get nodemap from recovery master %u \n " ,
rec - > recmaster ) ) ;
2016-04-28 09:58:35 +03:00
/* No election, just error */
return false ;
2015-10-27 08:43:07 +03:00
}
if ( ( recmaster_nodemap - > nodes [ rec - > recmaster ] . flags & NODE_FLAGS_INACTIVE ) & &
( rec - > node_flags & NODE_FLAGS_INACTIVE ) = = 0 ) {
DEBUG ( DEBUG_NOTICE ,
( " Recmaster node %u is inactive. Force election \n " ,
rec - > recmaster ) ) ;
/*
* update our nodemap to carry the recmaster ' s notion of
* its own flags , so that we don ' t keep freezing the
* inactive recmaster node . . .
*/
nodemap - > nodes [ rec - > recmaster ] . flags =
recmaster_nodemap - > nodes [ rec - > recmaster ] . flags ;
2016-04-28 09:58:35 +03:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 08:43:07 +03:00
}
2016-04-28 09:58:35 +03:00
return true ;
2015-10-27 08:43:07 +03:00
}
2010-06-22 17:20:23 +04:00
static void main_loop ( struct ctdb_context * ctdb , struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx )
2007-05-04 02:30:18 +04:00
{
2008-03-02 23:53:46 +03:00
uint32_t pnn ;
2015-10-29 09:22:48 +03:00
struct ctdb_node_map_old * nodemap = NULL ;
struct ctdb_node_map_old * * remote_nodemaps = NULL ;
2007-05-04 03:45:53 +04:00
struct ctdb_vnn_map * vnnmap = NULL ;
struct ctdb_vnn_map * remote_vnnmap = NULL ;
2015-03-29 09:49:02 +03:00
uint32_t num_lmasters ;
2008-02-18 11:38:04 +03:00
int32_t debug_level ;
2007-05-04 03:45:53 +04:00
int i , j , ret ;
2013-06-28 10:31:07 +04:00
bool self_ban ;
2007-06-07 09:18:55 +04:00
2007-06-04 14:22:44 +04:00
2008-01-07 08:17:22 +03:00
/* verify that the main daemon is still running */
2012-05-03 05:42:41 +04:00
if ( ctdb_kill ( ctdb , ctdb - > ctdbd_pid , 0 ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( " CTDB daemon is no longer available. Shutting down recovery daemon \n " ) ) ;
2008-01-07 08:17:22 +03:00
exit ( - 1 ) ;
}
2008-09-09 07:44:46 +04:00
/* ping the local daemon to tell it we are alive */
ctdb_ctrl_recd_ping ( ctdb ) ;
2007-11-13 02:27:44 +03:00
if ( rec - > election_timeout ) {
/* an election is in progress */
2010-06-22 17:20:23 +04:00
return ;
2007-11-13 02:27:44 +03:00
}
2008-02-18 11:38:04 +03:00
/* read the debug level from the parent and update locally */
ret = ctdb_ctrl_get_debuglevel ( ctdb , CTDB_CURRENT_NODE , & debug_level ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read debuglevel from parent \n " ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-02-18 11:38:04 +03:00
}
2014-09-24 11:12:56 +04:00
DEBUGLEVEL = debug_level ;
2008-02-18 11:38:04 +03:00
2007-06-04 14:22:44 +04:00
/* get relevant tunables */
2007-06-07 12:05:25 +04:00
ret = ctdb_ctrl_get_all_tunables ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & ctdb - > tunable ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to get tunables - retrying \n " ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-06-07 12:05:25 +04:00
}
2007-05-04 02:30:18 +04:00
2014-09-25 11:17:04 +04:00
/* get runstate */
ret = ctdb_ctrl_get_runstate ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & ctdb - > runstate ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to get runstate - retrying \n " ) ) ;
return ;
}
2013-07-08 06:45:31 +04:00
pnn = ctdb_get_pnn ( ctdb ) ;
2007-05-04 02:30:18 +04:00
2015-10-23 08:00:55 +03:00
/* get nodemap */
TALLOC_FREE ( rec - > nodemap ) ;
2008-03-03 01:19:30 +03:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , rec , & rec - > nodemap ) ;
2007-05-04 03:01:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from node %u \n " , pnn ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:01:01 +04:00
}
2008-03-03 01:19:30 +03:00
nodemap = rec - > nodemap ;
2007-05-04 02:30:18 +04:00
2013-06-28 08:09:35 +04:00
/* remember our own node flags */
rec - > node_flags = nodemap - > nodes [ pnn ] . flags ;
2013-06-28 10:31:07 +04:00
ban_misbehaving_nodes ( rec , & self_ban ) ;
if ( self_ban ) {
DEBUG ( DEBUG_NOTICE , ( " This node was banned, restart main_loop \n " ) ) ;
return ;
}
2013-06-27 10:01:16 +04:00
2013-06-28 08:02:44 +04:00
/* if the local daemon is STOPPED or BANNED, we verify that the databases are
2013-06-26 09:11:51 +04:00
also frozen and that the recmode is set to active .
2009-07-09 08:19:32 +04:00
*/
2013-06-27 09:39:15 +04:00
if ( rec - > node_flags & ( NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED ) ) {
2013-09-17 06:00:26 +04:00
/* If this node has become inactive then we want to
* reduce the chances of it taking over the recovery
* master role when it becomes active again . This
* helps to stabilise the recovery master role so that
* it stays on the most stable node .
*/
rec - > priority_time = timeval_current ( ) ;
2009-07-09 08:19:32 +04:00
ret = ctdb_ctrl_getrecmode ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & ctdb - > recovery_mode ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read recmode from local node \n " ) ) ;
}
if ( ctdb - > recovery_mode = = CTDB_RECOVERY_NORMAL ) {
2013-06-28 08:02:44 +04:00
DEBUG ( DEBUG_ERR , ( " Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases \n " ) ) ;
2009-07-09 08:19:32 +04:00
ret = ctdb_ctrl_setrecmode ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
2013-06-28 08:02:44 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to activate recovery mode in STOPPED or BANNED state \n " ) ) ;
2009-07-09 08:19:32 +04:00
2010-06-22 17:20:23 +04:00
return ;
2009-07-09 08:19:32 +04:00
}
2016-06-01 05:10:46 +03:00
}
if ( ! rec - > frozen_on_inactive ) {
ret = ctdb_ctrl_freeze ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE ) ;
2014-05-06 08:24:52 +04:00
if ( ret ! = 0 ) {
2016-06-01 05:10:46 +03:00
DEBUG ( DEBUG_ERR ,
( __location__ " Failed to freeze node "
" in STOPPED or BANNED state \n " ) ) ;
2014-05-06 08:24:52 +04:00
return ;
}
2016-06-01 05:10:46 +03:00
rec - > frozen_on_inactive = true ;
2009-07-09 08:19:32 +04:00
}
2013-06-27 09:39:15 +04:00
/* If this node is stopped or banned then it is not the recovery
* master , so don ' t do anything . This prevents stopped or banned
* node from starting election and sending unnecessary controls .
*/
return ;
2009-07-09 08:19:32 +04:00
}
2013-06-27 09:39:15 +04:00
2016-06-01 05:10:46 +03:00
rec - > frozen_on_inactive = false ;
2015-10-27 07:09:33 +03:00
/* Retrieve capabilities from all connected nodes */
ret = update_capabilities ( rec , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update node capabilities. \n " ) ) ;
return ;
}
2016-04-28 09:58:35 +03:00
if ( ! validate_recovery_master ( rec , mem_ctx ) ) {
2010-06-22 17:20:23 +04:00
return ;
2007-05-07 00:51:58 +04:00
}
2007-09-14 04:16:36 +04:00
2015-11-09 07:41:45 +03:00
/* Check if an IP takeover run is needed and trigger one if
* necessary */
verify_local_ip_allocation ( ctdb , rec , pnn , nodemap ) ;
2007-05-07 00:51:58 +04:00
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
2008-03-02 23:53:46 +03:00
if ( pnn ! = rec - > recmaster ) {
2010-06-22 17:20:23 +04:00
return ;
2007-05-07 00:51:58 +04:00
}
2007-10-11 00:16:36 +04:00
2007-10-15 08:28:51 +04:00
/* ensure our local copies of flags are right */
2007-11-30 00:44:34 +03:00
ret = update_local_flags ( rec , nodemap ) ;
2016-04-27 14:47:08 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to update local flags \n " ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-10-11 00:16:36 +04:00
}
2008-10-17 14:18:06 +04:00
if ( ctdb - > num_nodes ! = nodemap - > num ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file \n " , ctdb - > num_nodes , nodemap - > num ) ) ;
2013-10-14 06:54:39 +04:00
ctdb_load_nodes_file ( ctdb ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-10-17 14:18:06 +04:00
}
2007-09-04 17:15:23 +04:00
2007-05-07 00:51:58 +04:00
/* verify that all active nodes agree that we are the recmaster */
2008-04-21 18:56:27 +04:00
switch ( verify_recmaster ( rec , nodemap , pnn ) ) {
2007-08-23 13:27:09 +04:00
case MONITOR_RECOVERY_NEEDED :
/* can not happen */
2010-06-22 17:20:23 +04:00
return ;
2007-08-23 13:27:09 +04:00
case MONITOR_ELECTION_NEEDED :
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-08-23 13:27:09 +04:00
case MONITOR_OK :
break ;
case MONITOR_FAILED :
2010-06-22 17:20:23 +04:00
return ;
2007-05-07 00:51:58 +04:00
}
2015-10-27 06:35:09 +03:00
/* get the vnnmap */
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & vnnmap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from node %u \n " , pnn ) ) ;
return ;
}
2007-09-14 03:49:12 +04:00
if ( rec - > need_recovery ) {
/* a previous recovery didn't finish */
2009-09-03 20:20:39 +04:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-09-14 03:49:12 +04:00
}
2007-05-06 22:41:12 +04:00
/* verify that all active nodes are in normal mode
and not in recovery mode
2009-09-03 20:20:39 +04:00
*/
2007-08-23 13:27:09 +04:00
switch ( verify_recmode ( ctdb , nodemap ) ) {
2007-08-23 07:48:39 +04:00
case MONITOR_RECOVERY_NEEDED :
2009-09-03 20:20:39 +04:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-08-23 07:48:39 +04:00
case MONITOR_FAILED :
2010-06-22 17:20:23 +04:00
return ;
2007-08-23 13:27:09 +04:00
case MONITOR_ELECTION_NEEDED :
/* can not happen */
2007-08-23 07:48:39 +04:00
case MONITOR_OK :
break ;
2007-05-06 22:41:12 +04:00
}
2016-05-17 11:28:56 +03:00
if ( ctdb - > recovery_lock ! = NULL ) {
2014-12-09 06:45:08 +03:00
/* We must already hold the recovery lock */
2016-05-24 07:54:39 +03:00
if ( ! ctdb_recovery_have_lock ( rec ) ) {
2014-12-09 06:45:08 +03:00
DEBUG ( DEBUG_ERR , ( " Failed recovery lock sanity check. Force a recovery \n " ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , ctdb - > pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2009-06-25 05:41:18 +04:00
}
2007-10-05 07:28:21 +04:00
}
2007-08-23 07:48:39 +04:00
2012-04-30 09:50:44 +04:00
2015-02-06 12:59:11 +03:00
/* If recoveries are disabled then there is no use doing any
* nodemap or flags checks . Recoveries might be disabled due
* to " reloadnodes " , so doing these checks might cause an
* unnecessary recovery . */
if ( ctdb_op_is_disabled ( rec - > recovery ) ) {
2016-05-03 09:00:02 +03:00
goto takeover_run_checks ;
2015-02-06 12:59:11 +03:00
}
2008-12-05 08:32:30 +03:00
/* get the nodemap for all active remote nodes
2007-05-04 03:45:53 +04:00
*/
2015-10-29 09:22:48 +03:00
remote_nodemaps = talloc_array ( mem_ctx , struct ctdb_node_map_old * , nodemap - > num ) ;
2008-12-09 02:45:14 +03:00
if ( remote_nodemaps = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " failed to allocate remote nodemap array \n " ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-12-09 02:45:14 +03:00
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
remote_nodemaps [ i ] = NULL ;
}
if ( get_remote_nodemaps ( ctdb , mem_ctx , nodemap , remote_nodemaps ) ! = 0 ) {
2008-12-05 08:32:30 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read remote nodemaps \n " ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-12-05 08:32:30 +03:00
}
/* verify that all other nodes have the same nodemap as we have
*/
2007-05-04 03:45:53 +04:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2009-04-06 06:00:22 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2008-12-09 02:45:14 +03:00
if ( remote_nodemaps [ j ] = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Did not get a remote nodemap for node %d, restarting monitoring \n " , j ) ) ;
2009-04-02 07:50:43 +04:00
ctdb_set_culprit ( rec , j ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-12-09 02:45:14 +03:00
}
2008-12-05 08:32:30 +03:00
/* if the nodes disagree on how many nodes there are
2007-05-04 03:45:53 +04:00
then this is a good reason to try recovery
*/
2008-12-05 08:32:30 +03:00
if ( remote_nodemaps [ j ] - > num ! = nodemap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different node count. %u vs %u of the local node \n " ,
2008-12-05 08:32:30 +03:00
nodemap - > nodes [ j ] . pnn , remote_nodemaps [ j ] - > num , nodemap - > num ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
/* if the nodes disagree on which nodes exist and are
active , then that is also a good reason to do recovery
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2008-12-05 08:32:30 +03:00
if ( remote_nodemaps [ j ] - > nodes [ i ] . pnn ! = nodemap - > nodes [ i ] . pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u). \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , i ,
2008-12-05 08:32:30 +03:00
remote_nodemaps [ j ] - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . pnn ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-03 20:20:39 +04:00
vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
}
2013-07-22 11:26:28 +04:00
}
/*
* Update node flags obtained from each active node . This ensure we have
* up - to - date information for all the nodes .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
nodemap - > nodes [ j ] . flags = remote_nodemaps [ j ] - > nodes [ j ] . flags ;
}
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-05-04 03:45:53 +04:00
2008-12-05 08:32:30 +03:00
/* verify the flags are consistent
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
if ( nodemap - > nodes [ i ] . flags ! = remote_nodemaps [ j ] - > nodes [ i ] . flags ) {
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x \n " ,
nodemap - > nodes [ j ] . pnn ,
nodemap - > nodes [ i ] . pnn ,
remote_nodemaps [ j ] - > nodes [ i ] . flags ,
2013-01-23 07:35:47 +04:00
nodemap - > nodes [ i ] . flags ) ) ;
2008-12-05 08:32:30 +03:00
if ( i = = j ) {
DEBUG ( DEBUG_ERR , ( " Use flags 0x%02x from remote node %d for cluster update of its own flags \n " , remote_nodemaps [ j ] - > nodes [ i ] . flags , j ) ) ;
update_flags_on_all_nodes ( ctdb , nodemap , nodemap - > nodes [ i ] . pnn , remote_nodemaps [ j ] - > nodes [ i ] . flags ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-12-05 08:32:30 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-03 20:20:39 +04:00
vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-12-05 08:32:30 +03:00
} else {
DEBUG ( DEBUG_ERR , ( " Use flags 0x%02x from local recmaster node for cluster update of node %d flags \n " , nodemap - > nodes [ i ] . flags , i ) ) ;
update_flags_on_all_nodes ( ctdb , nodemap , nodemap - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . flags ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-12-05 08:32:30 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-03 20:20:39 +04:00
vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2008-12-05 08:32:30 +03:00
}
}
}
2007-05-04 03:45:53 +04:00
}
2015-03-29 12:00:17 +03:00
/* count how many active nodes there are */
num_lmasters = 0 ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
if ( ctdb_node_has_capabilities ( rec - > caps ,
ctdb - > nodes [ i ] - > pnn ,
CTDB_CAP_LMASTER ) ) {
num_lmasters + + ;
}
}
}
2007-05-04 03:45:53 +04:00
2013-09-26 07:11:04 +04:00
/* There must be the same number of lmasters in the vnn map as
* there are active nodes with the lmaster capability . . . or
* do a recovery .
2007-05-04 03:45:53 +04:00
*/
2015-03-29 09:49:02 +03:00
if ( vnnmap - > size ! = num_lmasters ) {
2013-09-26 07:11:04 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u \n " ,
2015-03-29 09:49:02 +03:00
vnnmap - > size , num_lmasters ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , ctdb - > pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
/* verify that all active nodes in the nodemap also exist in
the vnnmap .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( vnnmap - > map [ i ] = = nodemap - > nodes [ j ] . pnn ) {
2007-05-04 03:45:53 +04:00
break ;
}
}
2007-06-07 09:18:55 +04:00
if ( i = = vnnmap - > size ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Node %u is active in the nodemap but did not exist in the vnnmap \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
}
2007-05-04 05:57:45 +04:00
/* verify that all other nodes have the same vnnmap
and are from the same generation
*/
2007-05-04 03:45:53 +04:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_vnnmap ) ;
2007-05-04 03:45:53 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from remote node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
2007-05-04 05:57:45 +04:00
/* verify the vnnmap generation is the same */
if ( vnnmap - > generation ! = remote_vnnmap - > generation ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > generation , vnnmap - > generation ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 05:57:45 +04:00
}
2007-05-04 03:45:53 +04:00
/* verify the vnnmap size is the same */
if ( vnnmap - > size ! = remote_vnnmap - > size ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different size of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > size , vnnmap - > size ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
/* verify the vnnmap is the same */
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( remote_vnnmap - > map [ i ] ! = vnnmap - > map [ i ] ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different vnnmap. \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2009-09-03 20:20:39 +04:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-03 20:20:39 +04:00
vnnmap ) ;
2010-06-22 17:20:23 +04:00
return ;
2007-05-04 03:45:53 +04:00
}
}
}
2016-05-03 09:00:02 +03:00
takeover_run_checks :
2016-05-03 09:07:34 +03:00
/* If there are IP takeover runs requested or the previous one
* failed then perform one and notify the waiters */
2016-05-03 09:00:02 +03:00
if ( ! ctdb_op_is_disabled ( rec - > takeover_run ) & &
2016-05-03 09:07:34 +03:00
( rec - > reallocate_requests | | rec - > need_takeover_run ) ) {
2016-05-03 09:00:02 +03:00
process_ipreallocate_requests ( ctdb , rec ) ;
}
2010-06-22 17:20:23 +04:00
}
2016-06-02 02:26:40 +03:00
static void recd_sig_term_handler ( struct tevent_context * ev ,
struct tevent_signal * se , int signum ,
int count , void * dont_care ,
void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type_abort (
private_data , struct ctdb_recoverd ) ;
ctdb_recovery_unlock ( rec ) ;
exit ( 0 ) ;
}
2010-06-22 17:20:23 +04:00
/*
the main monitoring loop
*/
static void monitor_cluster ( struct ctdb_context * ctdb )
{
2016-06-02 02:26:40 +03:00
struct tevent_signal * se ;
2010-06-22 17:20:23 +04:00
struct ctdb_recoverd * rec ;
DEBUG ( DEBUG_NOTICE , ( " monitor_cluster starting \n " ) ) ;
rec = talloc_zero ( ctdb , struct ctdb_recoverd ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec ) ;
rec - > ctdb = ctdb ;
2015-11-10 05:54:47 +03:00
rec - > recmaster = CTDB_UNKNOWN_PNN ;
2016-05-24 07:54:39 +03:00
rec - > recovery_lock_handle = NULL ;
2007-06-06 04:25:46 +04:00
2015-02-08 12:52:12 +03:00
rec - > takeover_run = ctdb_op_init ( rec , " takeover runs " ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > takeover_run ) ;
2013-09-03 05:20:01 +04:00
2015-02-06 06:47:33 +03:00
rec - > recovery = ctdb_op_init ( rec , " recoveries " ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > recovery ) ;
2010-06-22 17:20:23 +04:00
rec - > priority_time = timeval_current ( ) ;
2016-06-01 05:10:46 +03:00
rec - > frozen_on_inactive = false ;
2008-06-26 07:08:37 +04:00
2016-06-02 02:26:40 +03:00
se = tevent_add_signal ( ctdb - > ev , ctdb , SIGTERM , 0 ,
recd_sig_term_handler , rec ) ;
if ( se = = NULL ) {
DEBUG ( DEBUG_ERR , ( " Failed to install SIGTERM handler \n " ) ) ;
exit ( 1 ) ;
}
2010-06-22 17:20:23 +04:00
/* register a message port for sending memory dumps */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_MEM_DUMP , mem_dump_handler , rec ) ;
2007-05-04 03:45:53 +04:00
2016-03-17 09:26:30 +03:00
/* when a node is assigned banning credits */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_BANNING ,
banning_handler , rec ) ;
2010-06-22 17:20:23 +04:00
/* register a message port for recovery elections */
2015-10-29 09:51:52 +03:00
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_ELECTION , election_handler , rec ) ;
2010-06-22 17:20:23 +04:00
/* when nodes are disabled/enabled */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_SET_NODE_FLAGS , monitor_handler , rec ) ;
/* when we are asked to puch out a flag change */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_PUSH_NODE_FLAGS , push_flags_handler , rec ) ;
/* register a message port for vacuum fetch */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_VACUUM_FETCH , vacuum_fetch_handler , rec ) ;
/* register a message port for reloadnodes */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_RELOAD_NODES , reload_nodes_handler , rec ) ;
/* register a message port for performing a takeover run */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_TAKEOVER_RUN , ip_reallocate_handler , rec ) ;
/* register a message port for disabling the ip check for a short while */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_DISABLE_IP_CHECK , disable_ip_check_handler , rec ) ;
/* register a message port for updating the recovery daemons node assignment for an ip */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_RECD_UPDATE_IP , recd_update_ip_handler , rec ) ;
2012-02-27 23:56:04 +04:00
/* register a message port for forcing a rebalance of a node next
reallocation */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_REBALANCE_NODE , recd_node_rebalance_handler , rec ) ;
2013-08-27 09:04:40 +04:00
/* Register a message port for disabling takeover runs */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
disable_takeover_runs_handler , rec ) ;
2015-02-06 07:06:44 +03:00
/* Register a message port for disabling recoveries */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DISABLE_RECOVERIES ,
disable_recoveries_handler , rec ) ;
2014-04-22 09:24:49 +04:00
/* register a message port for detaching database */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DETACH_DATABASE ,
detach_database_handler , rec ) ;
2010-06-22 17:20:23 +04:00
for ( ; ; ) {
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2010-06-22 17:20:35 +04:00
struct timeval start ;
double elapsed ;
2010-06-22 17:20:23 +04:00
if ( ! mem_ctx ) {
DEBUG ( DEBUG_CRIT , ( __location__
" Failed to create temp context \n " ) ) ;
exit ( - 1 ) ;
}
2010-06-22 17:20:35 +04:00
start = timeval_current ( ) ;
2010-06-22 17:20:23 +04:00
main_loop ( ctdb , rec , mem_ctx ) ;
talloc_free ( mem_ctx ) ;
/* we only check for recovery once every second */
2010-06-22 17:20:35 +04:00
elapsed = timeval_elapsed ( & start ) ;
if ( elapsed < ctdb - > tunable . recover_interval ) {
ctdb_wait_timeout ( ctdb , ctdb - > tunable . recover_interval
- elapsed ) ;
}
2010-06-22 17:20:23 +04:00
}
2007-05-04 02:30:18 +04:00
}
2007-06-06 04:25:46 +04:00
/*
2007-06-07 09:18:55 +04:00
event handler for when the main ctdbd dies
*/
2015-10-26 08:50:09 +03:00
static void ctdb_recoverd_parent ( struct tevent_context * ev ,
struct tevent_fd * fde ,
2007-05-15 09:13:36 +04:00
uint16_t flags , void * private_data )
{
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ALERT , ( " recovery daemon parent died - exiting \n " ) ) ;
2007-05-15 09:13:36 +04:00
_exit ( 1 ) ;
}
2008-05-06 05:19:17 +04:00
/*
called regularly to verify that the recovery daemon is still running
*/
2015-10-26 08:50:09 +03:00
static void ctdb_check_recd ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval yt , void * p )
2008-05-06 05:19:17 +04:00
{
struct ctdb_context * ctdb = talloc_get_type ( p , struct ctdb_context ) ;
2012-05-03 05:42:41 +04:00
if ( ctdb_kill ( ctdb , ctdb - > recoverd_pid , 0 ) ! = 0 ) {
2011-03-01 04:09:42 +03:00
DEBUG ( DEBUG_ERR , ( " Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon. \n " , ( int ) ctdb - > recoverd_pid ) ) ;
2008-05-06 05:19:17 +04:00
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb , timeval_zero ( ) ,
ctdb_restart_recd , ctdb ) ;
2008-05-06 05:19:17 +04:00
2011-03-01 04:09:42 +03:00
return ;
2008-05-06 05:19:17 +04:00
}
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > recd_ctx ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
2008-05-06 05:19:17 +04:00
}
2015-10-26 08:50:09 +03:00
static void recd_sig_child_handler ( struct tevent_context * ev ,
struct tevent_signal * se , int signum ,
int count , void * dont_care ,
void * private_data )
2008-07-09 08:02:54 +04:00
{
// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
int status ;
pid_t pid = - 1 ;
while ( pid ! = 0 ) {
pid = waitpid ( - 1 , & status , WNOHANG ) ;
if ( pid = = - 1 ) {
2009-06-19 09:55:13 +04:00
if ( errno ! = ECHILD ) {
DEBUG ( DEBUG_ERR , ( __location__ " waitpid() returned error. errno:%s(%d) \n " , strerror ( errno ) , errno ) ) ;
}
2008-07-09 08:02:54 +04:00
return ;
}
if ( pid > 0 ) {
DEBUG ( DEBUG_DEBUG , ( " RECD SIGCHLD from %d \n " , ( int ) pid ) ) ;
}
}
}
2007-06-07 09:18:55 +04:00
/*
startup the recovery daemon as a child of the main ctdb daemon
*/
2007-05-15 09:13:36 +04:00
int ctdb_start_recoverd ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
2007-05-15 09:13:36 +04:00
int fd [ 2 ] ;
2015-10-26 08:50:09 +03:00
struct tevent_signal * se ;
2010-08-18 03:46:31 +04:00
struct tevent_fd * fde ;
2007-05-04 02:30:18 +04:00
2007-05-15 09:13:36 +04:00
if ( pipe ( fd ) ! = 0 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2014-08-08 06:51:03 +04:00
ctdb - > recoverd_pid = ctdb_fork ( ctdb ) ;
2007-10-22 06:34:08 +04:00
if ( ctdb - > recoverd_pid = = - 1 ) {
2007-05-15 09:13:36 +04:00
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2012-12-04 08:05:44 +04:00
2007-10-22 06:34:08 +04:00
if ( ctdb - > recoverd_pid ! = 0 ) {
2012-12-04 08:05:44 +04:00
talloc_free ( ctdb - > recd_ctx ) ;
ctdb - > recd_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , ctdb - > recd_ctx ) ;
2007-05-15 09:13:36 +04:00
close ( fd [ 0 ] ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > recd_ctx ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
2007-05-15 09:13:36 +04:00
return 0 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
close ( fd [ 1 ] ) ;
srandom ( getpid ( ) ^ time ( NULL ) ) ;
2007-05-04 02:30:18 +04:00
2015-09-24 02:10:59 +03:00
prctl_set_comment ( " ctdb_recovered " ) ;
2010-07-19 13:59:09 +04:00
if ( switch_from_server_to_client ( ctdb , " recoverd " ) ! = 0 ) {
2009-03-23 04:37:30 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " ERROR: failed to switch recovery daemon into client mode. shutting down. \n " ) ) ;
2007-05-04 02:30:18 +04:00
exit ( 1 ) ;
}
2010-02-03 22:37:41 +03:00
DEBUG ( DEBUG_DEBUG , ( __location__ " Created PIPE FD:%d to recovery daemon \n " , fd [ 0 ] ) ) ;
2009-10-15 04:24:54 +04:00
2015-10-26 08:50:09 +03:00
fde = tevent_add_fd ( ctdb - > ev , ctdb , fd [ 0 ] , TEVENT_FD_READ ,
ctdb_recoverd_parent , & fd [ 0 ] ) ;
2010-08-18 03:46:31 +04:00
tevent_fd_set_auto_close ( fde ) ;
2009-03-23 04:37:30 +03:00
2008-07-09 08:02:54 +04:00
/* set up a handler to pick up sigchld */
2015-10-26 08:50:09 +03:00
se = tevent_add_signal ( ctdb - > ev , ctdb , SIGCHLD , 0 ,
recd_sig_child_handler , ctdb ) ;
2008-07-09 08:02:54 +04:00
if ( se = = NULL ) {
DEBUG ( DEBUG_CRIT , ( " Failed to set up signal handler for SIGCHLD in recovery daemon \n " ) ) ;
exit ( 1 ) ;
}
2007-05-15 09:13:36 +04:00
monitor_cluster ( ctdb ) ;
2007-05-07 00:51:58 +04:00
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ALERT , ( " ERROR: ctdb_recoverd finished!? \n " ) ) ;
2007-05-15 09:13:36 +04:00
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-10-22 06:34:08 +04:00
/*
shutdown the recovery daemon
*/
void ctdb_stop_recoverd ( struct ctdb_context * ctdb )
{
if ( ctdb - > recoverd_pid = = 0 ) {
return ;
}
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Shutting down recovery daemon \n " ) ) ;
2012-05-03 05:42:41 +04:00
ctdb_kill ( ctdb , ctdb - > recoverd_pid , SIGTERM ) ;
2012-12-04 08:05:44 +04:00
TALLOC_FREE ( ctdb - > recd_ctx ) ;
TALLOC_FREE ( ctdb - > recd_ping_count ) ;
2007-10-22 06:34:08 +04:00
}
2011-03-01 04:09:42 +03:00
2015-10-26 08:50:09 +03:00
static void ctdb_restart_recd ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval t , void * private_data )
2011-03-01 04:09:42 +03:00
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
DEBUG ( DEBUG_ERR , ( " Restarting recovery daemon \n " ) ) ;
ctdb_stop_recoverd ( ctdb ) ;
ctdb_start_recoverd ( ctdb ) ;
}