2007-05-04 08:30:18 +10:00
/*
ctdb recovery daemon
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 13:50:53 +10:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 15:29:31 +10:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 13:50:53 +10:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-04 08:30:18 +10:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 13:50:53 +10:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 15:29:31 +10:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-04 08:30:18 +10:00
*/
2015-10-26 16:50:46 +11:00
# include "replace.h"
2007-05-04 08:30:18 +10:00
# include "system/filesys.h"
2007-05-10 14:06:48 +10:00
# include "system/time.h"
2007-09-14 10:16:36 +10:00
# include "system/network.h"
2007-10-22 12:34:08 +10:00
# include "system/wait.h"
2015-10-26 16:50:46 +11:00
# include <popt.h>
# include <talloc.h>
# include <tevent.h>
# include <tdb.h>
2014-08-15 15:46:33 +10:00
# include "lib/tdb_wrap/tdb_wrap.h"
2014-08-15 16:18:05 +10:00
# include "lib/util/dlinklist.h"
2015-10-26 16:50:46 +11:00
# include "lib/util/debug.h"
# include "lib/util/samba_util.h"
2016-11-29 12:55:06 +11:00
# include "lib/util/sys_rw.h"
2015-09-23 16:10:59 -07:00
# include "lib/util/util_process.h"
2015-10-26 16:50:46 +11:00
# include "ctdb_private.h"
# include "ctdb_client.h"
2015-10-23 14:11:53 +11:00
# include "common/system.h"
2015-10-23 14:17:34 +11:00
# include "common/common.h"
2015-11-11 15:41:10 +11:00
# include "common/logging.h"
2007-05-04 08:30:18 +10:00
2016-02-17 20:20:03 +11:00
# include "ctdb_cluster_mutex.h"
2007-06-07 16:34:33 +10:00
2013-08-16 20:02:34 +10:00
/* List of SRVID requests that need to be processed */
struct srvid_list {
struct srvid_list * next , * prev ;
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message * request ;
2013-08-16 20:02:34 +10:00
} ;
struct srvid_requests {
struct srvid_list * requests ;
2009-07-02 13:00:26 +10:00
} ;
2013-08-16 20:02:34 +10:00
static void srvid_request_reply ( struct ctdb_context * ctdb ,
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message * request ,
2013-08-16 20:02:34 +10:00
TDB_DATA result )
{
/* Someone that sent srvid==0 does not want a reply */
if ( request - > srvid = = 0 ) {
talloc_free ( request ) ;
return ;
}
if ( ctdb_client_send_message ( ctdb , request - > pnn , request - > srvid ,
result ) = = 0 ) {
DEBUG ( DEBUG_INFO , ( " Sent SRVID reply to %u:%llu \n " ,
( unsigned ) request - > pnn ,
( unsigned long long ) request - > srvid ) ) ;
} else {
DEBUG ( DEBUG_ERR , ( " Failed to send SRVID reply to %u:%llu \n " ,
( unsigned ) request - > pnn ,
( unsigned long long ) request - > srvid ) ) ;
}
talloc_free ( request ) ;
}
static void srvid_requests_reply ( struct ctdb_context * ctdb ,
struct srvid_requests * * requests ,
TDB_DATA result )
{
struct srvid_list * r ;
2016-05-03 15:56:09 +10:00
if ( * requests = = NULL ) {
return ;
}
2013-08-16 20:02:34 +10:00
for ( r = ( * requests ) - > requests ; r ! = NULL ; r = r - > next ) {
srvid_request_reply ( ctdb , r - > request , result ) ;
}
/* Free the list structure... */
TALLOC_FREE ( * requests ) ;
}
static void srvid_request_add ( struct ctdb_context * ctdb ,
struct srvid_requests * * requests ,
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message * request )
2013-08-16 20:02:34 +10:00
{
struct srvid_list * t ;
int32_t ret ;
TDB_DATA result ;
if ( * requests = = NULL ) {
* requests = talloc_zero ( ctdb , struct srvid_requests ) ;
if ( * requests = = NULL ) {
goto nomem ;
}
}
t = talloc_zero ( * requests , struct srvid_list ) ;
if ( t = = NULL ) {
/* If *requests was just allocated above then free it */
if ( ( * requests ) - > requests = = NULL ) {
TALLOC_FREE ( * requests ) ;
}
goto nomem ;
}
2015-10-29 14:32:49 +11:00
t - > request = ( struct ctdb_srvid_message * ) talloc_steal ( t , request ) ;
2013-08-16 20:02:34 +10:00
DLIST_ADD ( ( * requests ) - > requests , t ) ;
return ;
nomem :
/* Failed to add the request to the list. Send a fail. */
DEBUG ( DEBUG_ERR , ( __location__
" Out of memory, failed to queue SRVID request \n " ) ) ;
ret = - ENOMEM ;
result . dsize = sizeof ( ret ) ;
result . dptr = ( uint8_t * ) & ret ;
srvid_request_reply ( ctdb , request , result ) ;
}
2015-02-08 20:50:38 +11:00
/* An abstraction to allow an operation (takeover runs, recoveries,
* . . . ) to be disabled for a given timeout */
struct ctdb_op_state {
struct tevent_timer * timer ;
bool in_progress ;
const char * name ;
} ;
static struct ctdb_op_state * ctdb_op_init ( TALLOC_CTX * mem_ctx , const char * name )
{
struct ctdb_op_state * state = talloc_zero ( mem_ctx , struct ctdb_op_state ) ;
if ( state ! = NULL ) {
state - > in_progress = false ;
state - > name = name ;
}
return state ;
}
static bool ctdb_op_is_disabled ( struct ctdb_op_state * state )
{
return state - > timer ! = NULL ;
}
static bool ctdb_op_begin ( struct ctdb_op_state * state )
{
if ( ctdb_op_is_disabled ( state ) ) {
DEBUG ( DEBUG_NOTICE ,
( " Unable to begin - %s are disabled \n " , state - > name ) ) ;
return false ;
}
state - > in_progress = true ;
return true ;
}
static bool ctdb_op_end ( struct ctdb_op_state * state )
{
return state - > in_progress = false ;
}
static bool ctdb_op_is_in_progress ( struct ctdb_op_state * state )
{
return state - > in_progress ;
}
static void ctdb_op_enable ( struct ctdb_op_state * state )
{
TALLOC_FREE ( state - > timer ) ;
}
2015-10-26 16:50:09 +11:00
static void ctdb_op_timeout_handler ( struct tevent_context * ev ,
struct tevent_timer * te ,
2015-02-08 20:50:38 +11:00
struct timeval yt , void * p )
{
struct ctdb_op_state * state =
talloc_get_type ( p , struct ctdb_op_state ) ;
DEBUG ( DEBUG_NOTICE , ( " Reenabling %s after timeout \n " , state - > name ) ) ;
ctdb_op_enable ( state ) ;
}
static int ctdb_op_disable ( struct ctdb_op_state * state ,
struct tevent_context * ev ,
uint32_t timeout )
{
if ( timeout = = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " Reenabling %s \n " , state - > name ) ) ;
ctdb_op_enable ( state ) ;
return 0 ;
}
if ( state - > in_progress ) {
DEBUG ( DEBUG_ERR ,
( " Unable to disable %s - in progress \n " , state - > name ) ) ;
return - EAGAIN ;
}
DEBUG ( DEBUG_NOTICE , ( " Disabling %s for %u seconds \n " ,
state - > name , timeout ) ) ;
/* Clear any old timers */
talloc_free ( state - > timer ) ;
/* Arrange for the timeout to occur */
state - > timer = tevent_add_timer ( ev , state ,
timeval_current_ofs ( timeout , 0 ) ,
ctdb_op_timeout_handler , state ) ;
if ( state - > timer = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to setup timer \n " ) ) ;
return - ENOMEM ;
}
return 0 ;
}
2009-09-04 02:20:39 +10:00
struct ctdb_banning_state {
uint32_t count ;
struct timeval last_reported_time ;
} ;
2007-06-07 15:18:55 +10:00
/*
private state of recovery daemon
*/
struct ctdb_recoverd {
struct ctdb_context * ctdb ;
2008-03-03 07:53:46 +11:00
uint32_t recmaster ;
2009-09-04 02:20:39 +10:00
uint32_t last_culprit_node ;
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap ;
2007-06-07 18:37:27 +10:00
struct timeval priority_time ;
2007-09-13 14:08:18 +10:00
bool need_takeover_run ;
2007-09-14 09:49:12 +10:00
bool need_recovery ;
2007-10-05 13:28:21 +10:00
uint32_t node_flags ;
2015-10-26 16:50:09 +11:00
struct tevent_timer * send_election_te ;
struct tevent_timer * election_timeout ;
2013-08-16 20:02:34 +10:00
struct srvid_requests * reallocate_requests ;
2015-02-08 20:52:12 +11:00
struct ctdb_op_state * takeover_run ;
2015-02-06 14:47:33 +11:00
struct ctdb_op_state * recovery ;
2015-10-28 19:43:48 +11:00
struct ctdb_iface_list_old * ifaces ;
2013-09-04 14:30:04 +10:00
uint32_t * force_rebalance_nodes ;
2014-07-31 15:26:03 +10:00
struct ctdb_node_capabilities * caps ;
2016-06-01 12:10:46 +10:00
bool frozen_on_inactive ;
2016-05-24 14:54:39 +10:00
struct ctdb_cluster_mutex_handle * recovery_lock_handle ;
2007-06-07 15:18:55 +10:00
} ;
2007-05-04 08:30:18 +10:00
2007-06-04 20:22:44 +10:00
# define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
2007-06-06 10:25:46 +10:00
# define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
2007-05-24 13:49:27 +10:00
2015-10-26 16:50:09 +11:00
static void ctdb_restart_recd ( struct tevent_context * ev ,
struct tevent_timer * te , struct timeval t ,
void * private_data ) ;
2008-01-05 09:35:43 +11:00
2007-06-07 16:34:33 +10:00
/*
ban a node for a period of time
*/
2007-09-04 10:33:10 +10:00
static void ctdb_ban_node ( struct ctdb_recoverd * rec , uint32_t pnn , uint32_t ban_time )
2007-06-07 16:34:33 +10:00
{
2009-09-04 02:20:39 +10:00
int ret ;
2007-06-07 16:34:33 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
2015-10-28 18:18:33 +11:00
struct ctdb_ban_state bantime ;
2007-09-04 10:33:10 +10:00
if ( ! ctdb_validate_pnn ( ctdb , pnn ) ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Bad pnn %u in ctdb_ban_node \n " , pnn ) ) ;
2007-06-07 16:48:31 +10:00
return ;
}
2013-06-24 14:18:58 +10:00
DEBUG ( DEBUG_NOTICE , ( " Banning node %u for %u seconds \n " , pnn , ban_time ) ) ;
2009-09-04 02:20:39 +10:00
bantime . pnn = pnn ;
bantime . time = ban_time ;
2007-11-23 12:36:14 +11:00
2009-09-04 02:20:39 +10:00
ret = ctdb_ctrl_set_ban ( ctdb , CONTROL_TIMEOUT ( ) , pnn , & bantime ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to ban node %d \n " , pnn ) ) ;
2007-12-03 15:45:53 +11:00
return ;
2007-06-07 18:37:27 +10:00
}
2007-06-07 16:34:33 +10:00
}
2007-08-27 10:31:22 +10:00
enum monitor_result { MONITOR_OK , MONITOR_RECOVERY_NEEDED , MONITOR_ELECTION_NEEDED , MONITOR_FAILED } ;
2008-06-12 16:53:36 +10:00
/*
remember the trouble maker
*/
2009-09-04 02:20:39 +10:00
static void ctdb_set_culprit_count ( struct ctdb_recoverd * rec , uint32_t culprit , uint32_t count )
2008-06-12 16:53:36 +10:00
{
2009-09-04 02:20:39 +10:00
struct ctdb_context * ctdb = talloc_get_type ( rec - > ctdb , struct ctdb_context ) ;
struct ctdb_banning_state * ban_state ;
if ( culprit > ctdb - > num_nodes ) {
DEBUG ( DEBUG_ERR , ( " Trying to set culprit %d but num_nodes is %d \n " , culprit , ctdb - > num_nodes ) ) ;
return ;
}
2013-06-28 14:10:47 +10:00
/* If we are banned or stopped, do not set other nodes as culprits */
if ( rec - > node_flags & NODE_FLAGS_INACTIVE ) {
DEBUG ( DEBUG_NOTICE , ( " This node is INACTIVE, cannot set culprit node %d \n " , culprit ) ) ;
return ;
}
2009-09-04 02:20:39 +10:00
if ( ctdb - > nodes [ culprit ] - > ban_state = = NULL ) {
ctdb - > nodes [ culprit ] - > ban_state = talloc_zero ( ctdb - > nodes [ culprit ] , struct ctdb_banning_state ) ;
CTDB_NO_MEMORY_VOID ( ctdb , ctdb - > nodes [ culprit ] - > ban_state ) ;
2008-06-12 16:53:36 +10:00
2009-09-04 02:20:39 +10:00
}
ban_state = ctdb - > nodes [ culprit ] - > ban_state ;
if ( timeval_elapsed ( & ban_state - > last_reported_time ) > ctdb - > tunable . recovery_grace_period ) {
/* this was the first time in a long while this node
misbehaved so we will forgive any old transgressions .
*/
ban_state - > count = 0 ;
2008-06-12 16:53:36 +10:00
}
2009-09-04 02:20:39 +10:00
ban_state - > count + = count ;
ban_state - > last_reported_time = timeval_current ( ) ;
rec - > last_culprit_node = culprit ;
2008-06-12 16:53:36 +10:00
}
2009-04-24 13:58:32 +10:00
/*
remember the trouble maker
*/
2009-09-04 02:20:39 +10:00
static void ctdb_set_culprit ( struct ctdb_recoverd * rec , uint32_t culprit )
2009-04-24 13:58:32 +10:00
{
2009-09-04 02:20:39 +10:00
ctdb_set_culprit_count ( rec , culprit , 1 ) ;
2009-04-24 13:58:32 +10:00
}
2008-06-12 16:53:36 +10:00
2008-05-06 15:42:59 +10:00
/*
2015-10-27 15:09:33 +11:00
Retrieve capabilities from all connected nodes
2008-05-06 15:42:59 +10:00
*/
2014-07-31 15:26:03 +10:00
static int update_capabilities ( struct ctdb_recoverd * rec ,
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap )
2008-05-06 15:42:59 +10:00
{
2014-07-31 15:26:03 +10:00
uint32_t * capp ;
2008-05-06 15:42:59 +10:00
TALLOC_CTX * tmp_ctx ;
2014-07-31 15:26:03 +10:00
struct ctdb_node_capabilities * caps ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-05-06 15:42:59 +10:00
2014-07-31 15:26:03 +10:00
tmp_ctx = talloc_new ( rec ) ;
2008-05-06 15:42:59 +10:00
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2014-07-31 15:26:03 +10:00
caps = ctdb_get_capabilities ( ctdb , tmp_ctx ,
CONTROL_TIMEOUT ( ) , nodemap ) ;
if ( caps = = NULL ) {
DEBUG ( DEBUG_ERR ,
( __location__ " Failed to get node capabilities \n " ) ) ;
2008-05-06 15:42:59 +10:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2014-07-31 15:26:03 +10:00
capp = ctdb_get_node_capabilities ( caps , ctdb_get_pnn ( ctdb ) ) ;
if ( capp = = NULL ) {
DEBUG ( DEBUG_ERR ,
( __location__
" Capabilities don't include current node. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
ctdb - > capabilities = * capp ;
TALLOC_FREE ( rec - > caps ) ;
rec - > caps = talloc_steal ( rec , caps ) ;
2008-05-06 15:42:59 +10:00
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2007-06-07 15:18:55 +10:00
/*
change recovery mode on all nodes
*/
2015-10-06 11:52:06 +11:00
static int set_recovery_mode ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap ,
2016-09-13 15:45:54 +10:00
uint32_t rec_mode )
2007-05-06 09:53:12 +10:00
{
2008-01-06 12:38:01 +11:00
TDB_DATA data ;
2008-01-29 13:59:28 +11:00
uint32_t * nodes ;
TALLOC_CTX * tmp_ctx ;
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-06-12 16:53:36 +10:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2014-05-06 14:24:52 +10:00
data . dsize = sizeof ( uint32_t ) ;
data . dptr = ( unsigned char * ) & rec_mode ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_SET_RECMODE ,
nodes , 0 ,
CONTROL_TIMEOUT ( ) ,
false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode. Recovery failed. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
2008-01-29 13:59:28 +11:00
talloc_free ( tmp_ctx ) ;
2007-05-06 09:53:12 +10:00
return 0 ;
}
2007-06-07 15:18:55 +10:00
/*
ensure all other nodes have attached to any databases that we have
*/
2015-10-29 17:22:48 +11:00
static int create_missing_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap ,
2015-10-29 17:46:05 +11:00
uint32_t pnn , struct ctdb_dbid_map_old * dbmap , TALLOC_CTX * mem_ctx )
2007-05-04 09:45:53 +10:00
{
2007-05-04 15:21:40 +10:00
int i , j , db , ret ;
2015-10-29 17:46:05 +11:00
struct ctdb_dbid_map_old * remote_dbmap ;
2007-05-04 15:21:40 +10:00
2007-05-06 06:58:01 +10:00
/* verify that all other nodes have all our databases */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-26 23:02:57 +02:00
/* we don't need to ourself ourselves */
2007-09-04 10:33:10 +10:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 06:58:01 +10:00
continue ;
}
2015-07-26 23:02:57 +02:00
/* don't check nodes that are unavailable */
2007-06-07 15:18:55 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 06:58:01 +10:00
continue ;
}
2007-09-04 09:50:07 +10:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 18:39:37 +10:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 06:58:01 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 06:58:01 +10:00
return - 1 ;
}
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < remote_dbmap - > num ; i + + ) {
2015-10-29 17:46:05 +11:00
if ( dbmap - > dbs [ db ] . db_id = = remote_dbmap - > dbs [ i ] . db_id ) {
2007-05-06 06:58:01 +10:00
break ;
}
}
/* the remote node already have this database */
if ( i ! = remote_dbmap - > num ) {
continue ;
}
/* ok so we need to create this database */
2013-11-11 12:39:27 +11:00
ret = ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , pnn ,
2015-10-29 17:46:05 +11:00
dbmap - > dbs [ db ] . db_id , mem_ctx ,
2013-11-11 12:39:27 +11:00
& name ) ;
2007-05-06 06:58:01 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " , pnn ) ) ;
2007-05-06 06:58:01 +10:00
return - 1 ;
}
2013-11-11 12:39:27 +11:00
ret = ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) ,
nodemap - > nodes [ j ] . pnn ,
mem_ctx , name ,
2017-08-23 12:09:22 +10:00
dbmap - > dbs [ db ] . flags , NULL ) ;
2007-05-06 06:58:01 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create remote db:%s \n " , name ) ) ;
2007-05-06 06:58:01 +10:00
return - 1 ;
}
}
}
2007-05-04 15:21:40 +10:00
2007-05-06 10:04:37 +10:00
return 0 ;
}
2007-06-07 15:18:55 +10:00
/*
ensure we are attached to any databases that anyone else is attached to
*/
2015-10-29 17:22:48 +11:00
static int create_missing_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap ,
2015-10-29 17:46:05 +11:00
uint32_t pnn , struct ctdb_dbid_map_old * * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 10:12:42 +10:00
{
int i , j , db , ret ;
2015-10-29 17:46:05 +11:00
struct ctdb_dbid_map_old * remote_dbmap ;
2007-05-06 10:12:42 +10:00
/* verify that we have all database any other node has */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-07-26 23:02:57 +02:00
/* we don't need to ourself ourselves */
2007-09-04 10:33:10 +10:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 10:12:42 +10:00
continue ;
}
2015-07-26 23:02:57 +02:00
/* don't check nodes that are unavailable */
2007-06-07 15:18:55 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 10:12:42 +10:00
continue ;
}
2007-09-04 09:50:07 +10:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 18:39:37 +10:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 10:12:42 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 10:12:42 +10:00
return - 1 ;
}
/* step through all databases on the remote node */
for ( db = 0 ; db < remote_dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < ( * dbmap ) - > num ; i + + ) {
2015-10-29 17:46:05 +11:00
if ( remote_dbmap - > dbs [ db ] . db_id = = ( * dbmap ) - > dbs [ i ] . db_id ) {
2007-05-06 10:12:42 +10:00
break ;
}
}
/* we already have this db locally */
if ( i ! = ( * dbmap ) - > num ) {
continue ;
}
/* ok so we need to create this database and
rebuild dbmap
*/
2007-09-04 09:50:07 +10:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2015-10-29 17:46:05 +11:00
remote_dbmap - > dbs [ db ] . db_id , mem_ctx , & name ) ;
2007-05-06 10:12:42 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 10:12:42 +10:00
return - 1 ;
}
2017-08-18 13:50:39 +10:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , pnn ,
mem_ctx , name ,
2017-08-23 12:09:22 +10:00
remote_dbmap - > dbs [ db ] . flags , NULL ) ;
2007-05-06 10:12:42 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create local db:%s \n " , name ) ) ;
2007-05-06 10:12:42 +10:00
return - 1 ;
}
2007-09-04 10:33:10 +10:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , dbmap ) ;
2007-05-06 10:12:42 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to reread dbmap on node %u \n " , pnn ) ) ;
2007-05-06 10:12:42 +10:00
return - 1 ;
}
}
}
return 0 ;
}
2007-06-07 15:18:55 +10:00
/*
update flags on all active nodes
*/
2015-10-29 17:22:48 +11:00
static int update_flags_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap , uint32_t pnn , uint32_t flags )
2008-06-26 11:08:09 +10:00
{
2008-11-19 14:43:46 +11:00
int ret ;
2008-06-26 11:08:09 +10:00
2008-12-05 16:32:30 +11:00
ret = ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , pnn , flags , ~ flags ) ;
if ( ret ! = 0 ) {
2008-11-19 14:43:46 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update nodeflags on remote nodes \n " ) ) ;
return - 1 ;
}
2008-06-26 11:08:09 +10:00
return 0 ;
}
2007-05-06 10:38:44 +10:00
2008-01-08 17:23:27 +11:00
/*
2008-01-08 21:28:42 +11:00
called when a vacuum fetch has completed - just free it and do the next one
2008-01-08 17:23:27 +11:00
*/
static void vacuum_fetch_callback ( struct ctdb_client_call_state * state )
{
talloc_free ( state ) ;
2008-01-08 21:28:42 +11:00
}
2015-06-02 21:39:00 +02:00
/**
* Process one elements of the vacuum fetch list :
* Migrate it over to us with the special flag
* CTDB_CALL_FLAG_VACUUM_MIGRATION .
*/
static bool vacuum_fetch_process_one ( struct ctdb_db_context * ctdb_db ,
uint32_t pnn ,
2015-10-29 17:30:30 +11:00
struct ctdb_rec_data_old * r )
2015-06-02 21:39:00 +02:00
{
struct ctdb_client_call_state * state ;
TDB_DATA data ;
struct ctdb_ltdb_header * hdr ;
struct ctdb_call call ;
ZERO_STRUCT ( call ) ;
call . call_id = CTDB_NULL_FUNC ;
call . flags = CTDB_IMMEDIATE_MIGRATION ;
call . flags | = CTDB_CALL_FLAG_VACUUM_MIGRATION ;
call . key . dptr = & r - > data [ 0 ] ;
call . key . dsize = r - > keylen ;
/* ensure we don't block this daemon - just skip a record if we can't get
the chainlock */
if ( tdb_chainlock_nonblock ( ctdb_db - > ltdb - > tdb , call . key ) ! = 0 ) {
return true ;
}
data = tdb_fetch ( ctdb_db - > ltdb - > tdb , call . key ) ;
if ( data . dptr = = NULL ) {
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
if ( data . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
free ( data . dptr ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
if ( hdr - > dmaster = = pnn ) {
/* its already local */
free ( data . dptr ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
return true ;
}
free ( data . dptr ) ;
state = ctdb_call_send ( ctdb_db , & call ) ;
tdb_chainunlock ( ctdb_db - > ltdb - > tdb , call . key ) ;
if ( state = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to setup vacuum fetch call \n " ) ) ;
return false ;
}
state - > async . fn = vacuum_fetch_callback ;
state - > async . private_data = NULL ;
return true ;
}
2008-01-08 21:28:42 +11:00
2008-01-08 17:23:27 +11:00
/*
handler for vacuum fetch
*/
2015-04-08 14:38:26 +10:00
static void vacuum_fetch_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2008-01-08 17:23:27 +11:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-07-30 14:24:56 +10:00
struct ctdb_marshall_buffer * recs ;
2008-01-08 17:23:27 +11:00
int ret , i ;
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
const char * name ;
2015-10-29 17:46:05 +11:00
struct ctdb_dbid_map_old * dbmap = NULL ;
2017-08-18 14:00:47 +10:00
uint8_t db_flags = 0 ;
2008-01-08 17:23:27 +11:00
struct ctdb_db_context * ctdb_db ;
2015-10-29 17:30:30 +11:00
struct ctdb_rec_data_old * r ;
2008-01-08 17:23:27 +11:00
2008-07-30 14:24:56 +10:00
recs = ( struct ctdb_marshall_buffer * ) data . dptr ;
2008-01-08 21:28:42 +11:00
if ( recs - > count = = 0 ) {
2015-06-02 21:57:54 +02:00
goto done ;
2008-01-08 21:28:42 +11:00
}
2008-01-08 17:23:27 +11:00
/* work out if the database is persistent */
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from local node \n " ) ) ;
2015-06-02 21:57:54 +02:00
goto done ;
2008-01-08 17:23:27 +11:00
}
for ( i = 0 ; i < dbmap - > num ; i + + ) {
2015-10-29 17:46:05 +11:00
if ( dbmap - > dbs [ i ] . db_id = = recs - > db_id ) {
2017-08-18 14:00:47 +10:00
db_flags = dbmap - > dbs [ i ] . flags ;
2008-01-08 17:23:27 +11:00
break ;
}
}
if ( i = = dbmap - > num ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to find db_id 0x%x on local node \n " , recs - > db_id ) ) ;
2015-06-02 21:57:54 +02:00
goto done ;
2008-01-08 17:23:27 +11:00
}
/* find the name of this database */
if ( ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , recs - > db_id , tmp_ctx , & name ) ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to get name of db 0x%x \n " , recs - > db_id ) ) ;
2015-06-02 21:57:54 +02:00
goto done ;
2008-01-08 17:23:27 +11:00
}
/* attach to it */
2017-08-18 14:00:47 +10:00
ctdb_db = ctdb_attach ( ctdb , CONTROL_TIMEOUT ( ) , name , db_flags ) ;
2008-01-08 17:23:27 +11:00
if ( ctdb_db = = NULL ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to attach to database '%s' \n " , name ) ) ;
2015-06-02 21:57:54 +02:00
goto done ;
2008-01-08 17:23:27 +11:00
}
2015-10-29 17:30:30 +11:00
r = ( struct ctdb_rec_data_old * ) & recs - > data [ 0 ] ;
2015-06-05 16:35:48 +10:00
while ( recs - > count ) {
2015-06-02 22:17:03 +02:00
bool ok ;
2015-06-05 16:35:48 +10:00
ok = vacuum_fetch_process_one ( ctdb_db , rec - > ctdb - > pnn , r ) ;
2015-06-02 22:17:03 +02:00
if ( ! ok ) {
break ;
}
2015-10-29 17:30:30 +11:00
r = ( struct ctdb_rec_data_old * ) ( r - > length + ( uint8_t * ) r ) ;
2015-06-05 16:35:48 +10:00
recs - > count - - ;
2015-06-02 22:17:03 +02:00
}
2015-06-02 21:57:54 +02:00
done :
2008-09-16 07:55:57 +10:00
talloc_free ( tmp_ctx ) ;
2008-01-08 17:23:27 +11:00
}
2007-06-07 16:34:33 +10:00
2014-04-22 15:24:49 +10:00
/*
* handler for database detach
*/
2015-04-08 14:38:26 +10:00
static void detach_database_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2014-04-22 15:24:49 +10:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2014-04-22 15:24:49 +10:00
uint32_t db_id ;
struct ctdb_db_context * ctdb_db ;
if ( data . dsize ! = sizeof ( db_id ) ) {
return ;
}
db_id = * ( uint32_t * ) data . dptr ;
ctdb_db = find_ctdb_db ( ctdb , db_id ) ;
if ( ctdb_db = = NULL ) {
/* database is not attached */
return ;
}
DLIST_REMOVE ( ctdb - > db_list , ctdb_db ) ;
DEBUG ( DEBUG_NOTICE , ( " Detached from database '%s' \n " ,
ctdb_db - > db_name ) ) ;
talloc_free ( ctdb_db ) ;
}
2007-07-04 08:36:59 +10:00
/*
called when ctdb_wait_timeout should finish
*/
2015-10-26 16:50:09 +11:00
static void ctdb_wait_handler ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-07-04 08:36:59 +10:00
struct timeval yt , void * p )
{
uint32_t * timed_out = ( uint32_t * ) p ;
( * timed_out ) = 1 ;
}
/*
wait for a given number of seconds
*/
2010-06-22 22:50:35 +09:30
static void ctdb_wait_timeout ( struct ctdb_context * ctdb , double secs )
2007-07-04 08:36:59 +10:00
{
uint32_t timed_out = 0 ;
2010-06-22 22:50:35 +09:30
time_t usecs = ( secs - ( time_t ) secs ) * 1000000 ;
2015-10-26 16:50:09 +11:00
tevent_add_timer ( ctdb - > ev , ctdb , timeval_current_ofs ( secs , usecs ) ,
ctdb_wait_handler , & timed_out ) ;
2007-07-04 08:36:59 +10:00
while ( ! timed_out ) {
2015-10-26 16:50:09 +11:00
tevent_loop_once ( ctdb - > ev ) ;
2007-07-04 08:36:59 +10:00
}
}
2007-11-13 10:27:44 +11:00
/*
called when an election times out ( ends )
*/
2015-10-26 16:50:09 +11:00
static void ctdb_election_timeout ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-11-13 10:27:44 +11:00
struct timeval t , void * p )
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
rec - > election_timeout = NULL ;
2010-06-22 22:55:20 +09:30
fast_start = false ;
2009-07-17 11:37:03 +10:00
2014-06-20 13:36:25 +10:00
DEBUG ( DEBUG_WARNING , ( " Election period ended \n " ) ) ;
2007-11-13 10:27:44 +11:00
}
/*
wait for an election to finish . It finished election_timeout seconds after
the last election packet is received
*/
static void ctdb_wait_election ( struct ctdb_recoverd * rec )
{
struct ctdb_context * ctdb = rec - > ctdb ;
while ( rec - > election_timeout ) {
2015-10-26 16:50:09 +11:00
tevent_loop_once ( ctdb - > ev ) ;
2007-11-13 10:27:44 +11:00
}
}
2007-10-15 14:28:51 +10:00
/*
2007-11-23 11:31:42 +11:00
Update our local flags from all remote connected nodes .
This is only run when we are or we belive we are the recovery master
2007-10-15 14:28:51 +10:00
*/
2015-10-29 17:22:48 +11:00
static int update_local_flags ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap )
2007-10-15 14:28:51 +10:00
{
int j ;
2007-11-30 08:44:34 +11:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-15 14:28:51 +10:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * remote_nodemap = NULL ;
2007-10-15 14:28:51 +10:00
int ret ;
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
if ( nodemap - > nodes [ j ] . pnn = = ctdb - > pnn ) {
continue ;
}
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
mem_ctx , & remote_nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from remote node %u \n " ,
2007-10-15 14:28:51 +10:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-11-28 15:04:20 +11:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2007-10-15 14:28:51 +10:00
talloc_free ( mem_ctx ) ;
2016-04-27 21:47:08 +10:00
return - 1 ;
2007-10-15 14:28:51 +10:00
}
if ( nodemap - > nodes [ j ] . flags ! = remote_nodemap - > nodes [ j ] . flags ) {
2007-11-23 11:53:06 +11:00
/* We should tell our daemon about this so it
2007-11-23 10:52:29 +11:00
updates its flags or else we will log the same
message again in the next iteration of recovery .
2007-11-23 11:31:42 +11:00
Since we are the recovery master we can just as
well update the flags on all nodes .
2007-11-23 10:52:29 +11:00
*/
2013-06-26 15:22:46 +10:00
ret = ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , remote_nodemap - > nodes [ j ] . flags , ~ remote_nodemap - > nodes [ j ] . flags ) ;
2008-11-19 14:43:46 +11:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update nodeflags on remote nodes \n " ) ) ;
return - 1 ;
}
2007-11-23 10:52:29 +11:00
2007-11-23 11:53:06 +11:00
/* Update our local copy of the flags in the recovery
daemon .
*/
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_NOTICE , ( " Remote node %u had flags 0x%x, local had 0x%x - updating local \n " ,
2007-11-23 11:53:06 +11:00
nodemap - > nodes [ j ] . pnn , remote_nodemap - > nodes [ j ] . flags ,
nodemap - > nodes [ j ] . flags ) ) ;
2007-10-15 14:28:51 +10:00
nodemap - > nodes [ j ] . flags = remote_nodemap - > nodes [ j ] . flags ;
}
talloc_free ( remote_nodemap ) ;
}
talloc_free ( mem_ctx ) ;
2016-04-27 21:47:08 +10:00
return 0 ;
2007-10-15 14:28:51 +10:00
}
2015-10-12 16:52:49 +02:00
/* Create a new random generation id.
2007-08-22 12:38:31 +10:00
The generation id can not be the INVALID_GENERATION id
*/
static uint32_t new_generation ( void )
{
uint32_t generation ;
while ( 1 ) {
generation = random ( ) ;
if ( generation ! = INVALID_GENERATION ) {
break ;
}
}
return generation ;
}
2007-10-05 12:01:40 +10:00
2016-05-24 14:54:39 +10:00
static bool ctdb_recovery_have_lock ( struct ctdb_recoverd * rec )
2016-02-17 20:20:03 +11:00
{
2016-05-24 14:54:39 +10:00
return ( rec - > recovery_lock_handle ! = NULL ) ;
2016-02-17 20:20:03 +11:00
}
struct hold_reclock_state {
bool done ;
2016-05-31 18:37:30 +10:00
bool locked ;
2016-06-01 17:32:42 +10:00
double latency ;
2016-02-17 20:20:03 +11:00
} ;
2016-05-29 07:25:05 +10:00
static void take_reclock_handler ( char status ,
2016-02-17 20:20:03 +11:00
double latency ,
void * private_data )
{
struct hold_reclock_state * s =
( struct hold_reclock_state * ) private_data ;
switch ( status ) {
case ' 0 ' :
2016-06-01 17:32:42 +10:00
s - > latency = latency ;
2016-02-17 20:20:03 +11:00
break ;
case ' 1 ' :
DEBUG ( DEBUG_ERR ,
( " Unable to take recovery lock - contention \n " ) ) ;
break ;
default :
DEBUG ( DEBUG_ERR , ( " ERROR: when taking recovery lock \n " ) ) ;
}
s - > done = true ;
2016-05-31 18:37:30 +10:00
s - > locked = ( status = = ' 0 ' ) ;
2016-02-17 20:20:03 +11:00
}
2016-05-29 07:25:05 +10:00
static bool ctdb_recovery_lock ( struct ctdb_recoverd * rec ) ;
static void lost_reclock_handler ( void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type_abort (
private_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR ,
( " Recovery lock helper terminated unexpectedly - "
" trying to retake recovery lock \n " ) ) ;
TALLOC_FREE ( rec - > recovery_lock_handle ) ;
if ( ! ctdb_recovery_lock ( rec ) ) {
DEBUG ( DEBUG_ERR , ( " Failed to take recovery lock \n " ) ) ;
}
}
2016-05-24 14:54:39 +10:00
static bool ctdb_recovery_lock ( struct ctdb_recoverd * rec )
2016-02-17 20:20:03 +11:00
{
2016-05-24 14:54:39 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
2016-02-17 20:20:03 +11:00
struct ctdb_cluster_mutex_handle * h ;
struct hold_reclock_state s = {
. done = false ,
2016-05-31 18:37:30 +10:00
. locked = false ,
2016-06-01 17:32:42 +10:00
. latency = 0 ,
2016-02-17 20:20:03 +11:00
} ;
2016-06-01 18:56:33 +10:00
h = ctdb_cluster_mutex ( rec , ctdb , ctdb - > recovery_lock , 0 ,
2016-05-29 07:25:05 +10:00
take_reclock_handler , & s ,
lost_reclock_handler , rec ) ;
2016-02-17 20:20:03 +11:00
if ( h = = NULL ) {
2016-06-01 15:56:42 +10:00
return false ;
2016-02-17 20:20:03 +11:00
}
while ( ! s . done ) {
tevent_loop_once ( ctdb - > ev ) ;
}
2016-05-24 14:54:39 +10:00
if ( ! s . locked ) {
2016-06-01 17:32:42 +10:00
talloc_free ( h ) ;
2016-05-24 14:54:39 +10:00
return false ;
}
rec - > recovery_lock_handle = h ;
2016-06-01 17:32:42 +10:00
ctdb_ctrl_report_recd_lock_latency ( ctdb , CONTROL_TIMEOUT ( ) ,
s . latency ) ;
2016-05-24 14:54:39 +10:00
return true ;
2016-02-17 20:20:03 +11:00
}
2016-05-24 14:54:39 +10:00
static void ctdb_recovery_unlock ( struct ctdb_recoverd * rec )
2016-02-17 20:20:03 +11:00
{
2016-05-24 14:54:39 +10:00
if ( rec - > recovery_lock_handle ! = NULL ) {
2016-02-17 20:20:03 +11:00
DEBUG ( DEBUG_NOTICE , ( " Releasing recovery lock \n " ) ) ;
2016-05-24 14:54:39 +10:00
TALLOC_FREE ( rec - > recovery_lock_handle ) ;
2016-02-17 20:20:03 +11:00
}
}
2013-06-28 16:31:07 +10:00
static void ban_misbehaving_nodes ( struct ctdb_recoverd * rec , bool * self_ban )
2013-06-28 14:31:02 +10:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
int i ;
struct ctdb_banning_state * ban_state ;
2013-06-28 16:31:07 +10:00
* self_ban = false ;
2013-06-28 14:31:02 +10:00
for ( i = 0 ; i < ctdb - > num_nodes ; i + + ) {
if ( ctdb - > nodes [ i ] - > ban_state = = NULL ) {
continue ;
}
ban_state = ( struct ctdb_banning_state * ) ctdb - > nodes [ i ] - > ban_state ;
if ( ban_state - > count < 2 * ctdb - > num_nodes ) {
continue ;
}
DEBUG ( DEBUG_NOTICE , ( " Node %u reached %u banning credits - banning it for %u seconds \n " ,
ctdb - > nodes [ i ] - > pnn , ban_state - > count ,
ctdb - > tunable . recovery_ban_period ) ) ;
ctdb_ban_node ( rec , ctdb - > nodes [ i ] - > pnn , ctdb - > tunable . recovery_ban_period ) ;
ban_state - > count = 0 ;
2013-06-28 16:31:07 +10:00
/* Banning ourself? */
if ( ctdb - > nodes [ i ] - > pnn = = rec - > ctdb - > pnn ) {
* self_ban = true ;
}
2013-06-28 14:31:02 +10:00
}
}
2016-12-09 15:04:03 +11:00
struct helper_state {
int fd [ 2 ] ;
pid_t pid ;
int result ;
bool done ;
} ;
static void helper_handler ( struct tevent_context * ev ,
struct tevent_fd * fde ,
uint16_t flags , void * private_data )
{
struct helper_state * state = talloc_get_type_abort (
private_data , struct helper_state ) ;
int ret ;
ret = sys_read ( state - > fd [ 0 ] , & state - > result , sizeof ( state - > result ) ) ;
if ( ret ! = sizeof ( state - > result ) ) {
state - > result = EPIPE ;
}
state - > done = true ;
}
static int helper_run ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx ,
const char * prog , const char * arg , const char * type )
{
struct helper_state * state ;
struct tevent_fd * fde ;
const char * * args ;
int nargs , ret ;
2017-09-08 11:24:27 +10:00
uint32_t recmaster = rec - > recmaster ;
2016-12-09 15:04:03 +11:00
state = talloc_zero ( mem_ctx , struct helper_state ) ;
if ( state = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
return - 1 ;
}
state - > pid = - 1 ;
ret = pipe ( state - > fd ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR ,
( " Failed to create pipe for %s helper \n " , type ) ) ;
goto fail ;
}
set_close_on_exec ( state - > fd [ 0 ] ) ;
nargs = 4 ;
args = talloc_array ( state , const char * , nargs ) ;
if ( args = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
goto fail ;
}
args [ 0 ] = talloc_asprintf ( args , " %d " , state - > fd [ 1 ] ) ;
if ( args [ 0 ] = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
goto fail ;
}
args [ 1 ] = rec - > ctdb - > daemon . name ;
args [ 2 ] = arg ;
args [ 3 ] = NULL ;
if ( args [ 2 ] = = NULL ) {
nargs = 3 ;
}
state - > pid = ctdb_vfork_exec ( state , rec - > ctdb , prog , nargs , args ) ;
if ( state - > pid = = - 1 ) {
DEBUG ( DEBUG_ERR ,
( " Failed to create child for %s helper \n " , type ) ) ;
goto fail ;
}
close ( state - > fd [ 1 ] ) ;
state - > fd [ 1 ] = - 1 ;
state - > done = false ;
fde = tevent_add_fd ( rec - > ctdb - > ev , rec - > ctdb , state - > fd [ 0 ] ,
TEVENT_FD_READ , helper_handler , state ) ;
if ( fde = = NULL ) {
goto fail ;
}
tevent_fd_set_auto_close ( fde ) ;
while ( ! state - > done ) {
tevent_loop_once ( rec - > ctdb - > ev ) ;
2017-09-08 11:24:27 +10:00
/* If recmaster changes, we have lost election */
if ( recmaster ! = rec - > recmaster ) {
D_ERR ( " Recmaster changed to %u, aborting %s \n " ,
rec - > recmaster , type ) ;
state - > result = 1 ;
break ;
}
2016-12-09 15:04:03 +11:00
}
close ( state - > fd [ 0 ] ) ;
state - > fd [ 0 ] = - 1 ;
if ( state - > result ! = 0 ) {
goto fail ;
}
ctdb_kill ( rec - > ctdb , state - > pid , SIGKILL ) ;
talloc_free ( state ) ;
return 0 ;
fail :
if ( state - > fd [ 0 ] ! = - 1 ) {
close ( state - > fd [ 0 ] ) ;
}
if ( state - > fd [ 1 ] ! = - 1 ) {
close ( state - > fd [ 1 ] ) ;
}
if ( state - > pid ! = - 1 ) {
ctdb_kill ( rec - > ctdb , state - > pid , SIGKILL ) ;
}
talloc_free ( state ) ;
return - 1 ;
}
2016-12-09 16:21:39 +11:00
static int ctdb_takeover ( struct ctdb_recoverd * rec ,
uint32_t * force_rebalance_nodes )
{
static char prog [ PATH_MAX + 1 ] = " " ;
char * arg ;
int i ;
if ( ! ctdb_set_helper ( " takeover_helper " , prog , sizeof ( prog ) ,
" CTDB_TAKEOVER_HELPER " , CTDB_HELPER_BINDIR ,
" ctdb_takeover_helper " ) ) {
ctdb_die ( rec - > ctdb , " Unable to set takeover helper \n " ) ;
}
arg = NULL ;
for ( i = 0 ; i < talloc_array_length ( force_rebalance_nodes ) ; i + + ) {
uint32_t pnn = force_rebalance_nodes [ i ] ;
if ( arg = = NULL ) {
arg = talloc_asprintf ( rec , " %u " , pnn ) ;
} else {
arg = talloc_asprintf_append ( arg , " ,%u " , pnn ) ;
}
if ( arg = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
return - 1 ;
}
}
return helper_run ( rec , rec , prog , arg , " takeover " ) ;
}
2016-12-09 15:04:03 +11:00
2013-08-27 12:14:34 +10:00
static bool do_takeover_run ( struct ctdb_recoverd * rec ,
2016-05-03 15:35:08 +10:00
struct ctdb_node_map_old * nodemap )
2013-08-27 12:14:34 +10:00
{
2013-08-27 15:04:40 +10:00
uint32_t * nodes = NULL ;
2015-10-28 18:23:13 +11:00
struct ctdb_disable_message dtr ;
2013-09-03 11:21:09 +10:00
TDB_DATA data ;
2013-08-27 15:04:40 +10:00
int i ;
2013-09-06 11:23:07 +10:00
uint32_t * rebalance_nodes = rec - > force_rebalance_nodes ;
2013-08-27 12:14:34 +10:00
int ret ;
bool ok ;
2013-09-18 17:06:16 +10:00
DEBUG ( DEBUG_NOTICE , ( " Takeover run starting \n " ) ) ;
2015-02-08 20:52:12 +11:00
if ( ctdb_op_is_in_progress ( rec - > takeover_run ) ) {
2013-09-03 11:20:01 +10:00
DEBUG ( DEBUG_ERR , ( __location__
" takeover run already in progress \n " ) ) ;
ok = false ;
goto done ;
}
2015-02-08 20:52:12 +11:00
if ( ! ctdb_op_begin ( rec - > takeover_run ) ) {
2013-08-27 15:04:40 +10:00
ok = false ;
goto done ;
2013-09-03 11:21:09 +10:00
}
2013-08-27 15:04:40 +10:00
/* Disable IP checks (takeover runs, really) on other nodes
* while doing this takeover run . This will stop those other
* nodes from triggering takeover runs when think they should
* be hosting an IP but it isn ' t yet on an interface . Don ' t
* wait for replies since a failure here might cause some
* noise in the logs but will not actually cause a problem .
*/
2016-01-11 17:23:12 +11:00
ZERO_STRUCT ( dtr ) ;
2013-08-27 15:04:40 +10:00
dtr . srvid = 0 ; /* No reply */
dtr . pnn = - 1 ;
data . dptr = ( uint8_t * ) & dtr ;
data . dsize = sizeof ( dtr ) ;
nodes = list_of_connected_nodes ( rec - > ctdb , nodemap , rec , false ) ;
2013-10-24 11:13:16 +11:00
/* Disable for 60 seconds. This can be a tunable later if
2013-08-27 15:04:40 +10:00
* necessary .
*/
2015-10-28 18:23:13 +11:00
dtr . timeout = 60 ;
2013-08-27 15:04:40 +10:00
for ( i = 0 ; i < talloc_array_length ( nodes ) ; i + + ) {
if ( ctdb_client_send_message ( rec - > ctdb , nodes [ i ] ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
data ) ! = 0 ) {
DEBUG ( DEBUG_INFO , ( " Failed to disable takeover runs \n " ) ) ;
}
}
2013-09-03 11:20:01 +10:00
2016-12-09 16:21:39 +11:00
ret = ctdb_takeover ( rec , rec - > force_rebalance_nodes ) ;
2013-09-03 11:21:09 +10:00
2013-08-27 15:04:40 +10:00
/* Reenable takeover runs and IP checks on other nodes */
2015-10-28 18:23:13 +11:00
dtr . timeout = 0 ;
2013-08-27 15:04:40 +10:00
for ( i = 0 ; i < talloc_array_length ( nodes ) ; i + + ) {
if ( ctdb_client_send_message ( rec - > ctdb , nodes [ i ] ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
data ) ! = 0 ) {
2015-07-26 23:02:57 +02:00
DEBUG ( DEBUG_INFO , ( " Failed to re-enable takeover runs \n " ) ) ;
2013-08-27 15:04:40 +10:00
}
2013-09-03 11:21:09 +10:00
}
2013-08-27 12:14:34 +10:00
if ( ret ! = 0 ) {
2013-09-18 17:06:16 +10:00
DEBUG ( DEBUG_ERR , ( " ctdb_takeover_run() failed \n " ) ) ;
2013-08-27 12:14:34 +10:00
ok = false ;
goto done ;
}
ok = true ;
2013-09-04 14:30:04 +10:00
/* Takeover run was successful so clear force rebalance targets */
2013-09-06 11:23:07 +10:00
if ( rebalance_nodes = = rec - > force_rebalance_nodes ) {
TALLOC_FREE ( rec - > force_rebalance_nodes ) ;
} else {
DEBUG ( DEBUG_WARNING ,
( " Rebalance target nodes changed during takeover run - not clearing \n " ) ) ;
}
2013-08-27 12:14:34 +10:00
done :
rec - > need_takeover_run = ! ok ;
2013-08-27 15:04:40 +10:00
talloc_free ( nodes ) ;
2015-02-08 20:52:12 +11:00
ctdb_op_end ( rec - > takeover_run ) ;
2013-09-18 17:06:16 +10:00
DEBUG ( DEBUG_NOTICE , ( " Takeover run %s \n " , ok ? " completed successfully " : " unsuccessful " ) ) ;
2013-08-27 12:14:34 +10:00
return ok ;
}
2015-09-17 16:22:38 +10:00
static int db_recovery_parallel ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx )
{
static char prog [ PATH_MAX + 1 ] = " " ;
2016-12-09 15:04:03 +11:00
const char * arg ;
2015-09-17 16:22:38 +10:00
if ( ! ctdb_set_helper ( " recovery_helper " , prog , sizeof ( prog ) ,
" CTDB_RECOVERY_HELPER " , CTDB_HELPER_BINDIR ,
" ctdb_recovery_helper " ) ) {
ctdb_die ( rec - > ctdb , " Unable to set recovery helper \n " ) ;
}
2016-12-09 15:04:03 +11:00
arg = talloc_asprintf ( mem_ctx , " %u " , new_generation ( ) ) ;
if ( arg = = NULL ) {
2015-09-17 16:22:38 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " memory error \n " ) ) ;
return - 1 ;
}
2016-02-11 14:32:34 +11:00
setenv ( " CTDB_DBDIR_STATE " , rec - > ctdb - > db_directory_state , 1 ) ;
2016-12-09 15:04:03 +11:00
return helper_run ( rec , mem_ctx , prog , arg , " recovery " ) ;
2015-09-17 16:22:38 +10:00
}
2015-09-17 16:00:47 +10:00
/*
we are the recmaster , and recovery is needed - start a recovery run
*/
static int do_recovery ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx , uint32_t pnn ,
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap , struct ctdb_vnn_map * vnnmap )
2015-09-17 16:00:47 +10:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
int i , ret ;
2015-10-29 17:46:05 +11:00
struct ctdb_dbid_map_old * dbmap ;
2015-09-17 16:00:47 +10:00
bool self_ban ;
DEBUG ( DEBUG_NOTICE , ( __location__ " Starting do_recovery \n " ) ) ;
2015-10-06 17:31:41 +11:00
/* Check if the current node is still the recmaster. It's possible that
2015-10-23 15:33:01 +11:00
* re - election has changed the recmaster .
2015-10-06 17:31:41 +11:00
*/
2015-10-23 15:33:01 +11:00
if ( pnn ! = rec - > recmaster ) {
2015-10-06 17:31:41 +11:00
DEBUG ( DEBUG_NOTICE ,
( " Recovery master changed to %u, aborting recovery \n " ,
2015-10-23 15:33:01 +11:00
rec - > recmaster ) ) ;
2015-10-06 17:31:41 +11:00
return - 1 ;
}
2015-09-17 16:00:47 +10:00
/* if recovery fails, force it again */
rec - > need_recovery = true ;
if ( ! ctdb_op_begin ( rec - > recovery ) ) {
return - 1 ;
}
if ( rec - > election_timeout ) {
/* an election is in progress */
DEBUG ( DEBUG_ERR , ( " do_recovery called while election in progress - try again later \n " ) ) ;
goto fail ;
}
ban_misbehaving_nodes ( rec , & self_ban ) ;
if ( self_ban ) {
DEBUG ( DEBUG_NOTICE , ( " This node was banned, aborting recovery \n " ) ) ;
goto fail ;
}
2016-05-17 18:28:56 +10:00
if ( ctdb - > recovery_lock ! = NULL ) {
2016-05-24 14:54:39 +10:00
if ( ctdb_recovery_have_lock ( rec ) ) {
2015-09-17 16:00:47 +10:00
DEBUG ( DEBUG_NOTICE , ( " Already holding recovery lock \n " ) ) ;
} else {
DEBUG ( DEBUG_NOTICE , ( " Attempting to take recovery lock (%s) \n " ,
2016-05-17 18:28:56 +10:00
ctdb - > recovery_lock ) ) ;
2016-05-24 14:54:39 +10:00
if ( ! ctdb_recovery_lock ( rec ) ) {
2015-09-17 16:00:47 +10:00
if ( ctdb - > runstate = = CTDB_RUNSTATE_FIRST_RECOVERY ) {
/* If ctdb is trying first recovery, it's
* possible that current node does not know
* yet who the recmaster is .
*/
DEBUG ( DEBUG_ERR , ( " Unable to get recovery lock "
" - retrying recovery \n " ) ) ;
goto fail ;
}
DEBUG ( DEBUG_ERR , ( " Unable to get recovery lock - aborting recovery "
" and ban ourself for %u seconds \n " ,
ctdb - > tunable . recovery_ban_period ) ) ;
ctdb_ban_node ( rec , pnn , ctdb - > tunable . recovery_ban_period ) ;
goto fail ;
}
DEBUG ( DEBUG_NOTICE ,
( " Recovery lock taken successfully by recovery daemon \n " ) ) ;
}
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery initiated due to problem with node %u \n " , rec - > last_culprit_node ) ) ;
/* get a list of all databases */
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node :%u \n " , pnn ) ) ;
goto fail ;
}
/* we do the db creation before we set the recovery mode, so the freeze happens
on all databases we will be dealing with . */
/* verify that we have all the databases any other node has */
ret = create_missing_local_databases ( ctdb , nodemap , pnn , & dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing local databases \n " ) ) ;
goto fail ;
}
/* verify that all other nodes have all our databases */
ret = create_missing_remote_databases ( ctdb , nodemap , pnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing remote databases \n " ) ) ;
goto fail ;
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - created remote databases \n " ) ) ;
2015-10-27 15:09:33 +11:00
/* Retrieve capabilities from all connected nodes */
2015-09-17 16:07:37 +10:00
ret = update_capabilities ( rec , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update node capabilities. \n " ) ) ;
return - 1 ;
}
2015-09-17 17:10:15 +10:00
/*
update all nodes to have the same flags that we have
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
ret = update_flags_on_all_nodes ( ctdb , nodemap , i , nodemap - > nodes [ i ] . flags ) ;
if ( ret ! = 0 ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
DEBUG ( DEBUG_WARNING , ( __location__ " Unable to update flags on inactive node %d \n " , i ) ) ;
} else {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update flags on all nodes for node %d \n " , i ) ) ;
return - 1 ;
}
}
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated flags \n " ) ) ;
2016-07-19 16:06:37 +10:00
ret = db_recovery_parallel ( rec , mem_ctx ) ;
2015-09-17 16:00:47 +10:00
if ( ret ! = 0 ) {
goto fail ;
}
2016-05-03 15:35:08 +10:00
do_takeover_run ( rec , nodemap ) ;
2008-02-18 19:38:04 +11:00
2007-05-26 00:05:30 +10:00
/* send a message to all clients telling them that the cluster
has been reconfigured */
2013-11-11 12:39:27 +11:00
ret = ctdb_client_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
CTDB_SRVID_RECONFIGURE , tdb_null ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to send reconfigure message \n " ) ) ;
2015-02-06 14:32:08 +11:00
goto fail ;
2013-11-11 12:39:27 +11:00
}
2007-05-04 15:21:40 +10:00
2008-02-04 17:44:24 +11:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery complete \n " ) ) ;
2007-07-04 08:36:59 +10:00
2007-09-14 09:49:12 +10:00
rec - > need_recovery = false ;
2015-02-06 14:47:33 +11:00
ctdb_op_end ( rec - > recovery ) ;
2007-09-14 09:49:12 +10:00
2009-09-25 13:14:53 +10:00
/* we managed to complete a full recovery, make sure to forgive
any past sins by the nodes that could now participate in the
recovery .
*/
DEBUG ( DEBUG_ERR , ( " Resetting ban count to 0 for all nodes \n " ) ) ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
struct ctdb_banning_state * ban_state ;
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
ban_state = ( struct ctdb_banning_state * ) ctdb - > nodes [ nodemap - > nodes [ i ] . pnn ] - > ban_state ;
if ( ban_state = = NULL ) {
continue ;
}
ban_state - > count = 0 ;
}
2015-02-06 14:47:33 +11:00
/* We just finished a recovery successfully.
We now wait for rerecovery_timeout before we allow
2007-07-04 08:36:59 +10:00
another recovery to take place .
*/
2017-02-17 22:51:52 +13:00
DEBUG ( DEBUG_NOTICE , ( " Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds) \n " , ctdb - > tunable . rerecovery_timeout ) ) ;
2015-02-06 14:47:33 +11:00
ctdb_op_disable ( rec - > recovery , ctdb - > ev ,
ctdb - > tunable . rerecovery_timeout ) ;
2007-05-04 15:21:40 +10:00
return 0 ;
2015-02-06 14:32:08 +11:00
fail :
2015-02-06 14:47:33 +11:00
ctdb_op_end ( rec - > recovery ) ;
2015-02-06 14:32:08 +11:00
return - 1 ;
2007-05-04 09:45:53 +10:00
}
2007-05-04 08:30:18 +10:00
2007-05-07 04:41:12 +10:00
2007-06-07 19:17:27 +10:00
/*
elections are won by first checking the number of connected nodes , then
2007-09-04 10:33:10 +10:00
the priority time , then the pnn
2007-06-07 19:17:27 +10:00
*/
2007-05-07 06:51:58 +10:00
struct election_message {
2007-06-07 19:17:27 +10:00
uint32_t num_connected ;
2007-06-07 18:37:27 +10:00
struct timeval priority_time ;
2007-09-04 10:33:10 +10:00
uint32_t pnn ;
2007-10-05 13:28:21 +10:00
uint32_t node_flags ;
2007-05-07 06:51:58 +10:00
} ;
2007-06-07 19:17:27 +10:00
/*
form this nodes election data
*/
static void ctdb_election_data ( struct ctdb_recoverd * rec , struct election_message * em )
{
int ret , i ;
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap ;
2007-06-07 19:17:27 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
ZERO_STRUCTP ( em ) ;
2007-09-04 10:33:10 +10:00
em - > pnn = rec - > ctdb - > pnn ;
2007-06-07 19:17:27 +10:00
em - > priority_time = rec - > priority_time ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , rec , & nodemap ) ;
if ( ret ! = 0 ) {
2013-10-30 11:32:28 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " unable to get node map \n " ) ) ;
2007-06-07 19:17:27 +10:00
return ;
}
2009-07-17 11:37:03 +10:00
rec - > node_flags = nodemap - > nodes [ ctdb - > pnn ] . flags ;
em - > node_flags = rec - > node_flags ;
2007-06-07 19:17:27 +10:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
em - > num_connected + + ;
}
}
2008-05-06 13:56:56 +10:00
/* we shouldnt try to win this election if we cant be a recmaster */
if ( ( ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
em - > num_connected = 0 ;
em - > priority_time = timeval_current ( ) ;
}
2007-06-07 19:17:27 +10:00
talloc_free ( nodemap ) ;
}
/*
see if the given election data wins
*/
static bool ctdb_election_win ( struct ctdb_recoverd * rec , struct election_message * em )
{
struct election_message myem ;
2007-10-05 13:28:21 +10:00
int cmp = 0 ;
2007-06-07 19:17:27 +10:00
ctdb_election_data ( rec , & myem ) ;
2015-07-26 23:02:57 +02:00
/* we cant win if we don't have the recmaster capability */
2008-05-06 13:56:56 +10:00
if ( ( rec - > ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
return false ;
}
2007-10-11 06:16:36 +10:00
/* we cant win if we are banned */
if ( rec - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 14:17:49 +10:00
return false ;
2013-06-21 14:06:22 +02:00
}
2007-10-05 13:28:21 +10:00
2009-07-09 14:44:03 +10:00
/* we cant win if we are stopped */
if ( rec - > node_flags & NODE_FLAGS_STOPPED ) {
return false ;
2013-06-21 14:06:22 +02:00
}
2009-07-09 14:44:03 +10:00
2007-10-11 06:16:36 +10:00
/* we will automatically win if the other node is banned */
if ( em - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 14:17:49 +10:00
return true ;
2007-10-05 13:28:21 +10:00
}
2009-07-09 14:44:03 +10:00
/* we will automatically win if the other node is banned */
if ( em - > node_flags & NODE_FLAGS_STOPPED ) {
return true ;
}
2007-06-07 19:17:27 +10:00
/* then the longest running node */
if ( cmp = = 0 ) {
2007-06-07 19:21:55 +10:00
cmp = timeval_compare ( & em - > priority_time , & myem . priority_time ) ;
2007-06-07 19:17:27 +10:00
}
if ( cmp = = 0 ) {
2007-09-04 10:33:10 +10:00
cmp = ( int ) myem . pnn - ( int ) em - > pnn ;
2007-06-07 19:17:27 +10:00
}
return cmp > 0 ;
}
2007-06-07 15:18:55 +10:00
/*
send out an election request
*/
2013-10-29 16:38:42 +11:00
static int send_election_request ( struct ctdb_recoverd * rec , uint32_t pnn )
2007-05-07 06:51:58 +10:00
{
int ret ;
TDB_DATA election_data ;
struct election_message emsg ;
uint64_t srvid ;
2007-06-07 18:37:27 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-11 06:16:36 +10:00
2015-10-29 17:51:52 +11:00
srvid = CTDB_SRVID_ELECTION ;
2007-05-07 06:51:58 +10:00
2007-06-07 19:17:27 +10:00
ctdb_election_data ( rec , & emsg ) ;
2007-05-07 06:51:58 +10:00
election_data . dsize = sizeof ( struct election_message ) ;
election_data . dptr = ( unsigned char * ) & emsg ;
2013-10-29 16:38:42 +11:00
/* first we assume we will win the election and set
recoverymaster to be ourself on the current node
*/
2015-10-23 15:27:12 +11:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , pnn ) ;
2013-10-29 16:38:42 +11:00
if ( ret ! = 0 ) {
2015-10-23 15:27:12 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to set recmaster \n " ) ) ;
2013-10-29 16:38:42 +11:00
return - 1 ;
}
2015-10-23 14:32:41 +11:00
rec - > recmaster = pnn ;
2013-10-29 16:38:42 +11:00
2007-05-07 06:51:58 +10:00
/* send an election message to all active nodes */
2009-07-17 11:37:03 +10:00
DEBUG ( DEBUG_INFO , ( __location__ " Send election request to all active nodes \n " ) ) ;
2013-11-11 12:39:27 +11:00
return ctdb_client_send_message ( ctdb , CTDB_BROADCAST_ALL , srvid , election_data ) ;
2007-05-07 06:51:58 +10:00
}
2007-11-13 10:27:44 +11:00
/*
we think we are winning the election - send a broadcast election request
*/
2015-10-26 16:50:09 +11:00
static void election_send_request ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval t , void * p )
2007-11-13 10:27:44 +11:00
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
int ret ;
2013-10-29 16:38:42 +11:00
ret = send_election_request ( rec , ctdb_get_pnn ( rec - > ctdb ) ) ;
2007-11-13 10:27:44 +11:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Failed to send election request! \n " ) ) ;
2007-11-13 10:27:44 +11:00
}
2015-10-23 16:03:38 +11:00
TALLOC_FREE ( rec - > send_election_te ) ;
2007-11-13 10:27:44 +11:00
}
2008-04-01 15:34:54 +11:00
/*
handler for memory dumps
*/
2015-04-08 14:38:26 +10:00
static void mem_dump_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2008-04-01 15:34:54 +11:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-04-01 15:34:54 +11:00
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
TDB_DATA * dump ;
int ret ;
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message * rd ;
2008-04-01 15:34:54 +11:00
2015-10-29 14:32:49 +11:00
if ( data . dsize ! = sizeof ( struct ctdb_srvid_message ) ) {
2008-04-01 15:34:54 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size of return address. \n " ) ) ;
2008-09-16 09:00:48 +10:00
talloc_free ( tmp_ctx ) ;
2008-04-01 15:34:54 +11:00
return ;
}
2015-10-29 14:32:49 +11:00
rd = ( struct ctdb_srvid_message * ) data . dptr ;
2008-04-01 15:34:54 +11:00
dump = talloc_zero ( tmp_ctx , TDB_DATA ) ;
if ( dump = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to allocate memory for memdump \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
ret = ctdb_dump_memory ( ctdb , dump ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_dump_memory() failed \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
DEBUG ( DEBUG_ERR , ( " recovery master memory dump \n " ) ) ;
2010-06-02 09:45:21 +10:00
ret = ctdb_client_send_message ( ctdb , rd - > pnn , rd - > srvid , * dump ) ;
2008-04-01 15:34:54 +11:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to send rd memdump reply message \n " ) ) ;
2008-09-16 09:00:48 +10:00
talloc_free ( tmp_ctx ) ;
2008-04-01 15:34:54 +11:00
return ;
}
talloc_free ( tmp_ctx ) ;
}
2009-06-01 14:18:34 +10:00
/*
handler for reload_nodes
*/
2015-04-08 14:38:26 +10:00
static void reload_nodes_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2009-06-01 14:18:34 +10:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2009-06-01 14:18:34 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " Reload nodes file from recovery daemon \n " ) ) ;
2013-10-14 13:54:39 +11:00
ctdb_load_nodes_file ( rec - > ctdb ) ;
2009-06-01 14:18:34 +10:00
}
2009-10-06 12:11:32 +11:00
2015-04-08 14:38:26 +10:00
static void recd_node_rebalance_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2012-02-28 06:56:04 +11:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2012-02-28 06:56:04 +11:00
uint32_t pnn ;
2013-09-04 14:30:04 +10:00
uint32_t * t ;
int len ;
2012-02-28 06:56:04 +11:00
2013-09-04 14:30:04 +10:00
if ( rec - > recmaster ! = ctdb_get_pnn ( ctdb ) ) {
return ;
}
2012-02-28 06:56:04 +11:00
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes \n " , data . dsize , sizeof ( uint32_t ) ) ) ;
return ;
}
pnn = * ( uint32_t * ) & data . dptr [ 0 ] ;
2013-09-04 14:30:04 +10:00
DEBUG ( DEBUG_NOTICE , ( " Setting up rebalance of IPs to node %u \n " , pnn ) ) ;
2012-02-28 06:56:04 +11:00
2013-09-04 14:30:04 +10:00
/* Copy any existing list of nodes. There's probably some
* sort of realloc variant that will do this but we need to
* make sure that freeing the old array also cancels the timer
* event for the timeout . . . not sure if realloc will do that .
*/
len = ( rec - > force_rebalance_nodes ! = NULL ) ?
talloc_array_length ( rec - > force_rebalance_nodes ) :
0 ;
/* This allows duplicates to be added but they don't cause
* harm . A call to add a duplicate PNN arguably means that
* the timeout should be reset , so this is the simplest
* solution .
*/
t = talloc_zero_array ( rec , uint32_t , len + 1 ) ;
CTDB_NO_MEMORY_VOID ( ctdb , t ) ;
if ( len > 0 ) {
memcpy ( t , rec - > force_rebalance_nodes , sizeof ( uint32_t ) * len ) ;
2012-02-28 06:56:04 +11:00
}
2013-09-04 14:30:04 +10:00
t [ len ] = pnn ;
talloc_free ( rec - > force_rebalance_nodes ) ;
rec - > force_rebalance_nodes = t ;
2012-02-28 06:56:04 +11:00
}
2015-02-06 13:05:12 +11:00
static void srvid_disable_and_reply ( struct ctdb_context * ctdb ,
TDB_DATA data ,
struct ctdb_op_state * op_state )
2013-08-27 15:04:40 +10:00
{
2015-10-28 18:23:13 +11:00
struct ctdb_disable_message * r ;
2013-08-27 15:04:40 +10:00
uint32_t timeout ;
TDB_DATA result ;
int32_t ret = 0 ;
/* Validate input data */
2015-10-28 18:23:13 +11:00
if ( data . dsize ! = sizeof ( struct ctdb_disable_message ) ) {
2013-08-27 15:04:40 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size for data :%lu "
" expecting %lu \n " , ( long unsigned ) data . dsize ,
2015-10-29 14:32:49 +11:00
( long unsigned ) sizeof ( struct ctdb_srvid_message ) ) ) ;
2013-11-11 12:39:27 +11:00
return ;
2013-08-27 15:04:40 +10:00
}
if ( data . dptr = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " No data received \n " ) ) ;
2013-11-11 12:39:27 +11:00
return ;
2013-08-27 15:04:40 +10:00
}
2015-10-28 18:23:13 +11:00
r = ( struct ctdb_disable_message * ) data . dptr ;
timeout = r - > timeout ;
2013-08-27 15:04:40 +10:00
2015-02-06 13:05:12 +11:00
ret = ctdb_op_disable ( op_state , ctdb - > ev , timeout ) ;
2015-02-08 20:52:12 +11:00
if ( ret ! = 0 ) {
2013-08-27 15:04:40 +10:00
goto done ;
}
/* Returning our PNN tells the caller that we succeeded */
ret = ctdb_get_pnn ( ctdb ) ;
done :
result . dsize = sizeof ( int32_t ) ;
result . dptr = ( uint8_t * ) & ret ;
2015-10-29 14:32:49 +11:00
srvid_request_reply ( ctdb , ( struct ctdb_srvid_message * ) r , result ) ;
2013-08-27 15:04:40 +10:00
}
2015-04-08 14:38:26 +10:00
static void disable_takeover_runs_handler ( uint64_t srvid , TDB_DATA data ,
2015-02-06 13:05:12 +11:00
void * private_data )
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 13:05:12 +11:00
2015-04-08 14:38:26 +10:00
srvid_disable_and_reply ( rec - > ctdb , data , rec - > takeover_run ) ;
2015-02-06 13:05:12 +11:00
}
2015-02-06 15:03:03 +11:00
/* Backward compatibility for this SRVID */
2015-04-08 14:38:26 +10:00
static void disable_ip_check_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2013-08-28 11:32:54 +10:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 15:03:03 +11:00
uint32_t timeout ;
2013-08-28 11:32:54 +10:00
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size for data :%lu "
" expecting %lu \n " , ( long unsigned ) data . dsize ,
( long unsigned ) sizeof ( uint32_t ) ) ) ;
return ;
}
if ( data . dptr = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " No data received \n " ) ) ;
return ;
}
2015-02-06 15:03:03 +11:00
timeout = * ( ( uint32_t * ) data . dptr ) ;
2013-08-28 11:32:54 +10:00
2015-04-08 14:38:26 +10:00
ctdb_op_disable ( rec - > takeover_run , rec - > ctdb - > ev , timeout ) ;
2013-08-28 11:32:54 +10:00
}
2009-10-06 12:11:32 +11:00
2015-04-08 14:38:26 +10:00
static void disable_recoveries_handler ( uint64_t srvid , TDB_DATA data ,
2015-02-06 15:06:44 +11:00
void * private_data )
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2015-02-06 15:06:44 +11:00
2015-04-08 14:38:26 +10:00
srvid_disable_and_reply ( rec - > ctdb , data , rec - > recovery ) ;
2015-02-06 15:06:44 +11:00
}
2009-07-02 13:00:26 +10:00
/*
2013-08-16 20:10:10 +10:00
handler for ip reallocate , just add it to the list of requests and
2009-07-02 13:00:26 +10:00
handle this later in the monitor_cluster loop so we do not recurse
2013-08-16 20:10:10 +10:00
with other requests to takeover_run ( )
2009-07-02 13:00:26 +10:00
*/
2015-04-08 14:38:26 +10:00
static void ip_reallocate_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2009-07-02 13:00:26 +10:00
{
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message * request ;
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
2009-07-02 13:00:26 +10:00
2015-10-29 14:32:49 +11:00
if ( data . dsize ! = sizeof ( struct ctdb_srvid_message ) ) {
2009-07-02 13:00:26 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size of return address. \n " ) ) ;
return ;
}
2015-10-29 14:32:49 +11:00
request = ( struct ctdb_srvid_message * ) data . dptr ;
2009-07-02 13:00:26 +10:00
2015-04-08 14:38:26 +10:00
srvid_request_add ( rec - > ctdb , & rec - > reallocate_requests , request ) ;
2009-07-02 13:00:26 +10:00
}
2013-08-16 20:02:34 +10:00
static void process_ipreallocate_requests ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec )
2009-07-02 13:00:26 +10:00
{
TDB_DATA result ;
int32_t ret ;
2013-11-22 13:57:03 +11:00
struct srvid_requests * current ;
2009-07-02 13:00:26 +10:00
2013-11-22 13:57:03 +11:00
/* Only process requests that are currently pending. More
* might come in while the takeover run is in progress and
* they will need to be processed later since they might
* be in response flag changes .
*/
current = rec - > reallocate_requests ;
rec - > reallocate_requests = NULL ;
2016-05-03 15:35:08 +10:00
if ( do_takeover_run ( rec , rec - > nodemap ) ) {
2015-10-28 20:04:41 +11:00
ret = ctdb_get_pnn ( ctdb ) ;
} else {
ret = - 1 ;
2010-01-19 08:42:48 +01:00
}
2009-07-02 13:00:26 +10:00
result . dsize = sizeof ( int32_t ) ;
result . dptr = ( uint8_t * ) & ret ;
2013-11-22 13:57:03 +11:00
srvid_requests_reply ( ctdb , & current , result ) ;
2009-07-02 13:00:26 +10:00
}
2009-06-01 14:18:34 +10:00
2016-03-17 17:26:30 +11:00
/*
* handler for assigning banning credits
*/
static void banning_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
uint32_t ban_pnn ;
/* Ignore if we are not recmaster */
if ( rec - > ctdb - > pnn ! = rec - > recmaster ) {
return ;
}
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " invalid data size %zu \n " ,
data . dsize ) ) ;
return ;
}
ban_pnn = * ( uint32_t * ) data . dptr ;
ctdb_set_culprit_count ( rec , ban_pnn , rec - > nodemap - > num ) ;
}
2009-06-01 14:18:34 +10:00
2007-05-07 06:51:58 +10:00
/*
handler for recovery master elections
*/
2015-04-08 14:38:26 +10:00
static void election_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2007-05-07 06:51:58 +10:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-07 06:51:58 +10:00
int ret ;
struct election_message * em = ( struct election_message * ) data . dptr ;
2013-11-01 14:34:20 +11:00
/* Ignore election packets from ourself */
if ( ctdb - > pnn = = em - > pnn ) {
return ;
}
2007-11-13 10:27:44 +11:00
/* we got an election packet - update the timeout for the election */
talloc_free ( rec - > election_timeout ) ;
2015-10-26 16:50:09 +11:00
rec - > election_timeout = tevent_add_timer (
ctdb - > ev , ctdb ,
fast_start ?
timeval_current_ofs ( 0 , 500000 ) :
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
2007-11-13 10:27:44 +11:00
2007-05-07 06:51:58 +10:00
/* someone called an election. check their election data
and if we disagree and we would rather be the elected node ,
send a new election message to all other nodes
*/
2007-06-07 19:17:27 +10:00
if ( ctdb_election_win ( rec , em ) ) {
2007-11-13 10:27:44 +11:00
if ( ! rec - > send_election_te ) {
2015-10-26 16:50:09 +11:00
rec - > send_election_te = tevent_add_timer (
ctdb - > ev , rec ,
timeval_current_ofs ( 0 , 500000 ) ,
election_send_request , rec ) ;
2007-05-07 06:51:58 +10:00
}
return ;
}
2014-12-09 13:50:22 +11:00
2007-11-13 10:27:44 +11:00
/* we didn't win */
2015-03-31 13:59:02 +11:00
TALLOC_FREE ( rec - > send_election_te ) ;
2007-05-07 06:51:58 +10:00
2015-03-31 13:59:49 +11:00
/* Release the recovery lock file */
2016-05-24 14:54:39 +10:00
if ( ctdb_recovery_have_lock ( rec ) ) {
ctdb_recovery_unlock ( rec ) ;
2007-05-23 14:35:19 +10:00
}
2007-05-07 06:51:58 +10:00
/* ok, let that guy become recmaster then */
2015-10-23 15:27:12 +11:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , em - > pnn ) ;
2007-05-07 06:51:58 +10:00
if ( ret ! = 0 ) {
2015-10-23 15:27:12 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to set recmaster " ) ) ;
2007-05-07 06:51:58 +10:00
return ;
}
2015-10-23 14:32:41 +11:00
rec - > recmaster = em - > pnn ;
2007-05-07 06:51:58 +10:00
return ;
}
2007-06-07 15:18:55 +10:00
/*
force the start of the election process
*/
2008-03-03 09:19:30 +11:00
static void force_election ( struct ctdb_recoverd * rec , uint32_t pnn ,
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap )
2007-05-07 06:51:58 +10:00
{
int ret ;
2007-06-07 18:37:27 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-10 09:48:14 +10:00
2009-07-17 11:37:03 +10:00
DEBUG ( DEBUG_INFO , ( __location__ " Force an election \n " ) ) ;
2007-05-10 09:48:14 +10:00
/* set all nodes to recovery mode to stop all internode traffic */
2016-09-13 15:45:54 +10:00
ret = set_recovery_mode ( ctdb , rec , nodemap , CTDB_RECOVERY_ACTIVE ) ;
2008-07-07 08:50:12 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-10 09:48:14 +10:00
return ;
}
2007-11-13 10:27:44 +11:00
talloc_free ( rec - > election_timeout ) ;
2015-10-26 16:50:09 +11:00
rec - > election_timeout = tevent_add_timer (
ctdb - > ev , ctdb ,
fast_start ?
timeval_current_ofs ( 0 , 500000 ) :
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
2007-11-13 10:27:44 +11:00
2013-10-29 16:38:42 +11:00
ret = send_election_request ( rec , pnn ) ;
2007-05-07 06:51:58 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 06:51:58 +10:00
return ;
}
2007-05-26 14:01:08 +10:00
/* wait for a few seconds to collect all responses */
2007-11-13 10:27:44 +11:00
ctdb_wait_election ( rec ) ;
2007-06-07 15:18:55 +10:00
}
/*
handler for when a node changes its flags
*/
2015-04-08 14:38:26 +10:00
static void monitor_handler ( uint64_t srvid , TDB_DATA data , void * private_data )
2007-06-07 15:18:55 +10:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-06-07 15:18:55 +10:00
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap = NULL ;
2007-06-07 15:18:55 +10:00
TALLOC_CTX * tmp_ctx ;
int i ;
if ( data . dsize ! = sizeof ( * c ) ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Invalid data in ctdb_node_flag_change \n " ) ) ;
2007-06-07 15:18:55 +10:00
return ;
}
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY_VOID ( ctdb , tmp_ctx ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
2007-12-27 10:07:01 +11:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_ctrl_getnodemap failed in monitor_handler \n " ) ) ;
2007-12-27 10:07:01 +11:00
talloc_free ( tmp_ctx ) ;
return ;
}
2007-06-07 15:18:55 +10:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 10:33:10 +10:00
if ( nodemap - > nodes [ i ] . pnn = = c - > pnn ) break ;
2007-06-07 15:18:55 +10:00
}
if ( i = = nodemap - > num ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_CRIT , ( __location__ " Flag change for non-existant node %u \n " , c - > pnn ) ) ;
2007-06-07 15:18:55 +10:00
talloc_free ( tmp_ctx ) ;
return ;
}
2013-07-11 13:01:13 +10:00
if ( c - > old_flags ! = c - > new_flags ) {
DEBUG ( DEBUG_NOTICE , ( " Node %u has changed flags - now 0x%x was 0x%x \n " , c - > pnn , c - > new_flags , c - > old_flags ) ) ;
2007-06-07 15:18:55 +10:00
}
2007-08-21 17:25:15 +10:00
nodemap - > nodes [ i ] . flags = c - > new_flags ;
2007-06-07 15:18:55 +10:00
talloc_free ( tmp_ctx ) ;
2007-05-07 06:51:58 +10:00
}
2007-05-07 04:41:12 +10:00
2008-11-19 14:43:46 +11:00
/*
handler for when we need to push out flag changes ot all other nodes
*/
2015-04-08 14:38:26 +10:00
static void push_flags_handler ( uint64_t srvid , TDB_DATA data ,
void * private_data )
2008-11-19 14:43:46 +11:00
{
2015-04-08 14:38:26 +10:00
struct ctdb_recoverd * rec = talloc_get_type (
private_data , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
2008-11-19 14:43:46 +11:00
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap = NULL ;
2009-10-09 15:47:49 +02:00
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
uint32_t * nodes ;
2008-11-19 14:43:46 +11:00
2009-10-09 15:47:49 +02:00
/* read the node flags from the recmaster */
2015-10-23 15:33:01 +11:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , rec - > recmaster ,
tmp_ctx , & nodemap ) ;
2009-10-09 15:47:49 +02:00
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from node %u \n " , c - > pnn ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
2008-11-19 14:43:46 +11:00
}
2009-10-09 15:47:49 +02:00
if ( c - > pnn > = nodemap - > num ) {
DEBUG ( DEBUG_ERR , ( __location__ " Nodemap from recmaster does not contain node %d \n " , c - > pnn ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
/* send the flags update to all connected nodes */
nodes = list_of_connected_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_MODIFY_FLAGS ,
nodes , 0 , CONTROL_TIMEOUT ( ) ,
false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_control to modify node flags failed \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
talloc_free ( tmp_ctx ) ;
2008-11-19 14:43:46 +11:00
}
2007-06-07 15:18:55 +10:00
2007-08-23 13:48:39 +10:00
2007-08-27 09:40:10 +10:00
struct verify_recmode_normal_data {
uint32_t count ;
enum monitor_result status ;
} ;
static void verify_recmode_normal_callback ( struct ctdb_client_control_state * state )
{
2007-09-26 14:25:32 +10:00
struct verify_recmode_normal_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmode_normal_data ) ;
2007-08-27 09:40:10 +10:00
/* one more node has responded with recmode data*/
rmdata - > count - - ;
/* if we failed to get the recmode, then return an error and let
the main loop try again .
*/
if ( state - > state ! = CTDB_CONTROL_DONE ) {
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
return ;
}
/* if we got a response, then the recmode will be stored in the
status field
*/
if ( state - > status ! = CTDB_RECOVERY_NORMAL ) {
2013-06-30 17:57:33 +10:00
DEBUG ( DEBUG_NOTICE , ( " Node:%u was in recovery mode. Start recovery process \n " , state - > c - > hdr . destnode ) ) ;
2007-08-27 09:40:10 +10:00
rmdata - > status = MONITOR_RECOVERY_NEEDED ;
}
return ;
}
/* verify that all nodes are in normal recovery mode */
2015-10-29 17:22:48 +11:00
static enum monitor_result verify_recmode ( struct ctdb_context * ctdb , struct ctdb_node_map_old * nodemap )
2007-08-23 13:48:39 +10:00
{
2007-08-27 09:40:10 +10:00
struct verify_recmode_normal_data * rmdata ;
2007-08-23 19:27:09 +10:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-08-27 09:40:10 +10:00
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
2007-08-23 13:48:39 +10:00
2007-08-27 09:40:10 +10:00
rmdata = talloc ( mem_ctx , struct verify_recmode_normal_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
rmdata - > count = 0 ;
rmdata - > status = MONITOR_OK ;
2007-08-23 13:48:39 +10:00
/* loop over all active nodes and send an async getrecmode call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-08-27 09:40:10 +10:00
state = ctdb_ctrl_getrecmode_send ( ctdb , mem_ctx ,
2007-08-23 13:48:39 +10:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-27 09:40:10 +10:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmode_send during monitoring \n " ) ) ;
2007-08-23 19:27:09 +10:00
talloc_free ( mem_ctx ) ;
2007-08-23 13:48:39 +10:00
return MONITOR_FAILED ;
}
2007-08-23 19:27:09 +10:00
2007-08-27 09:40:10 +10:00
/* set up the callback functions */
state - > async . fn = verify_recmode_normal_callback ;
2007-09-26 14:25:32 +10:00
state - > async . private_data = rmdata ;
2007-08-27 09:40:10 +10:00
/* one more control to wait for to complete */
rmdata - > count + + ;
2007-08-23 13:48:39 +10:00
}
2007-08-27 09:40:10 +10:00
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
while ( rmdata - > count > 0 ) {
2015-10-26 16:50:09 +11:00
tevent_loop_once ( ctdb - > ev ) ;
2007-08-27 09:40:10 +10:00
}
status = rmdata - > status ;
2007-08-23 19:27:09 +10:00
talloc_free ( mem_ctx ) ;
2007-08-27 09:40:10 +10:00
return status ;
2007-08-23 13:48:39 +10:00
}
2007-08-27 09:40:10 +10:00
2007-08-23 19:27:09 +10:00
struct verify_recmaster_data {
2008-04-22 00:56:27 +10:00
struct ctdb_recoverd * rec ;
2007-08-23 19:27:09 +10:00
uint32_t count ;
2007-09-04 10:33:10 +10:00
uint32_t pnn ;
2007-08-23 19:27:09 +10:00
enum monitor_result status ;
} ;
2007-08-24 10:42:06 +10:00
static void verify_recmaster_callback ( struct ctdb_client_control_state * state )
2007-08-23 19:27:09 +10:00
{
2007-09-26 14:25:32 +10:00
struct verify_recmaster_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmaster_data ) ;
2007-08-23 19:27:09 +10:00
/* one more node has responded with recmaster data*/
rmdata - > count - - ;
/* if we failed to get the recmaster, then return an error and let
the main loop try again .
*/
2007-08-24 10:42:06 +10:00
if ( state - > state ! = CTDB_CONTROL_DONE ) {
2007-08-23 19:27:09 +10:00
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
2007-08-24 10:42:06 +10:00
return ;
2007-08-23 19:27:09 +10:00
}
/* if we got a response, then the recmaster will be stored in the
status field
*/
2007-09-04 10:33:10 +10:00
if ( state - > status ! = rmdata - > pnn ) {
2013-08-14 11:44:12 +10:00
DEBUG ( DEBUG_ERR , ( " Node %d thinks node %d is recmaster. Need a new recmaster election \n " , state - > c - > hdr . destnode , state - > status ) ) ;
2008-04-22 00:56:27 +10:00
ctdb_set_culprit ( rmdata - > rec , state - > c - > hdr . destnode ) ;
2007-08-23 19:27:09 +10:00
rmdata - > status = MONITOR_ELECTION_NEEDED ;
}
2007-08-24 10:42:06 +10:00
return ;
2007-08-23 19:27:09 +10:00
}
/* verify that all nodes agree that we are the recmaster */
2015-10-29 17:22:48 +11:00
static enum monitor_result verify_recmaster ( struct ctdb_recoverd * rec , struct ctdb_node_map_old * nodemap , uint32_t pnn )
2007-08-23 19:27:09 +10:00
{
2008-04-22 00:56:27 +10:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-08-23 19:27:09 +10:00
struct verify_recmaster_data * rmdata ;
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
rmdata = talloc ( mem_ctx , struct verify_recmaster_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
2008-04-22 00:56:27 +10:00
rmdata - > rec = rec ;
2007-08-23 19:27:09 +10:00
rmdata - > count = 0 ;
2007-09-04 10:33:10 +10:00
rmdata - > pnn = pnn ;
2007-08-23 19:27:09 +10:00
rmdata - > status = MONITOR_OK ;
2015-10-23 15:05:08 +11:00
/* loop over all active nodes and send an async getrecmaster call to
2007-08-23 19:27:09 +10:00
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2015-10-23 15:05:08 +11:00
if ( nodemap - > nodes [ j ] . pnn = = rec - > recmaster ) {
continue ;
}
2007-08-23 19:27:09 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
state = ctdb_ctrl_getrecmaster_send ( ctdb , mem_ctx ,
2007-08-23 19:38:54 +10:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-23 19:27:09 +10:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmaster_send during monitoring \n " ) ) ;
2007-08-23 19:27:09 +10:00
talloc_free ( mem_ctx ) ;
return MONITOR_FAILED ;
}
2007-08-24 10:42:06 +10:00
/* set up the callback functions */
state - > async . fn = verify_recmaster_callback ;
2007-09-26 14:25:32 +10:00
state - > async . private_data = rmdata ;
2007-08-24 10:42:06 +10:00
2007-08-23 19:27:09 +10:00
/* one more control to wait for to complete */
rmdata - > count + + ;
}
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
2007-08-23 19:38:54 +10:00
while ( rmdata - > count > 0 ) {
2015-10-26 16:50:09 +11:00
tevent_loop_once ( ctdb - > ev ) ;
2007-08-23 19:27:09 +10:00
}
status = rmdata - > status ;
talloc_free ( mem_ctx ) ;
return status ;
}
2013-02-21 10:43:35 +11:00
static bool interfaces_have_changed ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec )
{
2015-10-28 19:43:48 +11:00
struct ctdb_iface_list_old * ifaces = NULL ;
2013-02-21 10:43:35 +11:00
TALLOC_CTX * mem_ctx ;
bool ret = false ;
mem_ctx = talloc_new ( NULL ) ;
/* Read the interfaces from the local node */
if ( ctdb_ctrl_get_ifaces ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , mem_ctx , & ifaces ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to get interfaces from local node %u \n " , ctdb - > pnn ) ) ;
/* We could return an error. However, this will be
* rare so we ' ll decide that the interfaces have
* actually changed , just in case .
*/
talloc_free ( mem_ctx ) ;
return true ;
}
if ( ! rec - > ifaces ) {
/* We haven't been here before so things have changed */
2013-08-15 17:04:01 +10:00
DEBUG ( DEBUG_NOTICE , ( " Initial interface fetched \n " ) ) ;
2013-02-21 10:43:35 +11:00
ret = true ;
} else if ( rec - > ifaces - > num ! = ifaces - > num ) {
/* Number of interfaces has changed */
2013-08-15 17:04:01 +10:00
DEBUG ( DEBUG_NOTICE , ( " Interface count changed from %d to %d \n " ,
rec - > ifaces - > num , ifaces - > num ) ) ;
2013-02-21 10:43:35 +11:00
ret = true ;
} else {
/* See if interface names or link states have changed */
int i ;
for ( i = 0 ; i < rec - > ifaces - > num ; i + + ) {
2015-10-28 19:37:17 +11:00
struct ctdb_iface * iface = & rec - > ifaces - > ifaces [ i ] ;
2013-08-15 17:04:01 +10:00
if ( strcmp ( iface - > name , ifaces - > ifaces [ i ] . name ) ! = 0 ) {
DEBUG ( DEBUG_NOTICE ,
( " Interface in slot %d changed: %s => %s \n " ,
i , iface - > name , ifaces - > ifaces [ i ] . name ) ) ;
ret = true ;
break ;
}
if ( iface - > link_state ! = ifaces - > ifaces [ i ] . link_state ) {
DEBUG ( DEBUG_NOTICE ,
( " Interface %s changed state: %d => %d \n " ,
iface - > name , iface - > link_state ,
ifaces - > ifaces [ i ] . link_state ) ) ;
2013-02-21 10:43:35 +11:00
ret = true ;
break ;
}
}
}
talloc_free ( rec - > ifaces ) ;
rec - > ifaces = talloc_steal ( rec , ifaces ) ;
talloc_free ( mem_ctx ) ;
return ret ;
}
2007-06-07 15:18:55 +10:00
2016-05-03 16:36:37 +10:00
/* Check that the local allocation of public IP addresses is correct
* and do some house - keeping */
static int verify_local_ip_allocation ( struct ctdb_context * ctdb ,
struct ctdb_recoverd * rec ,
uint32_t pnn ,
struct ctdb_node_map_old * nodemap )
2008-07-02 13:55:59 +10:00
{
TALLOC_CTX * mem_ctx = talloc_new ( NULL ) ;
int ret , j ;
2009-12-22 15:21:08 +01:00
bool need_takeover_run = false ;
2015-11-09 15:41:45 +11:00
struct ctdb_public_ip_list_old * ips = NULL ;
2016-05-03 16:36:37 +10:00
/* If we are not the recmaster then do some housekeeping */
if ( rec - > recmaster ! = pnn ) {
/* Ignore any IP reallocate requests - only recmaster
* processes them
*/
TALLOC_FREE ( rec - > reallocate_requests ) ;
/* Clear any nodes that should be force rebalanced in
* the next takeover run . If the recovery master role
* has moved then we don ' t want to process these some
* time in the future .
*/
TALLOC_FREE ( rec - > force_rebalance_nodes ) ;
}
2015-11-09 15:41:45 +11:00
/* Return early if disabled... */
if ( ctdb - > tunable . disable_ip_failover ! = 0 | |
ctdb_op_is_disabled ( rec - > takeover_run ) ) {
return 0 ;
}
2008-07-02 13:55:59 +10:00
2013-02-21 10:43:35 +11:00
if ( interfaces_have_changed ( ctdb , rec ) ) {
2009-12-22 15:21:08 +01:00
need_takeover_run = true ;
}
2015-11-09 16:12:31 +11:00
/* If there are unhosted IPs but this node can host them then
* trigger an IP reallocation */
2012-10-11 15:17:54 +11:00
2015-11-09 16:12:31 +11:00
/* Read *available* IPs from local node */
ret = ctdb_ctrl_get_public_ips_flags (
ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , mem_ctx ,
CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE , & ips ) ;
2015-11-09 15:41:45 +11:00
if ( ret ! = 0 ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_ERR , ( " Unable to retrieve available public IPs \n " ) ) ;
2015-11-09 15:41:45 +11:00
talloc_free ( mem_ctx ) ;
return - 1 ;
}
2012-10-11 15:17:54 +11:00
2015-11-09 15:41:45 +11:00
for ( j = 0 ; j < ips - > num ; j + + ) {
if ( ips - > ips [ j ] . pnn = = - 1 & &
nodemap - > nodes [ pnn ] . flags = = 0 ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_WARNING ,
( " Unassigned IP %s can be served by this node \n " ,
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
2015-11-09 15:41:45 +11:00
need_takeover_run = true ;
2012-10-11 15:17:54 +11:00
}
2015-11-09 15:41:45 +11:00
}
2012-10-11 15:17:54 +11:00
2015-11-09 15:41:45 +11:00
talloc_free ( ips ) ;
2012-10-11 15:17:54 +11:00
2015-11-09 15:44:15 +11:00
if ( ! ctdb - > do_checkpublicip ) {
goto done ;
}
2015-11-09 16:12:31 +11:00
/* Validate the IP addresses that this node has on network
* interfaces . If there is an inconsistency between reality
* and the state expected by CTDB then try to fix it by
* triggering an IP reallocation or releasing extraneous IP
* addresses . */
/* Read *known* IPs from local node */
ret = ctdb_ctrl_get_public_ips_flags (
ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , mem_ctx , 0 , & ips ) ;
2015-11-09 15:41:45 +11:00
if ( ret ! = 0 ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_ERR , ( " Unable to retrieve known public IPs \n " ) ) ;
2015-11-09 15:41:45 +11:00
talloc_free ( mem_ctx ) ;
return - 1 ;
}
2012-10-11 15:17:54 +11:00
2015-11-09 15:41:45 +11:00
for ( j = 0 ; j < ips - > num ; j + + ) {
if ( ips - > ips [ j ] . pnn = = pnn ) {
2015-11-09 15:44:15 +11:00
if ( ! ctdb_sys_have_ip ( & ips - > ips [ j ] . addr ) ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_ERR ,
( " Assigned IP %s not on an interface \n " ,
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
2015-11-09 15:41:45 +11:00
need_takeover_run = true ;
}
} else {
2015-11-09 15:44:15 +11:00
if ( ctdb_sys_have_ip ( & ips - > ips [ j ] . addr ) ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_ERR ,
2016-08-02 12:18:15 +10:00
( " IP %s incorrectly on an interface \n " ,
2015-11-09 16:12:31 +11:00
ctdb_addr_to_str ( & ips - > ips [ j ] . addr ) ) ) ;
2016-08-02 12:18:15 +10:00
need_takeover_run = true ;
2008-07-02 13:55:59 +10:00
}
}
}
2015-11-09 15:44:15 +11:00
done :
2009-12-22 15:21:08 +01:00
if ( need_takeover_run ) {
2015-10-29 14:32:49 +11:00
struct ctdb_srvid_message rd ;
2009-12-22 15:21:08 +01:00
TDB_DATA data ;
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_NOTICE , ( " Trigger takeoverrun \n " ) ) ;
2009-12-22 15:21:08 +01:00
2016-01-11 17:23:12 +11:00
ZERO_STRUCT ( rd ) ;
2009-12-22 15:21:08 +01:00
rd . pnn = ctdb - > pnn ;
rd . srvid = 0 ;
data . dptr = ( uint8_t * ) & rd ;
data . dsize = sizeof ( rd ) ;
2010-06-02 09:45:21 +10:00
ret = ctdb_client_send_message ( ctdb , rec - > recmaster , CTDB_SRVID_TAKEOVER_RUN , data ) ;
2009-12-22 15:21:08 +01:00
if ( ret ! = 0 ) {
2015-11-09 16:12:31 +11:00
DEBUG ( DEBUG_ERR ,
( " Failed to send takeover run request \n " ) ) ;
2009-12-22 15:21:08 +01:00
}
}
2008-07-02 13:55:59 +10:00
talloc_free ( mem_ctx ) ;
return 0 ;
}
2008-12-05 16:32:30 +11:00
static void async_getnodemap_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * * remote_nodemaps = callback_data ;
2008-12-05 16:32:30 +11:00
if ( node_pnn > = ctdb - > num_nodes ) {
DEBUG ( DEBUG_ERR , ( __location__ " pnn from invalid node \n " ) ) ;
return ;
}
2015-10-29 17:22:48 +11:00
remote_nodemaps [ node_pnn ] = ( struct ctdb_node_map_old * ) talloc_steal ( remote_nodemaps , outdata . dptr ) ;
2008-12-05 16:32:30 +11:00
}
static int get_remote_nodemaps ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx ,
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap ,
struct ctdb_node_map_old * * remote_nodemaps )
2008-12-05 16:32:30 +11:00
{
uint32_t * nodes ;
nodes = list_of_active_nodes ( ctdb , nodemap , mem_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_GET_NODEMAP ,
2009-10-12 12:08:39 +11:00
nodes , 0 ,
2008-12-05 16:32:30 +11:00
CONTROL_TIMEOUT ( ) , false , tdb_null ,
async_getnodemap_callback ,
NULL ,
2008-12-09 10:45:14 +11:00
remote_nodemaps ) ! = 0 ) {
2008-12-05 16:32:30 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to pull all remote nodemaps \n " ) ) ;
return - 1 ;
}
return 0 ;
}
2016-04-28 16:58:35 +10:00
static bool validate_recovery_master ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx )
2015-10-27 16:43:07 +11:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
uint32_t pnn = ctdb_get_pnn ( ctdb ) ;
struct ctdb_node_map_old * nodemap = rec - > nodemap ;
struct ctdb_node_map_old * recmaster_nodemap = NULL ;
int ret ;
/* When recovery daemon is started, recmaster is set to
* " unknown " so it knows to start an election .
*/
if ( rec - > recmaster = = CTDB_UNKNOWN_PNN ) {
DEBUG ( DEBUG_NOTICE ,
( " Initial recovery master set - forcing election \n " ) ) ;
2016-04-28 16:58:35 +10:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 16:43:07 +11:00
}
/*
* If the current recmaster does not have CTDB_CAP_RECMASTER ,
* but we have , then force an election and try to become the new
* recmaster .
*/
if ( ! ctdb_node_has_capabilities ( rec - > caps ,
rec - > recmaster ,
CTDB_CAP_RECMASTER ) & &
( rec - > ctdb - > capabilities & CTDB_CAP_RECMASTER ) & &
! ( nodemap - > nodes [ pnn ] . flags & NODE_FLAGS_INACTIVE ) ) {
DEBUG ( DEBUG_ERR ,
( " Current recmaster node %u does not have CAP_RECMASTER, "
" but we (node %u) have - force an election \n " ,
rec - > recmaster , pnn ) ) ;
2016-04-28 16:58:35 +10:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 16:43:07 +11:00
}
/* Verify that the master node has not been deleted. This
* should not happen because a node should always be shutdown
* before being deleted , causing a new master to be elected
* before now . However , if something strange has happened
* then checking here will ensure we don ' t index beyond the
* end of the nodemap array . */
if ( rec - > recmaster > = nodemap - > num ) {
DEBUG ( DEBUG_ERR ,
( " Recmaster node %u has been deleted. Force election \n " ,
rec - > recmaster ) ) ;
2016-04-28 16:58:35 +10:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 16:43:07 +11:00
}
/* if recovery master is disconnected/deleted we must elect a new recmaster */
if ( nodemap - > nodes [ rec - > recmaster ] . flags &
( NODE_FLAGS_DISCONNECTED | NODE_FLAGS_DELETED ) ) {
DEBUG ( DEBUG_NOTICE ,
( " Recmaster node %u is disconnected/deleted. Force election \n " ,
rec - > recmaster ) ) ;
2016-04-28 16:58:35 +10:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 16:43:07 +11:00
}
/* get nodemap from the recovery master to check if it is inactive */
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , rec - > recmaster ,
mem_ctx , & recmaster_nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR ,
( __location__
" Unable to get nodemap from recovery master %u \n " ,
rec - > recmaster ) ) ;
2016-04-28 16:58:35 +10:00
/* No election, just error */
return false ;
2015-10-27 16:43:07 +11:00
}
if ( ( recmaster_nodemap - > nodes [ rec - > recmaster ] . flags & NODE_FLAGS_INACTIVE ) & &
( rec - > node_flags & NODE_FLAGS_INACTIVE ) = = 0 ) {
DEBUG ( DEBUG_NOTICE ,
( " Recmaster node %u is inactive. Force election \n " ,
rec - > recmaster ) ) ;
/*
* update our nodemap to carry the recmaster ' s notion of
* its own flags , so that we don ' t keep freezing the
* inactive recmaster node . . .
*/
nodemap - > nodes [ rec - > recmaster ] . flags =
recmaster_nodemap - > nodes [ rec - > recmaster ] . flags ;
2016-04-28 16:58:35 +10:00
force_election ( rec , pnn , nodemap ) ;
return false ;
2015-10-27 16:43:07 +11:00
}
2016-04-28 16:58:35 +10:00
return true ;
2015-10-27 16:43:07 +11:00
}
2010-06-22 22:50:23 +09:30
static void main_loop ( struct ctdb_context * ctdb , struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx )
2007-05-04 08:30:18 +10:00
{
2008-03-03 07:53:46 +11:00
uint32_t pnn ;
2015-10-29 17:22:48 +11:00
struct ctdb_node_map_old * nodemap = NULL ;
struct ctdb_node_map_old * * remote_nodemaps = NULL ;
2007-05-04 09:45:53 +10:00
struct ctdb_vnn_map * vnnmap = NULL ;
struct ctdb_vnn_map * remote_vnnmap = NULL ;
2015-03-29 17:49:02 +11:00
uint32_t num_lmasters ;
2008-02-18 19:38:04 +11:00
int32_t debug_level ;
2007-05-04 09:45:53 +10:00
int i , j , ret ;
2013-06-28 16:31:07 +10:00
bool self_ban ;
2007-06-07 15:18:55 +10:00
2007-06-04 20:22:44 +10:00
2008-01-07 16:17:22 +11:00
/* verify that the main daemon is still running */
2012-05-03 11:42:41 +10:00
if ( ctdb_kill ( ctdb , ctdb - > ctdbd_pid , 0 ) ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_CRIT , ( " CTDB daemon is no longer available. Shutting down recovery daemon \n " ) ) ;
2008-01-07 16:17:22 +11:00
exit ( - 1 ) ;
}
2008-09-09 13:44:46 +10:00
/* ping the local daemon to tell it we are alive */
ctdb_ctrl_recd_ping ( ctdb ) ;
2007-11-13 10:27:44 +11:00
if ( rec - > election_timeout ) {
/* an election is in progress */
2010-06-22 22:50:23 +09:30
return ;
2007-11-13 10:27:44 +11:00
}
2008-02-18 19:38:04 +11:00
/* read the debug level from the parent and update locally */
ret = ctdb_ctrl_get_debuglevel ( ctdb , CTDB_CURRENT_NODE , & debug_level ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read debuglevel from parent \n " ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-02-18 19:38:04 +11:00
}
2014-09-24 17:12:56 +10:00
DEBUGLEVEL = debug_level ;
2008-02-18 19:38:04 +11:00
2007-06-04 20:22:44 +10:00
/* get relevant tunables */
2007-06-07 18:05:25 +10:00
ret = ctdb_ctrl_get_all_tunables ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & ctdb - > tunable ) ;
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Failed to get tunables - retrying \n " ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-06-07 18:05:25 +10:00
}
2007-05-04 08:30:18 +10:00
2014-09-25 17:17:04 +10:00
/* get runstate */
ret = ctdb_ctrl_get_runstate ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & ctdb - > runstate ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to get runstate - retrying \n " ) ) ;
return ;
}
2013-07-08 12:45:31 +10:00
pnn = ctdb_get_pnn ( ctdb ) ;
2007-05-04 08:30:18 +10:00
2015-10-23 16:00:55 +11:00
/* get nodemap */
TALLOC_FREE ( rec - > nodemap ) ;
2008-03-03 09:19:30 +11:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , rec , & rec - > nodemap ) ;
2007-05-04 09:01:01 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from node %u \n " , pnn ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:01:01 +10:00
}
2008-03-03 09:19:30 +11:00
nodemap = rec - > nodemap ;
2007-05-04 08:30:18 +10:00
2013-06-28 14:09:35 +10:00
/* remember our own node flags */
rec - > node_flags = nodemap - > nodes [ pnn ] . flags ;
2013-06-28 16:31:07 +10:00
ban_misbehaving_nodes ( rec , & self_ban ) ;
if ( self_ban ) {
DEBUG ( DEBUG_NOTICE , ( " This node was banned, restart main_loop \n " ) ) ;
return ;
}
2013-06-27 16:01:16 +10:00
2017-06-22 17:45:20 +10:00
ret = ctdb_ctrl_getrecmode ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & ctdb - > recovery_mode ) ;
if ( ret ! = 0 ) {
D_ERR ( " Failed to read recmode from local node \n " ) ;
return ;
}
2013-06-28 14:02:44 +10:00
/* if the local daemon is STOPPED or BANNED, we verify that the databases are
2013-06-26 07:11:51 +02:00
also frozen and that the recmode is set to active .
2009-07-09 14:19:32 +10:00
*/
2013-06-27 15:39:15 +10:00
if ( rec - > node_flags & ( NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED ) ) {
2013-09-17 12:00:26 +10:00
/* If this node has become inactive then we want to
* reduce the chances of it taking over the recovery
* master role when it becomes active again . This
* helps to stabilise the recovery master role so that
* it stays on the most stable node .
*/
rec - > priority_time = timeval_current ( ) ;
2009-07-09 14:19:32 +10:00
if ( ctdb - > recovery_mode = = CTDB_RECOVERY_NORMAL ) {
2013-06-28 14:02:44 +10:00
DEBUG ( DEBUG_ERR , ( " Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases \n " ) ) ;
2009-07-09 14:19:32 +10:00
ret = ctdb_ctrl_setrecmode ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
2013-06-28 14:02:44 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to activate recovery mode in STOPPED or BANNED state \n " ) ) ;
2009-07-09 14:19:32 +10:00
2010-06-22 22:50:23 +09:30
return ;
2009-07-09 14:19:32 +10:00
}
2016-06-01 12:10:46 +10:00
}
if ( ! rec - > frozen_on_inactive ) {
ret = ctdb_ctrl_freeze ( ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE ) ;
2014-05-06 14:24:52 +10:00
if ( ret ! = 0 ) {
2016-06-01 12:10:46 +10:00
DEBUG ( DEBUG_ERR ,
( __location__ " Failed to freeze node "
" in STOPPED or BANNED state \n " ) ) ;
2014-05-06 14:24:52 +10:00
return ;
}
2016-06-01 12:10:46 +10:00
rec - > frozen_on_inactive = true ;
2009-07-09 14:19:32 +10:00
}
2013-06-27 15:39:15 +10:00
/* If this node is stopped or banned then it is not the recovery
* master , so don ' t do anything . This prevents stopped or banned
* node from starting election and sending unnecessary controls .
*/
return ;
2009-07-09 14:19:32 +10:00
}
2013-06-27 15:39:15 +10:00
2016-06-01 12:10:46 +10:00
rec - > frozen_on_inactive = false ;
2015-10-27 15:09:33 +11:00
/* Retrieve capabilities from all connected nodes */
ret = update_capabilities ( rec , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update node capabilities. \n " ) ) ;
return ;
}
2016-04-28 16:58:35 +10:00
if ( ! validate_recovery_master ( rec , mem_ctx ) ) {
2010-06-22 22:50:23 +09:30
return ;
2007-05-07 06:51:58 +10:00
}
2007-09-14 10:16:36 +10:00
2017-06-22 16:15:47 +10:00
if ( ctdb - > recovery_mode = = CTDB_RECOVERY_NORMAL ) {
/* Check if an IP takeover run is needed and trigger one if
* necessary */
verify_local_ip_allocation ( ctdb , rec , pnn , nodemap ) ;
}
2007-05-07 06:51:58 +10:00
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
2008-03-03 07:53:46 +11:00
if ( pnn ! = rec - > recmaster ) {
2010-06-22 22:50:23 +09:30
return ;
2007-05-07 06:51:58 +10:00
}
2007-10-11 06:16:36 +10:00
2007-10-15 14:28:51 +10:00
/* ensure our local copies of flags are right */
2007-11-30 08:44:34 +11:00
ret = update_local_flags ( rec , nodemap ) ;
2016-04-27 21:47:08 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( " Unable to update local flags \n " ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-10-11 06:16:36 +10:00
}
2008-10-17 21:18:06 +11:00
if ( ctdb - > num_nodes ! = nodemap - > num ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file \n " , ctdb - > num_nodes , nodemap - > num ) ) ;
2013-10-14 13:54:39 +11:00
ctdb_load_nodes_file ( ctdb ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-10-17 21:18:06 +11:00
}
2007-09-04 23:15:23 +10:00
2007-05-07 06:51:58 +10:00
/* verify that all active nodes agree that we are the recmaster */
2008-04-22 00:56:27 +10:00
switch ( verify_recmaster ( rec , nodemap , pnn ) ) {
2007-08-23 19:27:09 +10:00
case MONITOR_RECOVERY_NEEDED :
/* can not happen */
2010-06-22 22:50:23 +09:30
return ;
2007-08-23 19:27:09 +10:00
case MONITOR_ELECTION_NEEDED :
2008-03-03 09:19:30 +11:00
force_election ( rec , pnn , nodemap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-08-23 19:27:09 +10:00
case MONITOR_OK :
break ;
case MONITOR_FAILED :
2010-06-22 22:50:23 +09:30
return ;
2007-05-07 06:51:58 +10:00
}
2015-10-27 14:35:09 +11:00
/* get the vnnmap */
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & vnnmap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from node %u \n " , pnn ) ) ;
return ;
}
2007-09-14 09:49:12 +10:00
if ( rec - > need_recovery ) {
/* a previous recovery didn't finish */
2009-09-04 02:20:39 +10:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-09-14 09:49:12 +10:00
}
2007-05-07 04:41:12 +10:00
/* verify that all active nodes are in normal mode
and not in recovery mode
2009-09-04 02:20:39 +10:00
*/
2007-08-23 19:27:09 +10:00
switch ( verify_recmode ( ctdb , nodemap ) ) {
2007-08-23 13:48:39 +10:00
case MONITOR_RECOVERY_NEEDED :
2009-09-04 02:20:39 +10:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-08-23 13:48:39 +10:00
case MONITOR_FAILED :
2010-06-22 22:50:23 +09:30
return ;
2007-08-23 19:27:09 +10:00
case MONITOR_ELECTION_NEEDED :
/* can not happen */
2007-08-23 13:48:39 +10:00
case MONITOR_OK :
break ;
2007-05-07 04:41:12 +10:00
}
2016-05-17 18:28:56 +10:00
if ( ctdb - > recovery_lock ! = NULL ) {
2014-12-09 14:45:08 +11:00
/* We must already hold the recovery lock */
2016-05-24 14:54:39 +10:00
if ( ! ctdb_recovery_have_lock ( rec ) ) {
2014-12-09 14:45:08 +11:00
DEBUG ( DEBUG_ERR , ( " Failed recovery lock sanity check. Force a recovery \n " ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , ctdb - > pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2009-06-25 11:41:18 +10:00
}
2007-10-05 13:28:21 +10:00
}
2007-08-23 13:48:39 +10:00
2012-04-30 15:50:44 +10:00
2015-02-06 20:59:11 +11:00
/* If recoveries are disabled then there is no use doing any
* nodemap or flags checks . Recoveries might be disabled due
* to " reloadnodes " , so doing these checks might cause an
* unnecessary recovery . */
if ( ctdb_op_is_disabled ( rec - > recovery ) ) {
2016-05-03 16:00:02 +10:00
goto takeover_run_checks ;
2015-02-06 20:59:11 +11:00
}
2008-12-05 16:32:30 +11:00
/* get the nodemap for all active remote nodes
2007-05-04 09:45:53 +10:00
*/
2015-10-29 17:22:48 +11:00
remote_nodemaps = talloc_array ( mem_ctx , struct ctdb_node_map_old * , nodemap - > num ) ;
2008-12-09 10:45:14 +11:00
if ( remote_nodemaps = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " failed to allocate remote nodemap array \n " ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-12-09 10:45:14 +11:00
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
remote_nodemaps [ i ] = NULL ;
}
if ( get_remote_nodemaps ( ctdb , mem_ctx , nodemap , remote_nodemaps ) ! = 0 ) {
2008-12-05 16:32:30 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read remote nodemaps \n " ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-12-05 16:32:30 +11:00
}
/* verify that all other nodes have the same nodemap as we have
*/
2007-05-04 09:45:53 +10:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2009-04-06 12:00:22 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 09:45:53 +10:00
continue ;
}
2008-12-09 10:45:14 +11:00
if ( remote_nodemaps [ j ] = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Did not get a remote nodemap for node %d, restarting monitoring \n " , j ) ) ;
2009-04-02 14:50:43 +11:00
ctdb_set_culprit ( rec , j ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-12-09 10:45:14 +11:00
}
2008-12-05 16:32:30 +11:00
/* if the nodes disagree on how many nodes there are
2007-05-04 09:45:53 +10:00
then this is a good reason to try recovery
*/
2008-12-05 16:32:30 +11:00
if ( remote_nodemaps [ j ] - > num ! = nodemap - > num ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different node count. %u vs %u of the local node \n " ,
2008-12-05 16:32:30 +11:00
nodemap - > nodes [ j ] . pnn , remote_nodemaps [ j ] - > num , nodemap - > num ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
/* if the nodes disagree on which nodes exist and are
active , then that is also a good reason to do recovery
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2008-12-05 16:32:30 +11:00
if ( remote_nodemaps [ j ] - > nodes [ i ] . pnn ! = nodemap - > nodes [ i ] . pnn ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u). \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn , i ,
2008-12-05 16:32:30 +11:00
remote_nodemaps [ j ] - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . pnn ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-02-29 12:55:20 +11:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-04 02:20:39 +10:00
vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
}
2013-07-22 17:26:28 +10:00
}
/*
* Update node flags obtained from each active node . This ensure we have
* up - to - date information for all the nodes .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
nodemap - > nodes [ j ] . flags = remote_nodemaps [ j ] - > nodes [ j ] . flags ;
}
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-05-04 09:45:53 +10:00
2008-12-05 16:32:30 +11:00
/* verify the flags are consistent
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
if ( nodemap - > nodes [ i ] . flags ! = remote_nodemaps [ j ] - > nodes [ i ] . flags ) {
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x \n " ,
nodemap - > nodes [ j ] . pnn ,
nodemap - > nodes [ i ] . pnn ,
remote_nodemaps [ j ] - > nodes [ i ] . flags ,
2013-01-23 14:35:47 +11:00
nodemap - > nodes [ i ] . flags ) ) ;
2008-12-05 16:32:30 +11:00
if ( i = = j ) {
DEBUG ( DEBUG_ERR , ( " Use flags 0x%02x from remote node %d for cluster update of its own flags \n " , remote_nodemaps [ j ] - > nodes [ i ] . flags , j ) ) ;
update_flags_on_all_nodes ( ctdb , nodemap , nodemap - > nodes [ i ] . pnn , remote_nodemaps [ j ] - > nodes [ i ] . flags ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-12-05 16:32:30 +11:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-04 02:20:39 +10:00
vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-12-05 16:32:30 +11:00
} else {
DEBUG ( DEBUG_ERR , ( " Use flags 0x%02x from local recmaster node for cluster update of node %d flags \n " , nodemap - > nodes [ i ] . flags , i ) ) ;
update_flags_on_all_nodes ( ctdb , nodemap , nodemap - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . flags ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-12-05 16:32:30 +11:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-04 02:20:39 +10:00
vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2008-12-05 16:32:30 +11:00
}
}
}
2007-05-04 09:45:53 +10:00
}
2015-03-29 20:00:17 +11:00
/* count how many active nodes there are */
num_lmasters = 0 ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
if ( ctdb_node_has_capabilities ( rec - > caps ,
ctdb - > nodes [ i ] - > pnn ,
CTDB_CAP_LMASTER ) ) {
num_lmasters + + ;
}
}
}
2007-05-04 09:45:53 +10:00
2013-09-26 13:11:04 +10:00
/* There must be the same number of lmasters in the vnn map as
* there are active nodes with the lmaster capability . . . or
* do a recovery .
2007-05-04 09:45:53 +10:00
*/
2015-03-29 17:49:02 +11:00
if ( vnnmap - > size ! = num_lmasters ) {
2013-09-26 13:11:04 +10:00
DEBUG ( DEBUG_ERR , ( __location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u \n " ,
2015-03-29 17:49:02 +11:00
vnnmap - > size , num_lmasters ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , ctdb - > pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
/* verify that all active nodes in the nodemap also exist in
the vnnmap .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 15:18:55 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 09:45:53 +10:00
continue ;
}
2007-09-04 10:33:10 +10:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 09:45:53 +10:00
continue ;
}
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
2007-09-04 09:50:07 +10:00
if ( vnnmap - > map [ i ] = = nodemap - > nodes [ j ] . pnn ) {
2007-05-04 09:45:53 +10:00
break ;
}
}
2007-06-07 15:18:55 +10:00
if ( i = = vnnmap - > size ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Node %u is active in the nodemap but did not exist in the vnnmap \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
}
2007-05-04 11:57:45 +10:00
/* verify that all other nodes have the same vnnmap
and are from the same generation
*/
2007-05-04 09:45:53 +10:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 15:18:55 +10:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 09:45:53 +10:00
continue ;
}
2007-09-04 10:33:10 +10:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 09:45:53 +10:00
continue ;
}
2007-09-04 09:50:07 +10:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 18:39:37 +10:00
mem_ctx , & remote_vnnmap ) ;
2007-05-04 09:45:53 +10:00
if ( ret ! = 0 ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from remote node %u \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
2007-05-04 11:57:45 +10:00
/* verify the vnnmap generation is the same */
if ( vnnmap - > generation ! = remote_vnnmap - > generation ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > generation , vnnmap - > generation ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 11:57:45 +10:00
}
2007-05-04 09:45:53 +10:00
/* verify the vnnmap size is the same */
if ( vnnmap - > size ! = remote_vnnmap - > size ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different size of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > size , vnnmap - > size ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
/* verify the vnnmap is the same */
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( remote_vnnmap - > map [ i ] ! = vnnmap - > map [ i ] ) {
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different vnnmap. \n " ,
2007-09-04 09:50:07 +10:00
nodemap - > nodes [ j ] . pnn ) ) ;
2009-09-04 02:20:39 +10:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2008-02-29 12:55:20 +11:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2009-09-04 02:20:39 +10:00
vnnmap ) ;
2010-06-22 22:50:23 +09:30
return ;
2007-05-04 09:45:53 +10:00
}
}
}
2016-06-20 20:41:05 +10:00
/* FIXME: Add remote public IP checking to ensure that nodes
* have the IP addresses that are allocated to them . */
2016-05-03 16:00:02 +10:00
takeover_run_checks :
2016-05-03 16:07:34 +10:00
/* If there are IP takeover runs requested or the previous one
* failed then perform one and notify the waiters */
2016-05-03 16:00:02 +10:00
if ( ! ctdb_op_is_disabled ( rec - > takeover_run ) & &
2016-05-03 16:07:34 +10:00
( rec - > reallocate_requests | | rec - > need_takeover_run ) ) {
2016-05-03 16:00:02 +10:00
process_ipreallocate_requests ( ctdb , rec ) ;
}
2010-06-22 22:50:23 +09:30
}
2016-06-02 09:26:40 +10:00
static void recd_sig_term_handler ( struct tevent_context * ev ,
struct tevent_signal * se , int signum ,
int count , void * dont_care ,
void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type_abort (
private_data , struct ctdb_recoverd ) ;
2016-11-25 14:57:30 +11:00
DEBUG ( DEBUG_ERR , ( " Received SIGTERM, exiting \n " ) ) ;
2016-06-02 09:26:40 +10:00
ctdb_recovery_unlock ( rec ) ;
exit ( 0 ) ;
}
2010-06-22 22:50:23 +09:30
/*
the main monitoring loop
*/
static void monitor_cluster ( struct ctdb_context * ctdb )
{
2016-06-02 09:26:40 +10:00
struct tevent_signal * se ;
2010-06-22 22:50:23 +09:30
struct ctdb_recoverd * rec ;
DEBUG ( DEBUG_NOTICE , ( " monitor_cluster starting \n " ) ) ;
rec = talloc_zero ( ctdb , struct ctdb_recoverd ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec ) ;
rec - > ctdb = ctdb ;
2015-11-10 13:54:47 +11:00
rec - > recmaster = CTDB_UNKNOWN_PNN ;
2016-05-24 14:54:39 +10:00
rec - > recovery_lock_handle = NULL ;
2007-06-06 10:25:46 +10:00
2015-02-08 20:52:12 +11:00
rec - > takeover_run = ctdb_op_init ( rec , " takeover runs " ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > takeover_run ) ;
2013-09-03 11:20:01 +10:00
2015-02-06 14:47:33 +11:00
rec - > recovery = ctdb_op_init ( rec , " recoveries " ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > recovery ) ;
2010-06-22 22:50:23 +09:30
rec - > priority_time = timeval_current ( ) ;
2016-06-01 12:10:46 +10:00
rec - > frozen_on_inactive = false ;
2008-06-26 13:08:37 +10:00
2016-06-02 09:26:40 +10:00
se = tevent_add_signal ( ctdb - > ev , ctdb , SIGTERM , 0 ,
recd_sig_term_handler , rec ) ;
if ( se = = NULL ) {
DEBUG ( DEBUG_ERR , ( " Failed to install SIGTERM handler \n " ) ) ;
exit ( 1 ) ;
}
2010-06-22 22:50:23 +09:30
/* register a message port for sending memory dumps */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_MEM_DUMP , mem_dump_handler , rec ) ;
2007-05-04 09:45:53 +10:00
2016-03-17 17:26:30 +11:00
/* when a node is assigned banning credits */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_BANNING ,
banning_handler , rec ) ;
2010-06-22 22:50:23 +09:30
/* register a message port for recovery elections */
2015-10-29 17:51:52 +11:00
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_ELECTION , election_handler , rec ) ;
2010-06-22 22:50:23 +09:30
/* when nodes are disabled/enabled */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_SET_NODE_FLAGS , monitor_handler , rec ) ;
/* when we are asked to puch out a flag change */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_PUSH_NODE_FLAGS , push_flags_handler , rec ) ;
/* register a message port for vacuum fetch */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_VACUUM_FETCH , vacuum_fetch_handler , rec ) ;
/* register a message port for reloadnodes */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_RELOAD_NODES , reload_nodes_handler , rec ) ;
/* register a message port for performing a takeover run */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_TAKEOVER_RUN , ip_reallocate_handler , rec ) ;
/* register a message port for disabling the ip check for a short while */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_DISABLE_IP_CHECK , disable_ip_check_handler , rec ) ;
2012-02-28 06:56:04 +11:00
/* register a message port for forcing a rebalance of a node next
reallocation */
ctdb_client_set_message_handler ( ctdb , CTDB_SRVID_REBALANCE_NODE , recd_node_rebalance_handler , rec ) ;
2013-08-27 15:04:40 +10:00
/* Register a message port for disabling takeover runs */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DISABLE_TAKEOVER_RUNS ,
disable_takeover_runs_handler , rec ) ;
2015-02-06 15:06:44 +11:00
/* Register a message port for disabling recoveries */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DISABLE_RECOVERIES ,
disable_recoveries_handler , rec ) ;
2014-04-22 15:24:49 +10:00
/* register a message port for detaching database */
ctdb_client_set_message_handler ( ctdb ,
CTDB_SRVID_DETACH_DATABASE ,
detach_database_handler , rec ) ;
2010-06-22 22:50:23 +09:30
for ( ; ; ) {
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2010-06-22 22:50:35 +09:30
struct timeval start ;
double elapsed ;
2010-06-22 22:50:23 +09:30
if ( ! mem_ctx ) {
DEBUG ( DEBUG_CRIT , ( __location__
" Failed to create temp context \n " ) ) ;
exit ( - 1 ) ;
}
2010-06-22 22:50:35 +09:30
start = timeval_current ( ) ;
2010-06-22 22:50:23 +09:30
main_loop ( ctdb , rec , mem_ctx ) ;
talloc_free ( mem_ctx ) ;
/* we only check for recovery once every second */
2010-06-22 22:50:35 +09:30
elapsed = timeval_elapsed ( & start ) ;
if ( elapsed < ctdb - > tunable . recover_interval ) {
ctdb_wait_timeout ( ctdb , ctdb - > tunable . recover_interval
- elapsed ) ;
}
2010-06-22 22:50:23 +09:30
}
2007-05-04 08:30:18 +10:00
}
2007-06-06 10:25:46 +10:00
/*
2007-06-07 15:18:55 +10:00
event handler for when the main ctdbd dies
*/
2015-10-26 16:50:09 +11:00
static void ctdb_recoverd_parent ( struct tevent_context * ev ,
struct tevent_fd * fde ,
2007-05-15 15:13:36 +10:00
uint16_t flags , void * private_data )
{
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ALERT , ( " recovery daemon parent died - exiting \n " ) ) ;
2007-05-15 15:13:36 +10:00
_exit ( 1 ) ;
}
2008-05-06 11:19:17 +10:00
/*
called regularly to verify that the recovery daemon is still running
*/
2015-10-26 16:50:09 +11:00
static void ctdb_check_recd ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval yt , void * p )
2008-05-06 11:19:17 +10:00
{
struct ctdb_context * ctdb = talloc_get_type ( p , struct ctdb_context ) ;
2012-05-03 11:42:41 +10:00
if ( ctdb_kill ( ctdb , ctdb - > recoverd_pid , 0 ) ! = 0 ) {
2011-03-01 12:09:42 +11:00
DEBUG ( DEBUG_ERR , ( " Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon. \n " , ( int ) ctdb - > recoverd_pid ) ) ;
2008-05-06 11:19:17 +10:00
2015-10-26 16:50:09 +11:00
tevent_add_timer ( ctdb - > ev , ctdb , timeval_zero ( ) ,
ctdb_restart_recd , ctdb ) ;
2008-05-06 11:19:17 +10:00
2011-03-01 12:09:42 +11:00
return ;
2008-05-06 11:19:17 +10:00
}
2015-10-26 16:50:09 +11:00
tevent_add_timer ( ctdb - > ev , ctdb - > recd_ctx ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
2008-05-06 11:19:17 +10:00
}
2015-10-26 16:50:09 +11:00
static void recd_sig_child_handler ( struct tevent_context * ev ,
struct tevent_signal * se , int signum ,
int count , void * dont_care ,
void * private_data )
2008-07-09 14:02:54 +10:00
{
// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
int status ;
pid_t pid = - 1 ;
while ( pid ! = 0 ) {
pid = waitpid ( - 1 , & status , WNOHANG ) ;
if ( pid = = - 1 ) {
2009-06-19 15:55:13 +10:00
if ( errno ! = ECHILD ) {
DEBUG ( DEBUG_ERR , ( __location__ " waitpid() returned error. errno:%s(%d) \n " , strerror ( errno ) , errno ) ) ;
}
2008-07-09 14:02:54 +10:00
return ;
}
if ( pid > 0 ) {
DEBUG ( DEBUG_DEBUG , ( " RECD SIGCHLD from %d \n " , ( int ) pid ) ) ;
}
}
}
2007-06-07 15:18:55 +10:00
/*
startup the recovery daemon as a child of the main ctdb daemon
*/
2007-05-15 15:13:36 +10:00
int ctdb_start_recoverd ( struct ctdb_context * ctdb )
2007-05-04 08:30:18 +10:00
{
2007-05-15 15:13:36 +10:00
int fd [ 2 ] ;
2015-10-26 16:50:09 +11:00
struct tevent_signal * se ;
2010-08-18 09:16:31 +09:30
struct tevent_fd * fde ;
2016-11-29 16:49:41 +11:00
int ret ;
2007-05-04 08:30:18 +10:00
2007-05-15 15:13:36 +10:00
if ( pipe ( fd ) ! = 0 ) {
return - 1 ;
2007-05-04 08:30:18 +10:00
}
2014-08-08 12:51:03 +10:00
ctdb - > recoverd_pid = ctdb_fork ( ctdb ) ;
2007-10-22 12:34:08 +10:00
if ( ctdb - > recoverd_pid = = - 1 ) {
2007-05-15 15:13:36 +10:00
return - 1 ;
2007-05-04 08:30:18 +10:00
}
2012-12-04 15:05:44 +11:00
2007-10-22 12:34:08 +10:00
if ( ctdb - > recoverd_pid ! = 0 ) {
2012-12-04 15:05:44 +11:00
talloc_free ( ctdb - > recd_ctx ) ;
ctdb - > recd_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , ctdb - > recd_ctx ) ;
2007-05-15 15:13:36 +10:00
close ( fd [ 0 ] ) ;
2015-10-26 16:50:09 +11:00
tevent_add_timer ( ctdb - > ev , ctdb - > recd_ctx ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
2007-05-15 15:13:36 +10:00
return 0 ;
2007-05-04 08:30:18 +10:00
}
2007-05-15 15:13:36 +10:00
close ( fd [ 1 ] ) ;
srandom ( getpid ( ) ^ time ( NULL ) ) ;
2007-05-04 08:30:18 +10:00
2016-11-29 16:49:41 +11:00
ret = logging_init ( ctdb , NULL , NULL , " ctdb-recoverd " ) ;
if ( ret ! = 0 ) {
return - 1 ;
}
2015-09-23 16:10:59 -07:00
prctl_set_comment ( " ctdb_recovered " ) ;
2016-11-25 14:44:10 +11:00
if ( switch_from_server_to_client ( ctdb ) ! = 0 ) {
2009-03-23 12:37:30 +11:00
DEBUG ( DEBUG_CRIT , ( __location__ " ERROR: failed to switch recovery daemon into client mode. shutting down. \n " ) ) ;
2007-05-04 08:30:18 +10:00
exit ( 1 ) ;
}
2010-02-04 06:37:41 +11:00
DEBUG ( DEBUG_DEBUG , ( __location__ " Created PIPE FD:%d to recovery daemon \n " , fd [ 0 ] ) ) ;
2009-10-15 11:24:54 +11:00
2015-10-26 16:50:09 +11:00
fde = tevent_add_fd ( ctdb - > ev , ctdb , fd [ 0 ] , TEVENT_FD_READ ,
ctdb_recoverd_parent , & fd [ 0 ] ) ;
2010-08-18 09:16:31 +09:30
tevent_fd_set_auto_close ( fde ) ;
2009-03-23 12:37:30 +11:00
2008-07-09 14:02:54 +10:00
/* set up a handler to pick up sigchld */
2015-10-26 16:50:09 +11:00
se = tevent_add_signal ( ctdb - > ev , ctdb , SIGCHLD , 0 ,
recd_sig_child_handler , ctdb ) ;
2008-07-09 14:02:54 +10:00
if ( se = = NULL ) {
DEBUG ( DEBUG_CRIT , ( " Failed to set up signal handler for SIGCHLD in recovery daemon \n " ) ) ;
exit ( 1 ) ;
}
2007-05-15 15:13:36 +10:00
monitor_cluster ( ctdb ) ;
2007-05-07 06:51:58 +10:00
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_ALERT , ( " ERROR: ctdb_recoverd finished!? \n " ) ) ;
2007-05-15 15:13:36 +10:00
return - 1 ;
2007-05-04 08:30:18 +10:00
}
2007-10-22 12:34:08 +10:00
/*
shutdown the recovery daemon
*/
void ctdb_stop_recoverd ( struct ctdb_context * ctdb )
{
if ( ctdb - > recoverd_pid = = 0 ) {
return ;
}
2008-02-04 20:07:15 +11:00
DEBUG ( DEBUG_NOTICE , ( " Shutting down recovery daemon \n " ) ) ;
2012-05-03 11:42:41 +10:00
ctdb_kill ( ctdb , ctdb - > recoverd_pid , SIGTERM ) ;
2012-12-04 15:05:44 +11:00
TALLOC_FREE ( ctdb - > recd_ctx ) ;
TALLOC_FREE ( ctdb - > recd_ping_count ) ;
2007-10-22 12:34:08 +10:00
}
2011-03-01 12:09:42 +11:00
2015-10-26 16:50:09 +11:00
static void ctdb_restart_recd ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval t , void * private_data )
2011-03-01 12:09:42 +11:00
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
DEBUG ( DEBUG_ERR , ( " Restarting recovery daemon \n " ) ) ;
ctdb_stop_recoverd ( ctdb ) ;
ctdb_start_recoverd ( ctdb ) ;
}