2007-05-04 02:30:18 +04:00
/*
ctdb recovery daemon
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 07:50:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 09:29:31 +04:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 07:50:53 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-04 02:30:18 +04:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 07:50:53 +04:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 09:29:31 +04:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-04 02:30:18 +04:00
*/
# include "includes.h"
# include "lib/events/events.h"
# include "system/filesys.h"
2007-05-10 08:06:48 +04:00
# include "system/time.h"
2007-05-04 02:30:18 +04:00
# include "popt.h"
# include "cmdline.h"
# include "../include/ctdb.h"
# include "../include/ctdb_private.h"
2007-06-07 10:34:33 +04:00
struct ban_state {
struct ctdb_recoverd * rec ;
uint32_t banned_node ;
} ;
2007-06-07 09:18:55 +04:00
/*
private state of recovery daemon
*/
struct ctdb_recoverd {
struct ctdb_context * ctdb ;
uint32_t last_culprit ;
uint32_t culprit_counter ;
struct timeval first_recover_time ;
2007-06-07 10:34:33 +04:00
struct ban_state * * banned_nodes ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-06-07 09:18:55 +04:00
} ;
2007-05-04 02:30:18 +04:00
2007-06-04 14:22:44 +04:00
# define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
2007-06-06 04:25:46 +04:00
# define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
2007-05-24 07:49:27 +04:00
2007-06-07 10:34:33 +04:00
/*
unban a node
*/
static void ctdb_unban_node ( struct ctdb_recoverd * rec , uint32_t vnn )
{
struct ctdb_context * ctdb = rec - > ctdb ;
2007-09-04 04:09:58 +04:00
if ( ! ctdb_validate_pnn ( ctdb , vnn ) ) {
DEBUG ( 0 , ( " Bad pnn %u in ctdb_ban_node \n " , vnn ) ) ;
2007-06-07 10:48:31 +04:00
return ;
}
2007-06-07 10:34:33 +04:00
if ( rec - > banned_nodes [ vnn ] = = NULL ) {
return ;
}
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , vnn , 0 , NODE_FLAGS_BANNED ) ;
talloc_free ( rec - > banned_nodes [ vnn ] ) ;
rec - > banned_nodes [ vnn ] = NULL ;
}
/*
called when a ban has timed out
*/
static void ctdb_ban_timeout ( struct event_context * ev , struct timed_event * te , struct timeval t , void * p )
{
struct ban_state * state = talloc_get_type ( p , struct ban_state ) ;
struct ctdb_recoverd * rec = state - > rec ;
uint32_t vnn = state - > banned_node ;
2007-06-07 12:37:27 +04:00
DEBUG ( 0 , ( " Node %u is now unbanned \n " , vnn ) ) ;
2007-06-07 10:34:33 +04:00
ctdb_unban_node ( rec , vnn ) ;
}
/*
ban a node for a period of time
*/
static void ctdb_ban_node ( struct ctdb_recoverd * rec , uint32_t vnn , uint32_t ban_time )
{
struct ctdb_context * ctdb = rec - > ctdb ;
2007-09-04 04:09:58 +04:00
if ( ! ctdb_validate_pnn ( ctdb , vnn ) ) {
DEBUG ( 0 , ( " Bad pnn %u in ctdb_ban_node \n " , vnn ) ) ;
2007-06-07 10:48:31 +04:00
return ;
}
2007-09-04 04:06:36 +04:00
if ( vnn = = ctdb - > pnn ) {
2007-06-07 13:21:55 +04:00
DEBUG ( 0 , ( " self ban - lowering our election priority \n " ) ) ;
2007-06-07 12:37:27 +04:00
/* banning ourselves - lower our election priority */
rec - > priority_time = timeval_current ( ) ;
}
2007-06-07 10:34:33 +04:00
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , vnn , NODE_FLAGS_BANNED , 0 ) ;
rec - > banned_nodes [ vnn ] = talloc ( rec , struct ban_state ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes [ vnn ] ) ;
rec - > banned_nodes [ vnn ] - > rec = rec ;
rec - > banned_nodes [ vnn ] - > banned_node = vnn ;
if ( ban_time ! = 0 ) {
event_add_timed ( ctdb - > ev , rec - > banned_nodes [ vnn ] ,
timeval_current_ofs ( ban_time , 0 ) ,
ctdb_ban_timeout , rec - > banned_nodes [ vnn ] ) ;
}
}
2007-08-27 04:31:22 +04:00
enum monitor_result { MONITOR_OK , MONITOR_RECOVERY_NEEDED , MONITOR_ELECTION_NEEDED , MONITOR_FAILED } ;
struct freeze_node_data {
uint32_t count ;
enum monitor_result status ;
} ;
static void freeze_node_callback ( struct ctdb_client_control_state * state )
{
struct freeze_node_data * fndata = talloc_get_type ( state - > async . private , struct freeze_node_data ) ;
/* one more node has responded to our freeze node*/
fndata - > count - - ;
/* if we failed to freeze the node, we must trigger another recovery */
if ( ( state - > state ! = CTDB_CONTROL_DONE ) | | ( state - > status ! = 0 ) ) {
DEBUG ( 0 , ( __location__ " Failed to freeze node:%u. recovery failed \n " , state - > c - > hdr . destnode ) ) ;
fndata - > status = MONITOR_RECOVERY_NEEDED ;
}
return ;
}
/* freeze all nodes */
static enum monitor_result freeze_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
{
struct freeze_node_data * fndata ;
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
fndata = talloc ( mem_ctx , struct freeze_node_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , fndata ) ;
fndata - > count = 0 ;
fndata - > status = MONITOR_OK ;
/* loop over all active nodes and send an async freeze call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
state = ctdb_ctrl_freeze_send ( ctdb , mem_ctx ,
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-27 04:31:22 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
DEBUG ( 0 , ( " Failed to call ctdb_ctrl_freeze_send during recovery \n " ) ) ;
talloc_free ( mem_ctx ) ;
return MONITOR_RECOVERY_NEEDED ;
}
/* set up the callback functions */
state - > async . fn = freeze_node_callback ;
state - > async . private = fndata ;
/* one more control to wait for to complete */
fndata - > count + + ;
}
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
while ( fndata - > count > 0 ) {
event_loop_once ( ctdb - > ev ) ;
}
status = fndata - > status ;
talloc_free ( mem_ctx ) ;
return status ;
}
2007-06-07 10:34:33 +04:00
2007-06-07 09:18:55 +04:00
/*
change recovery mode on all nodes
*/
2007-05-06 03:53:12 +04:00
static int set_recovery_mode ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t rec_mode )
{
int j , ret ;
2007-08-27 04:31:22 +04:00
/* freeze all nodes */
if ( rec_mode = = CTDB_RECOVERY_ACTIVE ) {
ret = freeze_all_nodes ( ctdb , nodemap ) ;
if ( ret ! = MONITOR_OK ) {
DEBUG ( 0 , ( __location__ " Unable to freeze nodes. Recovery failed. \n " ) ) ;
return - 1 ;
}
}
2007-06-11 17:03:23 +04:00
2007-05-06 03:53:12 +04:00
/* set recovery mode to active on all nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont change it for nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 03:53:12 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setrecmode ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , rec_mode ) ;
2007-05-06 03:53:12 +04:00
if ( ret ! = 0 ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recmode on node %u \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 03:53:12 +04:00
return - 1 ;
}
2007-05-12 09:15:27 +04:00
if ( rec_mode = = CTDB_RECOVERY_NORMAL ) {
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_thaw ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ) ;
2007-05-12 09:15:27 +04:00
if ( ret ! = 0 ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( __location__ " Unable to thaw node %u \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-12 09:15:27 +04:00
return - 1 ;
}
}
2007-05-06 03:53:12 +04:00
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
change recovery master on all node
*/
2007-05-07 00:51:58 +04:00
static int set_recovery_master ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn )
{
int j , ret ;
/* set recovery master to vnn on all nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont change it for nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-07 00:51:58 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , vnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recmaster on node %u \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure all other nodes have attached to any databases that we have
*/
static int create_missing_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-04 03:45:53 +04:00
{
2007-05-04 09:21:40 +04:00
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
2007-05-06 00:58:01 +04:00
/* verify that all other nodes have all our databases */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-06 00:58:01 +04:00
continue ;
}
/* dont check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 00:58:01 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node %u \n " , vnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < remote_dbmap - > num ; i + + ) {
if ( dbmap - > dbids [ db ] = = remote_dbmap - > dbids [ i ] ) {
break ;
}
}
/* the remote node already have this database */
if ( i ! = remote_dbmap - > num ) {
continue ;
}
/* ok so we need to create this database */
2007-05-24 07:49:27 +04:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , vnn , dbmap - > dbids [ db ] , mem_ctx , & name ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbname from node %u \n " , vnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
2007-09-04 03:50:07 +04:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , mem_ctx , name ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create remote db:%s \n " , name ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
}
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:04:37 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure we are attached to any databases that anyone else is attached to
*/
static int create_missing_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_dbid_map * * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:12:42 +04:00
{
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
/* verify that we have all database any other node has */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-06 04:12:42 +04:00
continue ;
}
/* dont check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:12:42 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node %u \n " , vnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
/* step through all databases on the remote node */
for ( db = 0 ; db < remote_dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < ( * dbmap ) - > num ; i + + ) {
if ( remote_dbmap - > dbids [ db ] = = ( * dbmap ) - > dbids [ i ] ) {
break ;
}
}
/* we already have this db locally */
if ( i ! = ( * dbmap ) - > num ) {
continue ;
}
/* ok so we need to create this database and
rebuild dbmap
*/
2007-09-04 03:50:07 +04:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
remote_dbmap - > dbids [ db ] , mem_ctx , & name ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2007-06-07 12:39:37 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbname from node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-05-24 07:49:27 +04:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , name ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create local db:%s \n " , name ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to reread dbmap on node %u \n " , vnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:16:48 +04:00
2007-06-07 09:18:55 +04:00
/*
pull all the remote database contents into ours
*/
static int pull_all_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:16:48 +04:00
{
int i , j , ret ;
/* pull all records from all other nodes across onto this node
( this merges based on rsn )
*/
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to merge with ourselves */
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-06 04:16:48 +04:00
continue ;
}
/* dont merge from nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:16:48 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_copydb ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
vnn , dbmap - > dbids [ i ] , CTDB_LMASTER_ANY , mem_ctx ) ;
2007-05-06 04:16:48 +04:00
if ( ret ! = 0 ) {
2007-06-07 12:39:37 +04:00
DEBUG ( 0 , ( __location__ " Unable to copy db from node %u to node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , vnn ) ) ;
2007-05-06 04:16:48 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
change the dmaster on all databases to point to us
*/
2007-05-23 11:21:14 +04:00
static int update_dmaster_on_all_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:22:13 +04:00
{
int i , j , ret ;
/* update dmaster to point to this node for all databases/nodes */
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont repoint nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:22:13 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setdmaster ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , ctdb , dbmap - > dbids [ i ] , vnn ) ;
2007-05-06 04:22:13 +04:00
if ( ret ! = 0 ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( __location__ " Unable to set dmaster for node %u db:0x%08x \n " , nodemap - > nodes [ j ] . pnn , dbmap - > dbids [ i ] ) ) ;
2007-05-06 04:22:13 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
update flags on all active nodes
*/
static int update_flags_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
{
int i ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
struct ctdb_node_flag_change c ;
TDB_DATA data ;
2007-09-04 03:50:07 +04:00
c . vnn = nodemap - > nodes [ i ] . pnn ;
2007-08-21 11:25:15 +04:00
c . old_flags = nodemap - > nodes [ i ] . flags ;
c . new_flags = nodemap - > nodes [ i ] . flags ;
2007-06-07 09:18:55 +04:00
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
2007-06-09 15:58:50 +04:00
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
2007-06-07 09:18:55 +04:00
CTDB_SRVID_NODE_FLAGS_CHANGED , data ) ;
}
return 0 ;
}
2007-05-23 11:21:14 +04:00
/*
vacuum one database
*/
static int vacuum_db ( struct ctdb_context * ctdb , uint32_t db_id , struct ctdb_node_map * nodemap )
{
uint64_t max_rsn ;
int ret , i ;
/* find max rsn on our local node for this db */
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_get_max_rsn ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , db_id , & max_rsn ) ;
2007-05-23 11:21:14 +04:00
if ( ret ! = 0 ) {
return - 1 ;
}
/* set rsn on non-empty records to max_rsn+1 */
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-23 11:21:14 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_set_rsn_nonempty ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ i ] . pnn ,
2007-05-23 11:21:14 +04:00
db_id , max_rsn + 1 ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Failed to set rsn on node %u to %llu \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ i ] . pnn , ( unsigned long long ) max_rsn + 1 ) ) ;
2007-05-23 11:21:14 +04:00
return - 1 ;
}
}
/* delete records with rsn < max_rsn+1 on all nodes */
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-23 11:21:14 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_delete_low_rsn ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ i ] . pnn ,
2007-05-23 11:21:14 +04:00
db_id , max_rsn + 1 ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Failed to delete records on node %u with rsn below %llu \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ i ] . pnn , ( unsigned long long ) max_rsn + 1 ) ) ;
2007-05-23 11:21:14 +04:00
return - 1 ;
}
}
2007-05-06 04:30:18 +04:00
2007-05-23 11:21:14 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
vacuum all attached databases
*/
2007-05-23 11:21:14 +04:00
static int vacuum_all_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
struct ctdb_dbid_map * dbmap )
{
int i ;
/* update dmaster to point to this node for all databases/nodes */
for ( i = 0 ; i < dbmap - > num ; i + + ) {
if ( vacuum_db ( ctdb , dbmap - > dbids [ i ] , nodemap ) ! = 0 ) {
return - 1 ;
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
push out all our database contents to all other nodes
*/
2007-05-23 11:21:14 +04:00
static int push_all_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:38:44 +04:00
{
int i , j , ret ;
/* push all records out to the nodes again */
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to push to ourselves */
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-06 04:38:44 +04:00
continue ;
}
/* dont push to nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:38:44 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_copydb ( ctdb , CONTROL_TIMEOUT ( ) , vnn , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
dbmap - > dbids [ i ] , CTDB_LMASTER_ANY , mem_ctx ) ;
2007-05-06 04:38:44 +04:00
if ( ret ! = 0 ) {
2007-06-07 12:39:37 +04:00
DEBUG ( 0 , ( __location__ " Unable to copy db from node %u to node %u \n " ,
2007-09-04 03:50:07 +04:00
vnn , nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 04:38:44 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure all nodes have the same vnnmap we do
*/
2007-05-23 11:21:14 +04:00
static int update_vnnmap_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
uint32_t vnn , struct ctdb_vnn_map * vnnmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:42:18 +04:00
{
int j , ret ;
/* push the new vnn map out to all the nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont push to nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:42:18 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , mem_ctx , vnnmap ) ;
2007-05-06 04:42:18 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set vnnmap for node %u \n " , vnn ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
2007-06-07 10:34:33 +04:00
/*
handler for when the admin bans a node
*/
static void ban_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
struct ctdb_ban_info * b = ( struct ctdb_ban_info * ) data . dptr ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-06-07 10:34:33 +04:00
uint32_t recmaster ;
int ret ;
if ( data . dsize ! = sizeof ( * b ) ) {
DEBUG ( 0 , ( " Bad data in ban_handler \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-08-23 13:27:09 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & recmaster ) ;
2007-06-07 10:34:33 +04:00
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Failed to find the recmaster \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-09-04 04:06:36 +04:00
if ( recmaster ! = ctdb - > pnn ) {
2007-06-07 10:34:33 +04:00
DEBUG ( 0 , ( " We are not the recmaster - ignoring ban request \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
DEBUG ( 0 , ( " Node %u has been banned for %u seconds by the administrator \n " ,
b - > vnn , b - > ban_time ) ) ;
ctdb_ban_node ( rec , b - > vnn , b - > ban_time ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
}
2007-06-07 09:18:55 +04:00
/*
2007-06-07 10:34:33 +04:00
handler for when the admin unbans a node
*/
static void unban_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
2007-06-07 09:18:55 +04:00
{
2007-06-07 10:34:33 +04:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-06-07 10:34:33 +04:00
uint32_t vnn ;
int ret ;
uint32_t recmaster ;
if ( data . dsize ! = sizeof ( uint32_t ) ) {
DEBUG ( 0 , ( " Bad data in unban_handler \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
vnn = * ( uint32_t * ) data . dptr ;
2007-08-23 13:27:09 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & recmaster ) ;
2007-06-07 10:34:33 +04:00
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Failed to find the recmaster \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-09-04 04:06:36 +04:00
if ( recmaster ! = ctdb - > pnn ) {
2007-06-07 10:34:33 +04:00
DEBUG ( 0 , ( " We are not the recmaster - ignoring unban request \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
DEBUG ( 0 , ( " Node %u has been unbanned by the administrator \n " , vnn ) ) ;
ctdb_unban_node ( rec , vnn ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 09:18:55 +04:00
}
2007-06-07 10:34:33 +04:00
2007-07-04 02:36:59 +04:00
/*
called when ctdb_wait_timeout should finish
*/
static void ctdb_wait_handler ( struct event_context * ev , struct timed_event * te ,
struct timeval yt , void * p )
{
uint32_t * timed_out = ( uint32_t * ) p ;
( * timed_out ) = 1 ;
}
/*
wait for a given number of seconds
*/
static void ctdb_wait_timeout ( struct ctdb_context * ctdb , uint32_t secs )
{
uint32_t timed_out = 0 ;
event_add_timed ( ctdb - > ev , ctdb , timeval_current_ofs ( secs , 0 ) , ctdb_wait_handler , & timed_out ) ;
while ( ! timed_out ) {
event_loop_once ( ctdb - > ev ) ;
}
}
2007-08-22 06:38:31 +04:00
/* Create a new random generation ip.
The generation id can not be the INVALID_GENERATION id
*/
static uint32_t new_generation ( void )
{
uint32_t generation ;
while ( 1 ) {
generation = random ( ) ;
if ( generation ! = INVALID_GENERATION ) {
break ;
}
}
return generation ;
}
2007-06-02 04:03:28 +04:00
/*
we are the recmaster , and recovery is needed - start a recovery run
*/
2007-06-07 09:18:55 +04:00
static int do_recovery ( struct ctdb_recoverd * rec ,
2007-05-25 11:04:13 +04:00
TALLOC_CTX * mem_ctx , uint32_t vnn , uint32_t num_active ,
2007-06-07 09:18:55 +04:00
struct ctdb_node_map * nodemap , struct ctdb_vnn_map * vnnmap ,
uint32_t culprit )
2007-05-06 04:04:37 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-06 04:12:42 +04:00
int i , j , ret ;
2007-05-06 04:04:37 +04:00
uint32_t generation ;
struct ctdb_dbid_map * dbmap ;
2007-06-02 04:03:28 +04:00
2007-06-07 09:18:55 +04:00
if ( rec - > last_culprit ! = culprit | |
timeval_elapsed ( & rec - > first_recover_time ) > ctdb - > tunable . recovery_grace_period ) {
/* either a new node is the culprit, or we've decide to forgive them */
rec - > last_culprit = culprit ;
rec - > first_recover_time = timeval_current ( ) ;
rec - > culprit_counter = 0 ;
}
rec - > culprit_counter + + ;
if ( rec - > culprit_counter > 2 * nodemap - > num ) {
DEBUG ( 0 , ( " Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds \n " ,
culprit , rec - > culprit_counter , timeval_elapsed ( & rec - > first_recover_time ) ,
ctdb - > tunable . recovery_ban_period ) ) ;
2007-06-07 10:34:33 +04:00
ctdb_ban_node ( rec , culprit , ctdb - > tunable . recovery_ban_period ) ;
2007-06-07 09:18:55 +04:00
}
2007-06-02 05:36:42 +04:00
if ( ! ctdb_recovery_lock ( ctdb , true ) ) {
DEBUG ( 0 , ( " Unable to get recovery lock - aborting recovery \n " ) ) ;
2007-06-02 04:03:28 +04:00
return - 1 ;
}
2007-05-06 04:04:37 +04:00
2007-05-12 09:59:49 +04:00
/* set recovery mode to active on all nodes */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
return - 1 ;
}
2007-05-06 04:04:37 +04:00
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " Recovery initiated due to problem with node %u \n " , culprit ) ) ;
2007-05-23 08:35:19 +04:00
2007-05-06 04:04:37 +04:00
/* pick a new generation number */
2007-08-22 06:38:31 +04:00
generation = new_generation ( ) ;
2007-05-06 04:04:37 +04:00
/* change the vnnmap on this node to use the new generation
number but not on any other nodes .
this guarantees that if we abort the recovery prematurely
for some reason ( a node stops responding ? )
that we can just return immediately and we will reenter
recovery shortly again .
I . e . we deliberately leave the cluster with an inconsistent
generation id to allow us to abort recovery at any stage and
just restart it from scratch .
*/
vnnmap - > generation = generation ;
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , vnnmap ) ;
2007-05-06 04:04:37 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set vnnmap for node %u \n " , vnn ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
/* get a list of all databases */
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , & dbmap ) ;
2007-05-06 04:04:37 +04:00
if ( ret ! = 0 ) {
2007-05-23 14:15:09 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node :%u \n " , vnn ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
2007-05-06 04:12:42 +04:00
2007-05-06 04:04:37 +04:00
/* verify that all other nodes have all our databases */
ret = create_missing_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing remote databases \n " ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
2007-05-06 04:12:42 +04:00
/* verify that we have all the databases any other node has */
ret = create_missing_local_databases ( ctdb , nodemap , vnn , & dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing local databases \n " ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:12:42 +04:00
2007-05-04 09:21:40 +04:00
/* verify that all other nodes have all our databases */
2007-05-06 04:04:37 +04:00
ret = create_missing_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing remote databases \n " ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:04:37 +04:00
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - created remote databases \n " ) ) ;
2007-05-08 08:51:55 +04:00
/* pull all remote databases onto the local node */
ret = pull_all_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
2007-05-06 04:22:13 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to pull remote databases \n " ) ) ;
2007-05-06 04:22:13 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - pulled remote databases \n " ) ) ;
2007-05-06 04:22:13 +04:00
2007-05-06 04:38:44 +04:00
/* push all local databases to the remote nodes */
ret = push_all_local_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to push local databases \n " ) ) ;
2007-05-06 04:38:44 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - pushed remote databases \n " ) ) ;
2007-05-06 04:38:44 +04:00
2007-06-07 09:18:55 +04:00
/* build a new vnn map with all the currently active and
unbanned nodes */
2007-08-22 06:38:31 +04:00
generation = new_generation ( ) ;
2007-05-10 02:49:57 +04:00
vnnmap = talloc ( mem_ctx , struct ctdb_vnn_map ) ;
CTDB_NO_MEMORY ( ctdb , vnnmap ) ;
2007-05-04 09:21:40 +04:00
vnnmap - > generation = generation ;
vnnmap - > size = num_active ;
2007-06-07 10:34:33 +04:00
vnnmap - > map = talloc_zero_array ( vnnmap , uint32_t , vnnmap - > size ) ;
2007-05-04 09:21:40 +04:00
for ( i = j = 0 ; i < nodemap - > num ; i + + ) {
2007-06-07 09:18:55 +04:00
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
2007-09-04 03:50:07 +04:00
vnnmap - > map [ j + + ] = nodemap - > nodes [ i ] . pnn ;
2007-05-04 09:21:40 +04:00
}
}
2007-05-06 04:42:18 +04:00
/* update to the new vnnmap on all nodes */
ret = update_vnnmap_on_all_nodes ( ctdb , nodemap , vnn , vnnmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to update vnnmap on all nodes \n " ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - updated vnnmap \n " ) ) ;
2007-05-04 09:21:40 +04:00
2007-05-07 00:51:58 +04:00
/* update recmaster to point to us for all nodes */
ret = set_recovery_master ( ctdb , nodemap , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery master \n " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - updated recmaster \n " ) ) ;
2007-05-07 00:51:58 +04:00
2007-05-08 08:51:55 +04:00
/* repoint all local and remote database records to the local
node as being dmaster
*/
ret = update_dmaster_on_all_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to update dmaster on all databases \n " ) ) ;
2007-05-08 08:51:55 +04:00
return - 1 ;
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - updated dmaster on all databases \n " ) ) ;
2007-06-07 09:18:55 +04:00
/*
update all nodes to have the same flags that we have
*/
ret = update_flags_on_all_nodes ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to update flags on all nodes \n " ) ) ;
return - 1 ;
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - updated flags \n " ) ) ;
2007-05-23 11:21:14 +04:00
/*
run a vacuum operation on empty records
*/
ret = vacuum_all_databases ( ctdb , nodemap , dbmap ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to vacuum all databases \n " ) ) ;
return - 1 ;
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - vacuumed all databases \n " ) ) ;
2007-05-25 11:04:13 +04:00
/*
if enabled , tell nodes to takeover their public IPs
*/
2007-09-04 03:50:07 +04:00
if ( ctdb - > vnn_list ) {
2007-05-25 11:04:13 +04:00
ret = ctdb_takeover_run ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
2007-05-29 07:48:30 +04:00
DEBUG ( 0 , ( __location__ " Unable to setup public takeover addresses \n " ) ) ;
2007-05-25 11:04:13 +04:00
return - 1 ;
}
2007-06-17 17:31:44 +04:00
DEBUG ( 1 , ( __location__ " Recovery - done takeover \n " ) ) ;
2007-05-25 11:04:13 +04:00
}
2007-05-08 08:51:55 +04:00
2007-06-17 17:31:44 +04:00
2007-05-04 09:21:40 +04:00
/* disable recovery mode */
2007-05-06 03:53:12 +04:00
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_NORMAL ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to normal on cluster \n " ) ) ;
2007-05-06 03:53:12 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-25 18:05:30 +04:00
/* send a message to all clients telling them that the cluster
has been reconfigured */
2007-08-21 11:25:15 +04:00
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED , CTDB_SRVID_RECONFIGURE , tdb_null ) ;
2007-05-04 09:21:40 +04:00
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Recovery complete \n " ) ) ;
2007-07-04 02:36:59 +04:00
/* We just finished a recovery successfully.
We now wait for rerecovery_timeout before we allow
another recovery to take place .
*/
DEBUG ( 0 , ( __location__ " New recoveries supressed for the rerecovery timeout \n " ) ) ;
ctdb_wait_timeout ( ctdb , ctdb - > tunable . rerecovery_timeout ) ;
DEBUG ( 0 , ( __location__ " Rerecovery timeout elapsed. Recovery reactivated. \n " ) ) ;
2007-05-04 09:21:40 +04:00
return 0 ;
2007-05-04 03:45:53 +04:00
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-06-07 13:17:27 +04:00
/*
elections are won by first checking the number of connected nodes , then
the priority time , then the vnn
*/
2007-05-07 00:51:58 +04:00
struct election_message {
2007-06-07 13:17:27 +04:00
uint32_t num_connected ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-06-07 13:17:27 +04:00
uint32_t vnn ;
2007-05-07 00:51:58 +04:00
} ;
2007-06-07 13:17:27 +04:00
/*
form this nodes election data
*/
static void ctdb_election_data ( struct ctdb_recoverd * rec , struct election_message * em )
{
int ret , i ;
struct ctdb_node_map * nodemap ;
struct ctdb_context * ctdb = rec - > ctdb ;
ZERO_STRUCTP ( em ) ;
2007-09-04 04:06:36 +04:00
em - > vnn = rec - > ctdb - > pnn ;
2007-06-07 13:17:27 +04:00
em - > priority_time = rec - > priority_time ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , rec , & nodemap ) ;
if ( ret ! = 0 ) {
return ;
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
em - > num_connected + + ;
}
}
talloc_free ( nodemap ) ;
}
/*
see if the given election data wins
*/
static bool ctdb_election_win ( struct ctdb_recoverd * rec , struct election_message * em )
{
struct election_message myem ;
int cmp ;
ctdb_election_data ( rec , & myem ) ;
/* try to use the most connected node */
cmp = ( int ) myem . num_connected - ( int ) em - > num_connected ;
/* then the longest running node */
if ( cmp = = 0 ) {
2007-06-07 13:21:55 +04:00
cmp = timeval_compare ( & em - > priority_time , & myem . priority_time ) ;
2007-06-07 13:17:27 +04:00
}
if ( cmp = = 0 ) {
cmp = ( int ) myem . vnn - ( int ) em - > vnn ;
}
return cmp > 0 ;
}
2007-06-07 09:18:55 +04:00
/*
send out an election request
*/
2007-06-07 12:37:27 +04:00
static int send_election_request ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx , uint32_t vnn )
2007-05-07 00:51:58 +04:00
{
int ret ;
TDB_DATA election_data ;
struct election_message emsg ;
uint64_t srvid ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-07 00:51:58 +04:00
2007-06-06 04:25:46 +04:00
srvid = CTDB_SRVID_RECOVERY ;
2007-05-07 00:51:58 +04:00
2007-06-07 13:17:27 +04:00
ctdb_election_data ( rec , & emsg ) ;
2007-05-07 00:51:58 +04:00
election_data . dsize = sizeof ( struct election_message ) ;
election_data . dptr = ( unsigned char * ) & emsg ;
/* first we assume we will win the election and set
recoverymaster to be ourself on the current node
*/
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) , vnn , vnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-06-07 13:17:27 +04:00
DEBUG ( 0 , ( __location__ " failed to send recmaster election request \n " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
/* send an election message to all active nodes */
ctdb_send_message ( ctdb , CTDB_BROADCAST_ALL , srvid , election_data ) ;
return 0 ;
}
2007-06-09 14:11:51 +04:00
/*
this function will unban all nodes in the cluster
*/
static void unban_all_nodes ( struct ctdb_context * ctdb )
{
int ret , i ;
struct ctdb_node_map * nodemap ;
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " failed to get nodemap to unban all nodes \n " ) ) ;
return ;
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) )
& & ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_BANNED ) ) {
2007-09-04 03:50:07 +04:00
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ i ] . pnn , 0 , NODE_FLAGS_BANNED ) ;
2007-06-09 14:11:51 +04:00
}
}
talloc_free ( tmp_ctx ) ;
}
2007-05-07 00:51:58 +04:00
/*
handler for recovery master elections
*/
static void election_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
2007-06-07 09:18:55 +04:00
TDB_DATA data , void * private_data )
2007-05-07 00:51:58 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2007-05-07 00:51:58 +04:00
int ret ;
struct election_message * em = ( struct election_message * ) data . dptr ;
TALLOC_CTX * mem_ctx ;
2007-05-10 07:10:23 +04:00
mem_ctx = talloc_new ( ctdb ) ;
2007-06-07 13:17:27 +04:00
2007-05-07 00:51:58 +04:00
/* someone called an election. check their election data
and if we disagree and we would rather be the elected node ,
send a new election message to all other nodes
*/
2007-06-07 13:17:27 +04:00
if ( ctdb_election_win ( rec , em ) ) {
2007-09-04 04:18:44 +04:00
ret = send_election_request ( rec , mem_ctx , ctdb_get_pnn ( ctdb ) ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
}
talloc_free ( mem_ctx ) ;
2007-06-09 14:13:25 +04:00
/*unban_all_nodes(ctdb);*/
2007-05-07 00:51:58 +04:00
return ;
}
2007-05-23 08:35:19 +04:00
/* release the recmaster lock */
2007-09-04 04:06:36 +04:00
if ( em - > vnn ! = ctdb - > pnn & &
2007-06-03 04:29:14 +04:00
ctdb - > recovery_lock_fd ! = - 1 ) {
2007-06-02 05:36:42 +04:00
close ( ctdb - > recovery_lock_fd ) ;
ctdb - > recovery_lock_fd = - 1 ;
2007-06-09 14:11:51 +04:00
unban_all_nodes ( ctdb ) ;
2007-05-23 08:35:19 +04:00
}
2007-05-07 00:51:58 +04:00
/* ok, let that guy become recmaster then */
2007-09-04 04:18:44 +04:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) , ctdb_get_pnn ( ctdb ) , em - > vnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to send recmaster election request " ) ) ;
2007-05-07 00:51:58 +04:00
talloc_free ( mem_ctx ) ;
return ;
}
2007-06-07 10:34:33 +04:00
/* release any bans */
2007-06-07 09:18:55 +04:00
rec - > last_culprit = ( uint32_t ) - 1 ;
talloc_free ( rec - > banned_nodes ) ;
2007-06-07 10:34:33 +04:00
rec - > banned_nodes = talloc_zero_array ( rec , struct ban_state * , ctdb - > num_nodes ) ;
2007-06-07 09:18:55 +04:00
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes ) ;
2007-05-07 00:51:58 +04:00
talloc_free ( mem_ctx ) ;
return ;
}
2007-06-07 09:18:55 +04:00
/*
force the start of the election process
*/
2007-06-07 12:37:27 +04:00
static void force_election ( struct ctdb_recoverd * rec , TALLOC_CTX * mem_ctx , uint32_t vnn ,
struct ctdb_node_map * nodemap )
2007-05-07 00:51:58 +04:00
{
int ret ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-10 03:48:14 +04:00
/* set all nodes to recovery mode to stop all internode traffic */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-10 03:48:14 +04:00
return ;
}
2007-05-07 00:51:58 +04:00
2007-06-07 12:37:27 +04:00
ret = send_election_request ( rec , mem_ctx , vnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
return ;
}
2007-05-26 08:01:08 +04:00
/* wait for a few seconds to collect all responses */
2007-06-07 09:18:55 +04:00
ctdb_wait_timeout ( ctdb , ctdb - > tunable . election_timeout ) ;
}
/*
handler for when a node changes its flags
*/
static void monitor_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
struct ctdb_node_map * nodemap = NULL ;
TALLOC_CTX * tmp_ctx ;
2007-08-21 11:25:15 +04:00
uint32_t changed_flags ;
2007-06-07 09:18:55 +04:00
int i ;
if ( data . dsize ! = sizeof ( * c ) ) {
DEBUG ( 0 , ( __location__ " Invalid data in ctdb_node_flag_change \n " ) ) ;
return ;
}
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY_VOID ( ctdb , tmp_ctx ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ i ] . pnn = = c - > vnn ) break ;
2007-06-07 09:18:55 +04:00
}
if ( i = = nodemap - > num ) {
DEBUG ( 0 , ( __location__ " Flag change for non-existant node %u \n " , c - > vnn ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
2007-08-21 11:25:15 +04:00
changed_flags = c - > old_flags ^ c - > new_flags ;
2007-07-09 07:21:17 +04:00
/* Dont let messages from remote nodes change the DISCONNECTED flag.
This flag is handled locally based on whether the local node
can communicate with the node or not .
*/
2007-08-21 11:25:15 +04:00
c - > new_flags & = ~ NODE_FLAGS_DISCONNECTED ;
2007-07-09 11:40:15 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
2007-08-21 11:25:15 +04:00
c - > new_flags | = NODE_FLAGS_DISCONNECTED ;
2007-07-09 11:40:15 +04:00
}
2007-07-09 07:21:17 +04:00
2007-08-21 11:25:15 +04:00
if ( nodemap - > nodes [ i ] . flags ! = c - > new_flags ) {
DEBUG ( 0 , ( " Node %u has changed flags - now 0x%x was 0x%x \n " , c - > vnn , c - > new_flags , c - > old_flags ) ) ;
2007-06-07 09:18:55 +04:00
}
2007-08-21 11:25:15 +04:00
nodemap - > nodes [ i ] . flags = c - > new_flags ;
2007-06-07 09:18:55 +04:00
2007-08-23 13:27:09 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , tmp_ctx , CONTROL_TIMEOUT ( ) ,
2007-06-07 09:18:55 +04:00
CTDB_CURRENT_NODE , & ctdb - > recovery_master ) ;
if ( ret = = 0 ) {
2007-08-23 07:00:10 +04:00
ret = ctdb_ctrl_getrecmode ( ctdb , tmp_ctx , CONTROL_TIMEOUT ( ) ,
2007-06-07 09:18:55 +04:00
CTDB_CURRENT_NODE , & ctdb - > recovery_mode ) ;
2007-05-07 00:51:58 +04:00
}
2007-06-07 09:18:55 +04:00
if ( ret = = 0 & &
2007-09-04 04:06:36 +04:00
ctdb - > recovery_master = = ctdb - > pnn & &
2007-06-07 09:18:55 +04:00
ctdb - > recovery_mode = = CTDB_RECOVERY_NORMAL & &
2007-09-04 03:50:07 +04:00
ctdb - > vnn_list ) {
2007-08-21 11:25:15 +04:00
/* Only do the takeover run if the perm disabled or unhealthy
flags changed since these will cause an ip failover but not
a recovery .
If the node became disconnected or banned this will also
lead to an ip address failover but that is handled
during recovery
*/
if ( changed_flags & NODE_FLAGS_DISABLED ) {
ret = ctdb_takeover_run ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to setup public takeover addresses \n " ) ) ;
}
/* send a message to all clients telling them that the
cluster has been reconfigured */
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED , CTDB_SRVID_RECONFIGURE , tdb_null ) ;
2007-06-07 09:18:55 +04:00
}
}
talloc_free ( tmp_ctx ) ;
2007-05-07 00:51:58 +04:00
}
2007-05-06 22:41:12 +04:00
2007-06-07 09:18:55 +04:00
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data {
uint32_t count ;
enum monitor_result status ;
} ;
static void verify_recmode_normal_callback ( struct ctdb_client_control_state * state )
{
struct verify_recmode_normal_data * rmdata = talloc_get_type ( state - > async . private , struct verify_recmode_normal_data ) ;
/* one more node has responded with recmode data*/
rmdata - > count - - ;
/* if we failed to get the recmode, then return an error and let
the main loop try again .
*/
if ( state - > state ! = CTDB_CONTROL_DONE ) {
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
return ;
}
/* if we got a response, then the recmode will be stored in the
status field
*/
if ( state - > status ! = CTDB_RECOVERY_NORMAL ) {
DEBUG ( 0 , ( __location__ " Node:%u was in recovery mode. Restart recovery process \n " , state - > c - > hdr . destnode ) ) ;
rmdata - > status = MONITOR_RECOVERY_NEEDED ;
}
return ;
}
/* verify that all nodes are in normal recovery mode */
2007-08-23 13:27:09 +04:00
static enum monitor_result verify_recmode ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
2007-08-23 07:48:39 +04:00
{
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data * rmdata ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-08-27 03:40:10 +04:00
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
rmdata = talloc ( mem_ctx , struct verify_recmode_normal_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
rmdata - > count = 0 ;
rmdata - > status = MONITOR_OK ;
2007-08-23 07:48:39 +04:00
/* loop over all active nodes and send an async getrecmode call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-08-27 03:40:10 +04:00
state = ctdb_ctrl_getrecmode_send ( ctdb , mem_ctx ,
2007-08-23 07:48:39 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-27 03:40:10 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
DEBUG ( 0 , ( " Failed to call ctdb_ctrl_getrecmode_send during monitoring \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-23 07:48:39 +04:00
return MONITOR_FAILED ;
}
2007-08-23 13:27:09 +04:00
2007-08-27 03:40:10 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmode_normal_callback ;
state - > async . private = rmdata ;
/* one more control to wait for to complete */
rmdata - > count + + ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
while ( rmdata - > count > 0 ) {
event_loop_once ( ctdb - > ev ) ;
}
status = rmdata - > status ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-27 03:40:10 +04:00
return status ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
2007-08-23 13:27:09 +04:00
struct verify_recmaster_data {
uint32_t count ;
uint32_t vnn ;
enum monitor_result status ;
} ;
2007-08-24 04:42:06 +04:00
static void verify_recmaster_callback ( struct ctdb_client_control_state * state )
2007-08-23 13:27:09 +04:00
{
2007-08-24 04:42:06 +04:00
struct verify_recmaster_data * rmdata = talloc_get_type ( state - > async . private , struct verify_recmaster_data ) ;
2007-08-23 13:27:09 +04:00
/* one more node has responded with recmaster data*/
rmdata - > count - - ;
/* if we failed to get the recmaster, then return an error and let
the main loop try again .
*/
2007-08-24 04:42:06 +04:00
if ( state - > state ! = CTDB_CONTROL_DONE ) {
2007-08-23 13:27:09 +04:00
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* if we got a response, then the recmaster will be stored in the
status field
*/
2007-08-24 04:42:06 +04:00
if ( state - > status ! = rmdata - > vnn ) {
DEBUG ( 0 , ( " Node %d does not agree we are the recmaster. Need a new recmaster election \n " , state - > c - > hdr . destnode ) ) ;
2007-08-23 13:27:09 +04:00
rmdata - > status = MONITOR_ELECTION_NEEDED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* verify that all nodes agree that we are the recmaster */
static enum monitor_result verify_recmaster ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn )
{
struct verify_recmaster_data * rmdata ;
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
rmdata = talloc ( mem_ctx , struct verify_recmaster_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
rmdata - > count = 0 ;
rmdata - > vnn = vnn ;
rmdata - > status = MONITOR_OK ;
/* loop over all active nodes and send an async getrecmaster call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
state = ctdb_ctrl_getrecmaster_send ( ctdb , mem_ctx ,
2007-08-23 13:38:54 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-23 13:27:09 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
DEBUG ( 0 , ( " Failed to call ctdb_ctrl_getrecmaster_send during monitoring \n " ) ) ;
talloc_free ( mem_ctx ) ;
return MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmaster_callback ;
state - > async . private = rmdata ;
2007-08-23 13:27:09 +04:00
/* one more control to wait for to complete */
rmdata - > count + + ;
}
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
2007-08-23 13:38:54 +04:00
while ( rmdata - > count > 0 ) {
2007-08-23 13:27:09 +04:00
event_loop_once ( ctdb - > ev ) ;
}
status = rmdata - > status ;
talloc_free ( mem_ctx ) ;
return status ;
}
2007-06-07 09:18:55 +04:00
2007-06-04 14:22:44 +04:00
/*
the main monitoring loop
*/
2007-06-05 11:57:07 +04:00
static void monitor_cluster ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
2007-08-23 07:48:39 +04:00
uint32_t vnn , num_active , recmaster ;
2007-05-04 02:30:18 +04:00
TALLOC_CTX * mem_ctx = NULL ;
2007-05-04 03:01:01 +04:00
struct ctdb_node_map * nodemap = NULL ;
2007-05-04 03:45:53 +04:00
struct ctdb_node_map * remote_nodemap = NULL ;
struct ctdb_vnn_map * vnnmap = NULL ;
struct ctdb_vnn_map * remote_vnnmap = NULL ;
int i , j , ret ;
2007-06-06 04:25:46 +04:00
bool need_takeover_run ;
2007-06-07 09:18:55 +04:00
struct ctdb_recoverd * rec ;
rec = talloc_zero ( ctdb , struct ctdb_recoverd ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec ) ;
rec - > ctdb = ctdb ;
2007-06-07 10:34:33 +04:00
rec - > banned_nodes = talloc_zero_array ( rec , struct ban_state * , ctdb - > num_nodes ) ;
2007-06-07 09:18:55 +04:00
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes ) ;
2007-06-07 12:37:27 +04:00
rec - > priority_time = timeval_current ( ) ;
2007-06-07 09:18:55 +04:00
/* register a message port for recovery elections */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_RECOVERY , election_handler , rec ) ;
/* and one for when nodes are disabled/enabled */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_NODE_FLAGS_CHANGED , monitor_handler , rec ) ;
2007-06-07 10:34:33 +04:00
/* and one for when nodes are banned */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_BAN_NODE , ban_handler , rec ) ;
/* and one for when nodes are unbanned */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_UNBAN_NODE , unban_handler , rec ) ;
2007-05-04 02:30:18 +04:00
again :
2007-06-06 04:25:46 +04:00
need_takeover_run = false ;
2007-05-04 02:30:18 +04:00
if ( mem_ctx ) {
talloc_free ( mem_ctx ) ;
mem_ctx = NULL ;
}
mem_ctx = talloc_new ( ctdb ) ;
if ( ! mem_ctx ) {
DEBUG ( 0 , ( " Failed to create temporary context \n " ) ) ;
exit ( - 1 ) ;
}
/* we only check for recovery once every second */
2007-06-07 09:18:55 +04:00
ctdb_wait_timeout ( ctdb , ctdb - > tunable . recover_interval ) ;
2007-06-04 14:22:44 +04:00
/* get relevant tunables */
2007-06-07 12:05:25 +04:00
ret = ctdb_ctrl_get_all_tunables ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & ctdb - > tunable ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( " Failed to get tunables - retrying \n " ) ) ;
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-05-24 07:49:27 +04:00
vnn = ctdb_ctrl_getvnn ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE ) ;
2007-05-23 08:35:19 +04:00
if ( vnn = = ( uint32_t ) - 1 ) {
DEBUG ( 0 , ( " Failed to get local vnn - retrying \n " ) ) ;
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
/* get the vnnmap */
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , & vnnmap ) ;
2007-05-06 22:41:12 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get vnnmap from node %u \n " , vnn ) ) ;
2007-05-06 22:41:12 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
/* get number of nodes */
2007-05-24 07:49:27 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , vnn , mem_ctx , & nodemap ) ;
2007-05-04 03:01:01 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get nodemap from node %u \n " , vnn ) ) ;
2007-05-04 03:01:01 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-05-04 03:45:53 +04:00
/* count how many active nodes there are */
num_active = 0 ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( rec - > banned_nodes [ nodemap - > nodes [ i ] . pnn ] ! = NULL ) {
2007-06-07 09:18:55 +04:00
nodemap - > nodes [ i ] . flags | = NODE_FLAGS_BANNED ;
} else {
nodemap - > nodes [ i ] . flags & = ~ NODE_FLAGS_BANNED ;
}
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
2007-05-04 03:45:53 +04:00
num_active + + ;
}
}
2007-05-07 00:51:58 +04:00
/* check which node is the recovery master */
2007-08-23 13:27:09 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) , vnn , & recmaster ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get recmaster from node %u \n " , vnn ) ) ;
2007-05-07 00:51:58 +04:00
goto again ;
}
2007-05-23 08:35:19 +04:00
if ( recmaster = = ( uint32_t ) - 1 ) {
DEBUG ( 0 , ( __location__ " Initial recovery master set - forcing election \n " ) ) ;
2007-06-07 12:37:27 +04:00
force_election ( rec , mem_ctx , vnn , nodemap ) ;
2007-05-23 08:35:19 +04:00
goto again ;
}
2007-05-07 00:51:58 +04:00
/* verify that the recmaster node is still active */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = recmaster ) {
2007-05-07 00:51:58 +04:00
break ;
}
2007-05-10 07:10:23 +04:00
}
2007-05-23 08:35:19 +04:00
if ( j = = nodemap - > num ) {
DEBUG ( 0 , ( " Recmaster node %u not in list. Force reelection \n " , recmaster ) ) ;
2007-06-07 12:37:27 +04:00
force_election ( rec , mem_ctx , vnn , nodemap ) ;
2007-05-23 08:35:19 +04:00
goto again ;
}
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( " Recmaster node %u no longer available. Force reelection \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2007-06-07 12:37:27 +04:00
force_election ( rec , mem_ctx , vnn , nodemap ) ;
2007-05-07 00:51:58 +04:00
goto again ;
}
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
if ( vnn ! = recmaster ) {
goto again ;
}
/* verify that all active nodes agree that we are the recmaster */
2007-08-23 13:27:09 +04:00
switch ( verify_recmaster ( ctdb , nodemap , vnn ) ) {
case MONITOR_RECOVERY_NEEDED :
/* can not happen */
goto again ;
case MONITOR_ELECTION_NEEDED :
force_election ( rec , mem_ctx , vnn , nodemap ) ;
goto again ;
case MONITOR_OK :
break ;
case MONITOR_FAILED :
goto again ;
2007-05-07 00:51:58 +04:00
}
2007-05-06 22:41:12 +04:00
/* verify that all active nodes are in normal mode
and not in recovery mode
*/
2007-08-23 13:27:09 +04:00
switch ( verify_recmode ( ctdb , nodemap ) ) {
2007-08-23 07:48:39 +04:00
case MONITOR_RECOVERY_NEEDED :
2007-09-04 03:50:07 +04:00
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-08-23 07:48:39 +04:00
goto again ;
case MONITOR_FAILED :
goto again ;
2007-08-23 13:27:09 +04:00
case MONITOR_ELECTION_NEEDED :
/* can not happen */
2007-08-23 07:48:39 +04:00
case MONITOR_OK :
break ;
2007-05-06 22:41:12 +04:00
}
2007-08-23 07:48:39 +04:00
2007-05-04 03:45:53 +04:00
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_nodemap ) ;
2007-05-04 03:45:53 +04:00
if ( ret ! = 0 ) {
2007-06-07 12:39:37 +04:00
DEBUG ( 0 , ( __location__ " Unable to get nodemap from remote node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on how many nodes there are
then this is a good reason to try recovery
*/
if ( remote_nodemap - > num ! = nodemap - > num ) {
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " Remote node:%u has different node count. %u vs %u of the local node \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_nodemap - > num , nodemap - > num ) ) ;
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on which nodes exist and are
active , then that is also a good reason to do recovery
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( remote_nodemap - > nodes [ i ] . pnn ! = nodemap - > nodes [ i ] . pnn ) {
2007-06-11 15:37:09 +04:00
DEBUG ( 0 , ( __location__ " Remote node:%u has different nodemap vnn for %d (%u vs %u). \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , i ,
remote_nodemap - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . pnn ) ) ;
2007-06-11 15:37:09 +04:00
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-06-11 15:37:09 +04:00
goto again ;
}
2007-07-09 06:55:15 +04:00
if ( ( remote_nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ! =
( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
DEBUG ( 0 , ( __location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , i ,
2007-06-11 15:37:09 +04:00
remote_nodemap - > nodes [ i ] . flags , nodemap - > nodes [ i ] . flags ) ) ;
2007-06-07 12:39:37 +04:00
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
2007-06-06 04:25:46 +04:00
/* update our nodemap flags according to the other
server - this gets the NODE_FLAGS_DISABLED
flag . Note that the remote node is authoritative
for its flags ( except CONNECTED , which we know
matches in this code ) */
if ( nodemap - > nodes [ j ] . flags ! = remote_nodemap - > nodes [ j ] . flags ) {
nodemap - > nodes [ j ] . flags = remote_nodemap - > nodes [ j ] . flags ;
need_takeover_run = true ;
}
2007-05-04 03:45:53 +04:00
}
/* there better be the same number of lmasters in the vnn map
2007-05-10 07:10:23 +04:00
as there are active nodes or we will have to do a recovery
2007-05-04 03:45:53 +04:00
*/
if ( vnnmap - > size ! = num_active ) {
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " The vnnmap count is different from the number of active nodes. %u vs %u \n " ,
vnnmap - > size , num_active ) ) ;
2007-09-04 04:06:36 +04:00
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , ctdb - > pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify that all active nodes in the nodemap also exist in
the vnnmap .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( vnnmap - > map [ i ] = = nodemap - > nodes [ j ] . pnn ) {
2007-05-04 03:45:53 +04:00
break ;
}
}
2007-06-07 09:18:55 +04:00
if ( i = = vnnmap - > size ) {
DEBUG ( 0 , ( __location__ " Node %u is active in the nodemap but did not exist in the vnnmap \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
2007-05-04 05:57:45 +04:00
/* verify that all other nodes have the same vnnmap
and are from the same generation
*/
2007-05-04 03:45:53 +04:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
if ( nodemap - > nodes [ j ] . pnn = = vnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_vnnmap ) ;
2007-05-04 03:45:53 +04:00
if ( ret ! = 0 ) {
2007-06-07 12:39:37 +04:00
DEBUG ( 0 , ( __location__ " Unable to get vnnmap from remote node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
2007-05-04 05:57:45 +04:00
/* verify the vnnmap generation is the same */
if ( vnnmap - > generation ! = remote_vnnmap - > generation ) {
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > generation , vnnmap - > generation ) ) ;
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 05:57:45 +04:00
goto again ;
}
2007-05-04 03:45:53 +04:00
/* verify the vnnmap size is the same */
if ( vnnmap - > size ! = remote_vnnmap - > size ) {
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " Remote node %u has different size of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > size , vnnmap - > size ) ) ;
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify the vnnmap is the same */
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( remote_vnnmap - > map [ i ] ! = vnnmap - > map [ i ] ) {
2007-06-07 09:18:55 +04:00
DEBUG ( 0 , ( __location__ " Remote node %u has different vnnmap. \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-06-07 12:39:37 +04:00
do_recovery ( rec , mem_ctx , vnn , num_active , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
}
2007-06-06 04:25:46 +04:00
/* we might need to change who has what IP assigned */
2007-09-04 03:50:07 +04:00
if ( need_takeover_run & & ctdb - > vnn_list ) {
2007-06-06 04:25:46 +04:00
ret = ctdb_takeover_run ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to setup public takeover addresses \n " ) ) ;
}
}
2007-05-04 03:45:53 +04:00
goto again ;
2007-05-04 02:30:18 +04:00
}
2007-06-06 04:25:46 +04:00
/*
2007-06-07 09:18:55 +04:00
event handler for when the main ctdbd dies
*/
2007-05-15 09:13:36 +04:00
static void ctdb_recoverd_parent ( struct event_context * ev , struct fd_event * fde ,
uint16_t flags , void * private_data )
{
DEBUG ( 0 , ( " recovery daemon parent died - exiting \n " ) ) ;
_exit ( 1 ) ;
}
2007-06-07 09:18:55 +04:00
/*
startup the recovery daemon as a child of the main ctdb daemon
*/
2007-05-15 09:13:36 +04:00
int ctdb_start_recoverd ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
int ret ;
2007-05-15 09:13:36 +04:00
int fd [ 2 ] ;
pid_t child ;
2007-05-04 02:30:18 +04:00
2007-05-15 09:13:36 +04:00
if ( pipe ( fd ) ! = 0 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
child = fork ( ) ;
if ( child = = - 1 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
if ( child ! = 0 ) {
close ( fd [ 0 ] ) ;
return 0 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
close ( fd [ 1 ] ) ;
2007-06-02 02:41:19 +04:00
/* shutdown the transport */
ctdb - > methods - > shutdown ( ctdb ) ;
/* get a new event context */
2007-05-30 07:26:50 +04:00
talloc_free ( ctdb - > ev ) ;
ctdb - > ev = event_context_init ( ctdb ) ;
2007-05-15 09:13:36 +04:00
event_add_fd ( ctdb - > ev , ctdb , fd [ 0 ] , EVENT_FD_READ | EVENT_FD_AUTOCLOSE ,
ctdb_recoverd_parent , & fd [ 0 ] ) ;
2007-05-10 07:10:23 +04:00
2007-05-15 09:13:36 +04:00
close ( ctdb - > daemon . sd ) ;
ctdb - > daemon . sd = - 1 ;
srandom ( getpid ( ) ^ time ( NULL ) ) ;
2007-05-04 02:30:18 +04:00
/* initialise ctdb */
2007-05-15 09:13:36 +04:00
ret = ctdb_socket_connect ( ctdb ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Failed to init ctdb \n " ) ) ;
2007-05-04 02:30:18 +04:00
exit ( 1 ) ;
}
2007-05-15 09:13:36 +04:00
monitor_cluster ( ctdb ) ;
2007-05-07 00:51:58 +04:00
2007-05-15 09:13:36 +04:00
DEBUG ( 0 , ( " ERROR: ctdb_recoverd finished!? \n " ) ) ;
return - 1 ;
2007-05-04 02:30:18 +04:00
}