2007-05-04 02:30:18 +04:00
/*
ctdb recovery daemon
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 07:50:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 09:29:31 +04:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 07:50:53 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-04 02:30:18 +04:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 07:50:53 +04:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 09:29:31 +04:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-04 02:30:18 +04:00
*/
# include "includes.h"
# include "lib/events/events.h"
# include "system/filesys.h"
2007-05-10 08:06:48 +04:00
# include "system/time.h"
2007-09-14 04:16:36 +04:00
# include "system/network.h"
2007-10-22 06:34:08 +04:00
# include "system/wait.h"
2007-05-04 02:30:18 +04:00
# include "popt.h"
# include "cmdline.h"
# include "../include/ctdb.h"
# include "../include/ctdb_private.h"
2008-01-06 04:38:01 +03:00
# include "db_wrap.h"
2008-01-08 13:28:42 +03:00
# include "dlinklist.h"
2007-05-04 02:30:18 +04:00
2007-06-07 10:34:33 +04:00
struct ban_state {
struct ctdb_recoverd * rec ;
uint32_t banned_node ;
} ;
2007-06-07 09:18:55 +04:00
/*
private state of recovery daemon
*/
struct ctdb_recoverd {
struct ctdb_context * ctdb ;
2008-02-29 04:37:42 +03:00
int rec_file_fd ;
2008-03-02 23:53:46 +03:00
uint32_t recmaster ;
2008-02-29 04:55:20 +03:00
uint32_t num_active ;
2008-03-03 02:24:17 +03:00
uint32_t num_connected ;
2008-03-03 01:19:30 +03:00
struct ctdb_node_map * nodemap ;
2007-06-07 09:18:55 +04:00
uint32_t last_culprit ;
uint32_t culprit_counter ;
struct timeval first_recover_time ;
2007-06-07 10:34:33 +04:00
struct ban_state * * banned_nodes ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-09-13 08:08:18 +04:00
bool need_takeover_run ;
2007-09-14 03:49:12 +04:00
bool need_recovery ;
2007-10-05 07:28:21 +04:00
uint32_t node_flags ;
2007-11-13 02:27:44 +03:00
struct timed_event * send_election_te ;
struct timed_event * election_timeout ;
2008-01-08 13:28:42 +03:00
struct vacuum_info * vacuum_info ;
2007-06-07 09:18:55 +04:00
} ;
2007-05-04 02:30:18 +04:00
2007-06-04 14:22:44 +04:00
# define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
2007-06-06 04:25:46 +04:00
# define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
2007-05-24 07:49:27 +04:00
2008-01-05 01:35:43 +03:00
2007-06-07 10:34:33 +04:00
/*
unban a node
*/
2007-09-04 04:33:10 +04:00
static void ctdb_unban_node ( struct ctdb_recoverd * rec , uint32_t pnn )
2007-06-07 10:34:33 +04:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Unbanning node %u \n " , pnn ) ) ;
2007-11-23 04:36:14 +03:00
2007-09-04 04:33:10 +04:00
if ( ! ctdb_validate_pnn ( ctdb , pnn ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Bad pnn %u in ctdb_unban_node \n " , pnn ) ) ;
2007-06-07 10:48:31 +04:00
return ;
}
2007-12-03 07:45:53 +03:00
/* If we are unbanning a different node then just pass the ban info on */
if ( pnn ! = ctdb - > pnn ) {
TDB_DATA data ;
int ret ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Unanning remote node %u. Passing the ban request on to the remote node. \n " , pnn ) ) ;
2007-12-03 07:45:53 +03:00
data . dptr = ( uint8_t * ) & pnn ;
data . dsize = sizeof ( uint32_t ) ;
ret = ctdb_send_message ( ctdb , pnn , CTDB_SRVID_UNBAN_NODE , data ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to unban node %u \n " , pnn ) ) ;
2007-12-03 07:45:53 +03:00
return ;
}
return ;
2007-11-23 04:36:14 +03:00
}
2007-12-03 07:45:53 +03:00
/* make sure we remember we are no longer banned in case
there is an election */
rec - > node_flags & = ~ NODE_FLAGS_BANNED ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_INFO , ( " Clearing ban flag on node %u \n " , pnn ) ) ;
2007-12-03 07:45:53 +03:00
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , pnn , 0 , NODE_FLAGS_BANNED ) ;
2007-09-04 04:33:10 +04:00
if ( rec - > banned_nodes [ pnn ] = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_INFO , ( " No ban recorded for this node. ctdb_unban_node() request ignored \n " ) ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-09-04 04:33:10 +04:00
talloc_free ( rec - > banned_nodes [ pnn ] ) ;
rec - > banned_nodes [ pnn ] = NULL ;
2007-06-07 10:34:33 +04:00
}
/*
called when a ban has timed out
*/
static void ctdb_ban_timeout ( struct event_context * ev , struct timed_event * te , struct timeval t , void * p )
{
struct ban_state * state = talloc_get_type ( p , struct ban_state ) ;
struct ctdb_recoverd * rec = state - > rec ;
2007-09-04 04:33:10 +04:00
uint32_t pnn = state - > banned_node ;
2007-06-07 10:34:33 +04:00
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Ban timeout. Node %u is now unbanned \n " , pnn ) ) ;
2007-09-04 04:33:10 +04:00
ctdb_unban_node ( rec , pnn ) ;
2007-06-07 10:34:33 +04:00
}
/*
ban a node for a period of time
*/
2007-09-04 04:33:10 +04:00
static void ctdb_ban_node ( struct ctdb_recoverd * rec , uint32_t pnn , uint32_t ban_time )
2007-06-07 10:34:33 +04:00
{
struct ctdb_context * ctdb = rec - > ctdb ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Banning node %u for %u seconds \n " , pnn , ban_time ) ) ;
2007-11-23 04:36:14 +03:00
2007-09-04 04:33:10 +04:00
if ( ! ctdb_validate_pnn ( ctdb , pnn ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Bad pnn %u in ctdb_ban_node \n " , pnn ) ) ;
2007-06-07 10:48:31 +04:00
return ;
}
2007-10-15 07:22:58 +04:00
if ( 0 = = ctdb - > tunable . enable_bans ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_INFO , ( " Bans are disabled - ignoring ban of node %u \n " , pnn ) ) ;
2007-10-15 07:22:58 +04:00
return ;
}
2007-12-03 07:45:53 +03:00
/* If we are banning a different node then just pass the ban info on */
if ( pnn ! = ctdb - > pnn ) {
struct ctdb_ban_info b ;
TDB_DATA data ;
int ret ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Banning remote node %u for %u seconds. Passing the ban request on to the remote node. \n " , pnn , ban_time ) ) ;
2007-12-03 07:45:53 +03:00
b . pnn = pnn ;
b . ban_time = ban_time ;
data . dptr = ( uint8_t * ) & b ;
data . dsize = sizeof ( b ) ;
ret = ctdb_send_message ( ctdb , pnn , CTDB_SRVID_BAN_NODE , data ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to ban node %u \n " , pnn ) ) ;
2007-12-03 07:45:53 +03:00
return ;
}
2007-11-23 04:36:14 +03:00
2007-12-03 07:45:53 +03:00
return ;
2007-06-07 12:37:27 +04:00
}
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " self ban - lowering our election priority \n " ) ) ;
2007-09-04 04:33:10 +04:00
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , pnn , NODE_FLAGS_BANNED , 0 ) ;
2007-06-07 10:34:33 +04:00
2007-12-03 07:45:53 +03:00
/* banning ourselves - lower our election priority */
rec - > priority_time = timeval_current ( ) ;
/* make sure we remember we are banned in case there is an
election */
rec - > node_flags | = NODE_FLAGS_BANNED ;
2007-11-23 04:38:37 +03:00
if ( rec - > banned_nodes [ pnn ] ! = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Re-banning an already banned node. Remove previous ban and set a new ban. \n " ) ) ;
2007-11-23 04:38:37 +03:00
talloc_free ( rec - > banned_nodes [ pnn ] ) ;
rec - > banned_nodes [ pnn ] = NULL ;
}
2007-12-03 03:39:17 +03:00
rec - > banned_nodes [ pnn ] = talloc ( rec - > banned_nodes , struct ban_state ) ;
2007-09-04 04:33:10 +04:00
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes [ pnn ] ) ;
2007-06-07 10:34:33 +04:00
2007-09-04 04:33:10 +04:00
rec - > banned_nodes [ pnn ] - > rec = rec ;
rec - > banned_nodes [ pnn ] - > banned_node = pnn ;
2007-06-07 10:34:33 +04:00
if ( ban_time ! = 0 ) {
2007-09-04 04:33:10 +04:00
event_add_timed ( ctdb - > ev , rec - > banned_nodes [ pnn ] ,
2007-06-07 10:34:33 +04:00
timeval_current_ofs ( ban_time , 0 ) ,
2007-09-04 04:33:10 +04:00
ctdb_ban_timeout , rec - > banned_nodes [ pnn ] ) ;
2007-06-07 10:34:33 +04:00
}
}
2007-08-27 04:31:22 +04:00
enum monitor_result { MONITOR_OK , MONITOR_RECOVERY_NEEDED , MONITOR_ELECTION_NEEDED , MONITOR_FAILED } ;
2008-01-29 05:59:28 +03:00
/*
run the " recovered " eventscript on all nodes
2008-01-06 04:38:01 +03:00
*/
2008-05-15 06:28:52 +04:00
static int run_recovered_eventscript ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , const char * caller )
2007-08-27 04:31:22 +04:00
{
2008-01-29 05:59:28 +03:00
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2007-08-27 04:31:22 +04:00
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2007-08-27 04:31:22 +04:00
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_END_RECOVERY ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , tdb_null ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-05-15 06:28:52 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'recovered' event when called from %s \n " , caller ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
2007-08-27 04:31:22 +04:00
}
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return 0 ;
2007-08-27 04:31:22 +04:00
}
2008-06-12 10:53:36 +04:00
/*
remember the trouble maker
*/
static void ctdb_set_culprit ( struct ctdb_recoverd * rec , uint32_t culprit )
{
struct ctdb_context * ctdb = rec - > ctdb ;
if ( rec - > last_culprit ! = culprit | |
timeval_elapsed ( & rec - > first_recover_time ) > ctdb - > tunable . recovery_grace_period ) {
DEBUG ( DEBUG_NOTICE , ( " New recovery culprit %u \n " , culprit ) ) ;
/* either a new node is the culprit, or we've decided to forgive them */
rec - > last_culprit = culprit ;
rec - > first_recover_time = timeval_current ( ) ;
rec - > culprit_counter = 0 ;
}
rec - > culprit_counter + + ;
}
/* this callback is called for every node that failed to execute the
start recovery event
*/
static void startrecovery_fail_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( callback_data , struct ctdb_recoverd ) ;
DEBUG ( DEBUG_ERR , ( __location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit \n " , node_pnn ) ) ;
ctdb_set_culprit ( rec , node_pnn ) ;
}
2008-01-29 05:59:28 +03:00
/*
run the " startrecovery " eventscript on all nodes
*/
2008-06-12 10:53:36 +04:00
static int run_startrecovery_eventscript ( struct ctdb_recoverd * rec , struct ctdb_node_map * nodemap )
2008-01-29 05:59:28 +03:00
{
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
struct ctdb_context * ctdb = rec - > ctdb ;
2007-06-07 10:34:33 +04:00
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_START_RECOVERY ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , tdb_null ,
NULL ,
startrecovery_fail_callback ,
rec ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'startrecovery' event. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2008-01-06 04:38:01 +03:00
2008-06-12 10:53:36 +04:00
static void async_getcap_callback ( struct ctdb_context * ctdb , uint32_t node_pnn , int32_t res , TDB_DATA outdata , void * callback_data )
2008-05-06 09:42:59 +04:00
{
if ( ( outdata . dsize ! = sizeof ( uint32_t ) ) | | ( outdata . dptr = = NULL ) ) {
2008-06-12 10:53:36 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Invalid lenght/pointer for getcap callback : %u %p \n " , ( unsigned ) outdata . dsize , outdata . dptr ) ) ;
2008-05-06 09:42:59 +04:00
return ;
}
ctdb - > nodes [ node_pnn ] - > capabilities = * ( ( uint32_t * ) outdata . dptr ) ;
}
/*
update the node capabilities for all connected nodes
*/
static int update_capabilities ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
{
uint32_t * nodes ;
TALLOC_CTX * tmp_ctx ;
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_GET_CAPABILITIES ,
nodes , CONTROL_TIMEOUT ( ) ,
2008-06-12 10:53:36 +04:00
false , tdb_null ,
async_getcap_callback , NULL ,
NULL ) ! = 0 ) {
2008-05-06 09:42:59 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read node capabilities. \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
talloc_free ( tmp_ctx ) ;
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
change recovery mode on all nodes
*/
2007-05-06 03:53:12 +04:00
static int set_recovery_mode ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t rec_mode )
{
2008-01-06 04:38:01 +03:00
TDB_DATA data ;
2008-01-29 05:59:28 +03:00
uint32_t * nodes ;
TALLOC_CTX * tmp_ctx ;
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2007-08-27 04:31:22 +04:00
/* freeze all nodes */
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2007-08-27 04:31:22 +04:00
if ( rec_mode = = CTDB_RECOVERY_ACTIVE ) {
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_FREEZE ,
nodes , CONTROL_TIMEOUT ( ) ,
2008-06-12 10:53:36 +04:00
false , tdb_null ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to freeze nodes. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2007-08-27 04:31:22 +04:00
return - 1 ;
}
}
2007-06-11 17:03:23 +04:00
2008-01-06 04:38:01 +03:00
data . dsize = sizeof ( uint32_t ) ;
data . dptr = ( unsigned char * ) & rec_mode ;
2007-05-06 03:53:12 +04:00
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_SET_RECMODE ,
nodes , CONTROL_TIMEOUT ( ) ,
2008-06-12 10:53:36 +04:00
false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2007-05-12 09:15:27 +04:00
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2007-05-06 03:53:12 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
change recovery master on all node
*/
2007-09-04 04:33:10 +04:00
static int set_recovery_master ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t pnn )
2007-05-07 00:51:58 +04:00
{
2008-01-06 04:38:01 +03:00
TDB_DATA data ;
2008-01-29 05:59:28 +03:00
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2007-05-07 00:51:58 +04:00
2008-01-06 04:38:01 +03:00
data . dsize = sizeof ( uint32_t ) ;
data . dptr = ( unsigned char * ) & pnn ;
2007-05-07 00:51:58 +04:00
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_SET_RECMASTER ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recmaster. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
2007-05-07 00:51:58 +04:00
}
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2007-05-07 00:51:58 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure all other nodes have attached to any databases that we have
*/
static int create_missing_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
2007-09-04 04:33:10 +04:00
uint32_t pnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-04 03:45:53 +04:00
{
2007-05-04 09:21:40 +04:00
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
2007-05-06 00:58:01 +04:00
/* verify that all other nodes have all our databases */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 00:58:01 +04:00
continue ;
}
/* dont check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 00:58:01 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < remote_dbmap - > num ; i + + ) {
2007-09-21 06:24:02 +04:00
if ( dbmap - > dbs [ db ] . dbid = = remote_dbmap - > dbs [ i ] . dbid ) {
2007-05-06 00:58:01 +04:00
break ;
}
}
/* the remote node already have this database */
if ( i ! = remote_dbmap - > num ) {
continue ;
}
/* ok so we need to create this database */
2007-09-21 06:24:02 +04:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , pnn , dbmap - > dbs [ db ] . dbid ,
mem_ctx , & name ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " , pnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
2007-09-21 06:24:02 +04:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
mem_ctx , name , dbmap - > dbs [ db ] . persistent ) ;
2007-05-06 00:58:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create remote db:%s \n " , name ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
}
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:04:37 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
ensure we are attached to any databases that anyone else is attached to
*/
static int create_missing_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
2007-09-04 04:33:10 +04:00
uint32_t pnn , struct ctdb_dbid_map * * dbmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:12:42 +04:00
{
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
/* verify that we have all database any other node has */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-06 04:12:42 +04:00
continue ;
}
/* dont check nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:12:42 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node %u \n " , pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
/* step through all databases on the remote node */
for ( db = 0 ; db < remote_dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < ( * dbmap ) - > num ; i + + ) {
2007-09-21 06:24:02 +04:00
if ( remote_dbmap - > dbs [ db ] . dbid = = ( * dbmap ) - > dbs [ i ] . dbid ) {
2007-05-06 04:12:42 +04:00
break ;
}
}
/* we already have this db locally */
if ( i ! = ( * dbmap ) - > num ) {
continue ;
}
/* ok so we need to create this database and
rebuild dbmap
*/
2007-09-04 03:50:07 +04:00
ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-09-21 06:24:02 +04:00
remote_dbmap - > dbs [ db ] . dbid , mem_ctx , & name ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbname from node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-09-21 06:24:02 +04:00
ctdb_ctrl_createdb ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , name ,
remote_dbmap - > dbs [ db ] . persistent ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create local db:%s \n " , name ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , dbmap ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to reread dbmap on node %u \n " , pnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:16:48 +04:00
2007-06-07 09:18:55 +04:00
/*
2008-01-06 04:38:01 +03:00
pull the remote database contents from one node into the recdb
2007-06-07 09:18:55 +04:00
*/
2008-01-06 04:38:01 +03:00
static int pull_one_remote_database ( struct ctdb_context * ctdb , uint32_t srcnode ,
struct tdb_wrap * recdb , uint32_t dbid )
2007-05-06 04:16:48 +04:00
{
2008-01-06 04:38:01 +03:00
int ret ;
TDB_DATA outdata ;
struct ctdb_control_pulldb_reply * reply ;
struct ctdb_rec_data * rec ;
int i ;
TALLOC_CTX * tmp_ctx = talloc_new ( recdb ) ;
2007-05-06 04:16:48 +04:00
2008-01-06 04:38:01 +03:00
ret = ctdb_ctrl_pulldb ( ctdb , srcnode , dbid , CTDB_LMASTER_ANY , tmp_ctx ,
CONTROL_TIMEOUT ( ) , & outdata ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to copy db from node %u \n " , srcnode ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
reply = ( struct ctdb_control_pulldb_reply * ) outdata . dptr ;
if ( outdata . dsize < offsetof ( struct ctdb_control_pulldb_reply , data ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " invalid data in pulldb reply \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
rec = ( struct ctdb_rec_data * ) & reply - > data [ 0 ] ;
for ( i = 0 ;
i < reply - > count ;
rec = ( struct ctdb_rec_data * ) ( rec - > length + ( uint8_t * ) rec ) , i + + ) {
TDB_DATA key , data ;
struct ctdb_ltdb_header * hdr ;
TDB_DATA existing ;
key . dptr = & rec - > data [ 0 ] ;
key . dsize = rec - > keylen ;
data . dptr = & rec - > data [ key . dsize ] ;
data . dsize = rec - > datalen ;
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
if ( data . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " bad ltdb record \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
}
/* fetch the existing record, if any */
existing = tdb_fetch ( recdb - > tdb , key ) ;
if ( existing . dptr ! = NULL ) {
struct ctdb_ltdb_header header ;
if ( existing . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Bad record size %u from node %u \n " ,
2008-01-07 06:08:25 +03:00
( unsigned ) existing . dsize , srcnode ) ) ;
2008-01-06 04:38:01 +03:00
free ( existing . dptr ) ;
talloc_free ( tmp_ctx ) ;
return - 1 ;
2007-05-06 04:16:48 +04:00
}
2008-01-06 04:38:01 +03:00
header = * ( struct ctdb_ltdb_header * ) existing . dptr ;
free ( existing . dptr ) ;
if ( ! ( header . rsn < hdr - > rsn | |
( header . dmaster ! = ctdb - > recovery_master & & header . rsn = = hdr - > rsn ) ) ) {
2007-05-06 04:16:48 +04:00
continue ;
}
2008-01-06 04:38:01 +03:00
}
if ( tdb_store ( recdb - > tdb , key , data , TDB_REPLACE ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to store record \n " ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
return - 1 ;
2007-05-06 04:16:48 +04:00
}
}
2008-01-06 04:38:01 +03:00
talloc_free ( tmp_ctx ) ;
2007-05-06 04:16:48 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
2008-01-06 04:38:01 +03:00
pull all the remote database contents into the recdb
2007-06-07 09:18:55 +04:00
*/
2008-01-06 04:38:01 +03:00
static int pull_remote_database ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
struct tdb_wrap * recdb , uint32_t dbid )
2007-05-06 04:22:13 +04:00
{
2008-01-06 04:38:01 +03:00
int j ;
2007-05-06 04:22:13 +04:00
2008-01-06 04:38:01 +03:00
/* pull all records from all other nodes across onto this node
( this merges based on rsn )
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont merge from nodes that are unavailable */
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
if ( pull_one_remote_database ( ctdb , nodemap - > nodes [ j ] . pnn , recdb , dbid ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to pull remote database from node %u \n " ,
2008-01-06 04:38:01 +03:00
nodemap - > nodes [ j ] . pnn ) ) ;
2008-01-02 14:44:46 +03:00
return - 1 ;
2007-05-06 04:22:13 +04:00
}
}
2008-01-06 04:38:01 +03:00
2007-05-06 04:22:13 +04:00
return 0 ;
}
2007-06-07 09:18:55 +04:00
/*
update flags on all active nodes
*/
static int update_flags_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
{
int i ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
struct ctdb_node_flag_change c ;
TDB_DATA data ;
2007-09-04 04:33:10 +04:00
c . pnn = nodemap - > nodes [ i ] . pnn ;
2007-08-21 11:25:15 +04:00
c . old_flags = nodemap - > nodes [ i ] . flags ;
c . new_flags = nodemap - > nodes [ i ] . flags ;
2007-06-07 09:18:55 +04:00
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
2007-06-09 15:58:50 +04:00
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
2007-06-07 09:18:55 +04:00
CTDB_SRVID_NODE_FLAGS_CHANGED , data ) ;
}
return 0 ;
}
2008-06-26 05:08:09 +04:00
static int update_our_flags_on_all_nodes ( struct ctdb_context * ctdb , uint32_t pnn , struct ctdb_node_map * nodemap )
{
struct ctdb_node_flag_change c ;
TDB_DATA data ;
c . pnn = nodemap - > nodes [ pnn ] . pnn ;
c . old_flags = nodemap - > nodes [ pnn ] . flags ;
c . new_flags = nodemap - > nodes [ pnn ] . flags ;
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
CTDB_SRVID_NODE_FLAGS_CHANGED , data ) ;
return 0 ;
}
2007-05-06 04:38:44 +04:00
2007-06-07 09:18:55 +04:00
/*
ensure all nodes have the same vnnmap we do
*/
2007-05-23 11:21:14 +04:00
static int update_vnnmap_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap ,
2007-09-04 04:33:10 +04:00
uint32_t pnn , struct ctdb_vnn_map * vnnmap , TALLOC_CTX * mem_ctx )
2007-05-06 04:42:18 +04:00
{
int j , ret ;
/* push the new vnn map out to all the nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont push to nodes that are unavailable */
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-06 04:42:18 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn , mem_ctx , vnnmap ) ;
2007-05-06 04:42:18 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set vnnmap for node %u \n " , pnn ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-06-07 09:18:55 +04:00
2007-06-07 10:34:33 +04:00
/*
handler for when the admin bans a node
*/
static void ban_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
struct ctdb_ban_info * b = ( struct ctdb_ban_info * ) data . dptr ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-06-07 10:34:33 +04:00
if ( data . dsize ! = sizeof ( * b ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Bad data in ban_handler \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-12-03 07:45:53 +03:00
if ( b - > pnn ! = ctdb - > pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request \n " , b - > pnn , ctdb - > pnn ) ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Node %u has been banned for %u seconds \n " ,
2007-09-04 04:33:10 +04:00
b - > pnn , b - > ban_time ) ) ;
2007-12-03 07:45:53 +03:00
2007-09-04 04:33:10 +04:00
ctdb_ban_node ( rec , b - > pnn , b - > ban_time ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
}
2007-06-07 09:18:55 +04:00
/*
2007-06-07 10:34:33 +04:00
handler for when the admin unbans a node
*/
static void unban_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
2007-06-07 09:18:55 +04:00
{
2007-06-07 10:34:33 +04:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-09-04 04:33:10 +04:00
uint32_t pnn ;
2007-06-07 10:34:33 +04:00
if ( data . dsize ! = sizeof ( uint32_t ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Bad data in unban_handler \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2007-09-04 04:33:10 +04:00
pnn = * ( uint32_t * ) data . dptr ;
2007-06-07 10:34:33 +04:00
2007-12-03 07:45:53 +03:00
if ( pnn ! = ctdb - > pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request \n " , pnn , ctdb - > pnn ) ) ;
2007-06-07 10:34:33 +04:00
return ;
}
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Node %u has been unbanned. \n " , pnn ) ) ;
2007-09-04 04:33:10 +04:00
ctdb_unban_node ( rec , pnn ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-06-07 09:18:55 +04:00
}
2008-01-08 13:28:42 +03:00
struct vacuum_info {
struct vacuum_info * next , * prev ;
struct ctdb_recoverd * rec ;
uint32_t srcnode ;
struct ctdb_db_context * ctdb_db ;
struct ctdb_control_pulldb_reply * recs ;
struct ctdb_rec_data * r ;
} ;
static void vacuum_fetch_next ( struct vacuum_info * v ) ;
2008-01-08 09:23:27 +03:00
/*
2008-01-08 13:28:42 +03:00
called when a vacuum fetch has completed - just free it and do the next one
2008-01-08 09:23:27 +03:00
*/
static void vacuum_fetch_callback ( struct ctdb_client_call_state * state )
{
2008-01-08 13:28:42 +03:00
struct vacuum_info * v = talloc_get_type ( state - > async . private , struct vacuum_info ) ;
2008-01-08 09:23:27 +03:00
talloc_free ( state ) ;
2008-01-08 13:28:42 +03:00
vacuum_fetch_next ( v ) ;
}
/*
process the next element from the vacuum list
*/
static void vacuum_fetch_next ( struct vacuum_info * v )
{
struct ctdb_call call ;
struct ctdb_rec_data * r ;
while ( v - > recs - > count ) {
struct ctdb_client_call_state * state ;
TDB_DATA data ;
struct ctdb_ltdb_header * hdr ;
ZERO_STRUCT ( call ) ;
call . call_id = CTDB_NULL_FUNC ;
call . flags = CTDB_IMMEDIATE_MIGRATION ;
r = v - > r ;
v - > r = ( struct ctdb_rec_data * ) ( r - > length + ( uint8_t * ) r ) ;
v - > recs - > count - - ;
call . key . dptr = & r - > data [ 0 ] ;
call . key . dsize = r - > keylen ;
/* ensure we don't block this daemon - just skip a record if we can't get
the chainlock */
if ( tdb_chainlock_nonblock ( v - > ctdb_db - > ltdb - > tdb , call . key ) ! = 0 ) {
continue ;
}
data = tdb_fetch ( v - > ctdb_db - > ltdb - > tdb , call . key ) ;
2008-01-15 12:11:44 +03:00
if ( data . dptr = = NULL ) {
tdb_chainunlock ( v - > ctdb_db - > ltdb - > tdb , call . key ) ;
continue ;
}
if ( data . dsize < sizeof ( struct ctdb_ltdb_header ) ) {
free ( data . dptr ) ;
2008-01-08 13:28:42 +03:00
tdb_chainunlock ( v - > ctdb_db - > ltdb - > tdb , call . key ) ;
continue ;
}
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
if ( hdr - > dmaster = = v - > rec - > ctdb - > pnn ) {
/* its already local */
2008-01-15 12:11:44 +03:00
free ( data . dptr ) ;
2008-01-08 13:28:42 +03:00
tdb_chainunlock ( v - > ctdb_db - > ltdb - > tdb , call . key ) ;
continue ;
}
2008-01-15 12:11:44 +03:00
free ( data . dptr ) ;
2008-01-08 13:28:42 +03:00
state = ctdb_call_send ( v - > ctdb_db , & call ) ;
tdb_chainunlock ( v - > ctdb_db - > ltdb - > tdb , call . key ) ;
if ( state = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to setup vacuum fetch call \n " ) ) ;
2008-01-08 13:28:42 +03:00
talloc_free ( v ) ;
return ;
}
state - > async . fn = vacuum_fetch_callback ;
state - > async . private = v ;
return ;
}
talloc_free ( v ) ;
}
/*
destroy a vacuum info structure
*/
static int vacuum_info_destructor ( struct vacuum_info * v )
{
DLIST_REMOVE ( v - > rec - > vacuum_info , v ) ;
return 0 ;
2008-01-08 09:23:27 +03:00
}
/*
handler for vacuum fetch
*/
static void vacuum_fetch_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
2008-01-08 13:28:42 +03:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2008-01-08 09:23:27 +03:00
struct ctdb_control_pulldb_reply * recs ;
int ret , i ;
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
const char * name ;
struct ctdb_dbid_map * dbmap = NULL ;
bool persistent = false ;
struct ctdb_db_context * ctdb_db ;
struct ctdb_rec_data * r ;
2008-01-08 13:28:42 +03:00
uint32_t srcnode ;
struct vacuum_info * v ;
2008-01-08 09:23:27 +03:00
recs = ( struct ctdb_control_pulldb_reply * ) data . dptr ;
2008-01-08 13:28:42 +03:00
r = ( struct ctdb_rec_data * ) & recs - > data [ 0 ] ;
if ( recs - > count = = 0 ) {
return ;
}
srcnode = r - > reqid ;
for ( v = rec - > vacuum_info ; v ; v = v - > next ) {
2008-01-09 02:22:20 +03:00
if ( srcnode = = v - > srcnode & & recs - > db_id = = v - > ctdb_db - > db_id ) {
2008-01-08 13:28:42 +03:00
/* we're already working on records from this node */
return ;
}
}
2008-01-08 09:23:27 +03:00
/* work out if the database is persistent */
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from local node \n " ) ) ;
2008-01-08 09:23:27 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
for ( i = 0 ; i < dbmap - > num ; i + + ) {
if ( dbmap - > dbs [ i ] . dbid = = recs - > db_id ) {
persistent = dbmap - > dbs [ i ] . persistent ;
break ;
}
}
if ( i = = dbmap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to find db_id 0x%x on local node \n " , recs - > db_id ) ) ;
2008-01-08 09:23:27 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
/* find the name of this database */
if ( ctdb_ctrl_getdbname ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , recs - > db_id , tmp_ctx , & name ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to get name of db 0x%x \n " , recs - > db_id ) ) ;
2008-01-08 09:23:27 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
/* attach to it */
2008-06-04 04:46:20 +04:00
ctdb_db = ctdb_attach ( ctdb , name , persistent , 0 ) ;
2008-01-08 09:23:27 +03:00
if ( ctdb_db = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to attach to database '%s' \n " , name ) ) ;
2008-01-08 09:23:27 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
2008-01-08 13:28:42 +03:00
v = talloc_zero ( rec , struct vacuum_info ) ;
if ( v = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Out of memory \n " ) ) ;
2008-01-08 13:28:42 +03:00
return ;
}
2008-01-08 09:23:27 +03:00
2008-01-08 13:28:42 +03:00
v - > rec = rec ;
v - > srcnode = srcnode ;
v - > ctdb_db = ctdb_db ;
v - > recs = talloc_memdup ( v , recs , data . dsize ) ;
if ( v - > recs = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Out of memory \n " ) ) ;
2008-01-08 13:28:42 +03:00
talloc_free ( v ) ;
return ;
}
v - > r = ( struct ctdb_rec_data * ) & v - > recs - > data [ 0 ] ;
2008-01-08 09:23:27 +03:00
2008-01-08 13:28:42 +03:00
DLIST_ADD ( rec - > vacuum_info , v ) ;
2008-01-08 09:23:27 +03:00
2008-01-08 13:28:42 +03:00
talloc_set_destructor ( v , vacuum_info_destructor ) ;
2008-01-08 09:23:27 +03:00
2008-01-08 13:28:42 +03:00
vacuum_fetch_next ( v ) ;
2008-01-08 09:23:27 +03:00
}
2007-06-07 10:34:33 +04:00
2007-07-04 02:36:59 +04:00
/*
called when ctdb_wait_timeout should finish
*/
static void ctdb_wait_handler ( struct event_context * ev , struct timed_event * te ,
struct timeval yt , void * p )
{
uint32_t * timed_out = ( uint32_t * ) p ;
( * timed_out ) = 1 ;
}
/*
wait for a given number of seconds
*/
static void ctdb_wait_timeout ( struct ctdb_context * ctdb , uint32_t secs )
{
uint32_t timed_out = 0 ;
event_add_timed ( ctdb - > ev , ctdb , timeval_current_ofs ( secs , 0 ) , ctdb_wait_handler , & timed_out ) ;
while ( ! timed_out ) {
event_loop_once ( ctdb - > ev ) ;
}
}
2007-11-13 02:27:44 +03:00
/*
called when an election times out ( ends )
*/
static void ctdb_election_timeout ( struct event_context * ev , struct timed_event * te ,
struct timeval t , void * p )
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
rec - > election_timeout = NULL ;
}
/*
wait for an election to finish . It finished election_timeout seconds after
the last election packet is received
*/
static void ctdb_wait_election ( struct ctdb_recoverd * rec )
{
struct ctdb_context * ctdb = rec - > ctdb ;
while ( rec - > election_timeout ) {
event_loop_once ( ctdb - > ev ) ;
}
}
2007-10-15 08:28:51 +04:00
/*
2007-11-23 03:31:42 +03:00
Update our local flags from all remote connected nodes .
This is only run when we are or we belive we are the recovery master
2007-10-15 08:28:51 +04:00
*/
2007-11-30 00:44:34 +03:00
static int update_local_flags ( struct ctdb_recoverd * rec , struct ctdb_node_map * nodemap )
2007-10-15 08:28:51 +04:00
{
int j ;
2007-11-30 00:44:34 +03:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-15 08:28:51 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
struct ctdb_node_map * remote_nodemap = NULL ;
int ret ;
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_DISCONNECTED ) {
continue ;
}
if ( nodemap - > nodes [ j ] . pnn = = ctdb - > pnn ) {
continue ;
}
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
mem_ctx , & remote_nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from remote node %u \n " ,
2007-10-15 08:28:51 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-11-28 07:04:20 +03:00
ctdb_set_culprit ( rec , nodemap - > nodes [ j ] . pnn ) ;
2007-10-15 08:28:51 +04:00
talloc_free ( mem_ctx ) ;
2007-11-23 03:53:06 +03:00
return MONITOR_FAILED ;
2007-10-15 08:28:51 +04:00
}
if ( nodemap - > nodes [ j ] . flags ! = remote_nodemap - > nodes [ j ] . flags ) {
2007-11-23 03:53:06 +03:00
struct ctdb_node_flag_change c ;
TDB_DATA data ;
2007-11-23 02:52:29 +03:00
2007-11-23 03:53:06 +03:00
/* We should tell our daemon about this so it
2007-11-23 02:52:29 +03:00
updates its flags or else we will log the same
message again in the next iteration of recovery .
2007-11-23 03:31:42 +03:00
Since we are the recovery master we can just as
well update the flags on all nodes .
2007-11-23 02:52:29 +03:00
*/
c . pnn = nodemap - > nodes [ j ] . pnn ;
c . old_flags = nodemap - > nodes [ j ] . flags ;
c . new_flags = remote_nodemap - > nodes [ j ] . flags ;
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
2007-12-03 07:45:53 +03:00
ctdb_send_message ( ctdb , ctdb - > pnn ,
2007-11-23 02:52:29 +03:00
CTDB_SRVID_NODE_FLAGS_CHANGED ,
data ) ;
2007-11-23 03:53:06 +03:00
/* Update our local copy of the flags in the recovery
daemon .
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Remote node %u had flags 0x%x, local had 0x%x - updating local \n " ,
2007-11-23 03:53:06 +03:00
nodemap - > nodes [ j ] . pnn , remote_nodemap - > nodes [ j ] . flags ,
nodemap - > nodes [ j ] . flags ) ) ;
2007-10-15 08:28:51 +04:00
nodemap - > nodes [ j ] . flags = remote_nodemap - > nodes [ j ] . flags ;
2007-11-23 03:53:06 +03:00
/* If the BANNED flag has changed for the node
this is a good reason to do a new election .
*/
if ( ( c . old_flags ^ c . new_flags ) & NODE_FLAGS_BANNED ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election \n " ,
2007-11-23 03:53:06 +03:00
nodemap - > nodes [ j ] . pnn , c . new_flags ,
c . old_flags ) ) ;
talloc_free ( mem_ctx ) ;
return MONITOR_ELECTION_NEEDED ;
}
2007-10-15 08:28:51 +04:00
}
talloc_free ( remote_nodemap ) ;
}
talloc_free ( mem_ctx ) ;
2007-11-23 03:53:06 +03:00
return MONITOR_OK ;
2007-10-15 08:28:51 +04:00
}
2007-08-22 06:38:31 +04:00
/* Create a new random generation ip.
The generation id can not be the INVALID_GENERATION id
*/
static uint32_t new_generation ( void )
{
uint32_t generation ;
while ( 1 ) {
generation = random ( ) ;
if ( generation ! = INVALID_GENERATION ) {
break ;
}
}
return generation ;
}
2007-10-05 06:01:40 +04:00
2008-01-06 04:38:01 +03:00
/*
create a temporary working database
*/
static struct tdb_wrap * create_recdb ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx )
{
char * name ;
struct tdb_wrap * recdb ;
2008-07-04 11:32:21 +04:00
unsigned tdb_flags ;
2008-01-06 04:38:01 +03:00
/* open up the temporary recovery database */
name = talloc_asprintf ( mem_ctx , " %s/recdb.tdb " , ctdb - > db_directory ) ;
if ( name = = NULL ) {
return NULL ;
}
unlink ( name ) ;
2008-07-04 11:32:21 +04:00
tdb_flags = TDB_NOLOCK ;
if ( ! ctdb - > do_setsched ) {
tdb_flags | = TDB_NOMMAP ;
}
2008-01-06 04:38:01 +03:00
recdb = tdb_wrap_open ( mem_ctx , name , ctdb - > tunable . database_hash_size ,
2008-07-04 11:32:21 +04:00
tdb_flags , O_RDWR | O_CREAT | O_EXCL , 0600 ) ;
2008-01-06 04:38:01 +03:00
if ( recdb = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to create temp recovery database '%s' \n " , name ) ) ;
2008-01-06 04:38:01 +03:00
}
talloc_free ( name ) ;
return recdb ;
}
/*
a traverse function for pulling all relevent records from recdb
*/
struct recdb_data {
struct ctdb_context * ctdb ;
struct ctdb_control_pulldb_reply * recdata ;
uint32_t len ;
2008-01-07 06:08:25 +03:00
bool failed ;
2008-01-06 04:38:01 +03:00
} ;
static int traverse_recdb ( struct tdb_context * tdb , TDB_DATA key , TDB_DATA data , void * p )
{
struct recdb_data * params = ( struct recdb_data * ) p ;
struct ctdb_rec_data * rec ;
struct ctdb_ltdb_header * hdr ;
/* skip empty records */
if ( data . dsize < = sizeof ( struct ctdb_ltdb_header ) ) {
return 0 ;
}
/* update the dmaster field to point to us */
hdr = ( struct ctdb_ltdb_header * ) data . dptr ;
hdr - > dmaster = params - > ctdb - > pnn ;
/* add the record to the blob ready to send to the nodes */
rec = ctdb_marshall_record ( params - > recdata , 0 , key , NULL , data ) ;
2008-01-07 06:08:25 +03:00
if ( rec = = NULL ) {
params - > failed = true ;
return - 1 ;
}
2008-01-06 04:38:01 +03:00
params - > recdata = talloc_realloc_size ( NULL , params - > recdata , rec - > length + params - > len ) ;
if ( params - > recdata = = NULL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to expand recdata to %u (%u records) \n " ,
2008-01-06 04:38:01 +03:00
rec - > length + params - > len , params - > recdata - > count ) ) ;
2008-01-07 06:08:25 +03:00
params - > failed = true ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
params - > recdata - > count + + ;
memcpy ( params - > len + ( uint8_t * ) params - > recdata , rec , rec - > length ) ;
params - > len + = rec - > length ;
talloc_free ( rec ) ;
return 0 ;
}
/*
push the recdb database out to all nodes
*/
static int push_recdb_database ( struct ctdb_context * ctdb , uint32_t dbid ,
struct tdb_wrap * recdb , struct ctdb_node_map * nodemap )
{
struct recdb_data params ;
struct ctdb_control_pulldb_reply * recdata ;
TDB_DATA outdata ;
2008-01-29 05:59:28 +03:00
TALLOC_CTX * tmp_ctx ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2008-01-29 05:59:28 +03:00
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY ( ctdb , tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
recdata = talloc_zero ( recdb , struct ctdb_control_pulldb_reply ) ;
CTDB_NO_MEMORY ( ctdb , recdata ) ;
recdata - > db_id = dbid ;
params . ctdb = ctdb ;
params . recdata = recdata ;
params . len = offsetof ( struct ctdb_control_pulldb_reply , data ) ;
2008-01-07 06:08:25 +03:00
params . failed = false ;
2008-01-06 04:38:01 +03:00
if ( tdb_traverse_read ( recdb - > tdb , traverse_recdb , & params ) = = - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to traverse recdb database \n " ) ) ;
2008-01-07 06:08:25 +03:00
talloc_free ( params . recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-01-07 06:08:25 +03:00
if ( params . failed ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to traverse recdb database \n " ) ) ;
2008-01-07 06:08:25 +03:00
talloc_free ( params . recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-07 06:08:25 +03:00
return - 1 ;
}
2008-01-06 04:38:01 +03:00
recdata = params . recdata ;
outdata . dptr = ( void * ) recdata ;
outdata . dsize = params . len ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , tmp_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_PUSH_DB ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , outdata ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to push recdb records to nodes for db 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
talloc_free ( recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - pushed remote database 0x%x of size %u \n " ,
2008-01-06 04:38:01 +03:00
dbid , recdata - > count ) ) ;
talloc_free ( recdata ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( tmp_ctx ) ;
2008-01-06 04:38:01 +03:00
return 0 ;
}
/*
go through a full recovery on one database
*/
static int recover_database ( struct ctdb_recoverd * rec ,
TALLOC_CTX * mem_ctx ,
uint32_t dbid ,
uint32_t pnn ,
2008-01-06 05:24:55 +03:00
struct ctdb_node_map * nodemap ,
uint32_t transaction_id )
2008-01-06 04:38:01 +03:00
{
struct tdb_wrap * recdb ;
int ret ;
struct ctdb_context * ctdb = rec - > ctdb ;
TDB_DATA data ;
2008-01-06 05:24:55 +03:00
struct ctdb_control_wipe_database w ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2008-01-06 04:38:01 +03:00
recdb = create_recdb ( ctdb , mem_ctx ) ;
if ( recdb = = NULL ) {
return - 1 ;
}
/* pull all remote databases onto the recdb */
ret = pull_remote_database ( ctdb , nodemap , recdb , dbid ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to pull remote database 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - pulled remote database 0x%x \n " , dbid ) ) ;
2008-01-06 04:38:01 +03:00
/* wipe all the remote databases. This is safe as we are in a transaction */
2008-01-06 05:24:55 +03:00
w . db_id = dbid ;
w . transaction_id = transaction_id ;
data . dptr = ( void * ) & w ;
data . dsize = sizeof ( w ) ;
2008-01-06 04:38:01 +03:00
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , recdb , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_WIPE_DATABASE ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to wipe database. Recovery failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
talloc_free ( recdb ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
/* push out the correct database. This sets the dmaster and skips
the empty records */
ret = push_recdb_database ( ctdb , dbid , recdb , nodemap ) ;
if ( ret ! = 0 ) {
talloc_free ( recdb ) ;
return - 1 ;
}
/* all done with this database */
talloc_free ( recdb ) ;
return 0 ;
}
2007-08-22 06:38:31 +04:00
2007-06-02 04:03:28 +04:00
/*
we are the recmaster , and recovery is needed - start a recovery run
*/
2007-06-07 09:18:55 +04:00
static int do_recovery ( struct ctdb_recoverd * rec ,
2008-02-29 04:55:20 +03:00
TALLOC_CTX * mem_ctx , uint32_t pnn ,
2007-06-07 09:18:55 +04:00
struct ctdb_node_map * nodemap , struct ctdb_vnn_map * vnnmap ,
2008-06-13 05:47:42 +04:00
int32_t culprit )
2007-05-06 04:04:37 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-06 04:12:42 +04:00
int i , j , ret ;
2007-05-06 04:04:37 +04:00
uint32_t generation ;
struct ctdb_dbid_map * dbmap ;
2008-01-06 05:24:55 +03:00
TDB_DATA data ;
2008-06-12 10:53:36 +04:00
uint32_t * nodes ;
2007-06-02 04:03:28 +04:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Starting do_recovery \n " ) ) ;
2007-10-18 10:27:36 +04:00
2007-09-14 03:49:12 +04:00
/* if recovery fails, force it again */
rec - > need_recovery = true ;
2008-06-13 05:47:42 +04:00
if ( culprit ! = - 1 ) {
ctdb_set_culprit ( rec , culprit ) ;
}
2007-06-07 09:18:55 +04:00
if ( rec - > culprit_counter > 2 * nodemap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds \n " ,
2007-06-07 09:18:55 +04:00
culprit , rec - > culprit_counter , timeval_elapsed ( & rec - > first_recover_time ) ,
ctdb - > tunable . recovery_ban_period ) ) ;
2007-06-07 10:34:33 +04:00
ctdb_ban_node ( rec , culprit , ctdb - > tunable . recovery_ban_period ) ;
2007-06-07 09:18:55 +04:00
}
2007-06-02 05:36:42 +04:00
if ( ! ctdb_recovery_lock ( ctdb , true ) ) {
2007-10-05 06:01:40 +04:00
ctdb_set_culprit ( rec , pnn ) ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to get recovery lock - aborting recovery \n " ) ) ;
2007-06-02 04:03:28 +04:00
return - 1 ;
}
2007-05-06 04:04:37 +04:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery initiated due to problem with node %u \n " , culprit ) ) ;
2007-05-23 08:35:19 +04:00
2007-05-06 04:04:37 +04:00
/* get a list of all databases */
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_getdbmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & dbmap ) ;
2007-05-06 04:04:37 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get dbids from node :%u \n " , pnn ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
2008-01-06 04:38:01 +03:00
/* we do the db creation before we set the recovery mode, so the freeze happens
on all databases we will be dealing with . */
2007-05-06 04:04:37 +04:00
2007-05-06 04:12:42 +04:00
/* verify that we have all the databases any other node has */
2007-09-04 04:33:10 +04:00
ret = create_missing_local_databases ( ctdb , nodemap , pnn , & dbmap , mem_ctx ) ;
2007-05-06 04:12:42 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing local databases \n " ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
/* verify that all other nodes have all our databases */
2007-09-04 04:33:10 +04:00
ret = create_missing_remote_databases ( ctdb , nodemap , pnn , dbmap , mem_ctx ) ;
2007-05-06 04:04:37 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to create missing remote databases \n " ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - created remote databases \n " ) ) ;
2007-06-17 17:31:44 +04:00
2008-01-29 05:59:28 +03:00
2008-01-06 04:38:01 +03:00
/* set recovery mode to active on all nodes */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-06 04:22:13 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-01-29 05:59:28 +03:00
/* execute the "startrecovery" event script on all nodes */
2008-06-12 10:53:36 +04:00
ret = run_startrecovery_eventscript ( rec , nodemap ) ;
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'startrecovery' event on cluster \n " ) ) ;
2008-01-29 05:59:28 +03:00
return - 1 ;
}
2008-01-06 04:38:01 +03:00
/* pick a new generation number */
generation = new_generation ( ) ;
2007-05-06 04:22:13 +04:00
2008-01-06 04:38:01 +03:00
/* change the vnnmap on this node to use the new generation
number but not on any other nodes .
this guarantees that if we abort the recovery prematurely
for some reason ( a node stops responding ? )
that we can just return immediately and we will reenter
recovery shortly again .
I . e . we deliberately leave the cluster with an inconsistent
generation id to allow us to abort recovery at any stage and
just restart it from scratch .
2008-01-02 14:44:46 +03:00
*/
2008-01-06 04:38:01 +03:00
vnnmap - > generation = generation ;
ret = ctdb_ctrl_setvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , vnnmap ) ;
2008-01-02 14:44:46 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set vnnmap for node %u \n " , pnn ) ) ;
2008-01-02 14:44:46 +03:00
return - 1 ;
}
2008-01-06 05:24:55 +03:00
data . dptr = ( void * ) & generation ;
data . dsize = sizeof ( uint32_t ) ;
2008-06-12 10:53:36 +04:00
nodes = list_of_active_nodes ( ctdb , nodemap , mem_ctx , true ) ;
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_TRANSACTION_START ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to start transactions. Recovery failed. \n " ) ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
2008-01-02 14:44:46 +03:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " started transactions on all nodes \n " ) ) ;
2008-01-02 14:44:46 +03:00
2008-01-06 04:38:01 +03:00
for ( i = 0 ; i < dbmap - > num ; i + + ) {
2008-01-06 05:24:55 +03:00
if ( recover_database ( rec , mem_ctx , dbmap - > dbs [ i ] . dbid , pnn , nodemap , generation ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Failed to recover database 0x%x \n " , dbmap - > dbs [ i ] . dbid ) ) ;
2008-01-06 04:38:01 +03:00
return - 1 ;
}
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - starting database commits \n " ) ) ;
2008-01-06 04:38:01 +03:00
/* commit all the changes */
2008-01-29 05:59:28 +03:00
if ( ctdb_client_async_control ( ctdb , CTDB_CONTROL_TRANSACTION_COMMIT ,
2008-06-12 10:53:36 +04:00
nodes ,
CONTROL_TIMEOUT ( ) , false , data ,
NULL , NULL ,
NULL ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to commit recovery changes. Recovery failed. \n " ) ) ;
2007-05-06 04:38:44 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - committed databases \n " ) ) ;
2008-01-06 04:38:01 +03:00
2007-05-06 04:38:44 +04:00
2008-05-06 09:42:59 +04:00
/* update the capabilities for all nodes */
ret = update_capabilities ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update node capabilities. \n " ) ) ;
return - 1 ;
}
2007-06-07 09:18:55 +04:00
/* build a new vnn map with all the currently active and
unbanned nodes */
2007-08-22 06:38:31 +04:00
generation = new_generation ( ) ;
2007-05-10 02:49:57 +04:00
vnnmap = talloc ( mem_ctx , struct ctdb_vnn_map ) ;
CTDB_NO_MEMORY ( ctdb , vnnmap ) ;
2007-05-04 09:21:40 +04:00
vnnmap - > generation = generation ;
2008-05-06 09:42:59 +04:00
vnnmap - > size = 0 ;
2007-06-07 10:34:33 +04:00
vnnmap - > map = talloc_zero_array ( vnnmap , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
2007-05-04 09:21:40 +04:00
for ( i = j = 0 ; i < nodemap - > num ; i + + ) {
2008-05-06 09:42:59 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
2007-05-04 09:21:40 +04:00
}
2008-05-06 09:42:59 +04:00
if ( ! ( ctdb - > nodes [ i ] - > capabilities & CTDB_CAP_LMASTER ) ) {
/* this node can not be an lmaster */
DEBUG ( DEBUG_DEBUG , ( " Node %d cant be a LMASTER, skipping it \n " , i ) ) ;
continue ;
}
vnnmap - > size + + ;
2008-05-08 13:59:24 +04:00
vnnmap - > map = talloc_realloc ( vnnmap , vnnmap - > map , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
vnnmap - > map [ j + + ] = nodemap - > nodes [ i ] . pnn ;
2007-05-04 09:21:40 +04:00
}
2008-05-06 09:42:59 +04:00
if ( vnnmap - > size = = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " No suitable lmasters found. Adding local node (recmaster) anyway. \n " ) ) ;
vnnmap - > size + + ;
2008-05-08 13:59:24 +04:00
vnnmap - > map = talloc_realloc ( vnnmap , vnnmap - > map , uint32_t , vnnmap - > size ) ;
2008-05-06 09:42:59 +04:00
CTDB_NO_MEMORY ( ctdb , vnnmap - > map ) ;
vnnmap - > map [ 0 ] = pnn ;
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:42:18 +04:00
/* update to the new vnnmap on all nodes */
2007-09-04 04:33:10 +04:00
ret = update_vnnmap_on_all_nodes ( ctdb , nodemap , pnn , vnnmap , mem_ctx ) ;
2007-05-06 04:42:18 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update vnnmap on all nodes \n " ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated vnnmap \n " ) ) ;
2007-05-04 09:21:40 +04:00
2007-05-07 00:51:58 +04:00
/* update recmaster to point to us for all nodes */
2007-09-04 04:33:10 +04:00
ret = set_recovery_master ( ctdb , nodemap , pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery master \n " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated recmaster \n " ) ) ;
2007-05-07 00:51:58 +04:00
2007-06-07 09:18:55 +04:00
/*
update all nodes to have the same flags that we have
*/
ret = update_flags_on_all_nodes ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update flags on all nodes \n " ) ) ;
2007-06-07 09:18:55 +04:00
return - 1 ;
}
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - updated flags \n " ) ) ;
2007-06-17 17:31:44 +04:00
2008-05-14 14:57:04 +04:00
/* disable recovery mode */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_NORMAL ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-05-14 14:57:04 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to normal on cluster \n " ) ) ;
return - 1 ;
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - disabled recovery mode \n " ) ) ;
2007-05-25 11:04:13 +04:00
/*
2008-05-09 07:41:31 +04:00
tell nodes to takeover their public IPs
2007-05-25 11:04:13 +04:00
*/
2008-05-09 07:41:31 +04:00
rec - > need_takeover_run = false ;
ret = ctdb_takeover_run ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to setup public takeover addresses \n " ) ) ;
return - 1 ;
2007-05-25 11:04:13 +04:00
}
2008-02-18 11:38:04 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - takeip finished \n " ) ) ;
2008-01-29 05:59:28 +03:00
/* execute the "recovered" event script on all nodes */
2008-05-15 06:28:52 +04:00
ret = run_recovered_eventscript ( ctdb , nodemap , " do_recovery " ) ;
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-05-15 06:28:52 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'recovered' event on cluster. Recovery process failed. \n " ) ) ;
2008-01-29 05:59:28 +03:00
return - 1 ;
}
2008-02-18 11:38:04 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery - finished the recovered event \n " ) ) ;
2007-05-25 18:05:30 +04:00
/* send a message to all clients telling them that the cluster
has been reconfigured */
2007-08-21 11:25:15 +04:00
ctdb_send_message ( ctdb , CTDB_BROADCAST_CONNECTED , CTDB_SRVID_RECONFIGURE , tdb_null ) ;
2007-05-04 09:21:40 +04:00
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Recovery complete \n " ) ) ;
2007-07-04 02:36:59 +04:00
2007-09-14 03:49:12 +04:00
rec - > need_recovery = false ;
2007-07-04 02:36:59 +04:00
/* We just finished a recovery successfully.
We now wait for rerecovery_timeout before we allow
another recovery to take place .
*/
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " New recoveries supressed for the rerecovery timeout \n " ) ) ;
2007-07-04 02:36:59 +04:00
ctdb_wait_timeout ( ctdb , ctdb - > tunable . rerecovery_timeout ) ;
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Rerecovery timeout elapsed. Recovery reactivated. \n " ) ) ;
2007-07-04 02:36:59 +04:00
2007-05-04 09:21:40 +04:00
return 0 ;
2007-05-04 03:45:53 +04:00
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-06-07 13:17:27 +04:00
/*
elections are won by first checking the number of connected nodes , then
2007-09-04 04:33:10 +04:00
the priority time , then the pnn
2007-06-07 13:17:27 +04:00
*/
2007-05-07 00:51:58 +04:00
struct election_message {
2007-06-07 13:17:27 +04:00
uint32_t num_connected ;
2007-06-07 12:37:27 +04:00
struct timeval priority_time ;
2007-09-04 04:33:10 +04:00
uint32_t pnn ;
2007-10-05 07:28:21 +04:00
uint32_t node_flags ;
2007-05-07 00:51:58 +04:00
} ;
2007-06-07 13:17:27 +04:00
/*
form this nodes election data
*/
static void ctdb_election_data ( struct ctdb_recoverd * rec , struct election_message * em )
{
int ret , i ;
struct ctdb_node_map * nodemap ;
struct ctdb_context * ctdb = rec - > ctdb ;
ZERO_STRUCTP ( em ) ;
2007-09-04 04:33:10 +04:00
em - > pnn = rec - > ctdb - > pnn ;
2007-06-07 13:17:27 +04:00
em - > priority_time = rec - > priority_time ;
2007-10-05 07:28:21 +04:00
em - > node_flags = rec - > node_flags ;
2007-06-07 13:17:27 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , rec , & nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " unable to get election data \n " ) ) ;
2007-06-07 13:17:27 +04:00
return ;
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
em - > num_connected + + ;
}
}
2008-05-06 07:56:56 +04:00
/* we shouldnt try to win this election if we cant be a recmaster */
if ( ( ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
em - > num_connected = 0 ;
em - > priority_time = timeval_current ( ) ;
}
2007-06-07 13:17:27 +04:00
talloc_free ( nodemap ) ;
}
/*
see if the given election data wins
*/
static bool ctdb_election_win ( struct ctdb_recoverd * rec , struct election_message * em )
{
struct election_message myem ;
2007-10-05 07:28:21 +04:00
int cmp = 0 ;
2007-06-07 13:17:27 +04:00
ctdb_election_data ( rec , & myem ) ;
2008-05-06 07:56:56 +04:00
/* we cant win if we dont have the recmaster capability */
if ( ( rec - > ctdb - > capabilities & CTDB_CAP_RECMASTER ) = = 0 ) {
return false ;
}
2007-10-11 00:16:36 +04:00
/* we cant win if we are banned */
if ( rec - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 08:17:49 +04:00
return false ;
2007-10-11 00:16:36 +04:00
}
2007-10-05 07:28:21 +04:00
2007-10-11 00:16:36 +04:00
/* we will automatically win if the other node is banned */
if ( em - > node_flags & NODE_FLAGS_BANNED ) {
2007-10-15 08:17:49 +04:00
return true ;
2007-10-05 07:28:21 +04:00
}
2007-06-07 13:17:27 +04:00
/* try to use the most connected node */
2007-10-05 07:28:21 +04:00
if ( cmp = = 0 ) {
cmp = ( int ) myem . num_connected - ( int ) em - > num_connected ;
}
2007-06-07 13:17:27 +04:00
/* then the longest running node */
if ( cmp = = 0 ) {
2007-06-07 13:21:55 +04:00
cmp = timeval_compare ( & em - > priority_time , & myem . priority_time ) ;
2007-06-07 13:17:27 +04:00
}
if ( cmp = = 0 ) {
2007-09-04 04:33:10 +04:00
cmp = ( int ) myem . pnn - ( int ) em - > pnn ;
2007-06-07 13:17:27 +04:00
}
return cmp > 0 ;
}
2007-06-07 09:18:55 +04:00
/*
send out an election request
*/
2007-11-13 02:27:44 +03:00
static int send_election_request ( struct ctdb_recoverd * rec , uint32_t pnn )
2007-05-07 00:51:58 +04:00
{
int ret ;
TDB_DATA election_data ;
struct election_message emsg ;
uint64_t srvid ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-10-11 00:16:36 +04:00
2007-06-06 04:25:46 +04:00
srvid = CTDB_SRVID_RECOVERY ;
2007-05-07 00:51:58 +04:00
2007-06-07 13:17:27 +04:00
ctdb_election_data ( rec , & emsg ) ;
2007-05-07 00:51:58 +04:00
election_data . dsize = sizeof ( struct election_message ) ;
election_data . dptr = ( unsigned char * ) & emsg ;
/* first we assume we will win the election and set
recoverymaster to be ourself on the current node
*/
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) , pnn , pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to send recmaster election request \n " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
/* send an election message to all active nodes */
ctdb_send_message ( ctdb , CTDB_BROADCAST_ALL , srvid , election_data ) ;
return 0 ;
}
2007-06-09 14:11:51 +04:00
/*
this function will unban all nodes in the cluster
*/
static void unban_all_nodes ( struct ctdb_context * ctdb )
{
int ret , i ;
struct ctdb_node_map * nodemap ;
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to get nodemap to unban all nodes \n " ) ) ;
2007-06-09 14:11:51 +04:00
return ;
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) )
& & ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_BANNED ) ) {
2007-09-04 03:50:07 +04:00
ctdb_ctrl_modflags ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ i ] . pnn , 0 , NODE_FLAGS_BANNED ) ;
2007-06-09 14:11:51 +04:00
}
}
talloc_free ( tmp_ctx ) ;
}
2007-05-07 00:51:58 +04:00
2007-11-13 02:27:44 +03:00
/*
we think we are winning the election - send a broadcast election request
*/
static void election_send_request ( struct event_context * ev , struct timed_event * te , struct timeval t , void * p )
{
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
int ret ;
ret = send_election_request ( rec , ctdb_get_pnn ( rec - > ctdb ) ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to send election request! \n " ) ) ;
2007-11-13 02:27:44 +03:00
}
talloc_free ( rec - > send_election_te ) ;
rec - > send_election_te = NULL ;
}
2008-04-01 08:34:54 +04:00
/*
handler for memory dumps
*/
static void mem_dump_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
TALLOC_CTX * tmp_ctx = talloc_new ( ctdb ) ;
TDB_DATA * dump ;
int ret ;
struct rd_memdump_reply * rd ;
if ( data . dsize ! = sizeof ( struct rd_memdump_reply ) ) {
DEBUG ( DEBUG_ERR , ( __location__ " Wrong size of return address. \n " ) ) ;
return ;
}
rd = ( struct rd_memdump_reply * ) data . dptr ;
dump = talloc_zero ( tmp_ctx , TDB_DATA ) ;
if ( dump = = NULL ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to allocate memory for memdump \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
ret = ctdb_dump_memory ( ctdb , dump ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_dump_memory() failed \n " ) ) ;
talloc_free ( tmp_ctx ) ;
return ;
}
DEBUG ( DEBUG_ERR , ( " recovery master memory dump \n " ) ) ;
ret = ctdb_send_message ( ctdb , rd - > pnn , rd - > srvid , * dump ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Failed to send rd memdump reply message \n " ) ) ;
return ;
}
talloc_free ( tmp_ctx ) ;
}
2007-05-07 00:51:58 +04:00
/*
handler for recovery master elections
*/
static void election_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
2007-06-07 09:18:55 +04:00
TDB_DATA data , void * private_data )
2007-05-07 00:51:58 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2007-05-07 00:51:58 +04:00
int ret ;
struct election_message * em = ( struct election_message * ) data . dptr ;
TALLOC_CTX * mem_ctx ;
2007-11-13 02:27:44 +03:00
/* we got an election packet - update the timeout for the election */
talloc_free ( rec - > election_timeout ) ;
rec - > election_timeout = event_add_timed ( ctdb - > ev , ctdb ,
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
2007-05-10 07:10:23 +04:00
mem_ctx = talloc_new ( ctdb ) ;
2007-06-07 13:17:27 +04:00
2007-05-07 00:51:58 +04:00
/* someone called an election. check their election data
and if we disagree and we would rather be the elected node ,
send a new election message to all other nodes
*/
2007-06-07 13:17:27 +04:00
if ( ctdb_election_win ( rec , em ) ) {
2007-11-13 02:27:44 +03:00
if ( ! rec - > send_election_te ) {
rec - > send_election_te = event_add_timed ( ctdb - > ev , rec ,
timeval_current_ofs ( 0 , 500000 ) ,
election_send_request , rec ) ;
2007-05-07 00:51:58 +04:00
}
talloc_free ( mem_ctx ) ;
2007-06-09 14:13:25 +04:00
/*unban_all_nodes(ctdb);*/
2007-05-07 00:51:58 +04:00
return ;
}
2007-11-13 02:27:44 +03:00
/* we didn't win */
talloc_free ( rec - > send_election_te ) ;
rec - > send_election_te = NULL ;
2007-05-07 00:51:58 +04:00
2007-05-23 08:35:19 +04:00
/* release the recmaster lock */
2007-09-04 04:33:10 +04:00
if ( em - > pnn ! = ctdb - > pnn & &
2007-06-03 04:29:14 +04:00
ctdb - > recovery_lock_fd ! = - 1 ) {
2007-06-02 05:36:42 +04:00
close ( ctdb - > recovery_lock_fd ) ;
ctdb - > recovery_lock_fd = - 1 ;
2007-06-09 14:11:51 +04:00
unban_all_nodes ( ctdb ) ;
2007-05-23 08:35:19 +04:00
}
2007-05-07 00:51:58 +04:00
/* ok, let that guy become recmaster then */
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_setrecmaster ( ctdb , CONTROL_TIMEOUT ( ) , ctdb_get_pnn ( ctdb ) , em - > pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to send recmaster election request " ) ) ;
2007-05-07 00:51:58 +04:00
talloc_free ( mem_ctx ) ;
return ;
}
2007-06-07 10:34:33 +04:00
/* release any bans */
2007-06-07 09:18:55 +04:00
rec - > last_culprit = ( uint32_t ) - 1 ;
talloc_free ( rec - > banned_nodes ) ;
2007-06-07 10:34:33 +04:00
rec - > banned_nodes = talloc_zero_array ( rec , struct ban_state * , ctdb - > num_nodes ) ;
2007-06-07 09:18:55 +04:00
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes ) ;
2007-05-07 00:51:58 +04:00
talloc_free ( mem_ctx ) ;
return ;
}
2007-06-07 09:18:55 +04:00
/*
force the start of the election process
*/
2008-03-03 01:19:30 +03:00
static void force_election ( struct ctdb_recoverd * rec , uint32_t pnn ,
2007-06-07 12:37:27 +04:00
struct ctdb_node_map * nodemap )
2007-05-07 00:51:58 +04:00
{
int ret ;
2007-06-07 12:37:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-05-10 03:48:14 +04:00
/* set all nodes to recovery mode to stop all internode traffic */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
2008-07-07 02:50:12 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-10 03:48:14 +04:00
return ;
}
2007-11-13 02:27:44 +03:00
talloc_free ( rec - > election_timeout ) ;
rec - > election_timeout = event_add_timed ( ctdb - > ev , ctdb ,
timeval_current_ofs ( ctdb - > tunable . election_timeout , 0 ) ,
ctdb_election_timeout , rec ) ;
ret = send_election_request ( rec , pnn ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
return ;
}
2007-05-26 08:01:08 +04:00
/* wait for a few seconds to collect all responses */
2007-11-13 02:27:44 +03:00
ctdb_wait_election ( rec ) ;
2007-06-07 09:18:55 +04:00
}
/*
handler for when a node changes its flags
*/
static void monitor_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
int ret ;
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) data . dptr ;
struct ctdb_node_map * nodemap = NULL ;
TALLOC_CTX * tmp_ctx ;
2007-08-21 11:25:15 +04:00
uint32_t changed_flags ;
2007-06-07 09:18:55 +04:00
int i ;
2007-09-13 08:08:18 +04:00
struct ctdb_recoverd * rec = talloc_get_type ( private_data , struct ctdb_recoverd ) ;
2007-06-07 09:18:55 +04:00
if ( data . dsize ! = sizeof ( * c ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Invalid data in ctdb_node_flag_change \n " ) ) ;
2007-06-07 09:18:55 +04:00
return ;
}
tmp_ctx = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY_VOID ( ctdb , tmp_ctx ) ;
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , tmp_ctx , & nodemap ) ;
2007-12-27 02:07:01 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " ctdb_ctrl_getnodemap failed in monitor_handler \n " ) ) ;
2007-12-27 02:07:01 +03:00
talloc_free ( tmp_ctx ) ;
return ;
}
2007-06-07 09:18:55 +04:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ i ] . pnn = = c - > pnn ) break ;
2007-06-07 09:18:55 +04:00
}
if ( i = = nodemap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Flag change for non-existant node %u \n " , c - > pnn ) ) ;
2007-06-07 09:18:55 +04:00
talloc_free ( tmp_ctx ) ;
return ;
}
2007-08-21 11:25:15 +04:00
changed_flags = c - > old_flags ^ c - > new_flags ;
2007-07-09 07:21:17 +04:00
/* Dont let messages from remote nodes change the DISCONNECTED flag.
This flag is handled locally based on whether the local node
can communicate with the node or not .
*/
2007-08-21 11:25:15 +04:00
c - > new_flags & = ~ NODE_FLAGS_DISCONNECTED ;
2007-07-09 11:40:15 +04:00
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) {
2007-08-21 11:25:15 +04:00
c - > new_flags | = NODE_FLAGS_DISCONNECTED ;
2007-07-09 11:40:15 +04:00
}
2007-07-09 07:21:17 +04:00
2007-08-21 11:25:15 +04:00
if ( nodemap - > nodes [ i ] . flags ! = c - > new_flags ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Node %u has changed flags - now 0x%x was 0x%x \n " , c - > pnn , c - > new_flags , c - > old_flags ) ) ;
2007-06-07 09:18:55 +04:00
}
2007-08-21 11:25:15 +04:00
nodemap - > nodes [ i ] . flags = c - > new_flags ;
2007-06-07 09:18:55 +04:00
2007-08-23 13:27:09 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , tmp_ctx , CONTROL_TIMEOUT ( ) ,
2007-06-07 09:18:55 +04:00
CTDB_CURRENT_NODE , & ctdb - > recovery_master ) ;
if ( ret = = 0 ) {
2007-08-23 07:00:10 +04:00
ret = ctdb_ctrl_getrecmode ( ctdb , tmp_ctx , CONTROL_TIMEOUT ( ) ,
2007-06-07 09:18:55 +04:00
CTDB_CURRENT_NODE , & ctdb - > recovery_mode ) ;
2007-05-07 00:51:58 +04:00
}
2007-06-07 09:18:55 +04:00
if ( ret = = 0 & &
2007-09-04 04:06:36 +04:00
ctdb - > recovery_master = = ctdb - > pnn & &
2008-05-15 07:28:19 +04:00
ctdb - > recovery_mode = = CTDB_RECOVERY_NORMAL ) {
2007-08-21 11:25:15 +04:00
/* Only do the takeover run if the perm disabled or unhealthy
flags changed since these will cause an ip failover but not
a recovery .
If the node became disconnected or banned this will also
lead to an ip address failover but that is handled
during recovery
*/
if ( changed_flags & NODE_FLAGS_DISABLED ) {
2007-09-13 08:08:18 +04:00
rec - > need_takeover_run = true ;
2007-06-07 09:18:55 +04:00
}
}
talloc_free ( tmp_ctx ) ;
2007-05-07 00:51:58 +04:00
}
2007-05-06 22:41:12 +04:00
2007-06-07 09:18:55 +04:00
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data {
uint32_t count ;
enum monitor_result status ;
} ;
static void verify_recmode_normal_callback ( struct ctdb_client_control_state * state )
{
2007-09-26 08:25:32 +04:00
struct verify_recmode_normal_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmode_normal_data ) ;
2007-08-27 03:40:10 +04:00
/* one more node has responded with recmode data*/
rmdata - > count - - ;
/* if we failed to get the recmode, then return an error and let
the main loop try again .
*/
if ( state - > state ! = CTDB_CONTROL_DONE ) {
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
return ;
}
/* if we got a response, then the recmode will be stored in the
status field
*/
if ( state - > status ! = CTDB_RECOVERY_NORMAL ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Node:%u was in recovery mode. Restart recovery process \n " , state - > c - > hdr . destnode ) ) ;
2007-08-27 03:40:10 +04:00
rmdata - > status = MONITOR_RECOVERY_NEEDED ;
}
return ;
}
/* verify that all nodes are in normal recovery mode */
2007-08-23 13:27:09 +04:00
static enum monitor_result verify_recmode ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap )
2007-08-23 07:48:39 +04:00
{
2007-08-27 03:40:10 +04:00
struct verify_recmode_normal_data * rmdata ;
2007-08-23 13:27:09 +04:00
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
2007-08-27 03:40:10 +04:00
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
2007-08-23 07:48:39 +04:00
2007-08-27 03:40:10 +04:00
rmdata = talloc ( mem_ctx , struct verify_recmode_normal_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
rmdata - > count = 0 ;
rmdata - > status = MONITOR_OK ;
2007-08-23 07:48:39 +04:00
/* loop over all active nodes and send an async getrecmode call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
2007-08-27 03:40:10 +04:00
state = ctdb_ctrl_getrecmode_send ( ctdb , mem_ctx ,
2007-08-23 07:48:39 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-27 03:40:10 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmode_send during monitoring \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-23 07:48:39 +04:00
return MONITOR_FAILED ;
}
2007-08-23 13:27:09 +04:00
2007-08-27 03:40:10 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmode_normal_callback ;
2007-09-26 08:25:32 +04:00
state - > async . private_data = rmdata ;
2007-08-27 03:40:10 +04:00
/* one more control to wait for to complete */
rmdata - > count + + ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
while ( rmdata - > count > 0 ) {
event_loop_once ( ctdb - > ev ) ;
}
status = rmdata - > status ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
2007-08-27 03:40:10 +04:00
return status ;
2007-08-23 07:48:39 +04:00
}
2007-08-27 03:40:10 +04:00
2007-08-23 13:27:09 +04:00
struct verify_recmaster_data {
2008-04-21 18:56:27 +04:00
struct ctdb_recoverd * rec ;
2007-08-23 13:27:09 +04:00
uint32_t count ;
2007-09-04 04:33:10 +04:00
uint32_t pnn ;
2007-08-23 13:27:09 +04:00
enum monitor_result status ;
} ;
2007-08-24 04:42:06 +04:00
static void verify_recmaster_callback ( struct ctdb_client_control_state * state )
2007-08-23 13:27:09 +04:00
{
2007-09-26 08:25:32 +04:00
struct verify_recmaster_data * rmdata = talloc_get_type ( state - > async . private_data , struct verify_recmaster_data ) ;
2007-08-23 13:27:09 +04:00
/* one more node has responded with recmaster data*/
rmdata - > count - - ;
/* if we failed to get the recmaster, then return an error and let
the main loop try again .
*/
2007-08-24 04:42:06 +04:00
if ( state - > state ! = CTDB_CONTROL_DONE ) {
2007-08-23 13:27:09 +04:00
if ( rmdata - > status = = MONITOR_OK ) {
rmdata - > status = MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* if we got a response, then the recmaster will be stored in the
status field
*/
2007-09-04 04:33:10 +04:00
if ( state - > status ! = rmdata - > pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Node %d does not agree we are the recmaster. Need a new recmaster election \n " , state - > c - > hdr . destnode ) ) ;
2008-04-21 18:56:27 +04:00
ctdb_set_culprit ( rmdata - > rec , state - > c - > hdr . destnode ) ;
2007-08-23 13:27:09 +04:00
rmdata - > status = MONITOR_ELECTION_NEEDED ;
}
2007-08-24 04:42:06 +04:00
return ;
2007-08-23 13:27:09 +04:00
}
/* verify that all nodes agree that we are the recmaster */
2008-04-21 18:56:27 +04:00
static enum monitor_result verify_recmaster ( struct ctdb_recoverd * rec , struct ctdb_node_map * nodemap , uint32_t pnn )
2007-08-23 13:27:09 +04:00
{
2008-04-21 18:56:27 +04:00
struct ctdb_context * ctdb = rec - > ctdb ;
2007-08-23 13:27:09 +04:00
struct verify_recmaster_data * rmdata ;
TALLOC_CTX * mem_ctx = talloc_new ( ctdb ) ;
struct ctdb_client_control_state * state ;
enum monitor_result status ;
int j ;
rmdata = talloc ( mem_ctx , struct verify_recmaster_data ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rmdata ) ;
2008-04-21 18:56:27 +04:00
rmdata - > rec = rec ;
2007-08-23 13:27:09 +04:00
rmdata - > count = 0 ;
2007-09-04 04:33:10 +04:00
rmdata - > pnn = pnn ;
2007-08-23 13:27:09 +04:00
rmdata - > status = MONITOR_OK ;
/* loop over all active nodes and send an async getrecmaster call to
them */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
state = ctdb_ctrl_getrecmaster_send ( ctdb , mem_ctx ,
2007-08-23 13:38:54 +04:00
CONTROL_TIMEOUT ( ) ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ;
2007-08-23 13:27:09 +04:00
if ( state = = NULL ) {
/* we failed to send the control, treat this as
an error and try again next iteration
*/
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to call ctdb_ctrl_getrecmaster_send during monitoring \n " ) ) ;
2007-08-23 13:27:09 +04:00
talloc_free ( mem_ctx ) ;
return MONITOR_FAILED ;
}
2007-08-24 04:42:06 +04:00
/* set up the callback functions */
state - > async . fn = verify_recmaster_callback ;
2007-09-26 08:25:32 +04:00
state - > async . private_data = rmdata ;
2007-08-24 04:42:06 +04:00
2007-08-23 13:27:09 +04:00
/* one more control to wait for to complete */
rmdata - > count + + ;
}
/* now wait for up to the maximum number of seconds allowed
or until all nodes we expect a response from has replied
*/
2007-08-23 13:38:54 +04:00
while ( rmdata - > count > 0 ) {
2007-08-23 13:27:09 +04:00
event_loop_once ( ctdb - > ev ) ;
}
status = rmdata - > status ;
talloc_free ( mem_ctx ) ;
return status ;
}
2008-02-29 04:37:42 +03:00
/*
this function writes the number of connected nodes we have for this pnn
to the pnn slot in the reclock file
*/
static void
2008-02-29 05:14:47 +03:00
ctdb_recoverd_write_pnn_connect_count ( struct ctdb_recoverd * rec )
2008-02-29 04:37:42 +03:00
{
2008-03-03 02:24:17 +03:00
const char count = rec - > num_connected ;
2008-02-29 04:37:42 +03:00
struct ctdb_context * ctdb = talloc_get_type ( rec - > ctdb , struct ctdb_context ) ;
2008-05-06 07:27:17 +04:00
if ( rec - > rec_file_fd = = - 1 ) {
DEBUG ( DEBUG_CRIT , ( __location__ " Unable to write pnn count. pnnfile is not open. \n " ) ) ;
return ;
}
2008-02-29 04:37:42 +03:00
if ( pwrite ( rec - > rec_file_fd , & count , 1 , ctdb - > pnn ) = = - 1 ) {
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to write pnn count \n " ) ) ;
2008-05-06 07:27:17 +04:00
close ( rec - > rec_file_fd ) ;
rec - > rec_file_fd = - 1 ;
2008-02-29 04:37:42 +03:00
}
}
/*
this function opens the reclock file and sets a byterage lock for the single
byte at position pnn + 1.
the existence / non - existence of such a lock provides an alternative mechanism
to know whether a remote node ( recovery daemon ) is running or not .
*/
static void
ctdb_recoverd_get_pnn_lock ( struct ctdb_recoverd * rec )
{
struct ctdb_context * ctdb = talloc_get_type ( rec - > ctdb , struct ctdb_context ) ;
struct flock lock ;
char * pnnfile = NULL ;
DEBUG ( DEBUG_INFO , ( " Setting PNN lock for pnn:%d \n " , ctdb - > pnn ) ) ;
if ( rec - > rec_file_fd ! = - 1 ) {
2008-05-06 07:27:17 +04:00
close ( rec - > rec_file_fd ) ;
rec - > rec_file_fd = - 1 ;
2008-02-29 04:37:42 +03:00
}
pnnfile = talloc_asprintf ( rec , " %s.pnn " , ctdb - > recovery_lock_file ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , pnnfile ) ;
rec - > rec_file_fd = open ( pnnfile , O_RDWR | O_CREAT , 0600 ) ;
if ( rec - > rec_file_fd = = - 1 ) {
DEBUG ( DEBUG_CRIT , ( __location__ " Unable to open %s - (%s) \n " ,
pnnfile , strerror ( errno ) ) ) ;
2008-05-06 07:27:17 +04:00
talloc_free ( pnnfile ) ;
return ;
2008-02-29 04:37:42 +03:00
}
set_close_on_exec ( rec - > rec_file_fd ) ;
lock . l_type = F_WRLCK ;
lock . l_whence = SEEK_SET ;
lock . l_start = ctdb - > pnn ;
lock . l_len = 1 ;
lock . l_pid = 0 ;
if ( fcntl ( rec - > rec_file_fd , F_SETLK , & lock ) ! = 0 ) {
close ( rec - > rec_file_fd ) ;
rec - > rec_file_fd = - 1 ;
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to get pnn lock on '%s' \n " , pnnfile ) ) ;
2008-05-06 07:27:17 +04:00
talloc_free ( pnnfile ) ;
return ;
2008-02-29 04:37:42 +03:00
}
DEBUG ( DEBUG_NOTICE , ( __location__ " Got pnn lock on '%s' \n " , pnnfile ) ) ;
talloc_free ( pnnfile ) ;
/* we start out with 0 connected nodes */
2008-02-29 05:14:47 +03:00
ctdb_recoverd_write_pnn_connect_count ( rec ) ;
}
/*
called when we need to do the periodical reclock pnn count update
*/
static void ctdb_update_pnn_count ( struct event_context * ev , struct timed_event * te ,
struct timeval t , void * p )
{
2008-03-03 01:19:30 +03:00
int i , count ;
struct ctdb_recoverd * rec = talloc_get_type ( p , struct ctdb_recoverd ) ;
struct ctdb_context * ctdb = rec - > ctdb ;
struct ctdb_node_map * nodemap = rec - > nodemap ;
2008-02-29 05:14:47 +03:00
2008-05-06 07:27:17 +04:00
/* close and reopen the pnn lock file */
ctdb_recoverd_get_pnn_lock ( rec ) ;
2008-02-29 05:14:47 +03:00
ctdb_recoverd_write_pnn_connect_count ( rec ) ;
2008-03-03 01:19:30 +03:00
event_add_timed ( rec - > ctdb - > ev , rec - > ctdb ,
timeval_current_ofs ( ctdb - > tunable . reclock_ping_period , 0 ) ,
ctdb_update_pnn_count , rec ) ;
/* check if there is a split cluster and yeld the recmaster role
it the other half of the cluster is larger
*/
DEBUG ( DEBUG_DEBUG , ( " CHECK FOR SPLIT CLUSTER \n " ) ) ;
if ( rec - > nodemap = = NULL ) {
return ;
}
if ( rec - > rec_file_fd = = - 1 ) {
return ;
}
/* only test this if we think we are the recmaster */
if ( ctdb - > pnn ! = rec - > recmaster ) {
DEBUG ( DEBUG_DEBUG , ( " We are not recmaster, skip test \n " ) ) ;
return ;
}
if ( ctdb - > recovery_lock_fd = = - 1 ) {
2008-05-06 07:27:17 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Lost reclock pnn file. Yielding recmaster role \n " ) ) ;
close ( ctdb - > recovery_lock_fd ) ;
ctdb - > recovery_lock_fd = - 1 ;
force_election ( rec , ctdb - > pnn , rec - > nodemap ) ;
2008-03-03 01:19:30 +03:00
return ;
}
for ( i = 0 ; i < nodemap - > num ; i + + ) {
/* we dont need to check ourself */
if ( nodemap - > nodes [ i ] . pnn = = ctdb - > pnn ) {
continue ;
}
/* dont check nodes that are connected to us */
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
continue ;
}
/* check if the node is "connected" and how connected it it */
count = ctdb_read_pnn_lock ( rec - > rec_file_fd , nodemap - > nodes [ i ] . pnn ) ;
if ( count < 0 ) {
continue ;
}
/* check if that node is more connected that us */
2008-03-03 02:24:17 +03:00
if ( count > rec - > num_connected ) {
2008-03-03 01:19:30 +03:00
DEBUG ( DEBUG_ERR , ( " DISCONNECTED Node %u is more connected than we are, yielding recmaster role \n " , nodemap - > nodes [ i ] . pnn ) ) ;
close ( ctdb - > recovery_lock_fd ) ;
ctdb - > recovery_lock_fd = - 1 ;
force_election ( rec , ctdb - > pnn , rec - > nodemap ) ;
return ;
}
}
2008-02-29 04:37:42 +03:00
}
2007-06-07 09:18:55 +04:00
2008-07-02 07:55:59 +04:00
/* called to check that the allocation of public ip addresses is ok.
*/
static int verify_ip_allocation ( struct ctdb_context * ctdb , uint32_t pnn )
{
TALLOC_CTX * mem_ctx = talloc_new ( NULL ) ;
struct ctdb_all_public_ips * ips = NULL ;
struct ctdb_uptime * uptime1 = NULL ;
struct ctdb_uptime * uptime2 = NULL ;
int ret , j ;
ret = ctdb_ctrl_uptime ( ctdb , ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & uptime1 ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to get uptime from local node %u \n " , pnn ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
/* read the ip allocation from the local node */
ret = ctdb_ctrl_get_public_ips ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , mem_ctx , & ips ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to get public ips from local node %u \n " , pnn ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
ret = ctdb_ctrl_uptime ( ctdb , ctdb , CONTROL_TIMEOUT ( ) ,
CTDB_CURRENT_NODE , & uptime2 ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to get uptime from local node %u \n " , pnn ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
/* skip the check if the startrecovery time has changed */
if ( timeval_compare ( & uptime1 - > last_recovery_started ,
& uptime2 - > last_recovery_started ) ! = 0 ) {
DEBUG ( DEBUG_NOTICE , ( __location__ " last recovery time changed while we read the public ip list. skipping public ip address check \n " ) ) ;
return 0 ;
}
/* skip the check if the endrecovery time has changed */
if ( timeval_compare ( & uptime1 - > last_recovery_finished ,
& uptime2 - > last_recovery_finished ) ! = 0 ) {
DEBUG ( DEBUG_NOTICE , ( __location__ " last recovery time changed while we read the public ip list. skipping public ip address check \n " ) ) ;
return 0 ;
}
/* skip the check if we have started but not finished recovery */
if ( timeval_compare ( & uptime1 - > last_recovery_finished ,
& uptime1 - > last_recovery_started ) ! = 1 ) {
DEBUG ( DEBUG_NOTICE , ( __location__ " in the middle of recovery. skipping public ip address check \n " ) ) ;
return 0 ;
}
/* verify that we have the ip addresses we should have
and we dont have ones we shouldnt have .
if we find an inconsistency we set recmode to
active on the local node and wait for the recmaster
to do a full blown recovery
*/
for ( j = 0 ; j < ips - > num ; j + + ) {
if ( ips - > ips [ j ] . pnn = = pnn ) {
if ( ! ctdb_sys_have_ip ( ips - > ips [ j ] . sin ) ) {
DEBUG ( DEBUG_CRIT , ( " Public address '%s' is missing and we should serve this ip \n " , inet_ntoa ( ips - > ips [ j ] . sin . sin_addr ) ) ) ;
ret = ctdb_ctrl_freeze ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to freeze node due to public ip address mismatches \n " ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
ret = ctdb_ctrl_setrecmode ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to activate recovery mode due to public ip address mismatches \n " ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
}
} else {
if ( ctdb_sys_have_ip ( ips - > ips [ j ] . sin ) ) {
DEBUG ( DEBUG_CRIT , ( " We are still serving a public address '%s' that we should not be serving. \n " , inet_ntoa ( ips - > ips [ j ] . sin . sin_addr ) ) ) ;
ret = ctdb_ctrl_freeze ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to freeze node due to public ip address mismatches \n " ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
ret = ctdb_ctrl_setrecmode ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to activate recovery mode due to public ip address mismatches \n " ) ) ;
talloc_free ( mem_ctx ) ;
return - 1 ;
}
}
}
}
talloc_free ( mem_ctx ) ;
return 0 ;
}
2007-06-04 14:22:44 +04:00
/*
the main monitoring loop
*/
2007-06-05 11:57:07 +04:00
static void monitor_cluster ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
2008-03-02 23:53:46 +03:00
uint32_t pnn ;
2007-05-04 02:30:18 +04:00
TALLOC_CTX * mem_ctx = NULL ;
2007-05-04 03:01:01 +04:00
struct ctdb_node_map * nodemap = NULL ;
2007-05-04 03:45:53 +04:00
struct ctdb_node_map * remote_nodemap = NULL ;
struct ctdb_vnn_map * vnnmap = NULL ;
struct ctdb_vnn_map * remote_vnnmap = NULL ;
2008-02-18 11:38:04 +03:00
int32_t debug_level ;
2007-05-04 03:45:53 +04:00
int i , j , ret ;
2007-06-07 09:18:55 +04:00
struct ctdb_recoverd * rec ;
2007-10-05 07:28:21 +04:00
char c ;
2007-06-07 09:18:55 +04:00
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " monitor_cluster starting \n " ) ) ;
2007-10-18 10:27:36 +04:00
2007-06-07 09:18:55 +04:00
rec = talloc_zero ( ctdb , struct ctdb_recoverd ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , rec ) ;
rec - > ctdb = ctdb ;
2007-06-07 10:34:33 +04:00
rec - > banned_nodes = talloc_zero_array ( rec , struct ban_state * , ctdb - > num_nodes ) ;
2007-06-07 09:18:55 +04:00
CTDB_NO_MEMORY_FATAL ( ctdb , rec - > banned_nodes ) ;
2007-06-07 12:37:27 +04:00
rec - > priority_time = timeval_current ( ) ;
2008-02-29 04:37:42 +03:00
/* open the rec file fd and lock our slot */
rec - > rec_file_fd = - 1 ;
ctdb_recoverd_get_pnn_lock ( rec ) ;
2008-04-01 08:34:54 +04:00
/* register a message port for sending memory dumps */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_MEM_DUMP , mem_dump_handler , rec ) ;
2007-06-07 09:18:55 +04:00
/* register a message port for recovery elections */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_RECOVERY , election_handler , rec ) ;
/* and one for when nodes are disabled/enabled */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_NODE_FLAGS_CHANGED , monitor_handler , rec ) ;
2007-06-07 10:34:33 +04:00
/* and one for when nodes are banned */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_BAN_NODE , ban_handler , rec ) ;
/* and one for when nodes are unbanned */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_UNBAN_NODE , unban_handler , rec ) ;
2008-01-08 09:23:27 +03:00
/* register a message port for vacuum fetch */
ctdb_set_message_handler ( ctdb , CTDB_SRVID_VACUUM_FETCH , vacuum_fetch_handler , rec ) ;
2008-02-29 05:14:47 +03:00
/* update the reclock pnn file connected count on a regular basis */
2008-03-03 01:19:30 +03:00
event_add_timed ( ctdb - > ev , ctdb ,
timeval_current_ofs ( ctdb - > tunable . reclock_ping_period , 0 ) ,
ctdb_update_pnn_count , rec ) ;
2008-02-29 05:14:47 +03:00
2007-05-04 02:30:18 +04:00
again :
if ( mem_ctx ) {
talloc_free ( mem_ctx ) ;
mem_ctx = NULL ;
}
mem_ctx = talloc_new ( ctdb ) ;
if ( ! mem_ctx ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( __location__ " Failed to create temporary context \n " ) ) ;
2007-05-04 02:30:18 +04:00
exit ( - 1 ) ;
}
/* we only check for recovery once every second */
2007-06-07 09:18:55 +04:00
ctdb_wait_timeout ( ctdb , ctdb - > tunable . recover_interval ) ;
2007-06-04 14:22:44 +04:00
2008-01-07 08:17:22 +03:00
/* verify that the main daemon is still running */
if ( kill ( ctdb - > ctdbd_pid , 0 ) ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( " CTDB daemon is no longer available. Shutting down recovery daemon \n " ) ) ;
2008-01-07 08:17:22 +03:00
exit ( - 1 ) ;
}
2007-11-13 02:27:44 +03:00
if ( rec - > election_timeout ) {
/* an election is in progress */
goto again ;
}
2008-02-18 11:38:04 +03:00
/* read the debug level from the parent and update locally */
ret = ctdb_ctrl_get_debuglevel ( ctdb , CTDB_CURRENT_NODE , & debug_level ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Failed to read debuglevel from parent \n " ) ) ;
goto again ;
}
LogLevel = debug_level ;
2007-11-28 07:04:20 +03:00
/* We must check if we need to ban a node here but we want to do this
as early as possible so we dont wait until we have pulled the node
map from the local node . thats why we have the hardcoded value 20
*/
if ( rec - > culprit_counter > 20 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Node %u has caused %u failures in %.0f seconds - banning it for %u seconds \n " ,
2007-11-28 07:04:20 +03:00
rec - > last_culprit , rec - > culprit_counter , timeval_elapsed ( & rec - > first_recover_time ) ,
ctdb - > tunable . recovery_ban_period ) ) ;
ctdb_ban_node ( rec , rec - > last_culprit , ctdb - > tunable . recovery_ban_period ) ;
}
2007-06-04 14:22:44 +04:00
/* get relevant tunables */
2007-06-07 12:05:25 +04:00
ret = ctdb_ctrl_get_all_tunables ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE , & ctdb - > tunable ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to get tunables - retrying \n " ) ) ;
2007-06-07 12:05:25 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-09-04 04:38:48 +04:00
pnn = ctdb_ctrl_getpnn ( ctdb , CONTROL_TIMEOUT ( ) , CTDB_CURRENT_NODE ) ;
2007-09-04 04:33:10 +04:00
if ( pnn = = ( uint32_t ) - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to get local pnn - retrying \n " ) ) ;
2007-05-23 08:35:19 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
/* get the vnnmap */
2007-09-04 04:33:10 +04:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , mem_ctx , & vnnmap ) ;
2007-05-06 22:41:12 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from node %u \n " , pnn ) ) ;
2007-05-06 22:41:12 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
/* get number of nodes */
2008-03-03 01:19:30 +03:00
if ( rec - > nodemap ) {
talloc_free ( rec - > nodemap ) ;
rec - > nodemap = NULL ;
nodemap = NULL ;
}
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , pnn , rec , & rec - > nodemap ) ;
2007-05-04 03:01:01 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from node %u \n " , pnn ) ) ;
2007-05-04 03:01:01 +04:00
goto again ;
}
2008-03-03 01:19:30 +03:00
nodemap = rec - > nodemap ;
2007-05-04 02:30:18 +04:00
2008-01-04 04:11:29 +03:00
/* check which node is the recovery master */
2008-03-02 23:53:46 +03:00
ret = ctdb_ctrl_getrecmaster ( ctdb , mem_ctx , CONTROL_TIMEOUT ( ) , pnn , & rec - > recmaster ) ;
2008-01-04 04:11:29 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get recmaster from node %u \n " , pnn ) ) ;
2008-01-04 04:11:29 +03:00
goto again ;
}
2008-03-02 23:53:46 +03:00
if ( rec - > recmaster = = ( uint32_t ) - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " Initial recovery master set - forcing election \n " ) ) ;
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2008-01-04 04:11:29 +03:00
goto again ;
}
2007-11-23 04:41:29 +03:00
/* check that we (recovery daemon) and the local ctdb daemon
agrees on whether we are banned or not
*/
if ( nodemap - > nodes [ pnn ] . flags & NODE_FLAGS_BANNED ) {
if ( rec - > banned_nodes [ pnn ] = = NULL ) {
2008-03-02 23:53:46 +03:00
if ( rec - > recmaster = = pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node \n " ) ) ;
2008-01-04 04:11:29 +03:00
ctdb_unban_node ( rec , pnn ) ;
} else {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node \n " ) ) ;
2008-01-04 04:11:29 +03:00
ctdb_ban_node ( rec , pnn , ctdb - > tunable . recovery_ban_period ) ;
ctdb_set_culprit ( rec , pnn ) ;
}
2007-11-23 04:41:29 +03:00
goto again ;
}
} else {
if ( rec - > banned_nodes [ pnn ] ! = NULL ) {
2008-03-02 23:53:46 +03:00
if ( rec - > recmaster = = pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node \n " ) ) ;
2007-11-23 04:41:29 +03:00
2008-01-04 04:11:29 +03:00
ctdb_unban_node ( rec , pnn ) ;
} else {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node \n " ) ) ;
2007-11-23 04:41:29 +03:00
2008-01-04 04:11:29 +03:00
ctdb_ban_node ( rec , pnn , ctdb - > tunable . recovery_ban_period ) ;
ctdb_set_culprit ( rec , pnn ) ;
}
2007-11-23 04:41:29 +03:00
goto again ;
}
}
2007-10-05 07:28:21 +04:00
/* remember our own node flags */
rec - > node_flags = nodemap - > nodes [ pnn ] . flags ;
2007-05-06 22:41:12 +04:00
2007-05-04 03:45:53 +04:00
/* count how many active nodes there are */
2008-03-03 02:24:17 +03:00
rec - > num_active = 0 ;
rec - > num_connected = 0 ;
2007-05-04 03:45:53 +04:00
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-06-07 09:18:55 +04:00
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
2008-02-29 04:55:20 +03:00
rec - > num_active + + ;
2007-05-04 03:45:53 +04:00
}
2008-03-03 02:24:17 +03:00
if ( ! ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_DISCONNECTED ) ) {
rec - > num_connected + + ;
}
2007-05-04 03:45:53 +04:00
}
2007-05-07 00:51:58 +04:00
/* verify that the recmaster node is still active */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2008-03-02 23:53:46 +03:00
if ( nodemap - > nodes [ j ] . pnn = = rec - > recmaster ) {
2007-05-07 00:51:58 +04:00
break ;
}
2007-05-10 07:10:23 +04:00
}
2007-05-23 08:35:19 +04:00
if ( j = = nodemap - > num ) {
2008-03-02 23:53:46 +03:00
DEBUG ( DEBUG_ERR , ( " Recmaster node %u not in list. Force reelection \n " , rec - > recmaster ) ) ;
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2007-05-23 08:35:19 +04:00
goto again ;
}
2007-10-11 01:10:17 +04:00
/* if recovery master is disconnected we must elect a new recmaster */
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_DISCONNECTED ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Recmaster node %u is disconnected. Force reelection \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2007-10-11 01:10:17 +04:00
goto again ;
}
2007-10-15 08:17:49 +04:00
/* grap the nodemap from the recovery master to check if it is banned */
2007-10-11 00:16:36 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
mem_ctx , & remote_nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from recovery master %u \n " ,
2007-10-11 00:16:36 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
goto again ;
}
if ( remote_nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Recmaster node %u no longer available. Force reelection \n " , nodemap - > nodes [ j ] . pnn ) ) ;
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2007-05-07 00:51:58 +04:00
goto again ;
}
2007-09-14 04:16:36 +04:00
2008-06-26 05:08:09 +04:00
/* verify that we and the recmaster agrees on our flags */
if ( nodemap - > nodes [ pnn ] . flags ! = remote_nodemap - > nodes [ pnn ] . flags ) {
DEBUG ( DEBUG_ERR , ( __location__ " Recmaster disagrees on our flags flags:0x%x recmaster_flags:0x%x Broadcasting out flags. \n " , nodemap - > nodes [ pnn ] . flags , remote_nodemap - > nodes [ pnn ] . flags ) ) ;
update_our_flags_on_all_nodes ( ctdb , pnn , nodemap ) ;
}
2008-07-02 07:55:59 +04:00
/* verify that we have all ip addresses we should have and we dont
* have addresses we shouldnt have .
*/
if ( verify_ip_allocation ( ctdb , pnn ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Public IPs were inconsistent. \n " ) ) ;
2008-05-15 07:28:19 +04:00
goto again ;
}
2008-07-02 07:55:59 +04:00
2007-05-07 00:51:58 +04:00
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
2008-03-02 23:53:46 +03:00
if ( pnn ! = rec - > recmaster ) {
2007-05-07 00:51:58 +04:00
goto again ;
}
2007-10-11 00:16:36 +04:00
2007-10-15 08:28:51 +04:00
/* ensure our local copies of flags are right */
2007-11-30 00:44:34 +03:00
ret = update_local_flags ( rec , nodemap ) ;
2007-11-23 03:53:06 +03:00
if ( ret = = MONITOR_ELECTION_NEEDED ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " update_local_flags() called for a re-election. \n " ) ) ;
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2007-11-23 03:53:06 +03:00
goto again ;
}
if ( ret ! = MONITOR_OK ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to update local flags \n " ) ) ;
2007-10-15 08:28:51 +04:00
goto again ;
2007-10-11 00:16:36 +04:00
}
2007-09-04 17:15:23 +04:00
/* update the list of public ips that a node can handle for
all connected nodes
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
continue ;
}
/* release any existing data */
if ( ctdb - > nodes [ j ] - > public_ips ) {
talloc_free ( ctdb - > nodes [ j ] - > public_ips ) ;
ctdb - > nodes [ j ] - > public_ips = NULL ;
}
/* grab a new shiny list of public ips from the node */
if ( ctdb_ctrl_get_public_ips ( ctdb , CONTROL_TIMEOUT ( ) ,
ctdb - > nodes [ j ] - > pnn ,
ctdb - > nodes ,
& ctdb - > nodes [ j ] - > public_ips ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Failed to read public ips from node : %u \n " ,
2007-09-04 17:15:23 +04:00
ctdb - > nodes [ j ] - > pnn ) ) ;
goto again ;
}
}
2007-05-07 00:51:58 +04:00
/* verify that all active nodes agree that we are the recmaster */
2008-04-21 18:56:27 +04:00
switch ( verify_recmaster ( rec , nodemap , pnn ) ) {
2007-08-23 13:27:09 +04:00
case MONITOR_RECOVERY_NEEDED :
/* can not happen */
goto again ;
case MONITOR_ELECTION_NEEDED :
2008-03-03 01:19:30 +03:00
force_election ( rec , pnn , nodemap ) ;
2007-08-23 13:27:09 +04:00
goto again ;
case MONITOR_OK :
break ;
case MONITOR_FAILED :
goto again ;
2007-05-07 00:51:58 +04:00
}
2007-09-14 03:49:12 +04:00
if ( rec - > need_recovery ) {
/* a previous recovery didn't finish */
2008-06-13 05:47:42 +04:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , - 1 ) ;
2007-09-14 03:49:12 +04:00
goto again ;
}
2007-05-06 22:41:12 +04:00
/* verify that all active nodes are in normal mode
and not in recovery mode
*/
2007-08-23 13:27:09 +04:00
switch ( verify_recmode ( ctdb , nodemap ) ) {
2007-08-23 07:48:39 +04:00
case MONITOR_RECOVERY_NEEDED :
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , ctdb - > pnn ) ;
2007-08-23 07:48:39 +04:00
goto again ;
case MONITOR_FAILED :
goto again ;
2007-08-23 13:27:09 +04:00
case MONITOR_ELECTION_NEEDED :
/* can not happen */
2007-08-23 07:48:39 +04:00
case MONITOR_OK :
break ;
2007-05-06 22:41:12 +04:00
}
2007-10-05 07:28:21 +04:00
/* we should have the reclock - check its not stale */
if ( ctdb - > recovery_lock_fd = = - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( " recovery master doesn't have the recovery lock \n " ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , ctdb - > pnn ) ;
2007-10-05 07:28:21 +04:00
goto again ;
}
2008-02-29 02:03:39 +03:00
if ( pread ( ctdb - > recovery_lock_fd , & c , 1 , 0 ) = = - 1 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_CRIT , ( " failed read from recovery_lock_fd - %s \n " , strerror ( errno ) ) ) ;
2007-10-05 07:28:21 +04:00
close ( ctdb - > recovery_lock_fd ) ;
ctdb - > recovery_lock_fd = - 1 ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , ctdb - > pnn ) ;
2007-10-05 07:28:21 +04:00
goto again ;
}
2007-08-23 07:48:39 +04:00
2007-05-04 03:45:53 +04:00
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_nodemap ) ;
2007-05-04 03:45:53 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get nodemap from remote node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on how many nodes there are
then this is a good reason to try recovery
*/
if ( remote_nodemap - > num ! = nodemap - > num ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different node count. %u vs %u of the local node \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_nodemap - > num , nodemap - > num ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on which nodes exist and are
active , then that is also a good reason to do recovery
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( remote_nodemap - > nodes [ i ] . pnn ! = nodemap - > nodes [ i ] . pnn ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u). \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , i ,
remote_nodemap - > nodes [ i ] . pnn , nodemap - > nodes [ i ] . pnn ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-06-11 15:37:09 +04:00
goto again ;
}
2007-07-09 06:55:15 +04:00
if ( ( remote_nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ! =
( nodemap - > nodes [ i ] . flags & NODE_FLAGS_INACTIVE ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , i ,
2007-06-11 15:37:09 +04:00
remote_nodemap - > nodes [ i ] . flags , nodemap - > nodes [ i ] . flags ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
}
/* there better be the same number of lmasters in the vnn map
2007-05-10 07:10:23 +04:00
as there are active nodes or we will have to do a recovery
2007-05-04 03:45:53 +04:00
*/
2008-02-29 04:55:20 +03:00
if ( vnnmap - > size ! = rec - > num_active ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " The vnnmap count is different from the number of active nodes. %u vs %u \n " ,
2008-02-29 04:55:20 +03:00
vnnmap - > size , rec - > num_active ) ) ;
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , ctdb - > pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify that all active nodes in the nodemap also exist in
the vnnmap .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
2007-09-04 03:50:07 +04:00
if ( vnnmap - > map [ i ] = = nodemap - > nodes [ j ] . pnn ) {
2007-05-04 03:45:53 +04:00
break ;
}
}
2007-06-07 09:18:55 +04:00
if ( i = = vnnmap - > size ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Node %u is active in the nodemap but did not exist in the vnnmap \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
2007-05-04 05:57:45 +04:00
/* verify that all other nodes have the same vnnmap
and are from the same generation
*/
2007-05-04 03:45:53 +04:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
2007-06-07 09:18:55 +04:00
if ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_INACTIVE ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 04:33:10 +04:00
if ( nodemap - > nodes [ j ] . pnn = = pnn ) {
2007-05-04 03:45:53 +04:00
continue ;
}
2007-09-04 03:50:07 +04:00
ret = ctdb_ctrl_getvnnmap ( ctdb , CONTROL_TIMEOUT ( ) , nodemap - > nodes [ j ] . pnn ,
2007-06-07 12:39:37 +04:00
mem_ctx , & remote_vnnmap ) ;
2007-05-04 03:45:53 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to get vnnmap from remote node %u \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
2007-05-04 05:57:45 +04:00
/* verify the vnnmap generation is the same */
if ( vnnmap - > generation ! = remote_vnnmap - > generation ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > generation , vnnmap - > generation ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 05:57:45 +04:00
goto again ;
}
2007-05-04 03:45:53 +04:00
/* verify the vnnmap size is the same */
if ( vnnmap - > size ! = remote_vnnmap - > size ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different size of vnnmap. %u vs %u (ours) \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn , remote_vnnmap - > size , vnnmap - > size ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap , vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify the vnnmap is the same */
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( remote_vnnmap - > map [ i ] ! = vnnmap - > map [ i ] ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Remote node %u has different vnnmap. \n " ,
2007-09-04 03:50:07 +04:00
nodemap - > nodes [ j ] . pnn ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2007-09-04 03:50:07 +04:00
vnnmap , nodemap - > nodes [ j ] . pnn ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
}
2007-06-06 04:25:46 +04:00
/* we might need to change who has what IP assigned */
2007-09-13 08:08:18 +04:00
if ( rec - > need_takeover_run ) {
rec - > need_takeover_run = false ;
2008-01-29 05:59:28 +03:00
/* execute the "startrecovery" event script on all nodes */
2008-06-12 10:53:36 +04:00
ret = run_startrecovery_eventscript ( rec , nodemap ) ;
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'startrecovery' event on cluster \n " ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2008-01-29 05:59:28 +03:00
vnnmap , ctdb - > pnn ) ;
}
2007-06-06 04:25:46 +04:00
ret = ctdb_takeover_run ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to setup public takeover addresses - starting recovery \n " ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2007-10-05 07:51:31 +04:00
vnnmap , ctdb - > pnn ) ;
2007-06-06 04:25:46 +04:00
}
2008-01-29 05:59:28 +03:00
/* execute the "recovered" event script on all nodes */
2008-05-15 06:28:52 +04:00
ret = run_recovered_eventscript ( ctdb , nodemap , " monitor_cluster " ) ;
2008-05-15 09:01:01 +04:00
#if 0
// we cant check whether the event completed successfully
// since this script WILL fail if the node is in recovery mode
// and if that race happens, the code here would just cause a second
// cascading recovery.
2008-01-29 05:59:28 +03:00
if ( ret ! = 0 ) {
2008-05-15 06:28:52 +04:00
DEBUG ( DEBUG_ERR , ( __location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed. \n " ) ) ;
2008-02-29 04:55:20 +03:00
do_recovery ( rec , mem_ctx , pnn , nodemap ,
2008-01-29 05:59:28 +03:00
vnnmap , ctdb - > pnn ) ;
}
2008-05-15 09:01:01 +04:00
# endif
2007-06-06 04:25:46 +04:00
}
2008-06-26 07:08:37 +04:00
2008-06-26 07:15:41 +04:00
DEBUG ( DEBUG_INFO , ( __location__ " Update flags on all nodes \n " ) ) ;
2008-06-26 07:08:37 +04:00
/*
update all nodes to have the same flags that we have
*/
ret = update_flags_on_all_nodes ( ctdb , nodemap ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Unable to update flags on all nodes \n " ) ) ;
2008-06-26 08:14:37 +04:00
goto again ;
2008-06-26 07:08:37 +04:00
}
2007-05-04 03:45:53 +04:00
goto again ;
2007-05-04 02:30:18 +04:00
}
2007-06-06 04:25:46 +04:00
/*
2007-06-07 09:18:55 +04:00
event handler for when the main ctdbd dies
*/
2007-05-15 09:13:36 +04:00
static void ctdb_recoverd_parent ( struct event_context * ev , struct fd_event * fde ,
uint16_t flags , void * private_data )
{
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ALERT , ( " recovery daemon parent died - exiting \n " ) ) ;
2007-05-15 09:13:36 +04:00
_exit ( 1 ) ;
}
2008-05-06 05:19:17 +04:00
/*
called regularly to verify that the recovery daemon is still running
*/
static void ctdb_check_recd ( struct event_context * ev , struct timed_event * te ,
struct timeval yt , void * p )
{
struct ctdb_context * ctdb = talloc_get_type ( p , struct ctdb_context ) ;
if ( kill ( ctdb - > recoverd_pid , 0 ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Recovery daemon (pid:%d) is no longer running. Shutting down main daemon \n " , ( int ) ctdb - > recoverd_pid ) ) ;
ctdb_stop_recoverd ( ctdb ) ;
ctdb_stop_keepalive ( ctdb ) ;
ctdb_stop_monitoring ( ctdb ) ;
ctdb_release_all_ips ( ctdb ) ;
2008-05-11 08:28:33 +04:00
if ( ctdb - > methods ! = NULL ) {
ctdb - > methods - > shutdown ( ctdb ) ;
}
2008-05-06 05:19:17 +04:00
ctdb_event_script ( ctdb , " shutdown " ) ;
exit ( 10 ) ;
}
event_add_timed ( ctdb - > ev , ctdb ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
}
2007-06-07 09:18:55 +04:00
/*
startup the recovery daemon as a child of the main ctdb daemon
*/
2007-05-15 09:13:36 +04:00
int ctdb_start_recoverd ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
int ret ;
2007-05-15 09:13:36 +04:00
int fd [ 2 ] ;
2007-05-04 02:30:18 +04:00
2007-05-15 09:13:36 +04:00
if ( pipe ( fd ) ! = 0 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2008-01-07 08:17:22 +03:00
ctdb - > ctdbd_pid = getpid ( ) ;
2007-10-22 06:34:08 +04:00
ctdb - > recoverd_pid = fork ( ) ;
if ( ctdb - > recoverd_pid = = - 1 ) {
2007-05-15 09:13:36 +04:00
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
2007-10-22 06:34:08 +04:00
if ( ctdb - > recoverd_pid ! = 0 ) {
2007-05-15 09:13:36 +04:00
close ( fd [ 0 ] ) ;
2008-05-06 05:19:17 +04:00
event_add_timed ( ctdb - > ev , ctdb ,
timeval_current_ofs ( 30 , 0 ) ,
ctdb_check_recd , ctdb ) ;
2007-05-15 09:13:36 +04:00
return 0 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
close ( fd [ 1 ] ) ;
2007-06-02 02:41:19 +04:00
/* shutdown the transport */
2008-05-11 08:28:33 +04:00
if ( ctdb - > methods ) {
ctdb - > methods - > shutdown ( ctdb ) ;
}
2007-06-02 02:41:19 +04:00
/* get a new event context */
2007-05-30 07:26:50 +04:00
talloc_free ( ctdb - > ev ) ;
ctdb - > ev = event_context_init ( ctdb ) ;
2007-05-15 09:13:36 +04:00
event_add_fd ( ctdb - > ev , ctdb , fd [ 0 ] , EVENT_FD_READ | EVENT_FD_AUTOCLOSE ,
ctdb_recoverd_parent , & fd [ 0 ] ) ;
2007-05-10 07:10:23 +04:00
2007-05-15 09:13:36 +04:00
close ( ctdb - > daemon . sd ) ;
ctdb - > daemon . sd = - 1 ;
srandom ( getpid ( ) ^ time ( NULL ) ) ;
2007-05-04 02:30:18 +04:00
2008-01-16 14:08:33 +03:00
/* the recovery daemon does not need to be realtime */
if ( ctdb - > do_setsched ) {
ctdb_restore_scheduler ( ctdb ) ;
}
2007-05-04 02:30:18 +04:00
/* initialise ctdb */
2007-05-15 09:13:36 +04:00
ret = ctdb_socket_connect ( ctdb ) ;
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ALERT , ( __location__ " Failed to init ctdb \n " ) ) ;
2007-05-04 02:30:18 +04:00
exit ( 1 ) ;
}
2007-05-15 09:13:36 +04:00
monitor_cluster ( ctdb ) ;
2007-05-07 00:51:58 +04:00
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ALERT , ( " ERROR: ctdb_recoverd finished!? \n " ) ) ;
2007-05-15 09:13:36 +04:00
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-10-22 06:34:08 +04:00
/*
shutdown the recovery daemon
*/
void ctdb_stop_recoverd ( struct ctdb_context * ctdb )
{
if ( ctdb - > recoverd_pid = = 0 ) {
return ;
}
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Shutting down recovery daemon \n " ) ) ;
2007-10-22 06:34:08 +04:00
kill ( ctdb - > recoverd_pid , SIGTERM ) ;
}