2007-05-04 02:30:18 +04:00
/*
ctdb recovery daemon
Copyright ( C ) Ronnie Sahlberg 2007
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 59 Temple Place , Suite 330 , Boston , MA 02111 - 1307 USA
*/
# include "includes.h"
# include "lib/events/events.h"
# include "system/filesys.h"
2007-05-10 08:06:48 +04:00
# include "system/time.h"
2007-05-04 02:30:18 +04:00
# include "popt.h"
# include "cmdline.h"
# include "../include/ctdb.h"
# include "../include/ctdb_private.h"
2007-05-04 03:01:01 +04:00
static int timed_out = 0 ;
2007-05-04 02:30:18 +04:00
2007-05-04 09:21:40 +04:00
static void timeout_func ( struct event_context * ev , struct timed_event * te ,
2007-05-04 02:30:18 +04:00
struct timeval t , void * private_data )
{
2007-05-04 03:01:01 +04:00
timed_out = 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-06 03:53:12 +04:00
static int set_recovery_mode ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t rec_mode )
{
int j , ret ;
/* set recovery mode to active on all nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont change it for nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
2007-05-12 09:15:27 +04:00
if ( rec_mode = = CTDB_RECOVERY_ACTIVE ) {
ret = ctdb_ctrl_freeze ( ctdb , timeval_current_ofs ( 5 , 0 ) , nodemap - > nodes [ j ] . vnn ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to freeze node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
return - 1 ;
}
}
2007-05-06 03:53:12 +04:00
ret = ctdb_ctrl_setrecmode ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , rec_mode ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recmode on node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-06 03:53:12 +04:00
return - 1 ;
}
2007-05-12 09:15:27 +04:00
if ( rec_mode = = CTDB_RECOVERY_NORMAL ) {
ret = ctdb_ctrl_thaw ( ctdb , timeval_current_ofs ( 5 , 0 ) , nodemap - > nodes [ j ] . vnn ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to thaw node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
return - 1 ;
}
}
2007-05-06 03:53:12 +04:00
}
return 0 ;
}
2007-05-07 00:51:58 +04:00
static int set_recovery_master ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn )
{
int j , ret ;
/* set recovery master to vnn on all nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont change it for nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_setrecmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recmaster on node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-05-06 04:04:37 +04:00
static int create_missing_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
2007-05-04 03:45:53 +04:00
{
2007-05-04 09:21:40 +04:00
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
2007-05-06 00:58:01 +04:00
/* verify that all other nodes have all our databases */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
/* dont check nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_getdbmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , & remote_dbmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node %u \n " , vnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
/* step through all local databases */
for ( db = 0 ; db < dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < remote_dbmap - > num ; i + + ) {
if ( dbmap - > dbids [ db ] = = remote_dbmap - > dbids [ i ] ) {
break ;
}
}
/* the remote node already have this database */
if ( i ! = remote_dbmap - > num ) {
continue ;
}
/* ok so we need to create this database */
ctdb_ctrl_getdbname ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , dbmap - > dbids [ db ] , mem_ctx , & name ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbname from node %u \n " , vnn ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
ctdb_ctrl_createdb ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , name ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create remote db:%s \n " , name ) ) ;
2007-05-06 00:58:01 +04:00
return - 1 ;
}
}
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:04:37 +04:00
return 0 ;
}
2007-05-06 04:12:42 +04:00
static int create_missing_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_dbid_map * * dbmap , TALLOC_CTX * mem_ctx )
{
int i , j , db , ret ;
struct ctdb_dbid_map * remote_dbmap ;
/* verify that we have all database any other node has */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to ourself ourselves */
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
/* dont check nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_getdbmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , & remote_dbmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node %u \n " , vnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
/* step through all databases on the remote node */
for ( db = 0 ; db < remote_dbmap - > num ; db + + ) {
const char * name ;
for ( i = 0 ; i < ( * dbmap ) - > num ; i + + ) {
if ( remote_dbmap - > dbids [ db ] = = ( * dbmap ) - > dbids [ i ] ) {
break ;
}
}
/* we already have this db locally */
if ( i ! = ( * dbmap ) - > num ) {
continue ;
}
/* ok so we need to create this database and
rebuild dbmap
*/
ctdb_ctrl_getdbname ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , remote_dbmap - > dbids [ db ] , mem_ctx , & name ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbname from node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
ctdb_ctrl_createdb ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , name ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create local db:%s \n " , name ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
ret = ctdb_ctrl_getdbmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , dbmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to reread dbmap on node %u \n " , vnn ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:16:48 +04:00
static int pull_all_remote_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
{
int i , j , ret ;
/* pull all records from all other nodes across onto this node
( this merges based on rsn )
*/
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to merge with ourselves */
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
/* dont merge from nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_copydb ( ctdb , timeval_current_ofs ( 2 , 0 ) , nodemap - > nodes [ j ] . vnn , vnn , dbmap - > dbids [ i ] , CTDB_LMASTER_ANY , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to copy db from node %u to node %u \n " , nodemap - > nodes [ j ] . vnn , vnn ) ) ;
2007-05-06 04:16:48 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:22:13 +04:00
static int update_dmaster_on_all_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
{
int i , j , ret ;
/* update dmaster to point to this node for all databases/nodes */
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont repoint nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_setdmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , ctdb , dbmap - > dbids [ i ] , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set dmaster for node %u db:0x%08x \n " , nodemap - > nodes [ j ] . vnn , dbmap - > dbids [ i ] ) ) ;
2007-05-06 04:22:13 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:30:18 +04:00
2007-05-06 04:38:44 +04:00
static int push_all_local_databases ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_dbid_map * dbmap , TALLOC_CTX * mem_ctx )
{
int i , j , ret ;
/* push all records out to the nodes again */
for ( i = 0 ; i < dbmap - > num ; i + + ) {
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* we dont need to push to ourselves */
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
/* dont push to nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_copydb ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , nodemap - > nodes [ j ] . vnn , dbmap - > dbids [ i ] , CTDB_LMASTER_ANY , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to copy db from node %u to node %u \n " , vnn , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-06 04:38:44 +04:00
return - 1 ;
}
}
}
return 0 ;
}
2007-05-06 04:42:18 +04:00
static int update_vnnmap_on_all_nodes ( struct ctdb_context * ctdb , struct ctdb_node_map * nodemap , uint32_t vnn , struct ctdb_vnn_map * vnnmap , TALLOC_CTX * mem_ctx )
{
int j , ret ;
/* push the new vnn map out to all the nodes */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
/* dont push to nodes that are unavailable */
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
ret = ctdb_ctrl_setvnnmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , vnnmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set vnnmap for node %u \n " , vnn ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
}
}
return 0 ;
}
2007-05-15 09:13:36 +04:00
static int do_recovery ( struct ctdb_context * ctdb ,
2007-05-06 04:04:37 +04:00
TALLOC_CTX * mem_ctx , uint32_t vnn , uint32_t num_active ,
struct ctdb_node_map * nodemap , struct ctdb_vnn_map * vnnmap )
{
2007-05-06 04:12:42 +04:00
int i , j , ret ;
2007-05-06 04:04:37 +04:00
uint32_t generation ;
struct ctdb_dbid_map * dbmap ;
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Recovery initiated \n " ) ) ;
2007-05-06 04:04:37 +04:00
2007-05-12 09:59:49 +04:00
/* set recovery mode to active on all nodes */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
return - 1 ;
}
2007-05-06 04:04:37 +04:00
/* pick a new generation number */
generation = random ( ) ;
/* change the vnnmap on this node to use the new generation
number but not on any other nodes .
this guarantees that if we abort the recovery prematurely
for some reason ( a node stops responding ? )
that we can just return immediately and we will reenter
recovery shortly again .
I . e . we deliberately leave the cluster with an inconsistent
generation id to allow us to abort recovery at any stage and
just restart it from scratch .
*/
vnnmap - > generation = generation ;
ret = ctdb_ctrl_setvnnmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , vnnmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set vnnmap for node %u \n " , vnn ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
/* get a list of all databases */
ret = ctdb_ctrl_getdbmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , & dbmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get dbids from node :%d \n " , vnn ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
2007-05-06 04:12:42 +04:00
2007-05-06 04:04:37 +04:00
/* verify that all other nodes have all our databases */
ret = create_missing_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing remote databases \n " ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
}
2007-05-04 09:21:40 +04:00
2007-05-06 04:12:42 +04:00
/* verify that we have all the databases any other node has */
ret = create_missing_local_databases ( ctdb , nodemap , vnn , & dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing local databases \n " ) ) ;
2007-05-06 04:12:42 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:12:42 +04:00
2007-05-04 09:21:40 +04:00
/* verify that all other nodes have all our databases */
2007-05-06 04:04:37 +04:00
ret = create_missing_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to create missing remote databases \n " ) ) ;
2007-05-06 04:04:37 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:04:37 +04:00
2007-05-08 08:51:55 +04:00
/* pull all remote databases onto the local node */
ret = pull_all_remote_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
2007-05-06 04:22:13 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to pull remote databases \n " ) ) ;
2007-05-06 04:22:13 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:22:13 +04:00
2007-05-06 04:38:44 +04:00
/* push all local databases to the remote nodes */
ret = push_all_local_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to push local databases \n " ) ) ;
2007-05-06 04:38:44 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-06 04:38:44 +04:00
2007-05-06 06:46:56 +04:00
/* build a new vnn map with all the currently active nodes */
2007-05-10 02:49:57 +04:00
vnnmap = talloc ( mem_ctx , struct ctdb_vnn_map ) ;
CTDB_NO_MEMORY ( ctdb , vnnmap ) ;
2007-05-04 09:21:40 +04:00
vnnmap - > generation = generation ;
vnnmap - > size = num_active ;
2007-05-10 02:49:57 +04:00
vnnmap - > map = talloc_array ( vnnmap , uint32_t , vnnmap - > size ) ;
2007-05-04 09:21:40 +04:00
for ( i = j = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_CONNECTED ) {
vnnmap - > map [ j + + ] = nodemap - > nodes [ i ] . vnn ;
}
}
2007-05-06 04:42:18 +04:00
/* update to the new vnnmap on all nodes */
ret = update_vnnmap_on_all_nodes ( ctdb , nodemap , vnn , vnnmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to update vnnmap on all nodes \n " ) ) ;
2007-05-06 04:42:18 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-07 00:51:58 +04:00
/* update recmaster to point to us for all nodes */
ret = set_recovery_master ( ctdb , nodemap , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery master \n " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
2007-05-08 08:51:55 +04:00
/* repoint all local and remote database records to the local
node as being dmaster
*/
ret = update_dmaster_on_all_databases ( ctdb , nodemap , vnn , dbmap , mem_ctx ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to update dmaster on all databases \n " ) ) ;
2007-05-08 08:51:55 +04:00
return - 1 ;
}
2007-05-04 09:21:40 +04:00
/* disable recovery mode */
2007-05-06 03:53:12 +04:00
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_NORMAL ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to normal on cluster \n " ) ) ;
2007-05-06 03:53:12 +04:00
return - 1 ;
2007-05-04 09:21:40 +04:00
}
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Recovery complete \n " ) ) ;
2007-05-04 09:21:40 +04:00
return 0 ;
2007-05-04 03:45:53 +04:00
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-05-07 00:51:58 +04:00
struct election_message {
uint32_t vnn ;
} ;
static int send_election_request ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx , uint32_t vnn )
{
int ret ;
TDB_DATA election_data ;
struct election_message emsg ;
uint64_t srvid ;
srvid = CTDB_SRVTYPE_RECOVERY ;
srvid < < = 32 ;
emsg . vnn = vnn ;
election_data . dsize = sizeof ( struct election_message ) ;
election_data . dptr = ( unsigned char * ) & emsg ;
/* first we assume we will win the election and set
recoverymaster to be ourself on the current node
*/
ret = ctdb_ctrl_setrecmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to send recmaster election request " ) ) ;
2007-05-07 00:51:58 +04:00
return - 1 ;
}
/* send an election message to all active nodes */
ctdb_send_message ( ctdb , CTDB_BROADCAST_ALL , srvid , election_data ) ;
return 0 ;
}
/*
handler for recovery master elections
*/
static void election_handler ( struct ctdb_context * ctdb , uint64_t srvid ,
TDB_DATA data , void * private_data )
{
int ret ;
struct election_message * em = ( struct election_message * ) data . dptr ;
TALLOC_CTX * mem_ctx ;
if ( em - > vnn = = ctdb_get_vnn ( ctdb ) ) {
return ;
}
2007-05-10 07:10:23 +04:00
mem_ctx = talloc_new ( ctdb ) ;
2007-05-07 00:51:58 +04:00
/* someone called an election. check their election data
and if we disagree and we would rather be the elected node ,
send a new election message to all other nodes
*/
/* for now we just check the vnn number and allow the lowest
vnn number to become recovery master
*/
if ( em - > vnn > ctdb_get_vnn ( ctdb ) ) {
ret = send_election_request ( ctdb , mem_ctx , ctdb_get_vnn ( ctdb ) ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
}
talloc_free ( mem_ctx ) ;
return ;
}
/* ok, let that guy become recmaster then */
ret = ctdb_ctrl_setrecmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , ctdb_get_vnn ( ctdb ) , em - > vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to send recmaster election request " ) ) ;
2007-05-07 00:51:58 +04:00
talloc_free ( mem_ctx ) ;
return ;
}
talloc_free ( mem_ctx ) ;
return ;
}
static void force_election ( struct ctdb_context * ctdb , TALLOC_CTX * mem_ctx , uint32_t vnn , struct ctdb_node_map * nodemap )
{
int ret ;
2007-05-10 03:48:14 +04:00
/* set all nodes to recovery mode to stop all internode traffic */
ret = set_recovery_mode ( ctdb , nodemap , CTDB_RECOVERY_ACTIVE ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to set recovery mode to active on cluster \n " ) ) ;
2007-05-10 03:48:14 +04:00
return ;
}
2007-05-07 00:51:58 +04:00
ret = send_election_request ( ctdb , mem_ctx , vnn ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " failed to initiate recmaster election " ) ) ;
2007-05-07 00:51:58 +04:00
return ;
}
/* wait for one second to collect all responses */
timed_out = 0 ;
event_add_timed ( ctdb - > ev , mem_ctx , timeval_current_ofs ( 1 , 0 ) , timeout_func , ctdb ) ;
while ( ! timed_out ) {
event_loop_once ( ctdb - > ev ) ;
}
}
2007-05-06 22:41:12 +04:00
2007-05-15 09:13:36 +04:00
void monitor_cluster ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
2007-05-07 00:51:58 +04:00
uint32_t vnn , num_active , recmode , recmaster ;
2007-05-04 02:30:18 +04:00
TALLOC_CTX * mem_ctx = NULL ;
2007-05-04 03:01:01 +04:00
struct ctdb_node_map * nodemap = NULL ;
2007-05-04 03:45:53 +04:00
struct ctdb_node_map * remote_nodemap = NULL ;
struct ctdb_vnn_map * vnnmap = NULL ;
struct ctdb_vnn_map * remote_vnnmap = NULL ;
int i , j , ret ;
2007-05-04 02:30:18 +04:00
again :
if ( mem_ctx ) {
talloc_free ( mem_ctx ) ;
mem_ctx = NULL ;
}
mem_ctx = talloc_new ( ctdb ) ;
if ( ! mem_ctx ) {
DEBUG ( 0 , ( " Failed to create temporary context \n " ) ) ;
exit ( - 1 ) ;
}
/* we only check for recovery once every second */
2007-05-04 03:01:01 +04:00
timed_out = 0 ;
2007-05-04 02:30:18 +04:00
event_add_timed ( ctdb - > ev , mem_ctx , timeval_current_ofs ( 1 , 0 ) , timeout_func , ctdb ) ;
2007-05-04 03:01:01 +04:00
while ( ! timed_out ) {
2007-05-15 09:13:36 +04:00
event_loop_once ( ctdb - > ev ) ;
2007-05-04 02:30:18 +04:00
}
/* get our vnn number */
vnn = ctdb_get_vnn ( ctdb ) ;
2007-05-06 22:41:12 +04:00
/* get the vnnmap */
ret = ctdb_ctrl_getvnnmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , & vnnmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get vnnmap from node %u \n " , vnn ) ) ;
2007-05-06 22:41:12 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
/* get number of nodes */
2007-05-04 03:01:01 +04:00
ret = ctdb_ctrl_getnodemap ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , mem_ctx , & nodemap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get nodemap from node %u \n " , vnn ) ) ;
2007-05-04 03:01:01 +04:00
goto again ;
}
2007-05-04 02:30:18 +04:00
2007-05-06 22:41:12 +04:00
2007-05-04 03:45:53 +04:00
/* count how many active nodes there are */
num_active = 0 ;
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( nodemap - > nodes [ i ] . flags & NODE_FLAGS_CONNECTED ) {
num_active + + ;
}
}
2007-05-07 00:51:58 +04:00
/* check which node is the recovery master */
ret = ctdb_ctrl_getrecmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , vnn , & recmaster ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get recmaster from node %u \n " , vnn ) ) ;
2007-05-07 00:51:58 +04:00
goto again ;
}
/* verify that the recmaster node is still active */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( nodemap - > nodes [ j ] . vnn = = recmaster ) {
break ;
}
2007-05-10 07:10:23 +04:00
}
2007-05-07 00:51:58 +04:00
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
DEBUG ( 0 , ( " Recmaster node %u no longer available. Force reelection \n " , nodemap - > nodes [ j ] . vnn ) ) ;
force_election ( ctdb , mem_ctx , vnn , nodemap ) ;
goto again ;
}
/* if we are not the recmaster then we do not need to check
if recovery is needed
*/
if ( vnn ! = recmaster ) {
goto again ;
}
/* verify that all active nodes agree that we are the recmaster */
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
2007-05-10 03:43:01 +04:00
ret = ctdb_ctrl_getrecmaster ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , & recmaster ) ;
2007-05-07 00:51:58 +04:00
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get recmaster from node %u \n " , vnn ) ) ;
2007-05-07 00:51:58 +04:00
goto again ;
}
if ( recmaster ! = vnn ) {
DEBUG ( 0 , ( " Node %d does not agree we are the recmaster. Force reelection \n " , nodemap - > nodes [ j ] . vnn ) ) ;
force_election ( ctdb , mem_ctx , vnn , nodemap ) ;
goto again ;
}
}
2007-05-06 22:41:12 +04:00
/* verify that all active nodes are in normal mode
and not in recovery mode
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
2007-05-10 03:43:01 +04:00
ret = ctdb_ctrl_getrecmode ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , & recmode ) ;
2007-05-06 22:41:12 +04:00
if ( ret ! = 0 ) {
DEBUG ( 0 , ( " Unable to get recmode from node %u \n " , vnn ) ) ;
goto again ;
}
if ( recmode ! = CTDB_RECOVERY_NORMAL ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Node:%d was in recovery mode. Restart recovery process \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-06 22:41:12 +04:00
goto again ;
}
}
2007-05-04 03:45:53 +04:00
/* get the nodemap for all active remote nodes and verify
they are the same as for this node
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
ret = ctdb_ctrl_getnodemap ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , & remote_nodemap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get nodemap from remote node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on how many nodes there are
then this is a good reason to try recovery
*/
if ( remote_nodemap - > num ! = nodemap - > num ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Remote node:%d has different node count. %d vs %d of the local node \n " , nodemap - > nodes [ j ] . vnn , remote_nodemap - > num , nodemap - > num ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* if the nodes disagree on which nodes exist and are
active , then that is also a good reason to do recovery
*/
for ( i = 0 ; i < nodemap - > num ; i + + ) {
if ( ( remote_nodemap - > nodes [ i ] . vnn ! = nodemap - > nodes [ i ] . vnn )
| | ( remote_nodemap - > nodes [ i ] . flags ! = nodemap - > nodes [ i ] . flags ) ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Remote node:%d has different nodemap. \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
}
/* there better be the same number of lmasters in the vnn map
2007-05-10 07:10:23 +04:00
as there are active nodes or we will have to do a recovery
2007-05-04 03:45:53 +04:00
*/
if ( vnnmap - > size ! = num_active ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " The vnnmap count is different from the number of active nodes. %d vs %d \n " , vnnmap - > size , num_active ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify that all active nodes in the nodemap also exist in
the vnnmap .
*/
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( vnnmap - > map [ i ] = = nodemap - > nodes [ j ] . vnn ) {
break ;
}
}
if ( i = = vnnmap - > size ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Node %d is active in the nodemap but did not exist in the vnnmap \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
2007-05-04 05:57:45 +04:00
/* verify that all other nodes have the same vnnmap
and are from the same generation
*/
2007-05-04 03:45:53 +04:00
for ( j = 0 ; j < nodemap - > num ; j + + ) {
if ( ! ( nodemap - > nodes [ j ] . flags & NODE_FLAGS_CONNECTED ) ) {
continue ;
}
if ( nodemap - > nodes [ j ] . vnn = = vnn ) {
continue ;
}
ret = ctdb_ctrl_getvnnmap ( ctdb , timeval_current_ofs ( 1 , 0 ) , nodemap - > nodes [ j ] . vnn , mem_ctx , & remote_vnnmap ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Unable to get vnnmap from remote node %u \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
2007-05-04 05:57:45 +04:00
/* verify the vnnmap generation is the same */
if ( vnnmap - > generation ! = remote_vnnmap - > generation ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Remote node %d has different generation of vnnmap. %d vs %d (ours) \n " , nodemap - > nodes [ j ] . vnn , remote_vnnmap - > generation , vnnmap - > generation ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 05:57:45 +04:00
goto again ;
}
2007-05-04 03:45:53 +04:00
/* verify the vnnmap size is the same */
if ( vnnmap - > size ! = remote_vnnmap - > size ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Remote node %d has different size of vnnmap. %d vs %d (ours) \n " , nodemap - > nodes [ j ] . vnn , remote_vnnmap - > size , vnnmap - > size ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
/* verify the vnnmap is the same */
for ( i = 0 ; i < vnnmap - > size ; i + + ) {
if ( remote_vnnmap - > map [ i ] ! = vnnmap - > map [ i ] ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Remote node %d has different vnnmap. \n " , nodemap - > nodes [ j ] . vnn ) ) ;
2007-05-15 09:13:36 +04:00
do_recovery ( ctdb , mem_ctx , vnn , num_active , nodemap , vnnmap ) ;
2007-05-04 03:45:53 +04:00
goto again ;
}
}
}
goto again ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
static void ctdb_recoverd_parent ( struct event_context * ev , struct fd_event * fde ,
uint16_t flags , void * private_data )
{
DEBUG ( 0 , ( " recovery daemon parent died - exiting \n " ) ) ;
_exit ( 1 ) ;
}
int ctdb_start_recoverd ( struct ctdb_context * ctdb )
2007-05-04 02:30:18 +04:00
{
int ret ;
2007-05-07 00:51:58 +04:00
uint64_t srvid ;
2007-05-15 09:13:36 +04:00
int fd [ 2 ] ;
pid_t child ;
2007-05-04 02:30:18 +04:00
2007-05-15 09:13:36 +04:00
if ( pipe ( fd ) ! = 0 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
child = fork ( ) ;
if ( child = = - 1 ) {
return - 1 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
if ( child ! = 0 ) {
close ( fd [ 0 ] ) ;
return 0 ;
2007-05-04 02:30:18 +04:00
}
2007-05-15 09:13:36 +04:00
close ( fd [ 1 ] ) ;
event_add_fd ( ctdb - > ev , ctdb , fd [ 0 ] , EVENT_FD_READ | EVENT_FD_AUTOCLOSE ,
ctdb_recoverd_parent , & fd [ 0 ] ) ;
2007-05-10 07:10:23 +04:00
2007-05-15 09:13:36 +04:00
close ( ctdb - > daemon . sd ) ;
ctdb - > daemon . sd = - 1 ;
srandom ( getpid ( ) ^ time ( NULL ) ) ;
2007-05-04 02:30:18 +04:00
/* initialise ctdb */
2007-05-15 09:13:36 +04:00
ret = ctdb_socket_connect ( ctdb ) ;
if ( ret ! = 0 ) {
2007-05-12 08:34:21 +04:00
DEBUG ( 0 , ( __location__ " Failed to init ctdb \n " ) ) ;
2007-05-04 02:30:18 +04:00
exit ( 1 ) ;
}
2007-05-07 00:51:58 +04:00
/* register a message port for recovery elections */
srvid = CTDB_SRVTYPE_RECOVERY ;
srvid < < = 32 ;
ctdb_set_message_handler ( ctdb , srvid , election_handler , NULL ) ;
2007-05-15 09:13:36 +04:00
monitor_cluster ( ctdb ) ;
2007-05-07 00:51:58 +04:00
2007-05-15 09:13:36 +04:00
DEBUG ( 0 , ( " ERROR: ctdb_recoverd finished!? \n " ) ) ;
return - 1 ;
2007-05-04 02:30:18 +04:00
}