2007-05-18 14:06:29 +04:00
/*
monitoring links to all other nodes to detect dead nodes
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 07:50:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 09:29:31 +04:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 07:50:53 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-18 14:06:29 +04:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 07:50:53 +04:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 09:29:31 +04:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-18 14:06:29 +04:00
*/
2015-10-26 08:50:46 +03:00
# include "replace.h"
2007-05-18 14:06:29 +04:00
# include "system/filesys.h"
2015-10-26 08:50:46 +03:00
# include "system/network.h"
2007-05-18 14:06:29 +04:00
# include "system/wait.h"
2015-10-26 08:50:46 +03:00
# include <talloc.h>
# include <tevent.h>
# include "lib/util/debug.h"
# include "lib/util/samba_util.h"
2015-09-24 02:10:59 +03:00
# include "lib/util/util_process.h"
2015-10-26 08:50:46 +03:00
# include "ctdb_private.h"
2015-10-23 06:11:53 +03:00
# include "common/system.h"
2015-10-23 06:17:34 +03:00
# include "common/common.h"
2015-11-11 07:41:10 +03:00
# include "common/logging.h"
2007-05-18 14:06:29 +04:00
2008-01-10 06:40:56 +03:00
struct ctdb_monitor_state {
uint32_t monitoring_mode ;
TALLOC_CTX * monitor_context ;
uint32_t next_interval ;
2015-10-28 08:51:59 +03:00
uint32_t event_script_timeouts ;
2008-01-10 06:40:56 +03:00
} ;
2015-10-26 08:50:09 +03:00
static void ctdb_check_health ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-06-06 04:25:46 +04:00
struct timeval t , void * private_data ) ;
2009-03-31 07:23:31 +04:00
/*
setup the notification script
*/
int ctdb_set_notification_script ( struct ctdb_context * ctdb , const char * script )
{
ctdb - > notification_script = talloc_strdup ( ctdb , script ) ;
CTDB_NO_MEMORY ( ctdb , ctdb - > notification_script ) ;
return 0 ;
}
static int ctdb_run_notification_script_child ( struct ctdb_context * ctdb , const char * event )
{
struct stat st ;
int ret ;
char * cmd ;
if ( stat ( ctdb - > notification_script , & st ) ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Could not stat notification script %s. Can not send notifications. \n " , ctdb - > notification_script ) ) ;
return - 1 ;
}
if ( ! ( st . st_mode & S_IXUSR ) ) {
DEBUG ( DEBUG_ERR , ( " Notification script %s is not executable. \n " , ctdb - > notification_script ) ) ;
return - 1 ;
}
cmd = talloc_asprintf ( ctdb , " %s %s \n " , ctdb - > notification_script , event ) ;
CTDB_NO_MEMORY ( ctdb , cmd ) ;
ret = system ( cmd ) ;
/* if the system() call was successful, translate ret into the
return code from the command
*/
if ( ret ! = - 1 ) {
ret = WEXITSTATUS ( ret ) ;
}
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Notification script \" %s \" failed with error %d \n " , cmd , ret ) ) ;
}
return ret ;
}
2010-01-19 12:07:14 +03:00
void ctdb_run_notification_script ( struct ctdb_context * ctdb , const char * event )
2009-03-31 07:23:31 +04:00
{
pid_t child ;
if ( ctdb - > notification_script = = NULL ) {
return ;
}
2011-01-10 05:57:49 +03:00
child = ctdb_fork ( ctdb ) ;
2009-03-31 07:23:31 +04:00
if ( child = = ( pid_t ) - 1 ) {
DEBUG ( DEBUG_ERR , ( " Failed to fork() a notification child process \n " ) ) ;
return ;
}
if ( child = = 0 ) {
int ret ;
2015-09-24 02:10:59 +03:00
prctl_set_comment ( " ctdb_notification " ) ;
2010-07-19 13:59:09 +04:00
debug_extra = talloc_asprintf ( NULL , " notification-%s: " , event ) ;
2009-03-31 07:23:31 +04:00
ret = ctdb_run_notification_script_child ( ctdb , event ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( __location__ " Notification script failed \n " ) ) ;
}
_exit ( 0 ) ;
}
return ;
}
2007-06-06 04:25:46 +04:00
/*
called when a health monitoring event script finishes
*/
static void ctdb_health_callback ( struct ctdb_context * ctdb , int status , void * p )
{
2007-09-04 04:06:36 +04:00
struct ctdb_node * node = ctdb - > nodes [ ctdb - > pnn ] ;
2007-06-06 04:25:46 +04:00
TDB_DATA data ;
struct ctdb_node_flag_change c ;
2007-09-24 04:12:18 +04:00
uint32_t next_interval ;
2009-10-14 04:59:16 +04:00
int ret ;
TDB_DATA rddata ;
2015-10-29 06:32:49 +03:00
struct ctdb_srvid_message rd ;
2010-08-24 11:22:49 +04:00
const char * state_str = NULL ;
2007-06-06 04:25:46 +04:00
2007-09-04 04:33:10 +04:00
c . pnn = ctdb - > pnn ;
2007-08-21 11:25:15 +04:00
c . old_flags = node - > flags ;
2016-01-11 09:23:12 +03:00
ZERO_STRUCT ( rd ) ;
2009-10-14 04:59:16 +04:00
rd . pnn = ctdb - > pnn ;
2016-04-22 23:45:01 +03:00
rd . srvid = 0 ;
2009-10-14 04:59:16 +04:00
rddata . dptr = ( uint8_t * ) & rd ;
rddata . dsize = sizeof ( rd ) ;
2011-11-17 06:34:29 +04:00
if ( status = = - ECANCELED ) {
DEBUG ( DEBUG_ERR , ( " Monitoring event was cancelled \n " ) ) ;
goto after_change_status ;
}
2009-12-07 16:18:57 +03:00
if ( status = = - ETIME ) {
2015-10-28 08:51:59 +03:00
ctdb - > monitor - > event_script_timeouts + + ;
if ( ctdb - > monitor - > event_script_timeouts > =
2015-10-28 09:03:01 +03:00
ctdb - > tunable . monitor_timeout_count ) {
2015-10-28 08:51:59 +03:00
DEBUG ( DEBUG_ERR ,
( " Maximum monitor timeout count %u reached. "
" Making node unhealthy \n " ,
2015-10-28 09:03:01 +03:00
ctdb - > tunable . monitor_timeout_count ) ) ;
2009-12-07 16:18:57 +03:00
} else {
/* We pretend this is OK. */
2009-12-07 16:22:01 +03:00
goto after_change_status ;
2009-12-07 16:18:57 +03:00
}
2015-10-28 08:42:41 +03:00
} else {
2015-10-28 08:51:59 +03:00
ctdb - > monitor - > event_script_timeouts = 0 ;
2009-12-07 16:18:57 +03:00
}
2007-06-07 05:15:22 +04:00
if ( status ! = 0 & & ! ( node - > flags & NODE_FLAGS_UNHEALTHY ) ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " monitor event failed - disabling node \n " ) ) ;
2007-06-07 05:15:22 +04:00
node - > flags | = NODE_FLAGS_UNHEALTHY ;
2009-10-27 05:51:45 +03:00
ctdb - > monitor - > next_interval = 5 ;
2008-02-22 02:33:09 +03:00
2009-03-31 07:23:31 +04:00
ctdb_run_notification_script ( ctdb , " unhealthy " ) ;
2007-06-07 05:15:22 +04:00
} else if ( status = = 0 & & ( node - > flags & NODE_FLAGS_UNHEALTHY ) ) {
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_NOTICE , ( " monitor event OK - node re-enabled \n " ) ) ;
2007-09-24 04:12:18 +04:00
node - > flags & = ~ NODE_FLAGS_UNHEALTHY ;
2009-10-27 05:51:45 +03:00
ctdb - > monitor - > next_interval = 5 ;
2009-03-31 07:23:31 +04:00
ctdb_run_notification_script ( ctdb , " healthy " ) ;
2007-09-24 04:12:18 +04:00
}
2009-12-07 16:22:01 +03:00
after_change_status :
2008-01-10 06:40:56 +03:00
next_interval = ctdb - > monitor - > next_interval ;
ctdb - > monitor - > next_interval * = 2 ;
if ( ctdb - > monitor - > next_interval > ctdb - > tunable . monitor_interval ) {
ctdb - > monitor - > next_interval = ctdb - > tunable . monitor_interval ;
2007-09-24 04:12:18 +04:00
}
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( next_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
2007-09-24 04:12:18 +04:00
if ( c . old_flags = = node - > flags ) {
2007-06-06 04:25:46 +04:00
return ;
}
2007-08-21 11:25:15 +04:00
c . new_flags = node - > flags ;
2007-06-06 04:25:46 +04:00
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
2008-11-19 06:43:46 +03:00
/* ask the recovery daemon to push these changes out to all nodes */
ctdb_daemon_send_message ( ctdb , ctdb - > pnn ,
CTDB_SRVID_PUSH_NODE_FLAGS , data ) ;
2007-06-07 05:15:22 +04:00
2010-08-24 11:22:49 +04:00
if ( c . new_flags & NODE_FLAGS_UNHEALTHY ) {
state_str = " UNHEALTHY " ;
} else {
state_str = " HEALTHY " ;
}
/* ask the recmaster to reallocate all addresses */
2015-03-30 12:51:51 +03:00
DEBUG ( DEBUG_ERR ,
( " Node became %s. Ask recovery master to reallocate IPs \n " ,
state_str ) ) ;
ret = ctdb_daemon_send_message ( ctdb , CTDB_BROADCAST_CONNECTED , CTDB_SRVID_TAKEOVER_RUN , rddata ) ;
2010-08-24 11:22:49 +04:00
if ( ret ! = 0 ) {
2015-03-30 12:51:51 +03:00
DEBUG ( DEBUG_ERR ,
( __location__
" Failed to send IP takeover run request \n " ) ) ;
2010-08-24 11:22:49 +04:00
}
2007-06-06 04:25:46 +04:00
}
2015-10-26 08:50:09 +03:00
static void ctdb_run_startup ( struct tevent_context * ev ,
struct tevent_timer * te ,
2013-12-18 08:37:11 +04:00
struct timeval t , void * private_data ) ;
2007-11-12 02:53:11 +03:00
/*
called when the startup event script finishes
*/
static void ctdb_startup_callback ( struct ctdb_context * ctdb , int status , void * p )
{
if ( status ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " startup event failed \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 5 , 0 ) ,
ctdb_run_startup , ctdb ) ;
2013-12-18 08:37:11 +04:00
return ;
2007-11-12 02:53:11 +03:00
}
2013-12-18 08:37:11 +04:00
DEBUG ( DEBUG_NOTICE , ( " startup event OK - enabling monitoring \n " ) ) ;
ctdb_set_runstate ( ctdb , CTDB_RUNSTATE_RUNNING ) ;
ctdb - > monitor - > next_interval = 2 ;
ctdb_run_notification_script ( ctdb , " startup " ) ;
2016-07-22 08:41:59 +03:00
ctdb - > monitor - > monitoring_mode = CTDB_MONITORING_ENABLED ;
2013-12-18 08:37:11 +04:00
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( ctdb - > monitor - > next_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
2007-11-12 02:53:11 +03:00
}
2015-10-26 08:50:09 +03:00
static void ctdb_run_startup ( struct tevent_context * ev ,
struct tevent_timer * te ,
2013-12-18 08:37:11 +04:00
struct timeval t , void * private_data )
{
struct ctdb_context * ctdb = talloc_get_type ( private_data ,
struct ctdb_context ) ;
int ret ;
/* This is necessary to avoid the "startup" event colliding
* with the " ipreallocated " event from the takeover run
* following the first recovery . We might as well serialise
* these things if we can .
*/
if ( ctdb - > runstate < CTDB_RUNSTATE_STARTUP ) {
DEBUG ( DEBUG_NOTICE ,
( " Not yet in startup runstate. Wait one more second \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_run_startup , ctdb ) ;
2013-12-18 08:37:11 +04:00
return ;
}
2015-05-08 13:00:35 +03:00
/* release any IPs we hold from previous runs of the daemon */
ctdb_release_all_ips ( ctdb ) ;
2013-12-18 08:37:11 +04:00
DEBUG ( DEBUG_NOTICE , ( " Running the \" startup \" event. \n " ) ) ;
ret = ctdb_event_script_callback ( ctdb ,
ctdb - > monitor - > monitor_context ,
ctdb_startup_callback ,
ctdb , CTDB_EVENT_STARTUP , " %s " , " " ) ;
if ( ret ! = 0 ) {
DEBUG ( DEBUG_ERR , ( " Unable to launch startup event script \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 5 , 0 ) ,
ctdb_run_startup , ctdb ) ;
2013-12-18 08:37:11 +04:00
}
}
2007-11-12 02:53:11 +03:00
2009-12-01 05:19:58 +03:00
/*
wait until we have finished initial recoveries before we start the
monitoring events
*/
2015-10-26 08:50:09 +03:00
static void ctdb_wait_until_recovered ( struct tevent_context * ev ,
struct tevent_timer * te ,
struct timeval t , void * private_data )
2009-12-01 05:19:58 +03:00
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
2009-12-07 15:28:11 +03:00
int ret ;
2011-05-04 02:54:02 +04:00
static int count = 0 ;
2009-12-01 05:19:58 +03:00
2011-05-04 02:54:02 +04:00
count + + ;
if ( count < 60 | | count % 600 = = 0 ) {
DEBUG ( DEBUG_NOTICE , ( " CTDB_WAIT_UNTIL_RECOVERED \n " ) ) ;
if ( ctdb - > nodes [ ctdb - > pnn ] - > flags & NODE_FLAGS_STOPPED ) {
DEBUG ( DEBUG_NOTICE , ( " Node is STOPPED. Node will NOT recover. \n " ) ) ;
}
2011-01-31 09:40:26 +03:00
}
2009-12-01 05:19:58 +03:00
if ( ctdb - > vnn_map - > generation = = INVALID_GENERATION ) {
2009-12-07 15:28:11 +03:00
ctdb - > db_persistent_startup_generation = INVALID_GENERATION ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2009-12-01 05:19:58 +03:00
return ;
}
if ( ctdb - > recovery_mode ! = CTDB_RECOVERY_NORMAL ) {
2009-12-07 15:28:11 +03:00
ctdb - > db_persistent_startup_generation = INVALID_GENERATION ;
2009-12-01 05:19:58 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " in recovery. Wait one more second \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2009-12-01 05:19:58 +03:00
return ;
}
2010-06-22 17:22:34 +04:00
if ( ! fast_start & & timeval_elapsed ( & ctdb - > last_recovery_finished ) < ( ctdb - > tunable . rerecovery_timeout + 3 ) ) {
2009-12-07 15:28:11 +03:00
ctdb - > db_persistent_startup_generation = INVALID_GENERATION ;
2009-12-01 05:19:58 +03:00
DEBUG ( DEBUG_NOTICE , ( __location__ " wait for pending recoveries to end. Wait one more second. \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2009-12-01 05:19:58 +03:00
return ;
}
2009-12-07 15:28:11 +03:00
if ( ctdb - > vnn_map - > generation = = ctdb - > db_persistent_startup_generation ) {
DEBUG ( DEBUG_INFO , ( __location__ " skip ctdb_recheck_persistent_health() "
" until the next recovery \n " ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2009-12-07 15:28:11 +03:00
return ;
}
ctdb - > db_persistent_startup_generation = ctdb - > vnn_map - > generation ;
ret = ctdb_recheck_persistent_health ( ctdb ) ;
if ( ret ! = 0 ) {
ctdb - > db_persistent_check_errors + + ;
if ( ctdb - > db_persistent_check_errors < ctdb - > max_persistent_check_errors ) {
2016-08-10 09:46:51 +03:00
DEBUG ( DEBUG_ERR ,
2009-12-07 15:28:11 +03:00
( __location__ " ctdb_recheck_persistent_health() "
" failed (%llu of %llu times) - retry later \n " ,
( unsigned long long ) ctdb - > db_persistent_check_errors ,
( unsigned long long ) ctdb - > max_persistent_check_errors ) ) ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev ,
ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2009-12-07 15:28:11 +03:00
return ;
}
DEBUG ( DEBUG_ALERT , ( __location__
" ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown \n " ,
( unsigned long long ) ctdb - > db_persistent_check_errors ) ) ;
2013-06-19 04:58:14 +04:00
ctdb_shutdown_sequence ( ctdb , 11 ) ;
2013-06-22 09:44:28 +04:00
/* In case above returns due to duplicate shutdown */
return ;
2009-12-07 15:28:11 +03:00
}
ctdb - > db_persistent_check_errors = 0 ;
2009-12-01 05:19:58 +03:00
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current ( ) , ctdb_run_startup , ctdb ) ;
2009-12-01 05:19:58 +03:00
}
2007-06-06 04:25:46 +04:00
/*
see if the event scripts think we are healthy
*/
2015-10-26 08:50:09 +03:00
static void ctdb_check_health ( struct tevent_context * ev ,
struct tevent_timer * te ,
2007-06-06 04:25:46 +04:00
struct timeval t , void * private_data )
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
2013-12-18 08:37:11 +04:00
bool skip_monitoring = false ;
2009-10-28 08:11:54 +03:00
int ret = 0 ;
2007-06-06 04:25:46 +04:00
2007-11-12 05:10:15 +03:00
if ( ctdb - > recovery_mode ! = CTDB_RECOVERY_NORMAL | |
2013-12-18 08:37:11 +04:00
ctdb - > monitor - > monitoring_mode = = CTDB_MONITORING_DISABLED ) {
skip_monitoring = true ;
2007-11-12 02:53:11 +03:00
} else {
2015-09-15 05:22:17 +03:00
if ( ctdb_db_all_frozen ( ctdb ) ) {
DEBUG ( DEBUG_ERR ,
( " Skip monitoring since databases are frozen \n " ) ) ;
skip_monitoring = true ;
2009-10-15 09:03:43 +04:00
}
2007-11-12 02:53:11 +03:00
}
2013-12-18 08:37:11 +04:00
if ( skip_monitoring ) {
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( ctdb - > monitor - > next_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
2013-12-18 08:37:11 +04:00
return ;
}
ret = ctdb_event_script_callback ( ctdb ,
ctdb - > monitor - > monitor_context ,
ctdb_health_callback ,
ctdb , CTDB_EVENT_MONITOR , " %s " , " " ) ;
2007-06-06 04:25:46 +04:00
if ( ret ! = 0 ) {
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_ERR , ( " Unable to launch monitor event script \n " ) ) ;
2009-10-28 08:11:54 +03:00
ctdb - > monitor - > next_interval = 5 ;
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 5 , 0 ) ,
ctdb_check_health , ctdb ) ;
2009-10-28 08:11:54 +03:00
}
2007-06-06 04:25:46 +04:00
}
2007-11-30 02:09:54 +03:00
/*
( Temporaily ) Disabling monitoring will stop the monitor event scripts
from running but node health checks will still occur
*/
void ctdb_disable_monitoring ( struct ctdb_context * ctdb )
{
2008-01-10 06:40:56 +03:00
ctdb - > monitor - > monitoring_mode = CTDB_MONITORING_DISABLED ;
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_INFO , ( " Monitoring has been disabled \n " ) ) ;
2007-11-30 02:09:54 +03:00
}
/*
Re - enable running monitor events after they have been disabled
*/
void ctdb_enable_monitoring ( struct ctdb_context * ctdb )
{
2016-07-22 08:41:59 +03:00
ctdb - > monitor - > monitoring_mode = CTDB_MONITORING_ENABLED ;
2009-10-28 08:11:54 +03:00
ctdb - > monitor - > next_interval = 5 ;
2008-02-04 09:44:24 +03:00
DEBUG ( DEBUG_INFO , ( " Monitoring has been enabled \n " ) ) ;
2007-11-30 02:09:54 +03:00
}
/* stop any monitoring
this should only be done when shutting down the daemon
*/
2007-06-06 07:45:12 +04:00
void ctdb_stop_monitoring ( struct ctdb_context * ctdb )
{
2008-01-10 06:40:56 +03:00
talloc_free ( ctdb - > monitor - > monitor_context ) ;
ctdb - > monitor - > monitor_context = NULL ;
2007-11-30 00:44:34 +03:00
2008-01-10 06:40:56 +03:00
ctdb - > monitor - > monitoring_mode = CTDB_MONITORING_DISABLED ;
2009-10-28 08:11:54 +03:00
ctdb - > monitor - > next_interval = 5 ;
2008-02-04 12:07:15 +03:00
DEBUG ( DEBUG_NOTICE , ( " Monitoring has been stopped \n " ) ) ;
2007-06-06 07:45:12 +04:00
}
2007-06-06 04:25:46 +04:00
2007-05-18 17:23:36 +04:00
/*
start watching for nodes that might be dead
*/
2013-12-18 08:37:11 +04:00
void ctdb_wait_for_first_recovery ( struct ctdb_context * ctdb )
2007-05-18 14:06:29 +04:00
{
2013-12-18 08:37:11 +04:00
ctdb_set_runstate ( ctdb , CTDB_RUNSTATE_FIRST_RECOVERY ) ;
2007-11-30 00:44:34 +03:00
2008-01-10 06:40:56 +03:00
ctdb - > monitor = talloc ( ctdb , struct ctdb_monitor_state ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , ctdb - > monitor ) ;
ctdb - > monitor - > monitor_context = talloc_new ( ctdb - > monitor ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , ctdb - > monitor - > monitor_context ) ;
2007-11-30 00:44:34 +03:00
2015-10-26 08:50:09 +03:00
tevent_add_timer ( ctdb - > ev , ctdb - > monitor - > monitor_context ,
timeval_current_ofs ( 1 , 0 ) ,
ctdb_wait_until_recovered , ctdb ) ;
2007-05-18 14:06:29 +04:00
}
2007-06-07 05:15:22 +04:00
/*
2007-06-07 09:18:55 +04:00
modify flags on a node
2007-06-07 05:15:22 +04:00
*/
2007-06-07 09:18:55 +04:00
int32_t ctdb_control_modflags ( struct ctdb_context * ctdb , TDB_DATA indata )
2007-06-07 05:15:22 +04:00
{
2008-11-19 06:43:46 +03:00
struct ctdb_node_flag_change * c = ( struct ctdb_node_flag_change * ) indata . dptr ;
struct ctdb_node * node ;
2009-07-09 07:20:14 +04:00
uint32_t old_flags ;
2009-10-12 05:08:39 +04:00
2008-11-19 06:43:46 +03:00
if ( c - > pnn > = ctdb - > num_nodes ) {
DEBUG ( DEBUG_ERR , ( __location__ " Node %d is invalid, num_nodes :%d \n " , c - > pnn , ctdb - > num_nodes ) ) ;
return - 1 ;
}
2007-06-07 05:15:22 +04:00
2008-11-19 06:43:46 +03:00
node = ctdb - > nodes [ c - > pnn ] ;
2009-07-09 07:20:14 +04:00
old_flags = node - > flags ;
2009-10-09 17:47:49 +04:00
if ( c - > pnn ! = ctdb - > pnn ) {
c - > old_flags = node - > flags ;
}
2008-11-19 06:43:46 +03:00
node - > flags = c - > new_flags & ~ NODE_FLAGS_DISCONNECTED ;
node - > flags | = ( c - > old_flags & NODE_FLAGS_DISCONNECTED ) ;
2007-06-07 09:18:55 +04:00
2015-07-27 00:02:57 +03:00
/* we don't let other nodes modify our STOPPED status */
2009-07-09 07:20:14 +04:00
if ( c - > pnn = = ctdb - > pnn ) {
node - > flags & = ~ NODE_FLAGS_STOPPED ;
if ( old_flags & NODE_FLAGS_STOPPED ) {
node - > flags | = NODE_FLAGS_STOPPED ;
}
}
2015-07-27 00:02:57 +03:00
/* we don't let other nodes modify our BANNED status */
2009-09-03 20:20:39 +04:00
if ( c - > pnn = = ctdb - > pnn ) {
node - > flags & = ~ NODE_FLAGS_BANNED ;
if ( old_flags & NODE_FLAGS_BANNED ) {
node - > flags | = NODE_FLAGS_BANNED ;
}
}
2008-11-19 06:43:46 +03:00
if ( node - > flags = = c - > old_flags ) {
DEBUG ( DEBUG_INFO , ( " Control modflags on node %u - Unchanged - flags 0x%x \n " , c - > pnn , node - > flags ) ) ;
2007-06-07 09:18:55 +04:00
return 0 ;
2007-06-07 05:15:22 +04:00
}
2008-11-19 06:43:46 +03:00
DEBUG ( DEBUG_INFO , ( " Control modflags on node %u - flags now 0x%x \n " , c - > pnn , node - > flags ) ) ;
2007-06-07 10:34:33 +04:00
2013-06-24 09:49:48 +04:00
if ( node - > flags = = 0 & & ctdb - > runstate < = CTDB_RUNSTATE_STARTUP ) {
2009-12-07 15:28:11 +03:00
DEBUG ( DEBUG_ERR , ( __location__ " Node %u became healthy - force recovery for startup \n " ,
c - > pnn ) ) ;
ctdb - > recovery_mode = CTDB_RECOVERY_ACTIVE ;
}
2007-06-07 05:15:22 +04:00
2008-11-19 06:43:46 +03:00
/* tell the recovery daemon something has changed */
2015-02-04 09:18:12 +03:00
c - > new_flags = node - > flags ;
2008-11-19 06:43:46 +03:00
ctdb_daemon_send_message ( ctdb , ctdb - > pnn ,
CTDB_SRVID_SET_NODE_FLAGS , indata ) ;
2007-06-07 10:34:33 +04:00
2008-11-19 06:43:46 +03:00
/* if we have become banned, we should go into recovery mode */
2009-06-10 04:28:47 +04:00
if ( ( node - > flags & NODE_FLAGS_BANNED ) & & ! ( c - > old_flags & NODE_FLAGS_BANNED ) & & ( node - > pnn = = ctdb - > pnn ) ) {
2013-06-28 08:04:18 +04:00
ctdb_local_node_got_banned ( ctdb ) ;
2007-06-07 10:34:33 +04:00
}
2007-06-07 05:15:22 +04:00
return 0 ;
}
2008-01-10 06:40:56 +03:00
/*
return the monitoring mode
*/
int32_t ctdb_monitoring_mode ( struct ctdb_context * ctdb )
{
if ( ctdb - > monitor = = NULL ) {
return CTDB_MONITORING_DISABLED ;
}
return ctdb - > monitor - > monitoring_mode ;
}
2008-04-02 04:13:30 +04:00
2012-10-29 07:56:10 +04:00
/*
* Check if monitoring has been stopped
*/
bool ctdb_stopped_monitoring ( struct ctdb_context * ctdb )
{
return ( ctdb - > monitor - > monitor_context = = NULL ? true : false ) ;
}