2007-05-18 14:06:29 +04:00
/*
monitoring links to all other nodes to detect dead nodes
Copyright ( C ) Ronnie Sahlberg 2007
2007-05-31 07:50:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 09:29:31 +04:00
the Free Software Foundation ; either version 3 of the License , or
2007-05-31 07:50:53 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
2007-05-18 14:06:29 +04:00
but WITHOUT ANY WARRANTY ; without even the implied warranty of
2007-05-31 07:50:53 +04:00
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 09:29:31 +04:00
along with this program ; if not , see < http : //www.gnu.org/licenses/>.
2007-05-18 14:06:29 +04:00
*/
# include "includes.h"
# include "lib/events/events.h"
# include "system/filesys.h"
# include "system/wait.h"
# include "../include/ctdb_private.h"
/*
2007-05-18 17:23:36 +04:00
see if any nodes are dead
*/
2007-05-18 14:06:29 +04:00
static void ctdb_check_for_dead_nodes ( struct event_context * ev , struct timed_event * te ,
2007-06-06 07:45:12 +04:00
struct timeval t , void * private_data )
2007-05-18 14:06:29 +04:00
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
int i ;
2007-06-06 04:25:46 +04:00
if ( ctdb - > monitoring_mode = = CTDB_MONITORING_DISABLED ) {
2007-06-06 07:45:12 +04:00
event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
2007-06-06 04:25:46 +04:00
timeval_current_ofs ( ctdb - > tunable . keepalive_interval , 0 ) ,
2007-05-21 03:24:34 +04:00
ctdb_check_for_dead_nodes , ctdb ) ;
return ;
}
2007-05-18 14:06:29 +04:00
/* send a keepalive to all other nodes, unless */
for ( i = 0 ; i < ctdb - > num_nodes ; i + + ) {
2007-05-18 17:23:36 +04:00
struct ctdb_node * node = ctdb - > nodes [ i ] ;
2007-09-04 03:50:07 +04:00
if ( node - > pnn = = ctdb - > vnn ) {
2007-05-18 14:06:29 +04:00
continue ;
}
2007-05-18 17:23:36 +04:00
2007-06-07 09:18:55 +04:00
if ( node - > flags & NODE_FLAGS_DISCONNECTED ) {
2007-06-02 04:03:28 +04:00
/* it might have come alive again */
if ( node - > rx_cnt ! = 0 ) {
ctdb_node_connected ( node ) ;
}
2007-05-19 11:21:58 +04:00
continue ;
2007-05-18 14:06:29 +04:00
}
2007-06-02 04:03:28 +04:00
2007-05-18 17:23:36 +04:00
if ( node - > rx_cnt = = 0 ) {
node - > dead_count + + ;
2007-05-18 14:06:29 +04:00
} else {
2007-05-18 17:23:36 +04:00
node - > dead_count = 0 ;
2007-05-18 14:06:29 +04:00
}
2007-05-18 17:23:36 +04:00
node - > rx_cnt = 0 ;
2007-06-06 04:25:46 +04:00
if ( node - > dead_count > = ctdb - > tunable . keepalive_limit ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 0 , ( " dead count reached for node %u \n " , node - > pnn ) ) ;
2007-05-19 10:59:10 +04:00
ctdb_node_dead ( node ) ;
2007-09-04 03:50:07 +04:00
ctdb_send_keepalive ( ctdb , node - > pnn ) ;
2007-05-18 17:23:36 +04:00
/* maybe tell the transport layer to kill the
sockets as well ?
2007-05-18 14:06:29 +04:00
*/
continue ;
}
2007-05-19 04:20:19 +04:00
2007-06-02 04:03:28 +04:00
if ( node - > tx_cnt = = 0 ) {
2007-09-04 03:50:07 +04:00
DEBUG ( 5 , ( " sending keepalive to %u \n " , node - > pnn ) ) ;
ctdb_send_keepalive ( ctdb , node - > pnn ) ;
2007-05-19 04:20:19 +04:00
}
2007-05-18 14:06:29 +04:00
2007-05-19 04:20:19 +04:00
node - > tx_cnt = 0 ;
2007-05-18 14:06:29 +04:00
}
2007-06-06 07:45:12 +04:00
event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
2007-06-06 04:25:46 +04:00
timeval_current_ofs ( ctdb - > tunable . keepalive_interval , 0 ) ,
2007-05-18 14:06:29 +04:00
ctdb_check_for_dead_nodes , ctdb ) ;
}
2007-06-06 04:25:46 +04:00
static void ctdb_check_health ( struct event_context * ev , struct timed_event * te ,
struct timeval t , void * private_data ) ;
/*
called when a health monitoring event script finishes
*/
static void ctdb_health_callback ( struct ctdb_context * ctdb , int status , void * p )
{
struct ctdb_node * node = ctdb - > nodes [ ctdb - > vnn ] ;
TDB_DATA data ;
struct ctdb_node_flag_change c ;
2007-06-06 07:45:12 +04:00
event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
2007-06-06 04:25:46 +04:00
timeval_current_ofs ( ctdb - > tunable . monitor_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
2007-08-21 11:25:15 +04:00
c . vnn = ctdb - > vnn ;
c . old_flags = node - > flags ;
2007-06-07 05:15:22 +04:00
if ( status ! = 0 & & ! ( node - > flags & NODE_FLAGS_UNHEALTHY ) ) {
2007-06-06 04:25:46 +04:00
DEBUG ( 0 , ( " monitor event failed - disabling node \n " ) ) ;
2007-06-07 05:15:22 +04:00
node - > flags | = NODE_FLAGS_UNHEALTHY ;
} else if ( status = = 0 & & ( node - > flags & NODE_FLAGS_UNHEALTHY ) ) {
2007-06-06 04:25:46 +04:00
DEBUG ( 0 , ( " monitor event OK - node re-enabled \n " ) ) ;
2007-06-07 05:15:22 +04:00
ctdb - > nodes [ ctdb - > vnn ] - > flags & = ~ NODE_FLAGS_UNHEALTHY ;
2007-06-06 04:25:46 +04:00
} else {
/* no change */
return ;
}
2007-08-21 11:25:15 +04:00
c . new_flags = node - > flags ;
2007-06-06 04:25:46 +04:00
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
2007-06-06 15:27:09 +04:00
/* tell the other nodes that something has changed */
2007-06-09 15:58:50 +04:00
ctdb_daemon_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
2007-06-06 15:27:09 +04:00
CTDB_SRVID_NODE_FLAGS_CHANGED , data ) ;
2007-06-07 05:15:22 +04:00
2007-06-06 04:25:46 +04:00
}
/*
see if the event scripts think we are healthy
*/
static void ctdb_check_health ( struct event_context * ev , struct timed_event * te ,
struct timeval t , void * private_data )
{
struct ctdb_context * ctdb = talloc_get_type ( private_data , struct ctdb_context ) ;
int ret ;
if ( ctdb - > monitoring_mode = = CTDB_MONITORING_DISABLED ) {
2007-06-06 07:45:12 +04:00
event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
2007-06-06 04:25:46 +04:00
timeval_current_ofs ( ctdb - > tunable . monitor_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
return ;
}
2007-06-06 07:45:12 +04:00
ret = ctdb_event_script_callback ( ctdb ,
timeval_current_ofs ( ctdb - > tunable . script_timeout , 0 ) ,
ctdb - > monitor_context , ctdb_health_callback , ctdb , " monitor " ) ;
2007-06-06 04:25:46 +04:00
if ( ret ! = 0 ) {
DEBUG ( 0 , ( " Unable to launch monitor event script \n " ) ) ;
2007-06-06 07:45:12 +04:00
event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
2007-06-06 04:25:46 +04:00
timeval_current_ofs ( ctdb - > tunable . monitor_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
}
}
2007-06-06 07:45:12 +04:00
/* stop any monitoring */
void ctdb_stop_monitoring ( struct ctdb_context * ctdb )
{
talloc_free ( ctdb - > monitor_context ) ;
ctdb - > monitor_context = talloc_new ( ctdb ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , ctdb - > monitor_context ) ;
}
2007-06-06 04:25:46 +04:00
2007-05-18 17:23:36 +04:00
/*
start watching for nodes that might be dead
*/
2007-06-06 07:45:12 +04:00
void ctdb_start_monitoring ( struct ctdb_context * ctdb )
2007-05-18 14:06:29 +04:00
{
2007-06-06 07:45:12 +04:00
struct timed_event * te ;
ctdb_stop_monitoring ( ctdb ) ;
te = event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
timeval_current_ofs ( ctdb - > tunable . keepalive_interval , 0 ) ,
ctdb_check_for_dead_nodes , ctdb ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , te ) ;
te = event_add_timed ( ctdb - > ev , ctdb - > monitor_context ,
timeval_current_ofs ( ctdb - > tunable . monitor_interval , 0 ) ,
ctdb_check_health , ctdb ) ;
CTDB_NO_MEMORY_FATAL ( ctdb , te ) ;
2007-05-18 14:06:29 +04:00
}
2007-06-07 05:15:22 +04:00
/*
2007-06-07 09:18:55 +04:00
modify flags on a node
2007-06-07 05:15:22 +04:00
*/
2007-06-07 09:18:55 +04:00
int32_t ctdb_control_modflags ( struct ctdb_context * ctdb , TDB_DATA indata )
2007-06-07 05:15:22 +04:00
{
2007-06-07 09:18:55 +04:00
struct ctdb_node_modflags * m = ( struct ctdb_node_modflags * ) indata . dptr ;
2007-06-07 05:15:22 +04:00
TDB_DATA data ;
struct ctdb_node_flag_change c ;
struct ctdb_node * node = ctdb - > nodes [ ctdb - > vnn ] ;
2007-06-07 09:18:55 +04:00
uint32_t old_flags = node - > flags ;
2007-06-07 05:15:22 +04:00
2007-06-07 09:18:55 +04:00
node - > flags | = m - > set ;
node - > flags & = ~ m - > clear ;
if ( node - > flags = = old_flags ) {
/* no change */
return 0 ;
2007-06-07 05:15:22 +04:00
}
2007-06-07 10:34:33 +04:00
DEBUG ( 0 , ( " Control modflags on node %u - flags now 0x%x \n " , ctdb - > vnn , node - > flags ) ) ;
/* if we have been banned, go into recovery mode */
2007-06-07 05:15:22 +04:00
c . vnn = ctdb - > vnn ;
2007-08-21 11:25:15 +04:00
c . old_flags = old_flags ;
c . new_flags = node - > flags ;
2007-06-07 05:15:22 +04:00
data . dptr = ( uint8_t * ) & c ;
data . dsize = sizeof ( c ) ;
/* tell the other nodes that something has changed */
2007-06-09 15:58:50 +04:00
ctdb_daemon_send_message ( ctdb , CTDB_BROADCAST_CONNECTED ,
2007-06-07 05:15:22 +04:00
CTDB_SRVID_NODE_FLAGS_CHANGED , data ) ;
2007-06-07 10:34:33 +04:00
if ( ( node - > flags & NODE_FLAGS_BANNED ) & & ! ( old_flags & NODE_FLAGS_BANNED ) ) {
/* make sure we are frozen */
DEBUG ( 0 , ( " This node has been banned - forcing freeze and recovery \n " ) ) ;
2007-08-22 04:38:35 +04:00
/* Reset the generation id to 1 to make us ignore any
REQ / REPLY CALL / DMASTER someone sends to us .
We are now banned so we shouldnt service database calls
anymore .
*/
2007-08-22 06:38:31 +04:00
ctdb - > vnn_map - > generation = INVALID_GENERATION ;
2007-08-22 04:38:35 +04:00
2007-06-10 02:46:33 +04:00
ctdb_start_freeze ( ctdb ) ;
ctdb_release_all_ips ( ctdb ) ;
2007-06-07 10:34:33 +04:00
ctdb - > recovery_mode = CTDB_RECOVERY_ACTIVE ;
}
2007-06-07 05:15:22 +04:00
return 0 ;
}