1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-25 06:04:04 +03:00

first step in health monitoring of cluster nodes. When not healthy they will be marked disabled

(This used to be ctdb commit d3dbd9fc4db21632075b56fc52cf95435c63374a)
This commit is contained in:
Andrew Tridgell 2007-06-05 17:43:19 +10:00
parent ee747b5bd6
commit ac55bc4166
9 changed files with 22 additions and 11 deletions

View File

@ -222,14 +222,16 @@ uint32_t ctdb_get_vnn(struct ctdb_context *ctdb)
}
/*
return the number of connected nodes
return the number of enabled nodes
*/
uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb)
uint32_t ctdb_get_num_enabled_nodes(struct ctdb_context *ctdb)
{
int i;
uint32_t count=0;
for (i=0;i<ctdb->vnn_map->size;i++) {
if (ctdb->nodes[ctdb->vnn_map->map[i]]->flags & NODE_FLAGS_CONNECTED) {
struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
if ((node->flags & NODE_FLAGS_CONNECTED) &&
!(node->flags & NODE_FLAGS_DISABLED)) {
count++;
}
}

View File

@ -1364,7 +1364,7 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name)
ctdb_db->db_id = *(uint32_t *)data.dptr;
talloc_free(data.dptr);
ret = ctdb_ctrl_getdbpath(ctdb, timeval_current_ofs(1, 0), CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
ret = ctdb_ctrl_getdbpath(ctdb, timeval_current_ofs(2, 0), CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
if (ret != 0) {
DEBUG(0,("Failed to get dbpath for database '%s'\n", name));
talloc_free(ctdb_db);

View File

@ -697,6 +697,8 @@ again:
"MonitorFrequency", &ctdb->tunable.monitor_frequency);
ctdb_ctrl_get_tunable(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE,
"ElectionTimeout", &ctdb->tunable.election_timeout);
ctdb_ctrl_get_tunable(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE,
"TakeoverTimeout", &ctdb->tunable.takeover_timeout);
vnn = ctdb_ctrl_getvnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
if (vnn == (uint32_t)-1) {

View File

@ -372,7 +372,7 @@ int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB
if (key.dsize == 0 && data.dsize == 0) {
state->null_count++;
if (state->null_count != ctdb_get_num_connected_nodes(ctdb)) {
if (state->null_count != ctdb_get_num_enabled_nodes(ctdb)) {
return 0;
}
}

View File

@ -35,6 +35,7 @@ static const struct {
{ "RecoverTimeout", 5, offsetof(struct ctdb_tunable, recover_timeout) },
{ "MonitorFrequency", 1, offsetof(struct ctdb_tunable, monitor_frequency) },
{ "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) },
{ "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) },
};
/*

View File

@ -21,8 +21,10 @@ case $cmd in
service smb stop > /dev/null 2>&1
service winbind stop > /dev/null 2>&1
# start Samba service
service smb start
# start Samba service. Start it reniced, as under very heavy load
# the number of smbd processes will mean that it leaves few cycles for
# anything else
nice service smb start
service winbind start
# wait for the Samba tcp ports to become available

View File

@ -50,6 +50,7 @@ struct ctdb_tunable {
uint32_t recover_timeout;
uint32_t monitor_frequency;
uint32_t election_timeout;
uint32_t takeover_timeout;
};
/*
@ -109,6 +110,7 @@ struct ctdb_node {
void *private_data; /* private to transport */
uint32_t vnn;
#define NODE_FLAGS_CONNECTED 0x00000001
#define NODE_FLAGS_DISABLED 0x00000002
uint32_t flags;
/* used by the dead node monitoring */
@ -905,7 +907,7 @@ int32_t ctdb_control_thaw(struct ctdb_context *ctdb);
int ctdb_start_recoverd(struct ctdb_context *ctdb);
uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb);
uint32_t ctdb_get_num_enabled_nodes(struct ctdb_context *ctdb);
int ctdb_start_monitoring(struct ctdb_context *ctdb);
void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);

View File

@ -27,7 +27,7 @@
#include "../include/ctdb_private.h"
#define TAKEOVER_TIMEOUT() timeval_current_ofs(5,0)
#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
#define CTDB_ARP_INTERVAL 1
#define CTDB_ARP_REPEAT 3
@ -403,7 +403,8 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
/* work out which node will look after each public IP */
for (i=0;i<nodemap->num;i++) {
if (nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) {
if ((nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) &&
!(nodemap->nodes[i].flags & NODE_FLAGS_DISABLED)) {
ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
} else {
/* assign this dead nodes IP to the next higher node */
@ -411,6 +412,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
j != i;
j=(j+1)%nodemap->num) {
if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) &&
!(nodemap->nodes[j].flags & NODE_FLAGS_DISABLED) &&
ctdb_same_subnet(ctdb->nodes[j]->public_address,
ctdb->nodes[i]->public_address,
ctdb->nodes[j]->public_netmask_bits)) {

View File

@ -383,7 +383,7 @@ static int control_shutdown(struct ctdb_context *ctdb, int argc, const char **ar
{
int ret;
ret = ctdb_ctrl_shutdown(ctdb, timeval_current_ofs(1, 0), options.vnn);
ret = ctdb_ctrl_shutdown(ctdb, TIMELIMIT(), options.vnn);
if (ret != 0) {
printf("Unable to shutdown node %u\n", options.vnn);
return ret;