1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-10 01:18:15 +03:00

implement a scheme where nodes are banned if they continuously caused the cluster

to start a recovery session. The node is banned from the cluster for the RecoveryBanPeriod (default of 5 minutes)

(This used to be ctdb commit 4ad43dd07f526b6002477177fbf55483246c2c0c)
This commit is contained in:
Andrew Tridgell 2007-06-07 15:18:55 +10:00
commit 2ed57a9ae1
13 changed files with 654 additions and 435 deletions

View File

@ -116,10 +116,13 @@ static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr)
/* this assumes that the nodes are kept in sorted order, and no gaps */
node->vnn = ctdb->num_nodes;
/* nodes start out disconnected */
node->flags |= NODE_FLAGS_DISCONNECTED;
if (ctdb->address.address &&
ctdb_same_address(&ctdb->address, &node->address)) {
ctdb->vnn = node->vnn;
node->flags |= NODE_FLAGS_CONNECTED;
node->flags &= ~NODE_FLAGS_DISCONNECTED;
}
ctdb->num_nodes++;
@ -222,8 +225,7 @@ uint32_t ctdb_get_num_enabled_nodes(struct ctdb_context *ctdb)
uint32_t count=0;
for (i=0;i<ctdb->vnn_map->size;i++) {
struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
if ((node->flags & NODE_FLAGS_CONNECTED) &&
!(node->flags & NODE_FLAGS_DISABLED)) {
if (!(node->flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
count++;
}
}
@ -354,14 +356,14 @@ static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t len
*/
void ctdb_node_dead(struct ctdb_node *node)
{
if (!(node->flags & NODE_FLAGS_CONNECTED)) {
if (node->flags & NODE_FLAGS_DISCONNECTED) {
DEBUG(1,("%s: node %s is already marked disconnected: %u connected\n",
node->ctdb->name, node->name,
node->ctdb->num_connected));
return;
}
node->ctdb->num_connected--;
node->flags &= ~NODE_FLAGS_CONNECTED;
node->flags |= NODE_FLAGS_DISCONNECTED;
node->rx_cnt = 0;
node->dead_count = 0;
DEBUG(1,("%s: node %s is dead: %u connected\n",
@ -374,7 +376,7 @@ void ctdb_node_dead(struct ctdb_node *node)
*/
void ctdb_node_connected(struct ctdb_node *node)
{
if (node->flags & NODE_FLAGS_CONNECTED) {
if (!(node->flags & NODE_FLAGS_DISCONNECTED)) {
DEBUG(1,("%s: node %s is already marked connected: %u connected\n",
node->ctdb->name, node->name,
node->ctdb->num_connected));
@ -382,7 +384,7 @@ void ctdb_node_connected(struct ctdb_node *node)
}
node->ctdb->num_connected++;
node->dead_count = 0;
node->flags |= NODE_FLAGS_CONNECTED;
node->flags &= ~NODE_FLAGS_DISCONNECTED;
DEBUG(1,("%s: connected to %s - %u connected\n",
node->ctdb->name, node->name, node->ctdb->num_connected));
}

View File

@ -1277,7 +1277,7 @@ uint32_t *ctdb_get_connected_nodes(struct ctdb_context *ctdb,
}
for (i=0;i<map->num;i++) {
if (map->nodes[i].flags & NODE_FLAGS_CONNECTED) {
if (!(map->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
nodes[*num_nodes] = map->nodes[i].vnn;
(*num_nodes)++;
}
@ -1921,20 +1921,25 @@ int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
/*
set/clear the permanent disabled bit on a remote node
*/
int ctdb_ctrl_permdisable(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t mode)
int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode,
uint32_t set, uint32_t clear)
{
int ret;
TDB_DATA data;
struct ctdb_node_modflags m;
int32_t res;
data.dsize = sizeof(uint32_t);
data.dptr = (unsigned char *)&mode;
m.set = set;
m.clear = clear;
data.dsize = sizeof(m);
data.dptr = (unsigned char *)&m;
ret = ctdb_control(ctdb, destnode, 0,
CTDB_CONTROL_PERMANENTLY_DISABLE, 0, data,
CTDB_CONTROL_MODIFY_FLAGS, 0, data,
NULL, NULL, &res, &timeout, NULL);
if (ret != 0 || res != 0) {
DEBUG(0,(__location__ " ctdb_control for setpermdisable failed\n"));
DEBUG(0,(__location__ " ctdb_control for modflags failed\n"));
return -1;
}

View File

@ -288,9 +288,9 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
case CTDB_CONTROL_LIST_TUNABLES:
return ctdb_control_list_tunables(ctdb, outdata);
case CTDB_CONTROL_PERMANENTLY_DISABLE:
CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
return ctdb_control_permdisable(ctdb, indata);
case CTDB_CONTROL_MODIFY_FLAGS:
CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_node_modflags));
return ctdb_control_modflags(ctdb, indata);
default:
DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
@ -445,7 +445,7 @@ int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
if (destnode != CTDB_BROADCAST_VNNMAP && destnode != CTDB_BROADCAST_ALL &&
(!ctdb_validate_vnn(ctdb, destnode) ||
!(ctdb->nodes[destnode]->flags & NODE_FLAGS_CONNECTED))) {
(ctdb->nodes[destnode]->flags & NODE_FLAGS_DISCONNECTED))) {
if (!(flags & CTDB_CTRL_FLAG_NOREPLY)) {
callback(ctdb, -1, tdb_null, "ctdb_control to disconnected node", private_data);
}

View File

@ -44,10 +44,10 @@ static void flag_change_handler(struct ctdb_context *ctdb, uint64_t srvid,
return;
}
/* don't get the connected flag from the other node */
/* don't get the disconnected flag from the other node */
ctdb->nodes[c->vnn]->flags =
(ctdb->nodes[c->vnn]->flags&NODE_FLAGS_CONNECTED)
| (c->flags & ~NODE_FLAGS_CONNECTED);
(ctdb->nodes[c->vnn]->flags&NODE_FLAGS_DISCONNECTED)
| (c->flags & ~NODE_FLAGS_DISCONNECTED);
}
/* called when the "startup" event script has finished */

View File

@ -48,7 +48,7 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve
continue;
}
if (!(node->flags & NODE_FLAGS_CONNECTED)) {
if (node->flags & NODE_FLAGS_DISCONNECTED) {
/* it might have come alive again */
if (node->rx_cnt != 0) {
ctdb_node_connected(node);
@ -185,19 +185,22 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
/*
administratively disable/enable a node
modify flags on a node
*/
int32_t ctdb_control_permdisable(struct ctdb_context *ctdb, TDB_DATA indata)
int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
{
uint32_t set = *(uint32_t *)indata.dptr;
struct ctdb_node_modflags *m = (struct ctdb_node_modflags *)indata.dptr;
TDB_DATA data;
struct ctdb_node_flag_change c;
struct ctdb_node *node = ctdb->nodes[ctdb->vnn];
uint32_t old_flags = node->flags;
if (set) {
node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
} else {
node->flags &= ~NODE_FLAGS_PERMANENTLY_DISABLED;
node->flags |= m->set;
node->flags &= ~m->clear;
if (node->flags == old_flags) {
/* no change */
return 0;
}
c.vnn = ctdb->vnn;

File diff suppressed because it is too large Load Diff

View File

@ -25,19 +25,21 @@ static const struct {
uint32_t default_v;
size_t offset;
} tunable_map[] = {
{ "MaxRedirectCount", 3, offsetof(struct ctdb_tunable, max_redirect_count) },
{ "SeqnumFrequency", 1, offsetof(struct ctdb_tunable, seqnum_frequency) },
{ "ControlTimeout", 60, offsetof(struct ctdb_tunable, control_timeout) },
{ "TraverseTimeout", 20, offsetof(struct ctdb_tunable, traverse_timeout) },
{ "KeepaliveInterval", 2, offsetof(struct ctdb_tunable, keepalive_interval) },
{ "KeepaliveLimit", 3, offsetof(struct ctdb_tunable, keepalive_limit) },
{ "MaxLACount", 7, offsetof(struct ctdb_tunable, max_lacount) },
{ "RecoverTimeout", 5, offsetof(struct ctdb_tunable, recover_timeout) },
{ "RecoverInterval", 1, offsetof(struct ctdb_tunable, recover_interval) },
{ "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) },
{ "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) },
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) },
{ "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) },
{ "MaxRedirectCount", 3, offsetof(struct ctdb_tunable, max_redirect_count) },
{ "SeqnumFrequency", 1, offsetof(struct ctdb_tunable, seqnum_frequency) },
{ "ControlTimeout", 60, offsetof(struct ctdb_tunable, control_timeout) },
{ "TraverseTimeout", 20, offsetof(struct ctdb_tunable, traverse_timeout) },
{ "KeepaliveInterval", 2, offsetof(struct ctdb_tunable, keepalive_interval) },
{ "KeepaliveLimit", 3, offsetof(struct ctdb_tunable, keepalive_limit) },
{ "MaxLACount", 7, offsetof(struct ctdb_tunable, max_lacount) },
{ "RecoverTimeout", 5, offsetof(struct ctdb_tunable, recover_timeout) },
{ "RecoverInterval", 1, offsetof(struct ctdb_tunable, recover_interval) },
{ "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) },
{ "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) },
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) },
{ "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) },
{ "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) },
{ "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period) },
};
/*

View File

@ -351,9 +351,9 @@ int ctdb_ctrl_list_tunables(struct ctdb_context *ctdb,
TALLOC_CTX *mem_ctx,
const char ***list, uint32_t *count);
int ctdb_ctrl_permdisable(struct ctdb_context *ctdb,
struct timeval timeout,
uint32_t destnode,
uint32_t mode);
int ctdb_ctrl_modflags(struct ctdb_context *ctdb,
struct timeval timeout,
uint32_t destnode,
uint32_t set, uint32_t clear);
#endif

View File

@ -53,6 +53,8 @@ struct ctdb_tunable {
uint32_t takeover_timeout;
uint32_t monitor_interval;
uint32_t script_timeout;
uint32_t recovery_grace_period;
uint32_t recovery_ban_period;
};
/*
@ -111,10 +113,12 @@ struct ctdb_node {
const char *name; /* for debug messages */
void *private_data; /* private to transport */
uint32_t vnn;
#define NODE_FLAGS_CONNECTED 0x00000001
#define NODE_FLAGS_UNHEALTHY 0x00000002
#define NODE_FLAGS_PERMANENTLY_DISABLED 0x00000004
#define NODE_FLAGS_DISCONNECTED 0x00000001 /* node isn't connected */
#define NODE_FLAGS_UNHEALTHY 0x00000002 /* monitoring says node is unhealthy */
#define NODE_FLAGS_PERMANENTLY_DISABLED 0x00000004 /* administrator has disabled node */
#define NODE_FLAGS_BANNED 0x00000008 /* recovery daemon has banned the node */
#define NODE_FLAGS_DISABLED (NODE_FLAGS_UNHEALTHY|NODE_FLAGS_PERMANENTLY_DISABLED)
#define NODE_FLAGS_INACTIVE (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_BANNED)
uint32_t flags;
/* used by the dead node monitoring */
@ -414,7 +418,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
CTDB_CONTROL_GET_TUNABLE = 49,
CTDB_CONTROL_LIST_TUNABLES = 50,
CTDB_CONTROL_GET_PUBLIC_IPS = 51,
CTDB_CONTROL_PERMANENTLY_DISABLE = 52,
CTDB_CONTROL_MODIFY_FLAGS = 52,
};
/*
@ -467,6 +471,14 @@ struct ctdb_node_flag_change {
uint32_t flags;
};
/*
structure to change flags on a node
*/
struct ctdb_node_modflags {
uint32_t set;
uint32_t clear;
};
enum call_state {CTDB_CALL_WAIT, CTDB_CALL_DONE, CTDB_CALL_ERROR};
#define CTDB_LMASTER_ANY 0xffffffff
@ -1010,6 +1022,6 @@ int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata)
void ctdb_tunables_set_defaults(struct ctdb_context *ctdb);
int32_t ctdb_control_permdisable(struct ctdb_context *ctdb, TDB_DATA indata);
int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata);
#endif

View File

@ -404,6 +404,29 @@ static bool ctdb_same_subnet(const char *ip1, const char *ip2, uint8_t netmask_b
return true;
}
/*
try to find an available node to take a given nodes IP that meets the
criterion given by the flags
*/
static void ctdb_takeover_find_node(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
int start_node, uint32_t mask_flags)
{
int j;
for (j=(start_node+1)%nodemap->num;
j != start_node;
j=(j+1)%nodemap->num) {
if (!(nodemap->nodes[j].flags & mask_flags) &&
ctdb_same_subnet(ctdb->nodes[j]->public_address,
ctdb->nodes[start_node]->public_address,
ctdb->nodes[j]->public_netmask_bits)) {
ctdb->nodes[start_node]->takeover_vnn = nodemap->nodes[j].vnn;
break;
}
}
}
/*
make any IP alias changes for public addresses that are necessary
*/
@ -413,51 +436,32 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
int ret;
struct ctdb_public_ip ip;
/* work out which node will look after each public IP */
/* Work out which node will look after each public IP.
* takeover_node cycles over the nodes and is incremented each time a
* node has been assigned to take over for another node.
* This spreads the failed nodes out across the remaining
* nodes more evenly
*/
for (i=0;i<nodemap->num;i++) {
if ((nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) &&
!(nodemap->nodes[i].flags & NODE_FLAGS_DISABLED)) {
if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
} else {
/* assign this dead nodes IP to the next higher node */
for (j=(i+1)%nodemap->num;
j != i;
j=(j+1)%nodemap->num) {
if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) &&
!(nodemap->nodes[j].flags & NODE_FLAGS_DISABLED) &&
ctdb_same_subnet(ctdb->nodes[j]->public_address,
ctdb->nodes[i]->public_address,
ctdb->nodes[j]->public_netmask_bits)) {
ctdb->nodes[i]->takeover_vnn = nodemap->nodes[j].vnn;
break;
}
}
ctdb->nodes[i]->takeover_vnn = (uint32_t)-1;
ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED);
/* if no enabled node can take it, then we
might as well use any connected node. It
might as well use any active node. It
probably means that some subsystem (such as
NFS) is sick on all nodes. Best we can do
is to keep the other services up. */
if (j == i) {
for (j=(i+1)%nodemap->num;
j != i;
j=(j+1)%nodemap->num) {
if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) &&
ctdb_same_subnet(ctdb->nodes[j]->public_address,
ctdb->nodes[i]->public_address,
ctdb->nodes[j]->public_netmask_bits)) {
ctdb->nodes[i]->takeover_vnn = nodemap->nodes[j].vnn;
DEBUG(0,("All available nodes disabled for %s - using a connected node\n",
ctdb->nodes[i]->public_address));
break;
}
}
if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
ctdb_takeover_find_node(ctdb, nodemap, i, NODE_FLAGS_INACTIVE);
}
if (j == i) {
if (ctdb->nodes[i]->takeover_vnn == (uint32_t)-1) {
DEBUG(0,(__location__ " No node available on same network to take %s\n",
ctdb->nodes[i]->public_address));
ctdb->nodes[i]->takeover_vnn = -1;
}
}
}
@ -470,7 +474,9 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
hold the given alias */
for (i=0;i<nodemap->num;i++) {
/* don't talk to unconnected nodes */
if (!(nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED)) continue;
if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
continue;
}
/* tell this node to delete all of the aliases that it should not have */
for (j=0;j<nodemap->num;j++) {

View File

@ -274,7 +274,7 @@ static int ctdb_tcp_listen_automatic(struct ctdb_context *ctdb)
ctdb->address.address,
ctdb->address.port);
ctdb->vnn = ctdb->nodes[i]->vnn;
ctdb->nodes[i]->flags |= NODE_FLAGS_CONNECTED;
ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED;
DEBUG(1,("ctdb chose network address %s:%u vnn %u\n",
ctdb->address.address,
ctdb->address.port,

View File

@ -1,3 +1,4 @@
127.0.0.1
127.0.0.2
127.0.0.3
127.0.0.4

View File

@ -285,13 +285,13 @@ static int control_status(struct ctdb_context *ctdb, int argc, const char **argv
}
if(options.machinereadable){
printf(":Node:IP:Connected:Disabled:Permanently Disabled:\n");
printf(":Node:IP:Disonnected:Disabled:Permanently Disabled:\n");
for(i=0;i<nodemap->num;i++){
printf(":%d:%s:%d:%d:%d:\n", nodemap->nodes[i].vnn,
inet_ntoa(nodemap->nodes[i].sin.sin_addr),
!!(nodemap->nodes[i].flags&NODE_FLAGS_CONNECTED),
!!(nodemap->nodes[i].flags&NODE_FLAGS_UNHEALTHY),
!!(nodemap->nodes[i].flags&NODE_FLAGS_PERMANENTLY_DISABLED));
!!(nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED),
!!(nodemap->nodes[i].flags&NODE_FLAGS_UNHEALTHY),
!!(nodemap->nodes[i].flags&NODE_FLAGS_PERMANENTLY_DISABLED));
}
return 0;
}
@ -303,10 +303,10 @@ static int control_status(struct ctdb_context *ctdb, int argc, const char **argv
flags_str = "DISABLED";
} else if (nodemap->nodes[i].flags & NODE_FLAGS_UNHEALTHY) {
flags_str = "UNHEALTHY";
} else if (nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) {
flags_str = "CONNECTED";
} else if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
flags_str = "DISCONNECTED";
} else {
flags_str = "UNAVAILABLE";
flags_str = "OK";
}
printf("vnn:%d %-16s %s%s\n", nodemap->nodes[i].vnn,
inet_ntoa(nodemap->nodes[i].sin.sin_addr),
@ -405,7 +405,7 @@ static int control_disable(struct ctdb_context *ctdb, int argc, const char **arg
{
int ret;
ret = ctdb_ctrl_permdisable(ctdb, TIMELIMIT(), options.vnn, NODE_FLAGS_PERMANENTLY_DISABLED);
ret = ctdb_ctrl_modflags(ctdb, TIMELIMIT(), options.vnn, NODE_FLAGS_PERMANENTLY_DISABLED, 0);
if (ret != 0) {
printf("Unable to disable node %u\n", options.vnn);
return ret;
@ -421,7 +421,7 @@ static int control_enable(struct ctdb_context *ctdb, int argc, const char **argv
{
int ret;
ret = ctdb_ctrl_permdisable(ctdb, TIMELIMIT(), options.vnn, 0);
ret = ctdb_ctrl_modflags(ctdb, TIMELIMIT(), options.vnn, 0, NODE_FLAGS_PERMANENTLY_DISABLED);
if (ret != 0) {
printf("Unable to enable node %u\n", options.vnn);
return ret;
@ -618,7 +618,7 @@ static int control_getvar(struct ctdb_context *ctdb, int argc, const char **argv
return -1;
}
printf("%-18s = %u\n", name, value);
printf("%-19s = %u\n", name, value);
return 0;
}