mirror of
https://github.com/samba-team/samba.git
synced 2025-03-08 04:58:40 +03:00
Merge root@10.1.1.27:/shared/ctdb/ctdb-git
(This used to be ctdb commit b869bb0e32d32422a5ba6b235864acba07f2b412)
This commit is contained in:
commit
a1084c687f
@ -54,7 +54,7 @@ CTDB_SERVER_OBJ = server/ctdbd.o server/ctdb_daemon.o server/ctdb_lockwait.o \
|
||||
server/ctdb_traverse.o server/eventscript.o server/ctdb_takeover.o \
|
||||
server/ctdb_serverids.o server/ctdb_persistent.o \
|
||||
server/ctdb_keepalive.o server/ctdb_logging.o server/ctdb_uptime.o \
|
||||
server/ctdb_vacuum.o \
|
||||
server/ctdb_vacuum.o server/ctdb_banning.o \
|
||||
$(CTDB_CLIENT_OBJ) $(CTDB_TCP_OBJ) @INFINIBAND_WRAPPER_OBJ@
|
||||
|
||||
TEST_BINS=tests/bin/ctdb_bench tests/bin/ctdb_fetch tests/bin/ctdb_store \
|
||||
|
@ -3842,3 +3842,48 @@ int ctdb_ctrl_disablescript(struct ctdb_context *ctdb, struct timeval timeout, u
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_ban_time *bantime)
|
||||
{
|
||||
int ret;
|
||||
TDB_DATA data;
|
||||
int32_t res;
|
||||
|
||||
data.dsize = sizeof(*bantime);
|
||||
data.dptr = (uint8_t *)bantime;
|
||||
|
||||
ret = ctdb_control(ctdb, destnode, 0,
|
||||
CTDB_CONTROL_SET_BAN_STATE, 0, data,
|
||||
NULL, NULL, &res, &timeout, NULL);
|
||||
if (ret != 0 || res != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_ban_time **bantime)
|
||||
{
|
||||
int ret;
|
||||
TDB_DATA outdata;
|
||||
int32_t res;
|
||||
TALLOC_CTX *tmp_ctx = talloc_new(NULL);
|
||||
|
||||
ret = ctdb_control(ctdb, destnode, 0,
|
||||
CTDB_CONTROL_GET_BAN_STATE, 0, tdb_null,
|
||||
tmp_ctx, &outdata, &res, &timeout, NULL);
|
||||
if (ret != 0 || res != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
*bantime = (struct ctdb_ban_time *)talloc_steal(mem_ctx, outdata.dptr);
|
||||
talloc_free(tmp_ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -75,16 +75,6 @@ struct ctdb_call_info {
|
||||
*/
|
||||
#define CTDB_SRVID_SET_NODE_FLAGS 0xF400000000000000LL
|
||||
|
||||
/*
|
||||
a message ID meaning that a node should be banned
|
||||
*/
|
||||
#define CTDB_SRVID_BAN_NODE 0xF500000000000000LL
|
||||
|
||||
/*
|
||||
a message ID meaning that a node should be unbanned
|
||||
*/
|
||||
#define CTDB_SRVID_UNBAN_NODE 0xF600000000000000LL
|
||||
|
||||
/*
|
||||
a message to tell the recovery daemon to fetch a set of records
|
||||
*/
|
||||
@ -669,4 +659,13 @@ int ctdb_ctrl_setrecmasterrole(struct ctdb_context *ctdb, struct timeval timeout
|
||||
int ctdb_ctrl_enablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script);
|
||||
int ctdb_ctrl_disablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script);
|
||||
|
||||
struct ctdb_ban_time {
|
||||
uint32_t pnn;
|
||||
uint32_t time;
|
||||
};
|
||||
|
||||
int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_ban_time *bantime);
|
||||
int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_ban_time **bantime);
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -226,6 +226,8 @@ struct ctdb_node {
|
||||
by each node.
|
||||
*/
|
||||
struct ctdb_all_public_ips *public_ips;
|
||||
/* used by the recovery dameon to track when a node should be banned */
|
||||
struct ctdb_banning_state *ban_state;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -429,6 +431,7 @@ struct ctdb_context {
|
||||
TALLOC_CTX *release_ips_ctx; /* a context used to automatically drop all IPs if we fail to recover the node */
|
||||
TALLOC_CTX *script_monitoring_ctx; /* a context where we store results while running the monitor event */
|
||||
TALLOC_CTX *last_monitoring_ctx;
|
||||
TALLOC_CTX *banning_ctx;
|
||||
};
|
||||
|
||||
struct ctdb_db_context {
|
||||
@ -587,6 +590,8 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
|
||||
CTDB_CONTROL_EVENT_SCRIPT_DISABLED = 106,
|
||||
CTDB_CONTROL_ENABLE_SCRIPT = 107,
|
||||
CTDB_CONTROL_DISABLE_SCRIPT = 108,
|
||||
CTDB_CONTROL_SET_BAN_STATE = 109,
|
||||
CTDB_CONTROL_GET_BAN_STATE = 110,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -1469,4 +1474,7 @@ int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db);
|
||||
int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata);
|
||||
int32_t ctdb_control_disable_script(struct ctdb_context *ctdb, TDB_DATA indata);
|
||||
|
||||
int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata);
|
||||
int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata);
|
||||
|
||||
#endif
|
||||
|
121
ctdb/server/ctdb_banning.c
Normal file
121
ctdb/server/ctdb_banning.c
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
ctdb banning code
|
||||
|
||||
Copyright (C) Ronnie Sahlberg 2009
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include "includes.h"
|
||||
#include "lib/events/events.h"
|
||||
#include "lib/tdb/include/tdb.h"
|
||||
#include "system/time.h"
|
||||
#include "system/network.h"
|
||||
#include "system/filesys.h"
|
||||
#include "system/wait.h"
|
||||
#include "../include/ctdb.h"
|
||||
#include "../include/ctdb_private.h"
|
||||
|
||||
|
||||
static void
|
||||
ctdb_ban_node_event(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
|
||||
|
||||
DEBUG(DEBUG_ERR,("Banning timedout\n"));
|
||||
ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_BANNED;
|
||||
|
||||
if (ctdb->banning_ctx != NULL) {
|
||||
talloc_free(ctdb->banning_ctx);
|
||||
ctdb->banning_ctx = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata)
|
||||
{
|
||||
struct ctdb_ban_time *bantime = (struct ctdb_ban_time *)indata.dptr;
|
||||
|
||||
DEBUG(DEBUG_INFO,("SET BAN STATE\n"));
|
||||
|
||||
if (bantime->pnn != ctdb->pnn) {
|
||||
if (bantime->pnn < 0 || bantime->pnn >= ctdb->num_nodes) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " ERROR: Invalid ban request. PNN:%d is invalid. Max nodes %d\n", bantime->pnn, ctdb->num_nodes));
|
||||
return -1;
|
||||
}
|
||||
if (bantime->time == 0) {
|
||||
DEBUG(DEBUG_INFO,("unbanning node %d\n", bantime->pnn));
|
||||
ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
|
||||
} else {
|
||||
DEBUG(DEBUG_INFO,("banning node %d\n", bantime->pnn));
|
||||
if (ctdb->tunable.enable_bans == 0) {
|
||||
DEBUG(DEBUG_INFO,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
|
||||
return 0;
|
||||
}
|
||||
|
||||
ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ctdb->banning_ctx != NULL) {
|
||||
talloc_free(ctdb->banning_ctx);
|
||||
ctdb->banning_ctx = NULL;
|
||||
}
|
||||
|
||||
if (bantime->time == 0) {
|
||||
DEBUG(DEBUG_ERR,("Unbanning this node\n"));
|
||||
ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ctdb->tunable.enable_bans == 0) {
|
||||
DEBUG(DEBUG_ERR,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
|
||||
return 0;
|
||||
}
|
||||
|
||||
ctdb->banning_ctx = talloc(ctdb, struct ctdb_ban_time);
|
||||
if (ctdb->banning_ctx == NULL) {
|
||||
DEBUG(DEBUG_CRIT,(__location__ " ERROR Failed to allocate new banning state\n"));
|
||||
return -1;
|
||||
}
|
||||
*((struct ctdb_ban_time *)(ctdb->banning_ctx)) = *bantime;
|
||||
|
||||
|
||||
DEBUG(DEBUG_ERR,("Banning this node for %d seconds\n", bantime->time));
|
||||
ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->banning_ctx, timeval_current_ofs(bantime->time,0), ctdb_ban_node_event, ctdb);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata)
|
||||
{
|
||||
struct ctdb_ban_time *bantime;
|
||||
|
||||
bantime = talloc(outdata, struct ctdb_ban_time);
|
||||
CTDB_NO_MEMORY(ctdb, bantime);
|
||||
|
||||
if (ctdb->banning_ctx != NULL) {
|
||||
*bantime = *(struct ctdb_ban_time *)(ctdb->banning_ctx);
|
||||
} else {
|
||||
bantime->pnn = ctdb->pnn;
|
||||
bantime->time = 0;
|
||||
}
|
||||
|
||||
outdata->dptr = (uint8_t *)bantime;
|
||||
outdata->dsize = sizeof(struct ctdb_ban_time);
|
||||
|
||||
return 0;
|
||||
}
|
@ -518,6 +518,14 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
|
||||
case CTDB_CONTROL_DISABLE_SCRIPT:
|
||||
return ctdb_control_disable_script(ctdb, indata);
|
||||
|
||||
case CTDB_CONTROL_SET_BAN_STATE:
|
||||
CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_ban_time));
|
||||
return ctdb_control_set_ban_state(ctdb, indata);
|
||||
|
||||
case CTDB_CONTROL_GET_BAN_STATE:
|
||||
CHECK_CONTROL_DATA_SIZE(0);
|
||||
return ctdb_control_get_ban_state(ctdb, outdata);
|
||||
|
||||
default:
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
|
||||
return -1;
|
||||
|
@ -306,6 +306,14 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
|
||||
}
|
||||
}
|
||||
|
||||
/* we dont let other nodes modify our BANNED status */
|
||||
if (c->pnn == ctdb->pnn) {
|
||||
node->flags &= ~NODE_FLAGS_BANNED;
|
||||
if (old_flags & NODE_FLAGS_BANNED) {
|
||||
node->flags |= NODE_FLAGS_BANNED;
|
||||
}
|
||||
}
|
||||
|
||||
if (node->flags == c->old_flags) {
|
||||
DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags));
|
||||
return 0;
|
||||
|
@ -31,11 +31,6 @@
|
||||
#include "dlinklist.h"
|
||||
|
||||
|
||||
struct ban_state {
|
||||
struct ctdb_recoverd *rec;
|
||||
uint32_t banned_node;
|
||||
};
|
||||
|
||||
/* list of "ctdb ipreallocate" processes to call back when we have
|
||||
finished the takeover run.
|
||||
*/
|
||||
@ -44,6 +39,11 @@ struct ip_reallocate_list {
|
||||
struct rd_memdump_reply *rd;
|
||||
};
|
||||
|
||||
struct ctdb_banning_state {
|
||||
uint32_t count;
|
||||
struct timeval last_reported_time;
|
||||
};
|
||||
|
||||
/*
|
||||
private state of recovery daemon
|
||||
*/
|
||||
@ -52,11 +52,8 @@ struct ctdb_recoverd {
|
||||
uint32_t recmaster;
|
||||
uint32_t num_active;
|
||||
uint32_t num_connected;
|
||||
uint32_t last_culprit_node;
|
||||
struct ctdb_node_map *nodemap;
|
||||
uint32_t last_culprit;
|
||||
uint32_t culprit_counter;
|
||||
struct timeval first_recover_time;
|
||||
struct ban_state **banned_nodes;
|
||||
struct timeval priority_time;
|
||||
bool need_takeover_run;
|
||||
bool need_recovery;
|
||||
@ -72,76 +69,15 @@ struct ctdb_recoverd {
|
||||
#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
|
||||
|
||||
|
||||
/*
|
||||
unban a node
|
||||
*/
|
||||
static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
|
||||
{
|
||||
struct ctdb_context *ctdb = rec->ctdb;
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Unbanning node %u\n", pnn));
|
||||
|
||||
if (!ctdb_validate_pnn(ctdb, pnn)) {
|
||||
DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_unban_node\n", pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
/* If we are unbanning a different node then just pass the ban info on */
|
||||
if (pnn != ctdb->pnn) {
|
||||
TDB_DATA data;
|
||||
int ret;
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
|
||||
|
||||
data.dptr = (uint8_t *)&pnn;
|
||||
data.dsize = sizeof(uint32_t);
|
||||
|
||||
ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed to unban node %u\n", pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* make sure we remember we are no longer banned in case
|
||||
there is an election */
|
||||
rec->node_flags &= ~NODE_FLAGS_BANNED;
|
||||
|
||||
DEBUG(DEBUG_INFO,("Clearing ban flag on node %u\n", pnn));
|
||||
ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
|
||||
|
||||
if (rec->banned_nodes[pnn] == NULL) {
|
||||
DEBUG(DEBUG_INFO,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
talloc_free(rec->banned_nodes[pnn]);
|
||||
rec->banned_nodes[pnn] = NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
called when a ban has timed out
|
||||
*/
|
||||
static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
|
||||
{
|
||||
struct ban_state *state = talloc_get_type(p, struct ban_state);
|
||||
struct ctdb_recoverd *rec = state->rec;
|
||||
uint32_t pnn = state->banned_node;
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Ban timeout. Node %u is now unbanned\n", pnn));
|
||||
ctdb_unban_node(rec, pnn);
|
||||
}
|
||||
|
||||
/*
|
||||
ban a node for a period of time
|
||||
*/
|
||||
static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
|
||||
{
|
||||
int ret;
|
||||
struct ctdb_context *ctdb = rec->ctdb;
|
||||
|
||||
struct ctdb_ban_time bantime;
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
|
||||
|
||||
if (!ctdb_validate_pnn(ctdb, pnn)) {
|
||||
@ -149,61 +85,15 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_
|
||||
return;
|
||||
}
|
||||
|
||||
if (0 == ctdb->tunable.enable_bans) {
|
||||
DEBUG(DEBUG_INFO,("Bans are disabled - ignoring ban of node %u\n", pnn));
|
||||
bantime.pnn = pnn;
|
||||
bantime.time = ban_time;
|
||||
|
||||
ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
/* If we are banning a different node then just pass the ban info on */
|
||||
if (pnn != ctdb->pnn) {
|
||||
struct ctdb_ban_info b;
|
||||
TDB_DATA data;
|
||||
int ret;
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
|
||||
|
||||
b.pnn = pnn;
|
||||
b.ban_time = ban_time;
|
||||
|
||||
data.dptr = (uint8_t *)&b;
|
||||
data.dsize = sizeof(b);
|
||||
|
||||
ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed to ban node %u\n", pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("self ban - lowering our election priority\n"));
|
||||
ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
|
||||
|
||||
/* banning ourselves - lower our election priority */
|
||||
rec->priority_time = timeval_current();
|
||||
|
||||
/* make sure we remember we are banned in case there is an
|
||||
election */
|
||||
rec->node_flags |= NODE_FLAGS_BANNED;
|
||||
|
||||
if (rec->banned_nodes[pnn] != NULL) {
|
||||
DEBUG(DEBUG_NOTICE,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
|
||||
talloc_free(rec->banned_nodes[pnn]);
|
||||
rec->banned_nodes[pnn] = NULL;
|
||||
}
|
||||
|
||||
rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
|
||||
|
||||
rec->banned_nodes[pnn]->rec = rec;
|
||||
rec->banned_nodes[pnn]->banned_node = pnn;
|
||||
|
||||
if (ban_time != 0) {
|
||||
event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
|
||||
timeval_current_ofs(ban_time, 0),
|
||||
ctdb_ban_timeout, rec->banned_nodes[pnn]);
|
||||
}
|
||||
}
|
||||
|
||||
enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
|
||||
@ -239,39 +129,44 @@ static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node
|
||||
/*
|
||||
remember the trouble maker
|
||||
*/
|
||||
static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
|
||||
static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
|
||||
{
|
||||
struct ctdb_context *ctdb = rec->ctdb;
|
||||
struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
|
||||
struct ctdb_banning_state *ban_state;
|
||||
|
||||
if (rec->last_culprit != culprit ||
|
||||
timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
|
||||
DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
|
||||
/* either a new node is the culprit, or we've decided to forgive them */
|
||||
rec->last_culprit = culprit;
|
||||
rec->first_recover_time = timeval_current();
|
||||
rec->culprit_counter = 0;
|
||||
if (culprit > ctdb->num_nodes) {
|
||||
DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
|
||||
return;
|
||||
}
|
||||
rec->culprit_counter++;
|
||||
|
||||
if (ctdb->nodes[culprit]->ban_state == NULL) {
|
||||
ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
|
||||
CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
|
||||
|
||||
|
||||
}
|
||||
ban_state = ctdb->nodes[culprit]->ban_state;
|
||||
if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
|
||||
/* this was the first time in a long while this node
|
||||
misbehaved so we will forgive any old transgressions.
|
||||
*/
|
||||
ban_state->count = 0;
|
||||
}
|
||||
|
||||
ban_state->count += count;
|
||||
ban_state->last_reported_time = timeval_current();
|
||||
rec->last_culprit_node = culprit;
|
||||
}
|
||||
|
||||
/*
|
||||
remember the trouble maker
|
||||
*/
|
||||
static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
|
||||
static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
|
||||
{
|
||||
struct ctdb_context *ctdb = rec->ctdb;
|
||||
|
||||
if (rec->last_culprit != culprit ||
|
||||
timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
|
||||
DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
|
||||
/* either a new node is the culprit, or we've decided to forgive them */
|
||||
rec->last_culprit = culprit;
|
||||
rec->first_recover_time = timeval_current();
|
||||
rec->culprit_counter = 0;
|
||||
}
|
||||
rec->culprit_counter += count;
|
||||
ctdb_set_culprit_count(rec, culprit, 1);
|
||||
}
|
||||
|
||||
|
||||
/* this callback is called for every node that failed to execute the
|
||||
start recovery event
|
||||
*/
|
||||
@ -708,62 +603,6 @@ static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_nod
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
handler for when the admin bans a node
|
||||
*/
|
||||
static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
|
||||
TDB_DATA data, void *private_data)
|
||||
{
|
||||
struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
|
||||
struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
|
||||
TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
||||
|
||||
if (data.dsize != sizeof(*b)) {
|
||||
DEBUG(DEBUG_ERR,("Bad data in ban_handler\n"));
|
||||
talloc_free(mem_ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
if (b->pnn != ctdb->pnn) {
|
||||
DEBUG(DEBUG_ERR,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has been banned for %u seconds\n",
|
||||
b->pnn, b->ban_time));
|
||||
|
||||
ctdb_ban_node(rec, b->pnn, b->ban_time);
|
||||
talloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
handler for when the admin unbans a node
|
||||
*/
|
||||
static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
|
||||
TDB_DATA data, void *private_data)
|
||||
{
|
||||
struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
|
||||
TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
||||
uint32_t pnn;
|
||||
|
||||
if (data.dsize != sizeof(uint32_t)) {
|
||||
DEBUG(DEBUG_ERR,("Bad data in unban_handler\n"));
|
||||
talloc_free(mem_ctx);
|
||||
return;
|
||||
}
|
||||
pnn = *(uint32_t *)data.dptr;
|
||||
|
||||
if (pnn != ctdb->pnn) {
|
||||
DEBUG(DEBUG_ERR,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has been unbanned.\n", pnn));
|
||||
ctdb_unban_node(rec, pnn);
|
||||
talloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
|
||||
struct vacuum_info {
|
||||
struct vacuum_info *next, *prev;
|
||||
struct ctdb_recoverd *rec;
|
||||
@ -1331,8 +1170,7 @@ static void reload_nodes_file(struct ctdb_context *ctdb)
|
||||
*/
|
||||
static int do_recovery(struct ctdb_recoverd *rec,
|
||||
TALLOC_CTX *mem_ctx, uint32_t pnn,
|
||||
struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
|
||||
int32_t culprit)
|
||||
struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
|
||||
{
|
||||
struct ctdb_context *ctdb = rec->ctdb;
|
||||
int i, j, ret;
|
||||
@ -1347,15 +1185,21 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
/* if recovery fails, force it again */
|
||||
rec->need_recovery = true;
|
||||
|
||||
if (culprit != -1) {
|
||||
ctdb_set_culprit(rec, culprit);
|
||||
}
|
||||
for (i=0; i<ctdb->num_nodes; i++) {
|
||||
struct ctdb_banning_state *ban_state;
|
||||
|
||||
if (rec->culprit_counter > 2*nodemap->num) {
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
|
||||
rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
|
||||
ctdb->tunable.recovery_ban_period));
|
||||
ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
|
||||
if (ctdb->nodes[i]->ban_state == NULL) {
|
||||
continue;
|
||||
}
|
||||
ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
|
||||
if (ban_state->count < 2*ctdb->num_nodes) {
|
||||
continue;
|
||||
}
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
|
||||
ctdb->nodes[i]->pnn, ban_state->count,
|
||||
ctdb->tunable.recovery_ban_period));
|
||||
ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
|
||||
ban_state->count = 0;
|
||||
}
|
||||
|
||||
|
||||
@ -1371,7 +1215,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
|
||||
DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
|
||||
|
||||
/* get a list of all databases */
|
||||
ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
|
||||
@ -1953,12 +1797,6 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
|
||||
return;
|
||||
}
|
||||
|
||||
/* release any bans */
|
||||
rec->last_culprit = (uint32_t)-1;
|
||||
talloc_free(rec->banned_nodes);
|
||||
rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
|
||||
|
||||
talloc_free(mem_ctx);
|
||||
return;
|
||||
}
|
||||
@ -2666,8 +2504,6 @@ static void monitor_cluster(struct ctdb_context *ctdb)
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, rec);
|
||||
|
||||
rec->ctdb = ctdb;
|
||||
rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
|
||||
|
||||
rec->priority_time = timeval_current();
|
||||
|
||||
@ -2683,12 +2519,6 @@ static void monitor_cluster(struct ctdb_context *ctdb)
|
||||
/* when we are asked to puch out a flag change */
|
||||
ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
|
||||
|
||||
/* when nodes are banned */
|
||||
ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
|
||||
|
||||
/* and one for when nodes are unbanned */
|
||||
ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
|
||||
|
||||
/* register a message port for vacuum fetch */
|
||||
ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
|
||||
|
||||
@ -2739,11 +2569,21 @@ again:
|
||||
as early as possible so we dont wait until we have pulled the node
|
||||
map from the local node. thats why we have the hardcoded value 20
|
||||
*/
|
||||
if (rec->culprit_counter > 20) {
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
|
||||
rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
|
||||
ctdb->tunable.recovery_ban_period));
|
||||
ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
|
||||
for (i=0; i<ctdb->num_nodes; i++) {
|
||||
struct ctdb_banning_state *ban_state;
|
||||
|
||||
if (ctdb->nodes[i]->ban_state == NULL) {
|
||||
continue;
|
||||
}
|
||||
ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
|
||||
if (ban_state->count < 20) {
|
||||
continue;
|
||||
}
|
||||
DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
|
||||
ctdb->nodes[i]->pnn, ban_state->count,
|
||||
ctdb->tunable.recovery_ban_period));
|
||||
ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
|
||||
ban_state->count = 0;
|
||||
}
|
||||
|
||||
/* get relevant tunables */
|
||||
@ -2860,34 +2700,7 @@ again:
|
||||
/* check that we (recovery daemon) and the local ctdb daemon
|
||||
agrees on whether we are banned or not
|
||||
*/
|
||||
if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
|
||||
if (rec->banned_nodes[pnn] == NULL) {
|
||||
if (rec->recmaster == pnn) {
|
||||
DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
|
||||
|
||||
ctdb_unban_node(rec, pnn);
|
||||
} else {
|
||||
DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
|
||||
ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
|
||||
ctdb_set_culprit(rec, pnn);
|
||||
}
|
||||
goto again;
|
||||
}
|
||||
} else {
|
||||
if (rec->banned_nodes[pnn] != NULL) {
|
||||
if (rec->recmaster == pnn) {
|
||||
DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
|
||||
|
||||
ctdb_unban_node(rec, pnn);
|
||||
} else {
|
||||
DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
|
||||
|
||||
ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
|
||||
ctdb_set_culprit(rec, pnn);
|
||||
}
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
//qqq
|
||||
|
||||
/* remember our own node flags */
|
||||
rec->node_flags = nodemap->nodes[pnn].flags;
|
||||
@ -3021,16 +2834,17 @@ again:
|
||||
|
||||
if (rec->need_recovery) {
|
||||
/* a previous recovery didn't finish */
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, -1);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
|
||||
/* verify that all active nodes are in normal mode
|
||||
and not in recovery mode
|
||||
*/
|
||||
*/
|
||||
switch (verify_recmode(ctdb, nodemap)) {
|
||||
case MONITOR_RECOVERY_NEEDED:
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
case MONITOR_FAILED:
|
||||
goto again;
|
||||
@ -3046,7 +2860,8 @@ again:
|
||||
ret = check_recovery_lock(ctdb);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
@ -3086,7 +2901,8 @@ again:
|
||||
if (remote_nodemaps[j]->num != nodemap->num) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
|
||||
nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
|
||||
@ -3098,8 +2914,9 @@ again:
|
||||
DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
|
||||
nodemap->nodes[j].pnn, i,
|
||||
remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, nodemap->nodes[j].pnn);
|
||||
vnnmap);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
@ -3120,14 +2937,16 @@ again:
|
||||
if (i == j) {
|
||||
DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
|
||||
update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, nodemap->nodes[j].pnn);
|
||||
vnnmap);
|
||||
goto again;
|
||||
} else {
|
||||
DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
|
||||
update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, nodemap->nodes[j].pnn);
|
||||
vnnmap);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
@ -3141,7 +2960,8 @@ again:
|
||||
if (vnnmap->size != rec->num_active) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
|
||||
vnnmap->size, rec->num_active));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
|
||||
@ -3164,7 +2984,8 @@ again:
|
||||
if (i == vnnmap->size) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
|
||||
nodemap->nodes[j].pnn));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
@ -3193,7 +3014,8 @@ again:
|
||||
if (vnnmap->generation != remote_vnnmap->generation) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
|
||||
nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
|
||||
@ -3201,7 +3023,8 @@ again:
|
||||
if (vnnmap->size != remote_vnnmap->size) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
|
||||
nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
goto again;
|
||||
}
|
||||
|
||||
@ -3210,8 +3033,9 @@ again:
|
||||
if (remote_vnnmap->map[i] != vnnmap->map[i]) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
|
||||
nodemap->nodes[j].pnn));
|
||||
ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, nodemap->nodes[j].pnn);
|
||||
vnnmap);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
@ -3225,15 +3049,15 @@ again:
|
||||
ret = run_startrecovery_eventscript(rec, nodemap);
|
||||
if (ret!=0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
}
|
||||
|
||||
ret = ctdb_takeover_run(ctdb, nodemap);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
}
|
||||
|
||||
/* execute the "recovered" event script on all nodes */
|
||||
@ -3245,8 +3069,8 @@ again:
|
||||
// cascading recovery.
|
||||
if (ret!=0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap,
|
||||
vnnmap, ctdb->pnn);
|
||||
ctdb_set_culprit(rec, ctdb->pnn);
|
||||
do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -625,21 +625,16 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
|
||||
|
||||
static void ctdb_ban_self(struct ctdb_context *ctdb, uint32_t ban_period)
|
||||
{
|
||||
int ret;
|
||||
struct ctdb_ban_info b;
|
||||
TDB_DATA data;
|
||||
struct ctdb_ban_time bantime;
|
||||
|
||||
b.pnn = ctdb->pnn;
|
||||
b.ban_time = ban_period;
|
||||
bantime.pnn = ctdb->pnn;
|
||||
bantime.time = ban_period;
|
||||
|
||||
data.dptr = (uint8_t *)&b;
|
||||
data.dsize = sizeof(b);
|
||||
data.dsize = sizeof(bantime);
|
||||
data.dptr = (uint8_t *)&bantime;
|
||||
|
||||
ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
|
||||
CTDB_SRVID_BAN_NODE, data);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " Failed to send ban message\n"));
|
||||
}
|
||||
ctdb_control_set_ban_state(ctdb, data);
|
||||
}
|
||||
|
||||
|
||||
|
@ -1678,6 +1678,17 @@ again:
|
||||
exit(10);
|
||||
}
|
||||
|
||||
|
||||
/* check tha there are nodes available that can act as a recmaster */
|
||||
for (i=0; i<nodemap->num; i++) {
|
||||
if (nodemap->nodes[i].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (i == nodemap->num) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* verify the recovery master is not STOPPED, nor BANNED */
|
||||
if (nodemap->nodes[recmaster].flags & (NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
|
||||
DEBUG(DEBUG_ERR,("No suitable recmaster found. Try again\n"));
|
||||
@ -1898,20 +1909,13 @@ static uint32_t get_generation(struct ctdb_context *ctdb)
|
||||
static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
{
|
||||
int ret;
|
||||
struct ctdb_ban_info b;
|
||||
TDB_DATA data;
|
||||
uint32_t ban_time;
|
||||
struct ctdb_node_map *nodemap=NULL;
|
||||
uint32_t generation, next_generation;
|
||||
struct ctdb_ban_time bantime;
|
||||
|
||||
if (argc < 1) {
|
||||
usage();
|
||||
}
|
||||
|
||||
/* record the current generation number */
|
||||
generation = get_generation(ctdb);
|
||||
|
||||
|
||||
/* verify the node exists */
|
||||
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
|
||||
if (ret != 0) {
|
||||
@ -1924,27 +1928,19 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
return -1;
|
||||
}
|
||||
|
||||
ban_time = strtoul(argv[0], NULL, 0);
|
||||
bantime.pnn = options.pnn;
|
||||
bantime.time = strtoul(argv[0], NULL, 0);
|
||||
|
||||
b.pnn = options.pnn;
|
||||
b.ban_time = ban_time;
|
||||
|
||||
data.dptr = (uint8_t *)&b;
|
||||
data.dsize = sizeof(b);
|
||||
|
||||
ret = ctdb_send_message(ctdb, options.pnn, CTDB_SRVID_BAN_NODE, data);
|
||||
ret = ctdb_ctrl_set_ban(ctdb, TIMELIMIT(), options.pnn, &bantime);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed to ban node %u\n", options.pnn));
|
||||
DEBUG(DEBUG_ERR,("Banning node %d for %d seconds failed.\n", bantime.pnn, bantime.time));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* wait until we are in a new generation */
|
||||
while (1) {
|
||||
next_generation = get_generation(ctdb);
|
||||
if (next_generation != generation) {
|
||||
return 0;
|
||||
}
|
||||
sleep(1);
|
||||
ret = control_ipreallocate(ctdb, argc, argv);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u\n", options.pnn));
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1957,16 +1953,10 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
{
|
||||
int ret;
|
||||
TDB_DATA data;
|
||||
uint32_t generation, next_generation;
|
||||
struct ctdb_node_map *nodemap=NULL;
|
||||
struct ctdb_ban_time bantime;
|
||||
|
||||
/* record the current generation number */
|
||||
generation = get_generation(ctdb);
|
||||
|
||||
data.dptr = (uint8_t *)&options.pnn;
|
||||
data.dsize = sizeof(uint32_t);
|
||||
|
||||
/* verify the node exists */
|
||||
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
|
||||
@ -1974,29 +1964,60 @@ static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
}
|
||||
|
||||
if (!(nodemap->nodes[options.pnn].flags & NODE_FLAGS_BANNED)) {
|
||||
DEBUG(DEBUG_ERR, ("Node %d is not banned. Can not unban\n", options.pnn));
|
||||
DEBUG(DEBUG_ERR,("Node %u is not banned.\n", options.pnn));
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = ctdb_send_message(ctdb, options.pnn, CTDB_SRVID_UNBAN_NODE, data);
|
||||
bantime.pnn = options.pnn;
|
||||
bantime.time = 0;
|
||||
|
||||
ret = ctdb_ctrl_set_ban(ctdb, TIMELIMIT(), options.pnn, &bantime);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed to to unban node %u\n", options.pnn));
|
||||
DEBUG(DEBUG_ERR,("Unbanning node %d failed.\n", bantime.pnn));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* wait until we are in a new generation */
|
||||
while (1) {
|
||||
next_generation = get_generation(ctdb);
|
||||
if (next_generation != generation) {
|
||||
return 0;
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
ret = control_ipreallocate(ctdb, argc, argv);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u\n", options.pnn));
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
show ban information for a node
|
||||
*/
|
||||
static int control_showban(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
{
|
||||
int ret;
|
||||
struct ctdb_node_map *nodemap=NULL;
|
||||
struct ctdb_ban_time *bantime;
|
||||
|
||||
/* verify the node exists */
|
||||
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = ctdb_ctrl_get_ban(ctdb, TIMELIMIT(), options.pnn, ctdb, &bantime);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Showing ban info for node %d failed.\n", options.pnn));
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (bantime->time == 0) {
|
||||
printf("Node %u is not banned\n", bantime->pnn);
|
||||
} else {
|
||||
printf("Node %u is banned banned for %d seconds\n", bantime->pnn, bantime->time);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
shutdown a daemon
|
||||
*/
|
||||
@ -3360,7 +3381,8 @@ static const struct {
|
||||
{ "stop", control_stop, true, false, "stop a node" },
|
||||
{ "continue", control_continue, true, false, "re-start a stopped node" },
|
||||
{ "ban", control_ban, true, false, "ban a node from the cluster", "<bantime|0>"},
|
||||
{ "unban", control_unban, true, false, "unban a node from the cluster" },
|
||||
{ "unban", control_unban, true, false, "unban a node" },
|
||||
{ "showban", control_showban, true, false, "show ban information"},
|
||||
{ "shutdown", control_shutdown, true, false, "shutdown ctdbd" },
|
||||
{ "recover", control_recover, true, false, "force recovery" },
|
||||
{ "ipreallocate", control_ipreallocate, true, false, "force the recovery daemon to perform a ip reallocation procedure" },
|
||||
|
Loading…
x
Reference in New Issue
Block a user