mirror of
https://github.com/samba-team/samba.git
synced 2024-12-22 13:34:15 +03:00
Expand the client async framework so that it can take a callback function.
This allows us to use the async framework also for controls that return outdata. Add a "capabilities" field to the ctdb_node structure. This field is only initialized and kept valid inside the recovery daemon context and not inside the main ctdb daemon. change the GET_CAPABILITIES control to return the capabilities in outdata instead of in the res return variable. When performing a recovery inside the recovery daemon, read the capabilities from all connected nodes and update the ctdb->nodes list of nodes. when building the new vnnmap after the database rebuild in recovery, do not include any nodes which lack the LMASTER capability in the new vnnmap. Unless there are no available connected node that sports the LMASTER capability in which case we let the local node (recmaster) take on the lmaster role temporarily (i.e. become a member of the vnnmap list) (This used to be ctdb commit 0f1883c69c689b28b0c04148774840b2c4081df6)
This commit is contained in:
parent
2c23959616
commit
92b61cd7d5
@ -2671,8 +2671,11 @@ int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, ui
|
||||
static void async_callback(struct ctdb_client_control_state *state)
|
||||
{
|
||||
struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
|
||||
struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context);
|
||||
int ret;
|
||||
TDB_DATA outdata;
|
||||
int32_t res;
|
||||
uint32_t destnode = state->c->hdr.destnode;
|
||||
|
||||
/* one more node has responded with recmode data */
|
||||
data->count--;
|
||||
@ -2690,13 +2693,16 @@ static void async_callback(struct ctdb_client_control_state *state)
|
||||
|
||||
state->async.fn = NULL;
|
||||
|
||||
ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
|
||||
ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
|
||||
if ((ret != 0) || (res != 0)) {
|
||||
if ( !data->dont_log_errors) {
|
||||
DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
|
||||
}
|
||||
data->fail_count++;
|
||||
}
|
||||
if ((ret == 0) && (data->callback != NULL)) {
|
||||
data->callback(ctdb, destnode, res, outdata);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2739,15 +2745,17 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
|
||||
uint32_t *nodes,
|
||||
struct timeval timeout,
|
||||
bool dont_log_errors,
|
||||
TDB_DATA data)
|
||||
TDB_DATA data,
|
||||
client_async_callback client_callback)
|
||||
{
|
||||
struct client_async_data *async_data;
|
||||
struct ctdb_client_control_state *state;
|
||||
int j, num_nodes;
|
||||
|
||||
|
||||
async_data = talloc_zero(ctdb, struct client_async_data);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
|
||||
async_data->dont_log_errors = dont_log_errors;
|
||||
async_data->callback = client_callback;
|
||||
|
||||
num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
|
||||
|
||||
@ -2886,15 +2894,16 @@ int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ct
|
||||
{
|
||||
int ret;
|
||||
int32_t res;
|
||||
TDB_DATA outdata;
|
||||
|
||||
ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
|
||||
if (ret != 0) {
|
||||
ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
|
||||
if ( (ret != 0) || (res != 0) ) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (capabilities) {
|
||||
*capabilities = (uint32_t)res;
|
||||
*capabilities = *((uint32_t *)outdata.dptr);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -199,6 +199,11 @@ struct ctdb_node {
|
||||
uint32_t rx_cnt;
|
||||
uint32_t tx_cnt;
|
||||
|
||||
/* used to track node capabilities, is only valid/tracked inside the
|
||||
recovery daemon.
|
||||
*/
|
||||
uint32_t capabilities;
|
||||
|
||||
/* a list of controls pending to this node, so we can time them out quickly
|
||||
if the node becomes disconnected */
|
||||
struct daemon_control_state *pending_controls;
|
||||
@ -1276,10 +1281,13 @@ int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
|
||||
int ctdb_set_child_logging(struct ctdb_context *ctdb);
|
||||
|
||||
|
||||
typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata);
|
||||
|
||||
struct client_async_data {
|
||||
bool dont_log_errors;
|
||||
uint32_t count;
|
||||
uint32_t fail_count;
|
||||
client_async_callback callback;
|
||||
};
|
||||
void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state);
|
||||
int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data);
|
||||
@ -1288,12 +1296,14 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
|
||||
uint32_t *nodes,
|
||||
struct timeval timeout,
|
||||
bool dont_log_errors,
|
||||
TDB_DATA data);
|
||||
TDB_DATA data,
|
||||
client_async_callback client_callback);
|
||||
|
||||
void ctdb_load_nodes_file(struct ctdb_context *ctdb);
|
||||
|
||||
int ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode);
|
||||
|
||||
int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata);
|
||||
int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata);
|
||||
|
||||
#endif
|
||||
|
@ -390,7 +390,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
|
||||
return ctdb_control_del_public_address(ctdb, indata);
|
||||
|
||||
case CTDB_CONTROL_GET_CAPABILITIES:
|
||||
return ctdb->capabilities;
|
||||
return ctdb_control_get_capabilities(ctdb, outdata);
|
||||
|
||||
default:
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
|
||||
|
@ -957,3 +957,21 @@ int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA inda
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
report capabilities
|
||||
*/
|
||||
int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
|
||||
{
|
||||
uint32_t *capabilities = NULL;
|
||||
|
||||
capabilities = talloc(outdata, uint32_t);
|
||||
CTDB_NO_MEMORY(ctdb, capabilities);
|
||||
*capabilities = ctdb->capabilities;
|
||||
|
||||
outdata->dsize = sizeof(uint32_t);
|
||||
outdata->dptr = (uint8_t *)capabilities;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -212,7 +212,7 @@ static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
|
||||
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, tdb_null) != 0) {
|
||||
CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -234,7 +234,7 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
|
||||
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, tdb_null) != 0) {
|
||||
CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -244,6 +244,40 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata)
|
||||
{
|
||||
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
|
||||
return;
|
||||
}
|
||||
ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
|
||||
}
|
||||
|
||||
/*
|
||||
update the node capabilities for all connected nodes
|
||||
*/
|
||||
static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
|
||||
{
|
||||
uint32_t *nodes;
|
||||
TALLOC_CTX *tmp_ctx;
|
||||
|
||||
tmp_ctx = talloc_new(ctdb);
|
||||
CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
||||
|
||||
nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
|
||||
nodes, CONTROL_TIMEOUT(),
|
||||
false, tdb_null, async_getcap_callback) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
talloc_free(tmp_ctx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
change recovery mode on all nodes
|
||||
*/
|
||||
@ -262,7 +296,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
|
||||
if (rec_mode == CTDB_RECOVERY_ACTIVE) {
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
|
||||
nodes, CONTROL_TIMEOUT(),
|
||||
false, tdb_null) != 0) {
|
||||
false, tdb_null, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -275,7 +309,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
|
||||
nodes, CONTROL_TIMEOUT(),
|
||||
false, data) != 0) {
|
||||
false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -284,7 +318,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
|
||||
if (rec_mode == CTDB_RECOVERY_NORMAL) {
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
|
||||
nodes, CONTROL_TIMEOUT(),
|
||||
false, tdb_null) != 0) {
|
||||
false, tdb_null, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -311,7 +345,7 @@ static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
|
||||
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, data) != 0) {
|
||||
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return -1;
|
||||
@ -1142,7 +1176,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
|
||||
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, outdata) != 0) {
|
||||
CONTROL_TIMEOUT(), false, outdata, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
|
||||
talloc_free(recdata);
|
||||
talloc_free(tmp_ctx);
|
||||
@ -1198,7 +1232,7 @@ static int recover_database(struct ctdb_recoverd *rec,
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
|
||||
list_of_active_nodes(ctdb, nodemap, recdb, true),
|
||||
CONTROL_TIMEOUT(), false, data) != 0) {
|
||||
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
|
||||
talloc_free(recdb);
|
||||
return -1;
|
||||
@ -1321,7 +1355,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
|
||||
list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, data) != 0) {
|
||||
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
|
||||
return -1;
|
||||
}
|
||||
@ -1340,7 +1374,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
/* commit all the changes */
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
|
||||
list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
|
||||
CONTROL_TIMEOUT(), false, data) != 0) {
|
||||
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
|
||||
return -1;
|
||||
}
|
||||
@ -1348,19 +1382,45 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
|
||||
|
||||
|
||||
/* update the capabilities for all nodes */
|
||||
ret = update_capabilities(ctdb, nodemap);
|
||||
if (ret!=0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* build a new vnn map with all the currently active and
|
||||
unbanned nodes */
|
||||
generation = new_generation();
|
||||
vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
|
||||
CTDB_NO_MEMORY(ctdb, vnnmap);
|
||||
vnnmap->generation = generation;
|
||||
vnnmap->size = rec->num_active;
|
||||
vnnmap->size = 0;
|
||||
vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
|
||||
CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
||||
for (i=j=0;i<nodemap->num;i++) {
|
||||
if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
|
||||
vnnmap->map[j++] = nodemap->nodes[i].pnn;
|
||||
if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
|
||||
continue;
|
||||
}
|
||||
if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
|
||||
/* this node can not be an lmaster */
|
||||
DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
|
||||
continue;
|
||||
}
|
||||
|
||||
vnnmap->size++;
|
||||
vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, vnnmap->size);
|
||||
CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
||||
vnnmap->map[j++] = nodemap->nodes[i].pnn;
|
||||
|
||||
}
|
||||
if (vnnmap->size == 0) {
|
||||
DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
|
||||
vnnmap->size++;
|
||||
vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, vnnmap->size);
|
||||
CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
||||
vnnmap->map[0] = pnn;
|
||||
}
|
||||
|
||||
/* update to the new vnnmap on all nodes */
|
||||
ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
|
||||
|
@ -443,7 +443,7 @@ struct sockaddr_in *sin)
|
||||
/* send release ip to all nodes */
|
||||
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP,
|
||||
list_of_active_nodes(ctdb, nodemap, ctdb, true),
|
||||
TIMELIMIT(), false, data) != 0) {
|
||||
TIMELIMIT(), false, data, NULL) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to send 'ReleaseIP' to all nodes.\n"));
|
||||
return -1;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user