1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-22 13:34:15 +03:00

Expand the client async framework so that it can take a callback function.

This allows us to use the async framework also for controls that return
outdata.

Add a "capabilities" field to the ctdb_node structure. This field is
only initialized and kept valid inside the recovery daemon context and not
inside the main ctdb daemon.

change the GET_CAPABILITIES control to return the capabilities in outdata instead of in the res return variable.

When performing a recovery inside the recovery daemon, read the capabilities from all connected nodes and update the ctdb->nodes list of nodes.
when building the new vnnmap after the database rebuild in recovery, do not include any nodes which lack the LMASTER capability in the new vnnmap.
Unless there are no available connected node that sports the LMASTER capability in which case we let the local node (recmaster) take on the lmaster role temporarily (i.e. become a member of the vnnmap list)

(This used to be ctdb commit 0f1883c69c689b28b0c04148774840b2c4081df6)
This commit is contained in:
Ronnie Sahlberg 2008-05-06 15:42:59 +10:00
parent 2c23959616
commit 92b61cd7d5
6 changed files with 119 additions and 22 deletions

View File

@ -2671,8 +2671,11 @@ int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, ui
static void async_callback(struct ctdb_client_control_state *state)
{
struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context);
int ret;
TDB_DATA outdata;
int32_t res;
uint32_t destnode = state->c->hdr.destnode;
/* one more node has responded with recmode data */
data->count--;
@ -2690,13 +2693,16 @@ static void async_callback(struct ctdb_client_control_state *state)
state->async.fn = NULL;
ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
if ((ret != 0) || (res != 0)) {
if ( !data->dont_log_errors) {
DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d\n", ret, (int)res));
}
data->fail_count++;
}
if ((ret == 0) && (data->callback != NULL)) {
data->callback(ctdb, destnode, res, outdata);
}
}
@ -2739,15 +2745,17 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
uint32_t *nodes,
struct timeval timeout,
bool dont_log_errors,
TDB_DATA data)
TDB_DATA data,
client_async_callback client_callback)
{
struct client_async_data *async_data;
struct ctdb_client_control_state *state;
int j, num_nodes;
async_data = talloc_zero(ctdb, struct client_async_data);
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
async_data->dont_log_errors = dont_log_errors;
async_data->callback = client_callback;
num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
@ -2886,15 +2894,16 @@ int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ct
{
int ret;
int32_t res;
TDB_DATA outdata;
ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
if (ret != 0) {
ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
if ( (ret != 0) || (res != 0) ) {
DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n"));
return -1;
}
if (capabilities) {
*capabilities = (uint32_t)res;
*capabilities = *((uint32_t *)outdata.dptr);
}
return 0;

View File

@ -199,6 +199,11 @@ struct ctdb_node {
uint32_t rx_cnt;
uint32_t tx_cnt;
/* used to track node capabilities, is only valid/tracked inside the
recovery daemon.
*/
uint32_t capabilities;
/* a list of controls pending to this node, so we can time them out quickly
if the node becomes disconnected */
struct daemon_control_state *pending_controls;
@ -1276,10 +1281,13 @@ int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
int ctdb_set_child_logging(struct ctdb_context *ctdb);
typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata);
struct client_async_data {
bool dont_log_errors;
uint32_t count;
uint32_t fail_count;
client_async_callback callback;
};
void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state);
int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data);
@ -1288,12 +1296,14 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
uint32_t *nodes,
struct timeval timeout,
bool dont_log_errors,
TDB_DATA data);
TDB_DATA data,
client_async_callback client_callback);
void ctdb_load_nodes_file(struct ctdb_context *ctdb);
int ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode);
int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata);
int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata);
#endif

View File

@ -390,7 +390,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
return ctdb_control_del_public_address(ctdb, indata);
case CTDB_CONTROL_GET_CAPABILITIES:
return ctdb->capabilities;
return ctdb_control_get_capabilities(ctdb, outdata);
default:
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));

View File

@ -957,3 +957,21 @@ int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA inda
return 0;
}
/*
report capabilities
*/
int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
{
uint32_t *capabilities = NULL;
capabilities = talloc(outdata, uint32_t);
CTDB_NO_MEMORY(ctdb, capabilities);
*capabilities = ctdb->capabilities;
outdata->dsize = sizeof(uint32_t);
outdata->dptr = (uint8_t *)capabilities;
return 0;
}

View File

@ -212,7 +212,7 @@ static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
CONTROL_TIMEOUT(), false, tdb_null) != 0) {
CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -234,7 +234,7 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
CONTROL_TIMEOUT(), false, tdb_null) != 0) {
CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -244,6 +244,40 @@ static int run_startrecovery_eventscript(struct ctdb_context *ctdb, struct ctdb_
return 0;
}
static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata)
{
if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
return;
}
ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
}
/*
update the node capabilities for all connected nodes
*/
static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
{
uint32_t *nodes;
TALLOC_CTX *tmp_ctx;
tmp_ctx = talloc_new(ctdb);
CTDB_NO_MEMORY(ctdb, tmp_ctx);
nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
nodes, CONTROL_TIMEOUT(),
false, tdb_null, async_getcap_callback) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
talloc_free(tmp_ctx);
return -1;
}
talloc_free(tmp_ctx);
return 0;
}
/*
change recovery mode on all nodes
*/
@ -262,7 +296,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
if (rec_mode == CTDB_RECOVERY_ACTIVE) {
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
nodes, CONTROL_TIMEOUT(),
false, tdb_null) != 0) {
false, tdb_null, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -275,7 +309,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
nodes, CONTROL_TIMEOUT(),
false, data) != 0) {
false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -284,7 +318,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
if (rec_mode == CTDB_RECOVERY_NORMAL) {
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
nodes, CONTROL_TIMEOUT(),
false, tdb_null) != 0) {
false, tdb_null, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to thaw nodes. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -311,7 +345,7 @@ static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
CONTROL_TIMEOUT(), false, data) != 0) {
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
talloc_free(tmp_ctx);
return -1;
@ -1142,7 +1176,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
CONTROL_TIMEOUT(), false, outdata) != 0) {
CONTROL_TIMEOUT(), false, outdata, NULL) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
talloc_free(recdata);
talloc_free(tmp_ctx);
@ -1198,7 +1232,7 @@ static int recover_database(struct ctdb_recoverd *rec,
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
list_of_active_nodes(ctdb, nodemap, recdb, true),
CONTROL_TIMEOUT(), false, data) != 0) {
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
talloc_free(recdb);
return -1;
@ -1321,7 +1355,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
CONTROL_TIMEOUT(), false, data) != 0) {
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
return -1;
}
@ -1340,7 +1374,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
/* commit all the changes */
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
CONTROL_TIMEOUT(), false, data) != 0) {
CONTROL_TIMEOUT(), false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
return -1;
}
@ -1348,19 +1382,45 @@ static int do_recovery(struct ctdb_recoverd *rec,
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
/* update the capabilities for all nodes */
ret = update_capabilities(ctdb, nodemap);
if (ret!=0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
return -1;
}
/* build a new vnn map with all the currently active and
unbanned nodes */
generation = new_generation();
vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
CTDB_NO_MEMORY(ctdb, vnnmap);
vnnmap->generation = generation;
vnnmap->size = rec->num_active;
vnnmap->size = 0;
vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
CTDB_NO_MEMORY(ctdb, vnnmap->map);
for (i=j=0;i<nodemap->num;i++) {
if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
vnnmap->map[j++] = nodemap->nodes[i].pnn;
if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
continue;
}
if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
/* this node can not be an lmaster */
DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
continue;
}
vnnmap->size++;
vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, vnnmap->size);
CTDB_NO_MEMORY(ctdb, vnnmap->map);
vnnmap->map[j++] = nodemap->nodes[i].pnn;
}
if (vnnmap->size == 0) {
DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
vnnmap->size++;
vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, vnnmap->size);
CTDB_NO_MEMORY(ctdb, vnnmap->map);
vnnmap->map[0] = pnn;
}
/* update to the new vnnmap on all nodes */
ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);

View File

@ -443,7 +443,7 @@ struct sockaddr_in *sin)
/* send release ip to all nodes */
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP,
list_of_active_nodes(ctdb, nodemap, ctdb, true),
TIMELIMIT(), false, data) != 0) {
TIMELIMIT(), false, data, NULL) != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to send 'ReleaseIP' to all nodes.\n"));
return -1;
}