mirror of
git://sourceware.org/git/lvm2.git
synced 2024-12-23 21:35:29 +03:00
- Updating cluster log with latest code changes/bug fixes before
altering to new kernel structures.
This commit is contained in:
parent
5800560602
commit
ceeb1eca9d
@ -31,7 +31,6 @@ static void process_signals(void);
|
|||||||
static void daemonize(void);
|
static void daemonize(void);
|
||||||
static void init_all(void);
|
static void init_all(void);
|
||||||
static void cleanup_all(void);
|
static void cleanup_all(void);
|
||||||
static void set_priority(void);
|
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
@ -42,8 +41,6 @@ int main(int argc, char *argv[])
|
|||||||
/* Parent can now exit, we're ready to handle requests */
|
/* Parent can now exit, we're ready to handle requests */
|
||||||
kill(getppid(), SIGTERM);
|
kill(getppid(), SIGTERM);
|
||||||
|
|
||||||
/* set_priority(); -- let's try to do w/o this */
|
|
||||||
|
|
||||||
LOG_PRINT("Starting clogd:");
|
LOG_PRINT("Starting clogd:");
|
||||||
LOG_PRINT(" Built: "__DATE__" "__TIME__"\n");
|
LOG_PRINT(" Built: "__DATE__" "__TIME__"\n");
|
||||||
LOG_DBG(" Compiled with debugging.");
|
LOG_DBG(" Compiled with debugging.");
|
||||||
@ -266,18 +263,3 @@ static void cleanup_all(void)
|
|||||||
cleanup_local();
|
cleanup_local();
|
||||||
cleanup_cluster();
|
cleanup_cluster();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_priority(void)
|
|
||||||
{
|
|
||||||
struct sched_param sched_param;
|
|
||||||
int res;
|
|
||||||
|
|
||||||
res = sched_get_priority_max(SCHED_RR);
|
|
||||||
if (res != -1) {
|
|
||||||
sched_param.sched_priority = res;
|
|
||||||
res = sched_setscheduler(0, SCHED_RR, &sched_param);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (res == -1)
|
|
||||||
LOG_ERROR("Unable to set SCHED_RR priority.");
|
|
||||||
}
|
|
||||||
|
@ -68,9 +68,14 @@ static SaCkptHandleT ckpt_handle = 0;
|
|||||||
static SaCkptCallbacksT callbacks = { 0, 0 };
|
static SaCkptCallbacksT callbacks = { 0, 0 };
|
||||||
static SaVersionT version = { 'B', 1, 1 };
|
static SaVersionT version = { 'B', 1, 1 };
|
||||||
|
|
||||||
#define DEBUGGING_HISTORY 50
|
#define DEBUGGING_HISTORY 100
|
||||||
static char debugging[DEBUGGING_HISTORY][128];
|
static char debugging[DEBUGGING_HISTORY][128];
|
||||||
static int idx = 0;
|
static int idx = 0;
|
||||||
|
#define LOG_SPRINT(f, arg...) do {\
|
||||||
|
idx++; \
|
||||||
|
idx = idx % DEBUGGING_HISTORY; \
|
||||||
|
sprintf(debugging[idx], f, ## arg); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
static int log_resp_rec = 0;
|
static int log_resp_rec = 0;
|
||||||
|
|
||||||
@ -213,8 +218,17 @@ static int handle_cluster_request(struct clog_cpg *entry,
|
|||||||
* a cluster action to co-ordinate reading
|
* a cluster action to co-ordinate reading
|
||||||
* the disk and checkpointing
|
* the disk and checkpointing
|
||||||
*/
|
*/
|
||||||
if ((t->request_type != DM_CLOG_RESUME) ||
|
if (t->request_type == DM_CLOG_RESUME) {
|
||||||
(t->originator == my_cluster_id))
|
if (t->originator == my_cluster_id) {
|
||||||
|
r = do_request(t, server);
|
||||||
|
|
||||||
|
r = kernel_send(t);
|
||||||
|
if (r < 0)
|
||||||
|
LOG_ERROR("Failed to send resume response to kernel");
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
r = do_request(t, server);
|
r = do_request(t, server);
|
||||||
|
|
||||||
if (server &&
|
if (server &&
|
||||||
@ -337,7 +351,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
|
|||||||
strncpy(new->uuid, entry->name.value, entry->name.length);
|
strncpy(new->uuid, entry->name.value, entry->name.length);
|
||||||
|
|
||||||
new->bitmap_size = push_state(entry->name.value, "clean_bits",
|
new->bitmap_size = push_state(entry->name.value, "clean_bits",
|
||||||
&new->clean_bits);
|
&new->clean_bits, cp_requester);
|
||||||
if (new->bitmap_size <= 0) {
|
if (new->bitmap_size <= 0) {
|
||||||
LOG_ERROR("Failed to store clean_bits to checkpoint for node %u",
|
LOG_ERROR("Failed to store clean_bits to checkpoint for node %u",
|
||||||
new->requester);
|
new->requester);
|
||||||
@ -346,7 +360,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
|
|||||||
}
|
}
|
||||||
|
|
||||||
new->bitmap_size = push_state(entry->name.value,
|
new->bitmap_size = push_state(entry->name.value,
|
||||||
"sync_bits", &new->sync_bits);
|
"sync_bits", &new->sync_bits, cp_requester);
|
||||||
if (new->bitmap_size <= 0) {
|
if (new->bitmap_size <= 0) {
|
||||||
LOG_ERROR("Failed to store sync_bits to checkpoint for node %u",
|
LOG_ERROR("Failed to store sync_bits to checkpoint for node %u",
|
||||||
new->requester);
|
new->requester);
|
||||||
@ -355,7 +369,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
r = push_state(entry->name.value, "recovering_region", &new->recovering_region);
|
r = push_state(entry->name.value, "recovering_region", &new->recovering_region, cp_requester);
|
||||||
if (r <= 0) {
|
if (r <= 0) {
|
||||||
LOG_ERROR("Failed to store recovering_region to checkpoint for node %u",
|
LOG_ERROR("Failed to store recovering_region to checkpoint for node %u",
|
||||||
new->requester);
|
new->requester);
|
||||||
@ -541,6 +555,7 @@ rr_create_retry:
|
|||||||
tfr->request_type = DM_CLOG_CHECKPOINT_READY;
|
tfr->request_type = DM_CLOG_CHECKPOINT_READY;
|
||||||
tfr->originator = cp->requester; /* FIXME: hack to overload meaning of originator */
|
tfr->originator = cp->requester; /* FIXME: hack to overload meaning of originator */
|
||||||
strncpy(tfr->uuid, cp->uuid, CPG_MAX_NAME_LENGTH);
|
strncpy(tfr->uuid, cp->uuid, CPG_MAX_NAME_LENGTH);
|
||||||
|
tfr->seq = my_cluster_id;
|
||||||
|
|
||||||
r = cluster_send(tfr);
|
r = cluster_send(tfr);
|
||||||
if (r)
|
if (r)
|
||||||
@ -704,15 +719,11 @@ no_read:
|
|||||||
return rtn;
|
return rtn;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void do_checkpoints(struct clog_cpg *entry)
|
static void do_checkpoints(struct clog_cpg *entry, int leaving)
|
||||||
{
|
{
|
||||||
struct checkpoint_data *cp;
|
struct checkpoint_data *cp;
|
||||||
|
|
||||||
for (cp = entry->checkpoint_list; cp;) {
|
for (cp = entry->checkpoint_list; cp;) {
|
||||||
LOG_COND(log_checkpoint,
|
|
||||||
"[%s] Checkpoint data available for node %u",
|
|
||||||
SHORT_UUID(entry->name.value), cp->requester);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FIXME: Check return code. Could send failure
|
* FIXME: Check return code. Could send failure
|
||||||
* notice in tfr in export_checkpoint function
|
* notice in tfr in export_checkpoint function
|
||||||
@ -720,18 +731,34 @@ static void do_checkpoints(struct clog_cpg *entry)
|
|||||||
*/
|
*/
|
||||||
switch (export_checkpoint(cp)) {
|
switch (export_checkpoint(cp)) {
|
||||||
case -EEXIST:
|
case -EEXIST:
|
||||||
|
LOG_SPRINT("[%s] Checkpoint for %u already handled%s",
|
||||||
|
SHORT_UUID(entry->name.value), cp->requester,
|
||||||
|
(leaving) ? "(L)": "");
|
||||||
LOG_COND(log_checkpoint,
|
LOG_COND(log_checkpoint,
|
||||||
"[%s] Checkpoint for %u already handled",
|
"[%s] Checkpoint for %u already handled%s",
|
||||||
SHORT_UUID(entry->name.value), cp->requester);
|
SHORT_UUID(entry->name.value), cp->requester,
|
||||||
|
(leaving) ? "(L)": "");
|
||||||
|
entry->checkpoint_list = cp->next;
|
||||||
|
free_checkpoint(cp);
|
||||||
|
cp = entry->checkpoint_list;
|
||||||
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
|
LOG_SPRINT("[%s] Checkpoint data available for node %u%s",
|
||||||
|
SHORT_UUID(entry->name.value), cp->requester,
|
||||||
|
(leaving) ? "(L)": "");
|
||||||
|
LOG_COND(log_checkpoint,
|
||||||
|
"[%s] Checkpoint data available for node %u%s",
|
||||||
|
SHORT_UUID(entry->name.value), cp->requester,
|
||||||
|
(leaving) ? "(L)": "");
|
||||||
entry->checkpoint_list = cp->next;
|
entry->checkpoint_list = cp->next;
|
||||||
free_checkpoint(cp);
|
free_checkpoint(cp);
|
||||||
cp = entry->checkpoint_list;
|
cp = entry->checkpoint_list;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
/* FIXME: Skipping will cause list corruption */
|
/* FIXME: Skipping will cause list corruption */
|
||||||
LOG_ERROR("[%s] Failed to export checkpoint for %u",
|
LOG_ERROR("[%s] Failed to export checkpoint for %u%s",
|
||||||
SHORT_UUID(entry->name.value), cp->requester);
|
SHORT_UUID(entry->name.value), cp->requester,
|
||||||
|
(leaving) ? "(L)": "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -763,8 +790,6 @@ static int resend_requests(struct clog_cpg *entry)
|
|||||||
}
|
}
|
||||||
|
|
||||||
switch (tfr->request_type) {
|
switch (tfr->request_type) {
|
||||||
case DM_CLOG_RESUME:
|
|
||||||
/* We are only concerned about this request locally */
|
|
||||||
case DM_CLOG_SET_REGION_SYNC:
|
case DM_CLOG_SET_REGION_SYNC:
|
||||||
/*
|
/*
|
||||||
* Some requests simply do not need to be resent.
|
* Some requests simply do not need to be resent.
|
||||||
@ -776,11 +801,10 @@ static int resend_requests(struct clog_cpg *entry)
|
|||||||
"[%s] Skipping resend of %s/#%u...",
|
"[%s] Skipping resend of %s/#%u...",
|
||||||
SHORT_UUID(entry->name.value),
|
SHORT_UUID(entry->name.value),
|
||||||
_RQ_TYPE(tfr->request_type), tfr->seq);
|
_RQ_TYPE(tfr->request_type), tfr->seq);
|
||||||
idx++;
|
LOG_SPRINT("### No resend: [%s] %s/%u ###",
|
||||||
idx = idx % DEBUGGING_HISTORY;
|
SHORT_UUID(entry->name.value),
|
||||||
sprintf(debugging[idx], "### No resend: [%s] %s/%u ###",
|
_RQ_TYPE(tfr->request_type), tfr->seq);
|
||||||
SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type),
|
|
||||||
tfr->seq);
|
|
||||||
tfr->data_size = 0;
|
tfr->data_size = 0;
|
||||||
kernel_send(tfr);
|
kernel_send(tfr);
|
||||||
|
|
||||||
@ -796,11 +820,9 @@ static int resend_requests(struct clog_cpg *entry)
|
|||||||
SHORT_UUID(entry->name.value),
|
SHORT_UUID(entry->name.value),
|
||||||
_RQ_TYPE(tfr->request_type),
|
_RQ_TYPE(tfr->request_type),
|
||||||
tfr->seq, entry->lowest_id);
|
tfr->seq, entry->lowest_id);
|
||||||
idx++;
|
LOG_SPRINT("*** Resending: [%s] %s/%u ***",
|
||||||
idx = idx % DEBUGGING_HISTORY;
|
SHORT_UUID(entry->name.value),
|
||||||
sprintf(debugging[idx], "*** Resending: [%s] %s/%u ***",
|
_RQ_TYPE(tfr->request_type), tfr->seq);
|
||||||
SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type),
|
|
||||||
tfr->seq);
|
|
||||||
r = cluster_send(tfr);
|
r = cluster_send(tfr);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
LOG_ERROR("Failed resend");
|
LOG_ERROR("Failed resend");
|
||||||
@ -825,7 +847,7 @@ static int do_cluster_work(void *data)
|
|||||||
free(entry);
|
free(entry);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
do_checkpoints(entry);
|
do_checkpoints(entry, 0);
|
||||||
|
|
||||||
resend_requests(entry);
|
resend_requests(entry);
|
||||||
}
|
}
|
||||||
@ -858,6 +880,8 @@ static int flush_startup_list(struct clog_cpg *entry)
|
|||||||
free(tfr);
|
free(tfr);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
LOG_SPRINT("[%s] Checkpoint prepared for %u",
|
||||||
|
SHORT_UUID(entry->name.value), tfr->originator);
|
||||||
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u",
|
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u",
|
||||||
SHORT_UUID(entry->name.value), tfr->originator);
|
SHORT_UUID(entry->name.value), tfr->originator);
|
||||||
new->next = entry->checkpoint_list;
|
new->next = entry->checkpoint_list;
|
||||||
@ -878,6 +902,7 @@ static int flush_startup_list(struct clog_cpg *entry)
|
|||||||
}
|
}
|
||||||
free(tfr);
|
free(tfr);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -901,6 +926,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
|
|
||||||
if ((nodeid == my_cluster_id) &&
|
if ((nodeid == my_cluster_id) &&
|
||||||
!(tfr->request_type & DM_CLOG_RESPONSE) &&
|
!(tfr->request_type & DM_CLOG_RESPONSE) &&
|
||||||
|
(tfr->request_type != DM_CLOG_RESUME) &&
|
||||||
(tfr->request_type != DM_CLOG_CLEAR_REGION) &&
|
(tfr->request_type != DM_CLOG_CLEAR_REGION) &&
|
||||||
(tfr->request_type != DM_CLOG_CHECKPOINT_READY)) {
|
(tfr->request_type != DM_CLOG_CHECKPOINT_READY)) {
|
||||||
tmp_tfr = malloc(DM_CLOG_TFR_SIZE);
|
tmp_tfr = malloc(DM_CLOG_TFR_SIZE);
|
||||||
@ -915,6 +941,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
memcpy(tmp_tfr, tfr, sizeof(*tfr) + tfr->data_size);
|
memcpy(tmp_tfr, tfr, sizeof(*tfr) + tfr->data_size);
|
||||||
|
INIT_LIST_HEAD((struct list_head *)&tmp_tfr->private);
|
||||||
list_add_tail((struct list_head *)&tmp_tfr->private, &match->working_list);
|
list_add_tail((struct list_head *)&tmp_tfr->private, &match->working_list);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -952,6 +979,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
LOG_COND(log_resend_requests, "[%s] %u is leaving, delay = %d",
|
LOG_COND(log_resend_requests, "[%s] %u is leaving, delay = %d",
|
||||||
SHORT_UUID(tfr->uuid), nodeid, match->delay);
|
SHORT_UUID(tfr->uuid), nodeid, match->delay);
|
||||||
}
|
}
|
||||||
|
tfr->originator = nodeid; /* don't really need this, but nice for debug */
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -969,45 +997,33 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
if (tfr->request_type == DM_CLOG_CHECKPOINT_READY) {
|
if (tfr->request_type == DM_CLOG_CHECKPOINT_READY) {
|
||||||
if (my_cluster_id == tfr->originator) {
|
if (my_cluster_id == tfr->originator) {
|
||||||
/* Redundant checkpoints ignored if match->valid */
|
/* Redundant checkpoints ignored if match->valid */
|
||||||
|
LOG_SPRINT("[%s] CHECKPOINT_READY notification from %u",
|
||||||
|
SHORT_UUID(tfr->uuid), nodeid);
|
||||||
if (import_checkpoint(match, (match->state != INVALID))) {
|
if (import_checkpoint(match, (match->state != INVALID))) {
|
||||||
|
LOG_SPRINT("[%s] Failed to import checkpoint from %u",
|
||||||
|
SHORT_UUID(tfr->uuid), nodeid);
|
||||||
LOG_ERROR("[%s] Failed to import checkpoint from %u",
|
LOG_ERROR("[%s] Failed to import checkpoint from %u",
|
||||||
SHORT_UUID(tfr->uuid), nodeid);
|
SHORT_UUID(tfr->uuid), nodeid);
|
||||||
|
kill(getpid(), SIGUSR1);
|
||||||
/* Could we retry? */
|
/* Could we retry? */
|
||||||
goto out;
|
goto out;
|
||||||
} else if (match->state == INVALID) {
|
} else if (match->state == INVALID) {
|
||||||
|
LOG_SPRINT("[%s] Checkpoint data received from %u. Log is now valid",
|
||||||
|
SHORT_UUID(match->name.value), nodeid);
|
||||||
LOG_COND(log_checkpoint,
|
LOG_COND(log_checkpoint,
|
||||||
"[%s] Checkpoint data received from %u. Log is now valid",
|
"[%s] Checkpoint data received from %u. Log is now valid",
|
||||||
SHORT_UUID(match->name.value), nodeid);
|
SHORT_UUID(match->name.value), nodeid);
|
||||||
match->state = VALID;
|
match->state = VALID;
|
||||||
|
|
||||||
flush_startup_list(match);
|
flush_startup_list(match);
|
||||||
|
} else {
|
||||||
|
LOG_SPRINT("[%s] Redundant checkpoint from %u ignored.",
|
||||||
|
SHORT_UUID(tfr->uuid), nodeid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* If the log is now valid, we can queue the checkpoints
|
|
||||||
*/
|
|
||||||
for (i = match->checkpoints_needed; i; ) {
|
|
||||||
struct checkpoint_data *new;
|
|
||||||
|
|
||||||
i--;
|
|
||||||
new = prepare_checkpoint(match, match->checkpoint_requesters[i]);
|
|
||||||
if (!new) {
|
|
||||||
/* FIXME: Need better error handling */
|
|
||||||
LOG_ERROR("[%s] Failed to prepare checkpoint for %u!!!",
|
|
||||||
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u*",
|
|
||||||
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]);
|
|
||||||
match->checkpoints_needed--;
|
|
||||||
|
|
||||||
new->next = match->checkpoint_list;
|
|
||||||
match->checkpoint_list = new;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tfr->request_type & DM_CLOG_RESPONSE) {
|
if (tfr->request_type & DM_CLOG_RESPONSE) {
|
||||||
response = 1;
|
response = 1;
|
||||||
r = handle_cluster_response(match, tfr);
|
r = handle_cluster_response(match, tfr);
|
||||||
@ -1033,6 +1049,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
|
|
||||||
memcpy(tmp_tfr, tfr, sizeof(*tfr) + tfr->data_size);
|
memcpy(tmp_tfr, tfr, sizeof(*tfr) + tfr->data_size);
|
||||||
tmp_tfr->error = match->lowest_id;
|
tmp_tfr->error = match->lowest_id;
|
||||||
|
INIT_LIST_HEAD((struct list_head *)&tmp_tfr->private);
|
||||||
list_add_tail((struct list_head *)&tmp_tfr->private,
|
list_add_tail((struct list_head *)&tmp_tfr->private,
|
||||||
&match->startup_list);
|
&match->startup_list);
|
||||||
goto out;
|
goto out;
|
||||||
@ -1041,6 +1058,37 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
|
|||||||
r = handle_cluster_request(match, tfr, i_am_server);
|
r = handle_cluster_request(match, tfr, i_am_server);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the log is now valid, we can queue the checkpoints
|
||||||
|
*/
|
||||||
|
for (i = match->checkpoints_needed; i; ) {
|
||||||
|
struct checkpoint_data *new;
|
||||||
|
|
||||||
|
if (log_get_state(tfr) != LOG_RESUMED) {
|
||||||
|
LOG_DBG("[%s] Withholding checkpoints until log is valid (%s from %u)",
|
||||||
|
SHORT_UUID(tfr->uuid), _RQ_TYPE(tfr->request_type), nodeid);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
i--;
|
||||||
|
new = prepare_checkpoint(match, match->checkpoint_requesters[i]);
|
||||||
|
if (!new) {
|
||||||
|
/* FIXME: Need better error handling */
|
||||||
|
LOG_ERROR("[%s] Failed to prepare checkpoint for %u!!!",
|
||||||
|
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
LOG_SPRINT("[%s] Checkpoint prepared for %u* (%s)",
|
||||||
|
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i],
|
||||||
|
(log_get_state(tfr) != LOG_RESUMED)? "LOG_RESUMED": "LOG_SUSPENDED");
|
||||||
|
LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u*",
|
||||||
|
SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]);
|
||||||
|
match->checkpoints_needed--;
|
||||||
|
|
||||||
|
new->next = match->checkpoint_list;
|
||||||
|
match->checkpoint_list = new;
|
||||||
|
}
|
||||||
|
|
||||||
out:
|
out:
|
||||||
/* nothing happens after this point. It is just for debugging */
|
/* nothing happens after this point. It is just for debugging */
|
||||||
if (r) {
|
if (r) {
|
||||||
@ -1066,17 +1114,17 @@ out:
|
|||||||
}
|
}
|
||||||
} else if (!(tfr->request_type & DM_CLOG_RESPONSE) ||
|
} else if (!(tfr->request_type & DM_CLOG_RESPONSE) ||
|
||||||
(tfr->originator == my_cluster_id)) {
|
(tfr->originator == my_cluster_id)) {
|
||||||
int len;
|
if (!response)
|
||||||
idx++;
|
LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
|
||||||
idx = idx % DEBUGGING_HISTORY;
|
tfr->seq, SHORT_UUID(tfr->uuid),
|
||||||
len = sprintf(debugging[idx],
|
|
||||||
"SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
|
|
||||||
tfr->seq,
|
|
||||||
SHORT_UUID(tfr->uuid),
|
|
||||||
_RQ_TYPE(tfr->request_type),
|
_RQ_TYPE(tfr->request_type),
|
||||||
tfr->originator, (response) ? "YES" : "NO");
|
tfr->originator, (response) ? "YES" : "NO");
|
||||||
if (response)
|
else
|
||||||
sprintf(debugging[idx] + len, ", RSPR=%u", nodeid);
|
LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s, RSPR=%u",
|
||||||
|
tfr->seq, SHORT_UUID(tfr->uuid),
|
||||||
|
_RQ_TYPE(tfr->request_type),
|
||||||
|
tfr->originator, (response) ? "YES" : "NO",
|
||||||
|
nodeid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1089,6 +1137,7 @@ static void cpg_join_callback(struct clog_cpg *match,
|
|||||||
int my_pid = getpid();
|
int my_pid = getpid();
|
||||||
uint32_t lowest = match->lowest_id;
|
uint32_t lowest = match->lowest_id;
|
||||||
struct clog_tfr *tfr;
|
struct clog_tfr *tfr;
|
||||||
|
char dbuf[32];
|
||||||
|
|
||||||
/* Assign my_cluster_id */
|
/* Assign my_cluster_id */
|
||||||
if ((my_cluster_id == 0xDEAD) && (joined->pid == my_pid))
|
if ((my_cluster_id == 0xDEAD) && (joined->pid == my_pid))
|
||||||
@ -1104,8 +1153,12 @@ static void cpg_join_callback(struct clog_cpg *match,
|
|||||||
if (joined->nodeid == my_cluster_id)
|
if (joined->nodeid == my_cluster_id)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
LOG_COND(log_checkpoint, "[%s] Joining node, %u needs checkpoint",
|
memset(dbuf, 0, sizeof(dbuf));
|
||||||
SHORT_UUID(match->name.value), joined->nodeid);
|
for (i = 0; i < (member_list_entries-1); i++)
|
||||||
|
sprintf(dbuf+strlen(dbuf), "%u-", member_list[i].nodeid);
|
||||||
|
sprintf(dbuf+strlen(dbuf), "(%u)", joined->nodeid);
|
||||||
|
LOG_COND(log_checkpoint, "[%s] Joining node, %u needs checkpoint [%s]",
|
||||||
|
SHORT_UUID(match->name.value), joined->nodeid, dbuf);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FIXME: remove checkpoint_requesters/checkpoints_needed, and use
|
* FIXME: remove checkpoint_requesters/checkpoints_needed, and use
|
||||||
@ -1127,6 +1180,7 @@ static void cpg_join_callback(struct clog_cpg *match,
|
|||||||
}
|
}
|
||||||
tfr->request_type = DM_CLOG_MEMBER_JOIN;
|
tfr->request_type = DM_CLOG_MEMBER_JOIN;
|
||||||
tfr->originator = joined->nodeid;
|
tfr->originator = joined->nodeid;
|
||||||
|
INIT_LIST_HEAD((struct list_head *)&tfr->private);
|
||||||
list_add_tail((struct list_head *)&tfr->private, &match->startup_list);
|
list_add_tail((struct list_head *)&tfr->private, &match->startup_list);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
@ -1149,9 +1203,7 @@ out:
|
|||||||
LOG_COND(log_membership_change, "[%s] Server unchanged at %u (%u joined)",
|
LOG_COND(log_membership_change, "[%s] Server unchanged at %u (%u joined)",
|
||||||
SHORT_UUID(match->name.value),
|
SHORT_UUID(match->name.value),
|
||||||
lowest, joined->nodeid);
|
lowest, joined->nodeid);
|
||||||
idx++;
|
LOG_SPRINT("+++ UUID=%s %u join +++",
|
||||||
idx = idx % DEBUGGING_HISTORY;
|
|
||||||
sprintf(debugging[idx], "+++ UUID=%s %u join +++",
|
|
||||||
SHORT_UUID(match->name.value), joined->nodeid);
|
SHORT_UUID(match->name.value), joined->nodeid);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1160,17 +1212,14 @@ static void cpg_leave_callback(struct clog_cpg *match,
|
|||||||
struct cpg_address *member_list,
|
struct cpg_address *member_list,
|
||||||
int member_list_entries)
|
int member_list_entries)
|
||||||
{
|
{
|
||||||
int i, fd;
|
int i, j, fd;
|
||||||
struct list_head *p, *n;
|
struct list_head *p, *n;
|
||||||
uint32_t lowest = match->lowest_id;
|
uint32_t lowest = match->lowest_id;
|
||||||
struct clog_tfr *tfr;
|
struct clog_tfr *tfr;
|
||||||
|
struct checkpoint_data *p_cp, *c_cp;
|
||||||
|
|
||||||
{
|
LOG_SPRINT("--- UUID=%s %u left ---",
|
||||||
idx++;
|
|
||||||
idx = idx % DEBUGGING_HISTORY;
|
|
||||||
sprintf(debugging[idx], "--- UUID=%s %u left ---",
|
|
||||||
SHORT_UUID(match->name.value), left->nodeid);
|
SHORT_UUID(match->name.value), left->nodeid);
|
||||||
}
|
|
||||||
|
|
||||||
/* Am I leaving? */
|
/* Am I leaving? */
|
||||||
if (my_cluster_id == left->nodeid) {
|
if (my_cluster_id == left->nodeid) {
|
||||||
@ -1198,6 +1247,42 @@ static void cpg_leave_callback(struct clog_cpg *match,
|
|||||||
match->state = INVALID;
|
match->state = INVALID;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Remove any pending checkpoints for the leaving node. */
|
||||||
|
for (p_cp = NULL, c_cp = match->checkpoint_list;
|
||||||
|
c_cp && (c_cp->requester != left->nodeid);
|
||||||
|
p_cp = c_cp, c_cp = c_cp->next);
|
||||||
|
if (c_cp) {
|
||||||
|
if (p_cp)
|
||||||
|
p_cp->next = c_cp->next;
|
||||||
|
else
|
||||||
|
match->checkpoint_list = c_cp->next;
|
||||||
|
|
||||||
|
LOG_COND(log_checkpoint,
|
||||||
|
"[%s] Removing pending checkpoint (%u is leaving)",
|
||||||
|
SHORT_UUID(match->name.value), left->nodeid);
|
||||||
|
free_checkpoint(c_cp);
|
||||||
|
}
|
||||||
|
list_for_each_safe(p, n, &match->startup_list) {
|
||||||
|
tfr = (struct clog_tfr *)p;
|
||||||
|
if ((tfr->request_type == DM_CLOG_MEMBER_JOIN) &&
|
||||||
|
(tfr->originator == left->nodeid)) {
|
||||||
|
LOG_COND(log_checkpoint,
|
||||||
|
"[%s] Removing pending ckpt from startup list (%u is leaving)",
|
||||||
|
SHORT_UUID(match->name.value), left->nodeid);
|
||||||
|
list_del_init(p);
|
||||||
|
free(tfr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (i = 0, j = 0; i < match->checkpoints_needed; i++, j++) {
|
||||||
|
match->checkpoint_requesters[j] = match->checkpoint_requesters[i];
|
||||||
|
if (match->checkpoint_requesters[i] == left->nodeid) {
|
||||||
|
LOG_ERROR("[%s] Removing pending ckpt from needed list (%u is leaving)",
|
||||||
|
SHORT_UUID(match->name.value), left->nodeid);
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match->checkpoints_needed = j;
|
||||||
|
|
||||||
if (left->nodeid < my_cluster_id) {
|
if (left->nodeid < my_cluster_id) {
|
||||||
match->delay = (match->delay > 0) ? match->delay - 1 : 0;
|
match->delay = (match->delay > 0) ? match->delay - 1 : 0;
|
||||||
if (!match->delay && list_empty(&match->working_list))
|
if (!match->delay && list_empty(&match->working_list))
|
||||||
@ -1379,9 +1464,7 @@ int create_cluster_cpg(char *str)
|
|||||||
new->name.length = size;
|
new->name.length = size;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Look for checkpoints before joining to see if
|
* Ensure there are no stale checkpoints around before we join
|
||||||
* someone wrote a checkpoint after I left a previous
|
|
||||||
* session.
|
|
||||||
*/
|
*/
|
||||||
if (remove_checkpoint(new) == 1)
|
if (remove_checkpoint(new) == 1)
|
||||||
LOG_COND(log_checkpoint,
|
LOG_COND(log_checkpoint,
|
||||||
@ -1437,6 +1520,7 @@ static void abort_startup(struct clog_cpg *del)
|
|||||||
static int _destroy_cluster_cpg(struct clog_cpg *del)
|
static int _destroy_cluster_cpg(struct clog_cpg *del)
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
|
int state;
|
||||||
|
|
||||||
LOG_COND(log_resend_requests, "[%s] I am leaving.2.....",
|
LOG_COND(log_resend_requests, "[%s] I am leaving.2.....",
|
||||||
SHORT_UUID(del->name.value));
|
SHORT_UUID(del->name.value));
|
||||||
@ -1445,13 +1529,27 @@ static int _destroy_cluster_cpg(struct clog_cpg *del)
|
|||||||
* We must send any left over checkpoints before
|
* We must send any left over checkpoints before
|
||||||
* leaving. If we don't, an incoming node could
|
* leaving. If we don't, an incoming node could
|
||||||
* be stuck with no checkpoint and stall.
|
* be stuck with no checkpoint and stall.
|
||||||
|
do_checkpoints(del); --- THIS COULD BE CAUSING OUR PROBLEMS:
|
||||||
|
|
||||||
|
- Incoming node deletes old checkpoints before joining
|
||||||
|
- A stale checkpoint is issued here by leaving node
|
||||||
|
- (leaving node leaves)
|
||||||
|
- Incoming node joins cluster and finds stale checkpoint.
|
||||||
|
- (leaving node leaves - option 2)
|
||||||
*/
|
*/
|
||||||
do_checkpoints(del);
|
do_checkpoints(del, 1);
|
||||||
|
|
||||||
|
state = del->state;
|
||||||
|
|
||||||
del->cpg_state = INVALID;
|
del->cpg_state = INVALID;
|
||||||
del->state = LEAVING;
|
del->state = LEAVING;
|
||||||
|
|
||||||
if (!list_empty(&del->startup_list))
|
/*
|
||||||
|
* If the state is VALID, we might be processing the
|
||||||
|
* startup list. If so, we certainly don't want to
|
||||||
|
* clear the startup_list here by calling abort_startup
|
||||||
|
*/
|
||||||
|
if (!list_empty(&del->startup_list) && (state != VALID))
|
||||||
abort_startup(del);
|
abort_startup(del);
|
||||||
|
|
||||||
r = cpg_leave(del->handle, &del->name);
|
r = cpg_leave(del->handle, &del->name);
|
||||||
@ -1473,13 +1571,11 @@ int destroy_cluster_cpg(char *str)
|
|||||||
|
|
||||||
int init_cluster(void)
|
int init_cluster(void)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
SaAisErrorT rv;
|
SaAisErrorT rv;
|
||||||
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < DEBUGGING_HISTORY; i++)
|
for (i = 0; i < DEBUGGING_HISTORY; i++)
|
||||||
debugging[i][0] = '\0';
|
debugging[i][0] = '\0';
|
||||||
}
|
|
||||||
|
|
||||||
INIT_LIST_HEAD(&clog_cpg_list);
|
INIT_LIST_HEAD(&clog_cpg_list);
|
||||||
rv = saCkptInitialize(&ckpt_handle, &callbacks, &version);
|
rv = saCkptInitialize(&ckpt_handle, &callbacks, &version);
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
#include <linux/kdev_t.h>
|
#include <linux/kdev_t.h>
|
||||||
#define __USE_GNU /* for O_DIRECT */
|
#define __USE_GNU /* for O_DIRECT */
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
#include <time.h>
|
||||||
#include "linux/dm-clog-tfr.h"
|
#include "linux/dm-clog-tfr.h"
|
||||||
#include "list.h"
|
#include "list.h"
|
||||||
#include "functions.h"
|
#include "functions.h"
|
||||||
@ -50,6 +51,7 @@ struct log_c {
|
|||||||
char uuid[DM_UUID_LEN];
|
char uuid[DM_UUID_LEN];
|
||||||
uint32_t ref_count;
|
uint32_t ref_count;
|
||||||
|
|
||||||
|
time_t delay; /* limits how fast a resume can happen after suspend */
|
||||||
int touched;
|
int touched;
|
||||||
uint32_t region_size;
|
uint32_t region_size;
|
||||||
uint32_t region_count;
|
uint32_t region_count;
|
||||||
@ -60,6 +62,7 @@ struct log_c {
|
|||||||
uint32_t *sync_bits;
|
uint32_t *sync_bits;
|
||||||
uint32_t recoverer;
|
uint32_t recoverer;
|
||||||
uint64_t recovering_region; /* -1 means not recovering */
|
uint64_t recovering_region; /* -1 means not recovering */
|
||||||
|
uint64_t skip_bit_warning; /* used to warn if region skipped */
|
||||||
int sync_search;
|
int sync_search;
|
||||||
|
|
||||||
int resume_override;
|
int resume_override;
|
||||||
@ -429,6 +432,7 @@ static int _clog_ctr(int argc, char **argv, uint64_t device_size)
|
|||||||
lc->block_on_error = block_on_error;
|
lc->block_on_error = block_on_error;
|
||||||
lc->sync_search = 0;
|
lc->sync_search = 0;
|
||||||
lc->recovering_region = (uint64_t)-1;
|
lc->recovering_region = (uint64_t)-1;
|
||||||
|
lc->skip_bit_warning = region_count;
|
||||||
lc->disk_fd = -1;
|
lc->disk_fd = -1;
|
||||||
lc->log_dev_failed = 0;
|
lc->log_dev_failed = 0;
|
||||||
lc->ref_count = 1;
|
lc->ref_count = 1;
|
||||||
@ -645,7 +649,6 @@ static int clog_presuspend(struct clog_tfr *tfr)
|
|||||||
if (lc->touched)
|
if (lc->touched)
|
||||||
LOG_DBG("WARNING: log still marked as 'touched' during suspend");
|
LOG_DBG("WARNING: log still marked as 'touched' during suspend");
|
||||||
|
|
||||||
lc->state = LOG_SUSPENDED;
|
|
||||||
lc->recovery_halted = 1;
|
lc->recovery_halted = 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -666,8 +669,10 @@ static int clog_postsuspend(struct clog_tfr *tfr)
|
|||||||
LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid));
|
LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid));
|
||||||
destroy_cluster_cpg(tfr->uuid);
|
destroy_cluster_cpg(tfr->uuid);
|
||||||
|
|
||||||
|
lc->state = LOG_SUSPENDED;
|
||||||
lc->recovering_region = (uint64_t)-1;
|
lc->recovering_region = (uint64_t)-1;
|
||||||
lc->recoverer = (uint32_t)-1;
|
lc->recoverer = (uint32_t)-1;
|
||||||
|
lc->delay = time(NULL);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -714,6 +719,9 @@ static int clog_resume(struct clog_tfr *tfr)
|
|||||||
case 1000:
|
case 1000:
|
||||||
LOG_ERROR("[%s] Additional resume issued before suspend",
|
LOG_ERROR("[%s] Additional resume issued before suspend",
|
||||||
SHORT_UUID(tfr->uuid));
|
SHORT_UUID(tfr->uuid));
|
||||||
|
#ifdef DEBUG
|
||||||
|
kill(getpid(), SIGUSR1);
|
||||||
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
case 0:
|
case 0:
|
||||||
lc->resume_override = 1000;
|
lc->resume_override = 1000;
|
||||||
@ -806,7 +814,7 @@ out:
|
|||||||
|
|
||||||
lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
|
lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
|
||||||
|
|
||||||
LOG_DBG("[%s] Initial sync_count = %llu",
|
LOG_SPRINT("[%s] Initial sync_count = %llu",
|
||||||
SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
|
SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
|
||||||
lc->sync_search = 0;
|
lc->sync_search = 0;
|
||||||
lc->state = LOG_RESUMED;
|
lc->state = LOG_RESUMED;
|
||||||
@ -826,6 +834,7 @@ out:
|
|||||||
int local_resume(struct clog_tfr *tfr)
|
int local_resume(struct clog_tfr *tfr)
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
|
time_t t;
|
||||||
struct log_c *lc = get_log(tfr->uuid);
|
struct log_c *lc = get_log(tfr->uuid);
|
||||||
|
|
||||||
if (!lc) {
|
if (!lc) {
|
||||||
@ -836,6 +845,34 @@ int local_resume(struct clog_tfr *tfr)
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t = time(NULL);
|
||||||
|
t -= lc->delay;
|
||||||
|
/*
|
||||||
|
* This should be considered a temporary fix. It addresses
|
||||||
|
* a problem that exists when nodes suspend/resume in rapid
|
||||||
|
* succession. While the problem is very rare, it has been
|
||||||
|
* seen to happen in real-world-like testing.
|
||||||
|
*
|
||||||
|
* The problem:
|
||||||
|
* - Node A joins cluster
|
||||||
|
* - Node B joins cluster
|
||||||
|
* - Node A prepares checkpoint
|
||||||
|
* - Node A gets ready to write checkpoint
|
||||||
|
* - Node B leaves
|
||||||
|
* - Node B joins
|
||||||
|
* - Node A finishes write of checkpoint
|
||||||
|
* - Node B receives checkpoint meant for previous session
|
||||||
|
* -- Node B can now be non-coherent
|
||||||
|
*
|
||||||
|
* This timer will solve the problem for now, but could be
|
||||||
|
* replaced by a generation number sent with the resume
|
||||||
|
* command from the kernel. The generation number would
|
||||||
|
* be included in the name of the checkpoint to prevent
|
||||||
|
* reading stale data.
|
||||||
|
*/
|
||||||
|
if ((t < 3) && (t >= 0))
|
||||||
|
sleep(3 - t);
|
||||||
|
|
||||||
/* Join the CPG */
|
/* Join the CPG */
|
||||||
r = create_cluster_cpg(tfr->uuid);
|
r = create_cluster_cpg(tfr->uuid);
|
||||||
if (r) {
|
if (r) {
|
||||||
@ -1155,6 +1192,7 @@ static int clog_get_resync_work(struct clog_tfr *tfr)
|
|||||||
(unsigned long long)lc->recovering_region);
|
(unsigned long long)lc->recovering_region);
|
||||||
pkg->r = lc->recovering_region;
|
pkg->r = lc->recovering_region;
|
||||||
pkg->i = 1;
|
pkg->i = 1;
|
||||||
|
LOG_COND(log_resend_requests, "***** RE-REQUEST *****");
|
||||||
} else {
|
} else {
|
||||||
LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
|
LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
|
||||||
"Someone already recovering (%llu)",
|
"Someone already recovering (%llu)",
|
||||||
@ -1233,10 +1271,30 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
|
|||||||
} else {
|
} else {
|
||||||
log_set_bit(lc, lc->sync_bits, pkg->region);
|
log_set_bit(lc, lc->sync_bits, pkg->region);
|
||||||
lc->sync_count++;
|
lc->sync_count++;
|
||||||
|
|
||||||
|
/* The rest of this section is all for debugging */
|
||||||
LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
|
LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
|
||||||
"Setting region (%llu)",
|
"Setting region (%llu)",
|
||||||
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
|
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
|
||||||
(unsigned long long)pkg->region);
|
(unsigned long long)pkg->region);
|
||||||
|
if (pkg->region == lc->skip_bit_warning)
|
||||||
|
lc->skip_bit_warning = lc->region_count;
|
||||||
|
|
||||||
|
if (pkg->region > (lc->skip_bit_warning + 5)) {
|
||||||
|
LOG_ERROR("*** Region #%llu skipped during recovery ***",
|
||||||
|
(unsigned long long)lc->skip_bit_warning);
|
||||||
|
lc->skip_bit_warning = lc->region_count;
|
||||||
|
#ifdef DEBUG
|
||||||
|
kill(getpid(), SIGUSR1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!log_test_bit(lc->sync_bits,
|
||||||
|
(pkg->region) ? pkg->region - 1 : 0)) {
|
||||||
|
LOG_SPRINT("*** Previous bit not set ***");
|
||||||
|
lc->skip_bit_warning = (pkg->region) ?
|
||||||
|
pkg->region - 1 : 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (log_test_bit(lc->sync_bits, pkg->region)) {
|
} else if (log_test_bit(lc->sync_bits, pkg->region)) {
|
||||||
lc->sync_count--;
|
lc->sync_count--;
|
||||||
@ -1254,6 +1312,9 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
|
|||||||
"sync_count(%llu) != bitmap count(%llu)",
|
"sync_count(%llu) != bitmap count(%llu)",
|
||||||
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
|
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
|
||||||
(unsigned long long)lc->sync_count, reset);
|
(unsigned long long)lc->sync_count, reset);
|
||||||
|
#ifdef DEBUG
|
||||||
|
kill(getpid(), SIGUSR1);
|
||||||
|
#endif
|
||||||
lc->sync_count = reset;
|
lc->sync_count = reset;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1291,6 +1352,19 @@ static int clog_get_sync_count(struct clog_tfr *tfr)
|
|||||||
|
|
||||||
tfr->data_size = sizeof(*sync_count);
|
tfr->data_size = sizeof(*sync_count);
|
||||||
|
|
||||||
|
if (lc->sync_count != count_bits32(lc->sync_bits, lc->bitset_uint32_count)) {
|
||||||
|
unsigned long long reset = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
|
||||||
|
|
||||||
|
LOG_SPRINT("get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
|
||||||
|
"sync_count(%llu) != bitmap count(%llu)",
|
||||||
|
tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
|
||||||
|
(unsigned long long)lc->sync_count, reset);
|
||||||
|
#ifdef DEBUG
|
||||||
|
kill(getpid(), SIGUSR1);
|
||||||
|
#endif
|
||||||
|
lc->sync_count = reset;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1593,7 +1667,7 @@ static void print_bits(char *buf, int size, int print)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* int store_bits(const char *uuid, const char *which, char **buf)*/
|
/* int store_bits(const char *uuid, const char *which, char **buf)*/
|
||||||
int push_state(const char *uuid, const char *which, char **buf)
|
int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who)
|
||||||
{
|
{
|
||||||
int bitset_size;
|
int bitset_size;
|
||||||
struct log_c *lc;
|
struct log_c *lc;
|
||||||
@ -1614,10 +1688,12 @@ int push_state(const char *uuid, const char *which, char **buf)
|
|||||||
sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
|
sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
|
||||||
lc->recoverer);
|
lc->recoverer);
|
||||||
|
|
||||||
LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = X:: "
|
LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
|
||||||
"recovering_region=%llu, recoverer=%u",
|
"recovering_region=%llu, recoverer=%u, sync_count=%llu",
|
||||||
SHORT_UUID(lc->uuid),
|
SHORT_UUID(lc->uuid), debug_who,
|
||||||
(unsigned long long)lc->recovering_region, lc->recoverer);
|
(unsigned long long)lc->recovering_region,
|
||||||
|
lc->recoverer,
|
||||||
|
(unsigned long long)count_bits32(lc->sync_bits, lc->bitset_uint32_count));
|
||||||
return 64;
|
return 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ int local_resume(struct clog_tfr *tfr);
|
|||||||
int cluster_postsuspend(char *);
|
int cluster_postsuspend(char *);
|
||||||
|
|
||||||
int do_request(struct clog_tfr *tfr, int server);
|
int do_request(struct clog_tfr *tfr, int server);
|
||||||
int push_state(const char *uuid, const char *which, char **buf);
|
int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who);
|
||||||
int pull_state(const char *uuid, const char *which, char *buf, int size);
|
int pull_state(const char *uuid, const char *which, char *buf, int size);
|
||||||
|
|
||||||
int log_get_state(struct clog_tfr *tfr);
|
int log_get_state(struct clog_tfr *tfr);
|
||||||
|
Loading…
Reference in New Issue
Block a user