glusterd: compare friend data within mutex

During friend handshake if the glusterd receives more than one friend
updates, it might very well become possible that two threads would end
up working on two different volinfo references and glusterd might end up
updating the store with a old volinfo reference. While debugging
glusterd crash from validating-server-quorum.t test file from the
line-coverage regression the same was observed.

Solution is to run glusterd_compare_friend_data under a mutex.

Test:

As the crash was more visible in the line-coverage run (given lcov does
some instrumentation and exposes the races), 6 manual lcov runs were
triggered starting from https://build.gluster.org/job/line-coverage/443
to https://build.gluster.org/job/line-coverage/449/ and no crash was
observed from validating-server-quorum.t

Change-Id: I86fce473a76fd24742d51bf17a685d28b90a8941
Fixes: bz#1603063
Signed-off-by: Atin Mukherjee <amukherj@redhat.com>
This commit is contained in:
Atin Mukherjee 2018-08-10 09:12:05 +05:30
parent 48b93c292c
commit 29d5557854
3 changed files with 48 additions and 41 deletions

View File

@ -937,54 +937,59 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
*/
//Build comparison logic here.
ret = glusterd_compare_friend_data (ev_ctx->vols, &status,
event->peername);
if (ret)
goto out;
if (GLUSTERD_VOL_COMP_RJT != status) {
event_type = GD_FRIEND_EVENT_LOCAL_ACC;
op_ret = 0;
} else {
event_type = GD_FRIEND_EVENT_LOCAL_RJT;
op_errno = GF_PROBE_VOLUME_CONFLICT;
op_ret = -1;
}
/* Compare missed_snapshot list with the peer *
* if volume comparison is successful */
if ((op_ret == 0) &&
(conf->op_version >= GD_OP_VERSION_3_6_0)) {
ret = glusterd_import_friend_missed_snap_list (ev_ctx->vols);
pthread_mutex_lock (&conf->import_volumes);
{
ret = glusterd_compare_friend_data (ev_ctx->vols, &status,
event->peername);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
"Failed to import peer's "
"missed_snaps_list.");
pthread_mutex_unlock (&conf->import_volumes);
goto out;
}
if (GLUSTERD_VOL_COMP_RJT != status) {
event_type = GD_FRIEND_EVENT_LOCAL_ACC;
op_ret = 0;
} else {
event_type = GD_FRIEND_EVENT_LOCAL_RJT;
op_errno = GF_PROBE_MISSED_SNAP_CONFLICT;
op_errno = GF_PROBE_VOLUME_CONFLICT;
op_ret = -1;
}
/* glusterd_compare_friend_snapshots and functions only require
* a peers hostname and uuid. It also does updates, which
* require use of synchronize_rcu. So we pass the hostname and
* id from the event instead of the peerinfo object to prevent
* deadlocks as above.
*/
ret = glusterd_compare_friend_snapshots (ev_ctx->vols,
event->peername,
event->peerid);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_SNAP_COMPARE_CONFLICT,
"Conflict in comparing peer's snapshots");
event_type = GD_FRIEND_EVENT_LOCAL_RJT;
op_errno = GF_PROBE_SNAP_CONFLICT;
op_ret = -1;
/* Compare missed_snapshot list with the peer *
* if volume comparison is successful */
if ((op_ret == 0) &&
(conf->op_version >= GD_OP_VERSION_3_6_0)) {
ret = glusterd_import_friend_missed_snap_list (ev_ctx->vols);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_MISSED_SNAP_LIST_STORE_FAIL,
"Failed to import peer's "
"missed_snaps_list.");
event_type = GD_FRIEND_EVENT_LOCAL_RJT;
op_errno = GF_PROBE_MISSED_SNAP_CONFLICT;
op_ret = -1;
}
/* glusterd_compare_friend_snapshots and functions only require
* a peers hostname and uuid. It also does updates, which
* require use of synchronize_rcu. So we pass the hostname and
* id from the event instead of the peerinfo object to prevent
* deadlocks as above.
*/
ret = glusterd_compare_friend_snapshots (ev_ctx->vols,
event->peername,
event->peerid);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
GD_MSG_SNAP_COMPARE_CONFLICT,
"Conflict in comparing peer's snapshots");
event_type = GD_FRIEND_EVENT_LOCAL_RJT;
op_errno = GF_PROBE_SNAP_CONFLICT;
op_ret = -1;
}
}
}
pthread_mutex_unlock (&conf->import_volumes);
ret = glusterd_friend_sm_new_event (event_type, &new_event);
if (ret) {

View File

@ -1854,6 +1854,7 @@ init (xlator_t *this)
synclock_init (&conf->big_lock, SYNC_LOCK_RECURSIVE);
pthread_mutex_init (&conf->xprt_lock, NULL);
INIT_LIST_HEAD (&conf->xprt_list);
pthread_mutex_init (&conf->import_volumes, NULL);
glusterd_friend_sm_init ();
glusterd_op_sm_init ();

View File

@ -162,6 +162,7 @@ typedef struct {
struct cds_list_head brick_procs; /* List of brick processes */
pthread_mutex_t xprt_lock;
struct list_head xprt_list;
pthread_mutex_t import_volumes;
gf_store_handle_t *handle;
gf_timer_t *timer;
glusterd_sm_tr_log_t op_sm_log;