rpc: fix for race between rpc and protocol/client

It is possible that the notification thread which notifies
protocol/client layer about the disconnection is put to sleep
and meanwhile, a fuse thread or a timer thread initiates and
completes reconnection to the brick. The notification thread
is then woken up and protocol/client layer updates its flags
to indicate that network is disconnected. No reconnection is
initiated because reconnection is rpc-lib layer's responsibility
and its flags indicate that connection is connected.

Fix: Serialize connect and disconnect notify

Credit: Raghavendra Talur <rtalur@redhat.com>
Change-Id: I8ff5d1a3283b47f5c26848a42016a40bc34ffc1d
BUG: 1386626
Signed-off-by: Rajesh Joseph <rjoseph@redhat.com>
Reviewed-on: http://review.gluster.org/15916
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Raghavendra Talur <rtalur@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
This commit is contained in:
Rajesh Joseph 2016-12-03 01:10:51 +05:30 committed by Atin Mukherjee
parent 45f914ec9c
commit aa22f24f5d
2 changed files with 58 additions and 39 deletions

View File

@ -871,6 +871,41 @@ rpc_clnt_destroy (struct rpc_clnt *rpc);
#define RPC_THIS_RESTORE (THIS = old_THIS)
static int
rpc_clnt_handle_disconnect (struct rpc_clnt *clnt, rpc_clnt_connection_t *conn)
{
struct timespec ts = {0, };
gf_boolean_t unref_clnt = _gf_false;
rpc_clnt_connection_cleanup (conn);
pthread_mutex_lock (&conn->lock);
{
if (!conn->rpc_clnt->disabled && (conn->reconnect == NULL)) {
ts.tv_sec = 10;
ts.tv_nsec = 0;
rpc_clnt_ref (clnt);
conn->reconnect = gf_timer_call_after (clnt->ctx, ts,
rpc_clnt_reconnect, conn);
if (conn->reconnect == NULL) {
gf_log (conn->name, GF_LOG_WARNING,
"Cannot create rpc_clnt_reconnect timer");
unref_clnt = _gf_true;
}
}
}
pthread_mutex_unlock (&conn->lock);
if (clnt->notifyfn)
clnt->notifyfn (clnt, clnt->mydata, RPC_CLNT_DISCONNECT, NULL);
if (unref_clnt)
rpc_clnt_ref (clnt);
return 0;
}
int
rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
rpc_transport_event_t event, void *data, ...)
@ -880,9 +915,7 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
int ret = -1;
rpc_request_info_t *req_info = NULL;
rpc_transport_pollin_t *pollin = NULL;
struct timespec ts = {0, };
void *clnt_mydata = NULL;
gf_boolean_t unref_clnt = _gf_false;
DECLARE_OLD_THIS;
conn = mydata;
@ -898,35 +931,11 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
switch (event) {
case RPC_TRANSPORT_DISCONNECT:
{
rpc_clnt_connection_cleanup (conn);
pthread_mutex_lock (&conn->lock);
pthread_mutex_lock (&clnt->notifylock);
{
if (!conn->rpc_clnt->disabled
&& (conn->reconnect == NULL)) {
ts.tv_sec = 10;
ts.tv_nsec = 0;
rpc_clnt_ref (clnt);
conn->reconnect =
gf_timer_call_after (clnt->ctx, ts,
rpc_clnt_reconnect,
conn);
if (conn->reconnect == NULL) {
gf_log (conn->name, GF_LOG_WARNING,
"Cannot create rpc_clnt_reconnect timer");
unref_clnt = _gf_true;
}
}
rpc_clnt_handle_disconnect (clnt, conn);
}
pthread_mutex_unlock (&conn->lock);
if (clnt->notifyfn)
ret = clnt->notifyfn (clnt, clnt->mydata,
RPC_CLNT_DISCONNECT, NULL);
if (unref_clnt)
rpc_clnt_ref (clnt);
pthread_mutex_unlock (&clnt->notifylock);
break;
}
@ -981,17 +990,21 @@ rpc_clnt_notify (rpc_transport_t *trans, void *mydata,
case RPC_TRANSPORT_CONNECT:
{
/* Every time there is a disconnection, processes
should try to connect to 'glusterd' (ie, default
port) or whichever port given as 'option remote-port'
in volume file. */
/* Below code makes sure the (re-)configured port lasts
for just one successful attempt */
conn->config.remote_port = 0;
pthread_mutex_lock (&clnt->notifylock);
{
/* Every time there is a disconnection, processes
* should try to connect to 'glusterd' (ie, default
* port) or whichever port given as 'option remote-port'
* in volume file. */
/* Below code makes sure the (re-)configured port lasts
* for just one successful attempt */
conn->config.remote_port = 0;
if (clnt->notifyfn)
ret = clnt->notifyfn (clnt, clnt->mydata,
RPC_CLNT_CONNECT, NULL);
if (clnt->notifyfn)
ret = clnt->notifyfn (clnt, clnt->mydata,
RPC_CLNT_CONNECT, NULL);
}
pthread_mutex_unlock (&clnt->notifylock);
break;
}
@ -1115,6 +1128,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
}
pthread_mutex_init (&rpc->lock, NULL);
pthread_mutex_init (&rpc->notifylock, NULL);
rpc->ctx = ctx;
rpc->owner = owner;
@ -1124,6 +1138,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
rpc->reqpool = mem_pool_new (struct rpc_req, reqpool_size);
if (rpc->reqpool == NULL) {
pthread_mutex_destroy (&rpc->lock);
pthread_mutex_destroy (&rpc->notifylock);
GF_FREE (rpc);
rpc = NULL;
goto out;
@ -1133,6 +1148,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
reqpool_size);
if (rpc->saved_frames_pool == NULL) {
pthread_mutex_destroy (&rpc->lock);
pthread_mutex_destroy (&rpc->notifylock);
mem_pool_destroy (rpc->reqpool);
GF_FREE (rpc);
rpc = NULL;
@ -1142,6 +1158,7 @@ rpc_clnt_new (dict_t *options, xlator_t *owner, char *name,
ret = rpc_clnt_connection_init (rpc, ctx, options, name);
if (ret == -1) {
pthread_mutex_destroy (&rpc->lock);
pthread_mutex_destroy (&rpc->notifylock);
mem_pool_destroy (rpc->reqpool);
mem_pool_destroy (rpc->saved_frames_pool);
GF_FREE (rpc);
@ -1737,6 +1754,7 @@ rpc_clnt_destroy (struct rpc_clnt *rpc)
saved_frames_destroy (rpc->conn.saved_frames);
pthread_mutex_destroy (&rpc->lock);
pthread_mutex_destroy (&rpc->conn.lock);
pthread_mutex_destroy (&rpc->notifylock);
/* mem-pool should be destroyed, otherwise,
it will cause huge memory leaks */

View File

@ -172,6 +172,7 @@ struct rpc_req {
typedef struct rpc_clnt {
pthread_mutex_t lock;
pthread_mutex_t notifylock;
rpc_clnt_notify_t notifyfn;
rpc_clnt_connection_t conn;
void *mydata;