core: glusterfsd keeping fd open in index xlator
Problem: At the time of processing GF_EVENT_PARENT_DOWN at brick xlator, it forwards the event to next xlator only while xlator ensures no stub is in progress. At io-thread xlator it decreases stub_cnt before the process a stub and notify EVENT to next xlator Solution: Introduce a new counter to save stub_cnt and decrease the counter after process the stub completely at io-thread xlator. To avoid brick crash at the time of call xlator_mem_cleanup move only brick xlator if detach brick name has found in the graph Note: Thanks to pranith for sharing a simple reproducer to reproduce the same fixes bz#1637934 Change-Id: I1a694a001f7a5417e8771e3adf92c518969b6baa Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
This commit is contained in:
parent
777412c5e8
commit
7bf95631b5
@ -1051,6 +1051,7 @@ xlator_mem_cleanup(xlator_t *this)
|
||||
top = ctx->active->first;
|
||||
LOCK(&ctx->volfile_lock);
|
||||
/* TODO here we have leak for xlator node in a graph */
|
||||
/* Need to move only top xlator from a graph */
|
||||
for (trav_p = &top->children; *trav_p; trav_p = &(*trav_p)->next) {
|
||||
victim = (*trav_p)->xlator;
|
||||
if (victim->call_cleanup && !strcmp(victim->name, this->name)) {
|
||||
@ -1058,19 +1059,6 @@ xlator_mem_cleanup(xlator_t *this)
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* TODO Sometime brick xlator is not moved from graph so followed below
|
||||
approach to move brick xlator from a graph, will move specific brick
|
||||
xlator from graph only while inode table and mem_acct are cleaned up
|
||||
*/
|
||||
trav_p = &top->children;
|
||||
while (*trav_p) {
|
||||
victim = (*trav_p)->xlator;
|
||||
if (victim->call_cleanup && !victim->itable && !victim->mem_acct) {
|
||||
(*trav_p) = (*trav_p)->next;
|
||||
} else {
|
||||
trav_p = &(*trav_p)->next;
|
||||
}
|
||||
}
|
||||
UNLOCK(&ctx->volfile_lock);
|
||||
}
|
||||
}
|
||||
|
78
tests/bugs/glusterd/brick-mux-fd-cleanup.t
Normal file
78
tests/bugs/glusterd/brick-mux-fd-cleanup.t
Normal file
@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
|
||||
. $(dirname $0)/../../include.rc
|
||||
. $(dirname $0)/../../volume.rc
|
||||
|
||||
#This .t tests that the fds from client are closed on brick when gluster volume
|
||||
#stop is executed in brick-mux setup.
|
||||
|
||||
cleanup;
|
||||
TEST glusterd
|
||||
TEST pidof glusterd
|
||||
|
||||
function keep_fd_open {
|
||||
#This function has to be run as background job because opening the fd in
|
||||
#foreground and running commands is leading to flush calls on these fds
|
||||
#which is making it very difficult to create the race where fds will be left
|
||||
#open even after the brick dies.
|
||||
exec 5>$M1/a
|
||||
exec 6>$M1/b
|
||||
while [ -f $M0/a ]; do sleep 1; done
|
||||
}
|
||||
|
||||
function count_open_files {
|
||||
local brick_pid="$1"
|
||||
local pattern="$2"
|
||||
ls -l /proc/$brick_pid/fd | grep -i "$pattern" | wc -l
|
||||
}
|
||||
|
||||
TEST $CLI volume set all cluster.brick-multiplex on
|
||||
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
|
||||
TEST $CLI volume create $V1 replica 2 $H0:$B0/${V1}{2,3}
|
||||
#Have same configuration on both bricks so that they are multiplexed
|
||||
#Delay flush fop for a second
|
||||
TEST $CLI volume heal $V0 disable
|
||||
TEST $CLI volume heal $V1 disable
|
||||
TEST $CLI volume set $V0 delay-gen posix
|
||||
TEST $CLI volume set $V0 delay-gen.enable flush
|
||||
TEST $CLI volume set $V0 delay-gen.delay-percentage 100
|
||||
TEST $CLI volume set $V0 delay-gen.delay-duration 1000000
|
||||
TEST $CLI volume set $V1 delay-gen posix
|
||||
TEST $CLI volume set $V1 delay-gen.enable flush
|
||||
TEST $CLI volume set $V1 delay-gen.delay-percentage 100
|
||||
TEST $CLI volume set $V1 delay-gen.delay-duration 1000000
|
||||
|
||||
TEST $CLI volume start $V0
|
||||
TEST $CLI volume start $V1
|
||||
|
||||
TEST $GFS -s $H0 --volfile-id=$V0 --direct-io-mode=enable $M0
|
||||
TEST $GFS -s $H0 --volfile-id=$V1 --direct-io-mode=enable $M1
|
||||
|
||||
TEST touch $M0/a
|
||||
keep_fd_open &
|
||||
TEST $CLI volume profile $V1 start
|
||||
brick_pid=$(get_brick_pid $V1 $H0 $B0/${V1}2)
|
||||
TEST count_open_files $brick_pid "$B0/${V1}2/a"
|
||||
TEST count_open_files $brick_pid "$B0/${V1}2/b"
|
||||
TEST count_open_files $brick_pid "$B0/${V1}3/a"
|
||||
TEST count_open_files $brick_pid "$B0/${V1}3/b"
|
||||
|
||||
#If any other flush fops are introduced into the system other than the one at
|
||||
#cleanup it interferes with the race, so test for it
|
||||
EXPECT "^0$" echo "$($CLI volume profile $V1 info incremental | grep -i flush | wc -l)"
|
||||
#Stop the volume
|
||||
TEST $CLI volume stop $V1
|
||||
|
||||
#Wait for cleanup resources or volume V1
|
||||
EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/a"
|
||||
EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}2/b"
|
||||
EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/a"
|
||||
EXPECT_WITHIN $GRAPH_SWITCH_TIMEOUT "^0$" count_open_files $brick_pid "$B0/${V1}3/b"
|
||||
|
||||
TEST rm -f $M0/a #Exit keep_fd_open()
|
||||
wait
|
||||
|
||||
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||||
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1
|
||||
|
||||
cleanup
|
@ -123,7 +123,7 @@ __iot_dequeue(iot_conf_t *conf, int *pri)
|
||||
if (!stub)
|
||||
return NULL;
|
||||
|
||||
GF_ATOMIC_DEC(conf->queue_size);
|
||||
conf->queue_size--;
|
||||
conf->queue_sizes[*pri]--;
|
||||
|
||||
return stub;
|
||||
@ -155,7 +155,8 @@ __iot_enqueue(iot_conf_t *conf, call_stub_t *stub, int pri)
|
||||
}
|
||||
list_add_tail(&stub->list, &ctx->reqs);
|
||||
|
||||
GF_ATOMIC_INC(conf->queue_size);
|
||||
conf->queue_size++;
|
||||
GF_ATOMIC_INC(conf->stub_cnt);
|
||||
conf->queue_sizes[pri]++;
|
||||
}
|
||||
|
||||
@ -183,7 +184,7 @@ iot_worker(void *data)
|
||||
conf->ac_iot_count[pri]--;
|
||||
pri = -1;
|
||||
}
|
||||
while (GF_ATOMIC_GET(conf->queue_size) == 0) {
|
||||
while (conf->queue_size == 0) {
|
||||
if (conf->down) {
|
||||
bye = _gf_true; /*Avoid sleep*/
|
||||
break;
|
||||
@ -230,6 +231,7 @@ iot_worker(void *data)
|
||||
} else {
|
||||
call_resume(stub);
|
||||
}
|
||||
GF_ATOMIC_DEC(conf->stub_cnt);
|
||||
}
|
||||
stub = NULL;
|
||||
|
||||
@ -837,9 +839,9 @@ __iot_workers_scale(iot_conf_t *conf)
|
||||
thread_name);
|
||||
if (ret == 0) {
|
||||
conf->curr_count++;
|
||||
gf_msg_debug(
|
||||
conf->this->name, 0, "scaled threads to %d (queue_size=%ld/%d)",
|
||||
conf->curr_count, GF_ATOMIC_GET(conf->queue_size), scale);
|
||||
gf_msg_debug(conf->this->name, 0,
|
||||
"scaled threads to %d (queue_size=%d/%d)",
|
||||
conf->curr_count, conf->queue_size, scale);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@ -1231,7 +1233,7 @@ init(xlator_t *this)
|
||||
GF_OPTION_INIT("pass-through", this->pass_through, bool, out);
|
||||
|
||||
conf->this = this;
|
||||
GF_ATOMIC_INIT(conf->queue_size, 0);
|
||||
GF_ATOMIC_INIT(conf->stub_cnt, 0);
|
||||
|
||||
for (i = 0; i < GF_FOP_PRI_MAX; i++) {
|
||||
INIT_LIST_HEAD(&conf->clients[i]);
|
||||
@ -1282,7 +1284,7 @@ notify(xlator_t *this, int32_t event, void *data, ...)
|
||||
{
|
||||
iot_conf_t *conf = this->private;
|
||||
xlator_t *victim = data;
|
||||
uint64_t queue_size = 0;
|
||||
uint64_t stub_cnt = 0;
|
||||
struct timespec sleep_till = {
|
||||
0,
|
||||
};
|
||||
@ -1292,14 +1294,14 @@ notify(xlator_t *this, int32_t event, void *data, ...)
|
||||
clock_gettime(CLOCK_REALTIME, &sleep_till);
|
||||
sleep_till.tv_sec += 1;
|
||||
/* Wait for draining stub from queue before notify PARENT_DOWN */
|
||||
queue_size = GF_ATOMIC_GET(conf->queue_size);
|
||||
stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
|
||||
|
||||
pthread_mutex_lock(&conf->mutex);
|
||||
{
|
||||
while (queue_size) {
|
||||
while (stub_cnt) {
|
||||
(void)pthread_cond_timedwait(&conf->cond, &conf->mutex,
|
||||
&sleep_till);
|
||||
queue_size = GF_ATOMIC_GET(conf->queue_size);
|
||||
stub_cnt = GF_ATOMIC_GET(conf->stub_cnt);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&conf->mutex);
|
||||
|
@ -63,7 +63,8 @@ struct iot_conf {
|
||||
int32_t ac_iot_limit[GF_FOP_PRI_MAX];
|
||||
int32_t ac_iot_count[GF_FOP_PRI_MAX];
|
||||
int queue_sizes[GF_FOP_PRI_MAX];
|
||||
gf_atomic_t queue_size;
|
||||
int32_t queue_size;
|
||||
gf_atomic_t stub_cnt;
|
||||
pthread_attr_t w_attr;
|
||||
gf_boolean_t least_priority; /*Enable/Disable least-priority */
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user