features/shard: Ref shard inode while adding to fsync list

PROBLEM:

Lot of the earlier changes in the management of shards in lru, fsync
lists assumed that if a given shard exists in fsync list, it must be
part of lru list as well. This was found to be not true.

Consider this - a file is FALLOCATE'd to a size which would make the
number of participant shards to be greater than the lru list size.
In this case, some of the resolved shards that are to participate in
this fop will be evicted from lru list to give way to the rest of the
shards. And once FALLOCATE completes, these shards are added to fsync
list but without a ref. After the fop completes, these shard inodes
are unref'd and destroyed while their inode ctxs are still part of
fsync list. Now when an FSYNC is called on the base file and the
fsync-list traversed, the client crashes due to illegal memory access.

FIX:

Hold a ref on the shard inode when adding to fsync list as well.
And unref under following conditions:
1. when the shard is evicted from lru list
2. when the base file is fsync'd
3. when the shards are deleted.

Change-Id: Iab460667d091b8388322f59b6cb27ce69299b1b2
fixes: bz#1669077
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
This commit is contained in:
Krutika Dhananjay 2019-01-24 14:14:39 +05:30 committed by Xavi Hernandez
parent 99b3ab0cf3
commit 72922c1fd6
2 changed files with 51 additions and 8 deletions

View File

@ -0,0 +1,29 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
SHARD_COUNT_TIME=5
cleanup
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume set $V0 features.shard on
TEST $CLI volume set $V0 features.shard-block-size 4MB
TEST $CLI volume set $V0 features.shard-lru-limit 25
TEST $CLI volume start $V0
TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
# If the bug still exists, client should crash during fallocate below
TEST fallocate -l 200M $M0/foo
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST $CLI volume stop $V0
TEST $CLI volume delete $V0
cleanup

View File

@ -273,6 +273,7 @@ shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this,
* of the to_fsync_list.
*/
inode_ref(base_inode);
inode_ref(shard_inode);
LOCK(&base_inode->lock);
LOCK(&shard_inode->lock);
@ -286,8 +287,10 @@ shard_inode_ctx_add_to_fsync_list(inode_t *base_inode, xlator_t *this,
/* Unref the base inode corresponding to the ref above, if the shard is
* found to be already part of the fsync list.
*/
if (ret != 0)
if (ret != 0) {
inode_unref(base_inode);
inode_unref(shard_inode);
}
return ret;
}
@ -734,6 +737,10 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
inode_unlink(lru_inode, priv->dot_shard_inode, block_bname);
inode_forget(lru_inode, 0);
} else {
/* The following unref corresponds to the ref
* held when the shard was added to fsync list.
*/
inode_unref(lru_inode);
fsync_inode = lru_inode;
if (lru_base_inode)
inode_unref(lru_base_inode);
@ -2932,8 +2939,8 @@ shard_unlink_block_inode(shard_local_t *local, int shard_block_num)
shard_priv_t *priv = NULL;
shard_inode_ctx_t *ctx = NULL;
shard_inode_ctx_t *base_ictx = NULL;
gf_boolean_t unlink_unref_forget = _gf_false;
int unref_base_inode = 0;
int unref_shard_inode = 0;
this = THIS;
priv = this->private;
@ -2958,11 +2965,12 @@ shard_unlink_block_inode(shard_local_t *local, int shard_block_num)
list_del_init(&ctx->ilist);
priv->inode_count--;
unref_base_inode++;
unref_shard_inode++;
GF_ASSERT(priv->inode_count >= 0);
unlink_unref_forget = _gf_true;
}
if (ctx->fsync_needed) {
unref_base_inode++;
unref_shard_inode++;
list_del_init(&ctx->to_fsync_list);
if (base_inode) {
__shard_inode_ctx_get(base_inode, this, &base_ictx);
@ -2973,11 +2981,11 @@ shard_unlink_block_inode(shard_local_t *local, int shard_block_num)
UNLOCK(&inode->lock);
if (base_inode)
UNLOCK(&base_inode->lock);
if (unlink_unref_forget) {
inode_unlink(inode, priv->dot_shard_inode, block_bname);
inode_unref(inode);
inode_forget(inode, 0);
}
inode_unlink(inode, priv->dot_shard_inode, block_bname);
inode_ref_reduce_by_n(inode, unref_shard_inode);
inode_forget(inode, 0);
if (base_inode && unref_base_inode)
inode_ref_reduce_by_n(base_inode, unref_base_inode);
UNLOCK(&priv->lock);
@ -5793,6 +5801,7 @@ shard_fsync_shards_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
shard_inode_ctx_t *ctx = NULL;
shard_inode_ctx_t *base_ictx = NULL;
inode_t *base_inode = NULL;
gf_boolean_t unref_shard_inode = _gf_false;
local = frame->local;
base_inode = local->fd->inode;
@ -5826,11 +5835,16 @@ out:
if (ctx->fsync_needed != 0) {
list_add_tail(&ctx->to_fsync_list, &base_ictx->to_fsync_list);
base_ictx->fsync_count++;
} else {
unref_shard_inode = _gf_true;
}
}
UNLOCK(&anon_fd->inode->lock);
UNLOCK(&base_inode->lock);
}
if (unref_shard_inode)
inode_unref(anon_fd->inode);
if (anon_fd)
fd_unref(anon_fd);