cluster/afr: Delegate metadata heal with pending xattrs to SHD

Problem:
When metadata-self-heal is triggered on the mount, it blocks
lookup until metadata-self-heal completes. But that can lead
to hangs when lot of clients are accessing a directory which
needs metadata heal and all of them trigger heals waiting
for other clients to complete heal.

Fix:
Only when the heal is needed but the pending xattrs are not set,
trigger metadata heal that could block lookup. This is the only
case where different clients may give different metadata to the
clients without heals, which should be avoided.

Updates bz#1622821
Change-Id: I6089e9fda0770a83fb287941b229c882711f4e66
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
This commit is contained in:
Pranith Kumar K 2018-08-27 11:46:33 +05:30
parent 37f77b1242
commit ccaad48f51
5 changed files with 72 additions and 51 deletions

View File

@ -17,6 +17,7 @@ TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
echo "some data" > $M0/datafile
EXPECT 0 echo $?
TEST touch $M0/mdatafile
TEST touch $M0/mdatafile-backend-direct-modify
TEST mkdir $M0/dir
#Kill a brick and perform I/O to have pending heals.
@ -29,6 +30,7 @@ EXPECT 0 echo $?
#pending metadata heal
TEST chmod +x $M0/mdatafile
TEST chmod +x $B0/${V0}0/mdatafile-backend-direct-modify
#pending entry heal. Also causes pending metadata/data heals on file{1..5}
TEST touch $M0/dir/file{1..5}
@ -40,9 +42,12 @@ TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0
#Medatada heal via explicit lookup must not happen
TEST ls $M0/mdatafile
TEST getfattr -d -m. -e hex $M0/mdatafile
TEST ls $M0/mdatafile-backend-direct-modify
#Inode refresh must not trigger data and entry heals.
TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" != "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]]
#Inode refresh must not trigger data metadata and entry heals.
#To trigger inode refresh for sure, the volume is unmounted and mounted each time.
#Check that data heal does not happen.
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
@ -52,7 +57,6 @@ TEST cat $M0/datafile
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
TEST ls $M0/dir
#No heal must have happened
EXPECT 8 get_pending_heal_count $V0
@ -61,21 +65,25 @@ TEST $CLI volume set $V0 cluster.data-self-heal on
TEST $CLI volume set $V0 cluster.metadata-self-heal on
TEST $CLI volume set $V0 cluster.entry-self-heal on
#Metadata heal is triggered by lookup without need for inode refresh.
TEST ls $M0/mdatafile
EXPECT 7 get_pending_heal_count $V0
#Inode refresh must trigger data and entry heals.
#Inode refresh must trigger data metadata and entry heals.
#To trigger inode refresh for sure, the volume is unmounted and mounted each time.
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
TEST ls $M0/mdatafile-backend-direct-modify
TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" == "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]]
TEST getfattr -d -m. -e hex $M0/mdatafile
EXPECT_WITHIN $HEAL_TIMEOUT 7 get_pending_heal_count $V0
TEST cat $M0/datafile
EXPECT_WITHIN $HEAL_TIMEOUT 6 get_pending_heal_count $V0
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
TEST ls $M0/dir
EXPECT 5 get_pending_heal_count $V0
EXPECT_WITHIN $HEAL_TIMEOUT 5 get_pending_heal_count $V0
TEST cat $M0/dir/file1
TEST cat $M0/dir/file2
@ -83,5 +91,5 @@ TEST cat $M0/dir/file3
TEST cat $M0/dir/file4
TEST cat $M0/dir/file5
EXPECT 0 get_pending_heal_count $V0
EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0
cleanup;

View File

@ -13,7 +13,6 @@ TEST pidof glusterd
TEST $CLI volume create $V0 replica $REPLICA $H0:$B0/${V0}-00 $H0:$B0/${V0}-01 $H0:$B0/${V0}-10 $H0:$B0/${V0}-11
TEST $CLI volume start $V0
TEST $CLI volume set $V0 cluster.self-heal-daemon off
TEST $CLI volume set $V0 cluster.background-self-heal-count 0
## Mount FUSE with caching disabled
@ -82,10 +81,15 @@ EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name"
# restart the brick process
TEST $CLI volume start $V0 force
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 `expr $brick_id - 1`
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3
cat $pth >/dev/null
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
# check backends - xattr should not be present anywhere
EXPECT 1 xattr_query_check ${backend_paths_array[0]} "trusted.name"
EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name"

View File

@ -2692,6 +2692,42 @@ out:
return 0;
}
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
{
int idx = -1;
afr_private_t *priv = NULL;
void *pending_raw = NULL;
int *pending_int = NULL;
int i = 0;
priv = this->private;
idx = afr_index_for_transaction_type (type);
if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
if (pending_raw) {
pending_int = pending_raw;
if (ntoh32 (pending_int[idx]))
return _gf_true;
}
}
for (i = 0; i < priv->child_count; i++) {
if (dict_get_ptr (xdata, priv->pending_key[i],
&pending_raw))
continue;
if (!pending_raw)
continue;
pending_int = pending_raw;
if (ntoh32 (pending_int[idx]))
return _gf_true;
}
return _gf_false;
}
static gf_boolean_t
afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
{
@ -2718,6 +2754,14 @@ afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
continue;
}
if (afr_is_pending_set (this, replies[i].xdata,
AFR_METADATA_TRANSACTION)) {
/* Let shd do the heal so that lookup is not blocked
* on getting metadata lock/doing the heal */
start = _gf_false;
break;
}
if (gf_uuid_compare (stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
start = _gf_false;
break;

View File

@ -2180,44 +2180,6 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
return 0;
}
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
{
int idx = -1;
afr_private_t *priv = NULL;
void *pending_raw = NULL;
int *pending_int = NULL;
int i = 0;
priv = this->private;
idx = afr_index_for_transaction_type (type);
if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
if (pending_raw) {
pending_int = pending_raw;
if (ntoh32 (pending_int[idx]))
return _gf_true;
}
}
for (i = 0; i < priv->child_count; i++) {
if (dict_get_ptr (xdata, priv->pending_key[i],
&pending_raw))
continue;
if (!pending_raw)
continue;
pending_int = pending_raw;
if (ntoh32 (pending_int[idx]))
return _gf_true;
}
return _gf_false;
}
gf_boolean_t
afr_is_data_set (xlator_t *this, dict_t *xdata)
{

View File

@ -1249,4 +1249,7 @@ afr_ta_post_op_lock (xlator_t *this, loc_t *loc);
int
afr_ta_post_op_unlock (xlator_t *this, loc_t *loc);
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
#endif /* __AFR_H__ */