cluster/afr: GFID split brain resolution with favorite-child-policy

Problem:
Currently the automatic split brain resolution with favorite child policy
is not resolving the GFID split brains.

Fix:
When there is a GFID split brain and the favorite child policy is set to
size/mtime/ctime/majority, based on the policy decide on the source and
sinks. Delete the entry from the sinks and recreate it from the source.
Mark the appropriate pending attributes and resolve the GFID split brain.
When the heal takes place it will complete the pending heals and reset
the attributes.

Change-Id: Ie30e5373f94ca6f276745d9c3ad662b8acca6946
BUG: 1430719
Signed-off-by: karthik-us <ksubrahm@redhat.com>
Reviewed-on: https://review.gluster.org/16878
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
This commit is contained in:
karthik-us 2017-03-09 18:08:28 +05:30 committed by Pranith Kumar Karampuri
parent 98dc1f08c1
commit 799a2ff829
3 changed files with 390 additions and 45 deletions

View File

@ -0,0 +1,228 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup;
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
TEST $CLI volume start $V0
TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
TEST $CLI volume set $V0 self-heal-daemon off
TEST $CLI volume set $V0 cluster.data-self-heal off
TEST $CLI volume set $V0 cluster.metadata-self-heal off
TEST $CLI volume set $V0 cluster.entry-self-heal off
##### Healing with favorite-child-policy = mtime ######
##### and self-heal-daemon ######
TEST $CLI volume set $V0 favorite-child-policy mtime
TEST kill_brick $V0 $H0 $B0/${V0}0
echo "Sink based on mtime" > $M0/f1
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
TEST kill_brick $V0 $H0 $B0/${V0}1
echo "Source based on mtime" > $M0/f1
#Gfids of file f1 on bricks 0 & 1 should differ
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f1)
gfid_1=$(gf_get_gfid_xattr $B0/${V0}1/f1)
TEST [ "$gfid_0" != "$gfid_1" ]
TEST $CLI volume set $V0 self-heal-daemon on
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
#We know that first brick has the latest mtime
LATEST_MTIME_MD5=$(md5sum $B0/${V0}0/f1 | cut -d\ -f1)
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
#gfid split-brain should be resolved
gfid_1=$(gf_get_gfid_xattr $B0/${V0}1/f1)
TEST [ "$gfid_0" == "$gfid_1" ]
HEALED_MD5=$(md5sum $B0/${V0}1/f1 | cut -d\ -f1)
TEST [ "$LATEST_MTIME_MD5" == "$HEALED_MD5" ]
TEST $CLI volume set $V0 self-heal-daemon off
##### Healing with favorite-child-policy = ctime ######
##### and self-heal-daemon ######
#gfid split-brain resolution should work even when the granular-enrty-heal is
#enabled
TEST $CLI volume heal $V0 granular-entry-heal enable
TEST $CLI volume set $V0 favorite-child-policy ctime
TEST kill_brick $V0 $H0 $B0/${V0}1
echo "Sink based on ctime" > $M0/f2
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
TEST kill_brick $V0 $H0 $B0/${V0}0
echo "Source based on ctime" > $M0/f2
#Gfids of file f2 on bricks 0 & 1 should differ
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f2)
gfid_1=$(gf_get_gfid_xattr $B0/${V0}1/f2)
TEST [ "$gfid_0" != "$gfid_1" ]
TEST $CLI volume set $V0 self-heal-daemon on
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
#We know that second brick has the latest ctime
LATEST_CTIME_MD5=$(md5sum $B0/${V0}1/f2 | cut -d\ -f1)
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
#gfid split-brain should be resolved
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f2)
TEST [ "$gfid_0" == "$gfid_1" ]
HEALED_MD5=$(md5sum $B0/${V0}0/f2 | cut -d\ -f1)
TEST [ "$LATEST_CTIME_MD5" == "$HEALED_MD5" ]
#Add one more brick, and heal.
TEST $CLI volume add-brick $V0 replica 3 $H0:$B0/${V0}2
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
TEST $CLI volume heal $V0
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
TEST $CLI volume set $V0 self-heal-daemon off
##### Healing using favorite-child-policy = size #####
##### and client side heal #####
TEST $CLI volume set $V0 cluster.data-self-heal on
TEST $CLI volume set $V0 cluster.metadata-self-heal on
TEST $CLI volume set $V0 cluster.entry-self-heal on
#Set the quorum-type to none, and create a gfid split brain
TEST $CLI volume set $V0 cluster.quorum-type none
TEST kill_brick $V0 $H0 $B0/${V0}0
TEST kill_brick $V0 $H0 $B0/${V0}1
echo "Smallest file" > $M0/f3
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
TEST kill_brick $V0 $H0 $B0/${V0}1
TEST kill_brick $V0 $H0 $B0/${V0}2
echo "Second smallest file" > $M0/f3
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
TEST kill_brick $V0 $H0 $B0/${V0}0
TEST kill_brick $V0 $H0 $B0/${V0}2
echo "Biggest among the three files" > $M0/f3
#Bring back the down bricks.
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
#Gfids of file f3 on all the bricks should differ
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f3)
gfid_1=$(gf_get_gfid_xattr $B0/${V0}1/f3)
gfid_2=$(gf_get_gfid_xattr $B0/${V0}2/f3)
TEST [ "$gfid_0" != "$gfid_1" ]
TEST [ "$gfid_0" != "$gfid_2" ]
TEST [ "$gfid_1" != "$gfid_2" ]
#We know that second brick has the bigger size file
BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/f3 | cut -d\ -f1)
TEST ls $M0/f3
TEST cat $M0/f3
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
#gfid split-brain should be resolved
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f3)
gfid_2=$(gf_get_gfid_xattr $B0/${V0}2/f3)
TEST [ "$gfid_0" == "$gfid_1" ]
TEST [ "$gfid_2" == "$gfid_1" ]
HEALED_MD5_1=$(md5sum $B0/${V0}0/f3 | cut -d\ -f1)
HEALED_MD5_2=$(md5sum $B0/${V0}2/f3 | cut -d\ -f1)
TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5_1" ]
TEST [ "$BIGGER_FILE_MD5" == "$HEALED_MD5_2" ]
##### Healing using favorite-child-policy = majority #####
##### and client side heal #####
TEST kill_brick $V0 $H0 $B0/${V0}0
TEST kill_brick $V0 $H0 $B0/${V0}1
echo "Does not agree with bricks 0 & 1" > $M0/f4
TEST $CLI v start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
TEST kill_brick $V0 $H0 $B0/${V0}2
echo "Agree on bricks 0 & 1" > $M0/f4
#Gfids of file f4 on bricks 0 & 1 should be same and bricks 0 & 2 should differ
gfid_0=$(gf_get_gfid_xattr $B0/${V0}0/f4)
gfid_1=$(gf_get_gfid_xattr $B0/${V0}1/f4)
gfid_2=$(gf_get_gfid_xattr $B0/${V0}2/f4)
TEST [ "$gfid_0" == "$gfid_1" ]
TEST [ "$gfid_0" != "$gfid_2" ]
#We know that first and second bricks agree with each other. Pick any one of
#them as source
MAJORITY_MD5=$(md5sum $B0/${V0}0/f4 | cut -d\ -f1)
#Bring back the down brick and heal.
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
TEST ls $M0/f4
TEST cat $M0/f4
EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
#gfid split-brain should be resolved
gfid_2=$(gf_get_gfid_xattr $B0/${V0}2/f4)
TEST [ "$gfid_0" == "$gfid_2" ]
HEALED_MD5=$(md5sum $B0/${V0}2/f4 | cut -d\ -f1)
TEST [ "$MAJORITY_MD5" == "$HEALED_MD5" ]
cleanup;

View File

@ -17,6 +17,105 @@
#include "syncop-utils.h"
#include "events.h"
int
afr_selfheal_gfid_mismatch_by_majority (struct afr_reply *replies,
int child_count)
{
int j = 0;
int i = 0;
int src = -1;
int votes[child_count];
for (i = 0; i < child_count; i++) {
if (!replies[i].valid || replies[i].op_ret == -1)
continue;
votes[i] = 1;
for (j = i+1; j < child_count; j++) {
if ((!gf_uuid_compare (replies[i].poststat.ia_gfid,
replies[j].poststat.ia_gfid)))
votes[i]++;
if (votes[i] > child_count / 2) {
src = i;
goto out;
}
}
}
out:
return src;
}
int
afr_gfid_split_brain_source (xlator_t *this, struct afr_reply *replies,
inode_t *inode, uuid_t pargfid, char *bname,
int src_idx, int child_idx,
unsigned char *locked_on, int *src)
{
afr_private_t *priv = NULL;
char g1[64] = {0,};
char g2[64] = {0,};
int up_count = 0;
priv = this->private;
up_count = AFR_COUNT (locked_on, priv->child_count);
if (up_count != priv->child_count) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_SPLIT_BRAIN,
"All the bricks should be up to resolve the gfid split "
"brain");
goto out;
}
switch (priv->fav_child_policy) {
case AFR_FAV_CHILD_BY_SIZE:
*src = afr_sh_fav_by_size (this, replies, inode);
break;
case AFR_FAV_CHILD_BY_MTIME:
*src = afr_sh_fav_by_mtime (this, replies, inode);
break;
case AFR_FAV_CHILD_BY_CTIME:
*src = afr_sh_fav_by_ctime(this, replies, inode);
break;
case AFR_FAV_CHILD_BY_MAJORITY:
if (priv->child_count != 2)
*src = afr_selfheal_gfid_mismatch_by_majority (replies,
priv->child_count);
else
*src = -1;
if (*src == -1) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_SPLIT_BRAIN, "No majority to resolve "
"gfid split brain");
}
break;
default:
break;
}
out:
if (*src == -1) {
gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
"Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
" %s on %s. Skipping conservative merge on the file.",
uuid_utoa (pargfid), bname,
uuid_utoa_r (replies[child_idx].poststat.ia_gfid, g1),
priv->children[child_idx]->name,
uuid_utoa_r (replies[src_idx].poststat.ia_gfid, g2),
priv->children[src_idx]->name);
gf_event (EVENT_AFR_SPLIT_BRAIN, "subvol=%s;type=gfid;file="
"<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
"child-%d=%s;gfid-%d=%s", this->name,
uuid_utoa (pargfid), bname, child_idx,
priv->children[child_idx]->name, child_idx,
uuid_utoa_r (replies[child_idx].poststat.ia_gfid, g1),
src_idx, priv->children[src_idx]->name, src_idx,
uuid_utoa_r (replies[src_idx].poststat.ia_gfid, g2));
return -1;
}
return 0;
}
static int
afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name,
inode_t *inode, int child, struct afr_reply *replies)
@ -206,13 +305,15 @@ __afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
static int
afr_selfheal_detect_gfid_and_type_mismatch (xlator_t *this,
struct afr_reply *replies,
uuid_t pargfid, char *bname,
int src_idx)
inode_t *inode,
uuid_t pargfid,
char *bname, int src_idx,
unsigned char *locked_on,
int *src)
{
int i = 0;
char g1[64] = {0,};
char g2[64] = {0,};
afr_private_t *priv = NULL;
int i = 0;
int ret = -1;
afr_private_t *priv = NULL;
priv = this->private;
@ -227,46 +328,33 @@ afr_selfheal_detect_gfid_and_type_mismatch (xlator_t *this,
continue;
if (gf_uuid_compare (replies[src_idx].poststat.ia_gfid,
replies[i].poststat.ia_gfid)) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_SPLIT_BRAIN, "Gfid mismatch "
"detected for <gfid:%s>/%s>, %s on %s and %s on %s. "
"Skipping conservative merge on the file.",
uuid_utoa (pargfid), bname,
uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
priv->children[i]->name,
uuid_utoa_r (replies[src_idx].poststat.ia_gfid,
g2), priv->children[src_idx]->name);
gf_event (EVENT_AFR_SPLIT_BRAIN,
"subvol=%s;type=gfid;file=<gfid:%s>/%s>;count=2;"
"child-%d=%s;gfid-%d=%s;child-%d=%s;gfid-%d=%s",
this->name, uuid_utoa (pargfid), bname, i,
priv->children[i]->name, i,
uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
src_idx, priv->children[src_idx]->name, src_idx,
uuid_utoa_r (replies[src_idx].poststat.ia_gfid, g2));
return -1;
replies[i].poststat.ia_gfid)) {
ret = afr_gfid_split_brain_source (this, replies, inode,
pargfid, bname,
src_idx, i,
locked_on, src);
return ret;
}
if ((replies[src_idx].poststat.ia_type) !=
(replies[i].poststat.ia_type)) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_SPLIT_BRAIN, "Type mismatch "
"detected for <gfid:%s>/%s>, %s on %s and %s on %s. "
AFR_MSG_SPLIT_BRAIN, "Type mismatch detected "
"for <gfid:%s>/%s>, %s on %s and %s on %s. "
"Skipping conservative merge on the file.",
uuid_utoa (pargfid), bname,
gf_inode_type_to_str (replies[i].poststat.ia_type),
gf_inode_type_to_str (replies[i].poststat.ia_type),
priv->children[i]->name,
gf_inode_type_to_str (replies[src_idx].poststat.ia_type),
gf_inode_type_to_str (replies[src_idx].poststat.ia_type),
priv->children[src_idx]->name);
gf_event (EVENT_AFR_SPLIT_BRAIN,
"subvol=%s;type=file;file=<gfid:%s>/%s>;count=2;"
"child-%d=%s;type-%d=%s;child-%d=%s;type-%d=%s",
this->name, uuid_utoa (pargfid), bname, i,
priv->children[i]->name, i,
gf_inode_type_to_str(replies[i].poststat.ia_type),
src_idx, priv->children[src_idx]->name, src_idx,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
gf_event (EVENT_AFR_SPLIT_BRAIN, "subvol=%s;type=file;"
"file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
"%d=%s;child-%d=%s;type-%d=%s",
this->name, uuid_utoa (pargfid), bname, i,
priv->children[i]->name, i,
gf_inode_type_to_str(replies[i].poststat.ia_type),
src_idx, priv->children[src_idx]->name, src_idx,
gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
return -1;
}
}
@ -283,11 +371,12 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = 0;
int i = 0;
int source = -1;
int src = -1;
afr_private_t *priv = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
for (i = 0; i < priv->child_count; i++) {
if (replies[i].valid && replies[i].op_ret == 0) {
source = i;
break;
@ -306,24 +395,41 @@ __afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
/* In case of a gfid or type mismatch on the entry, return -1.*/
ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies,
/* In case of type mismatch / unable to resolve gfid mismatch on the
* entry, return -1.*/
ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies, inode,
fd->inode->gfid,
name, source);
name, source,
locked_on, &src);
if (ret < 0)
return ret;
if (src != -1) {
source = src;
for (i = 0; i < priv->child_count; i++) {
if (i != src && replies[i].valid &&
gf_uuid_compare (replies[src].poststat.ia_gfid,
replies[i].poststat.ia_gfid)) {
sources[i] = 0;
}
}
}
for (i = 0; i < priv->child_count; i++) {
if (i == source || !healed_sinks[i])
continue;
if (replies[i].op_errno != ENOENT)
if (src != -1) {
if (!gf_uuid_compare (replies[src].poststat.ia_gfid,
replies[i].poststat.ia_gfid))
continue;
} else if (replies[i].op_errno != ENOENT) {
continue;
}
ret = afr_selfheal_recreate_entry (frame, i, source, sources,
fd->inode, name, inode,
replies);
ret |= afr_selfheal_recreate_entry (frame, i, source, sources,
fd->inode, name, inode,
replies);
}
return ret;

View File

@ -319,4 +319,15 @@ afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources,
int
afr_selfheal_metadata_by_stbuf (xlator_t *this, struct iatt *stbuf);
int
afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies,
inode_t *inode);
int
afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies,
inode_t *inode);
int
afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies,
inode_t *inode);
#endif /* !_AFR_SELFHEAL_H */