cluster/afr: Allow data/entry self heal for metadata split-brain
Problem: Currently whenever there is metadata split-brain, a variable sh->op_failed is set to 1 to denote that self heal got failed. But if we proceed for data self heal, even code-path of data self heal also relies on the sh->op_failed variable. So if will check for sh->op_failed variable and will eventually fails to do data self heal. So needed a mechanism to allow data self heal even if metadata is in split brain. Fix: Some data structure revamp is done in http://review.gluster.com/#/c/5106/ fix and this patch is based on the above fix. Now we can store which particular self-heal got failed i.e GFID_OR_MISSING_ENTRY_SELF_HEAL, METADATA, DATA, ENTRY. And we can do two types of self heal failure check. 1. Individual type check: We can check which among all four (Metadata, Data, Gfid or missing entry, entry self heal) got failed. 2. In afr_self_heal_completion_cbk, we need to make check based on the fact that if any specific self heal got failed treat the complete self heal as failure so that it will populate corresponding circular buffer of event history accordingly. Change-Id: Icb91e513bcc752386fc8a78812405cfabe5cac2d BUG: 977797 Signed-off-by: Venkatesh Somyajulu <vsomyaju@redhat.com> Reviewed-on: http://review.gluster.org/5253 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
This commit is contained in:
parent
7062eda157
commit
ef8092fab7
114
tests/bugs/bug-977797.t
Executable file
114
tests/bugs/bug-977797.t
Executable file
@ -0,0 +1,114 @@
|
||||
#!/bin/bash
|
||||
|
||||
. $(dirname $0)/../include.rc
|
||||
. $(dirname $0)/../volume.rc
|
||||
|
||||
cleanup;
|
||||
|
||||
## Start and create a volume
|
||||
TEST glusterd;
|
||||
TEST pidof glusterd;
|
||||
TEST $CLI volume info;
|
||||
|
||||
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2};
|
||||
|
||||
## Verify volume is is created
|
||||
EXPECT "$V0" volinfo_field $V0 'Volume Name';
|
||||
EXPECT 'Created' volinfo_field $V0 'Status';
|
||||
|
||||
## Start volume and verify
|
||||
TEST $CLI volume start $V0;
|
||||
EXPECT 'Started' volinfo_field $V0 'Status';
|
||||
|
||||
TEST $CLI volume set $V0 self-heal-daemon off
|
||||
TEST $CLI volume set $V0 open-behind off
|
||||
TEST $CLI volume set $V0 quick-read off
|
||||
TEST $CLI volume set $V0 read-ahead off
|
||||
TEST $CLI volume set $V0 write-behind off
|
||||
TEST $CLI volume set $V0 io-cache off
|
||||
TEST $CLI volume set $V0 background-self-heal-count 0
|
||||
|
||||
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
|
||||
|
||||
|
||||
TEST mkdir -p $M0/a
|
||||
TEST `echo "GLUSTERFS" > $M0/a/file`
|
||||
|
||||
TEST kill_brick $V0 $H0 $B0/$V0"1"
|
||||
|
||||
TEST chown root $M0/a
|
||||
TEST chown root $M0/a/file
|
||||
TEST `echo "GLUSTER-FILE-SYSTEM" > $M0/a/file`
|
||||
TEST mkdir $M0/a/b
|
||||
|
||||
TEST $CLI volume start $V0 force
|
||||
EXPECT_WITHIN 20 "1" afr_child_up_status $V0 0;
|
||||
|
||||
|
||||
|
||||
TEST kill_brick $V0 $H0 $B0/$V0"2"
|
||||
|
||||
TEST chmod 757 $M0/a
|
||||
TEST chmod 757 $M0/a/file
|
||||
|
||||
TEST $CLI volume start $V0 force
|
||||
EXPECT_WITHIN 20 "1" afr_child_up_status $V0 1;
|
||||
|
||||
TEST ls -l $M0/a/file
|
||||
|
||||
b1c0dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
|
||||
trusted.afr.$V0-client-0 "entry")
|
||||
b1c1dir=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a \
|
||||
trusted.afr.$V0-client-1 "entry")
|
||||
b2c0dir=$(afr_get_specific_changelog_xattr \
|
||||
$B0/$V0"2"/a trusted.afr.$V0-client-0 "entry")
|
||||
b2c1dir=$(afr_get_specific_changelog_xattr \
|
||||
$B0/$V0"2"/a trusted.afr.$V0-client-1 "entry")
|
||||
|
||||
|
||||
b1c0f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
|
||||
trusted.afr.$V0-client-0 "data")
|
||||
b1c1f=$(afr_get_specific_changelog_xattr $B0/$V0"1"/a/file \
|
||||
trusted.afr.$V0-client-1 "data")
|
||||
b2c0f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
|
||||
trusted.afr.$V0-client-0 "data")
|
||||
b2c1f=$(afr_get_specific_changelog_xattr $B0/$V0"2"/a/file \
|
||||
trusted.afr.$V0-client-1 "data")
|
||||
|
||||
EXPECT "00000000" echo $b1c0f
|
||||
EXPECT "00000000" echo $b1c1f
|
||||
EXPECT "00000000" echo $b2c0f
|
||||
EXPECT "00000000" echo $b2c1f
|
||||
|
||||
EXPECT "00000000" echo $b1c0dir
|
||||
EXPECT "00000000" echo $b1c1dir
|
||||
EXPECT "00000000" echo $b2c0dir
|
||||
EXPECT "00000000" echo $b2c1dir
|
||||
|
||||
contains() {
|
||||
string="$1"
|
||||
substring="$2"
|
||||
var="-1"
|
||||
if test "${string#*$substring}" != "$string"
|
||||
then
|
||||
var="0" # $substring is in $string
|
||||
else
|
||||
var="1" # $substring is not in $string
|
||||
fi
|
||||
echo $var
|
||||
}
|
||||
|
||||
var1=$(cat $M0/a/file 2>&1)
|
||||
var2="Input/output error"
|
||||
|
||||
|
||||
EXPECT "0" contains "$var1" "$var2"
|
||||
|
||||
## Finish up
|
||||
TEST $CLI volume stop $V0;
|
||||
EXPECT 'Stopped' volinfo_field $V0 'Status';
|
||||
|
||||
TEST $CLI volume delete $V0;
|
||||
TEST ! $CLI volume info $V0;
|
||||
|
||||
cleanup;
|
@ -237,6 +237,26 @@ function dht_get_layout {
|
||||
getfattr -d -e hex -n $my_xa $1 2> /dev/null | grep "$my_xa=" | cut -d= -f2
|
||||
}
|
||||
|
||||
function afr_get_specific_changelog_xattr ()
|
||||
{
|
||||
local path=$1
|
||||
local key=$2
|
||||
local type=$3
|
||||
local specific_changelog=""
|
||||
|
||||
changelog_xattr=$(afr_get_changelog_xattr "$path" "$key")
|
||||
if [ "$type" == "data" ]; then
|
||||
specific_changelog=${changelog_xattr:2:8}
|
||||
elif [ "$type" == "metadata" ]; then
|
||||
specific_changelog=${changelog_xattr:10:8}
|
||||
elif [ "$type" == "entry" ]; then
|
||||
specific_changelog=${changelog_xattr:18:8}
|
||||
else
|
||||
specific_changlog="error"
|
||||
fi
|
||||
|
||||
echo $specific_changelog
|
||||
}
|
||||
##
|
||||
# query pathinfo xattr and extract POSIX pathname(s)
|
||||
##
|
||||
|
@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
|
||||
}
|
||||
|
||||
sh_private_cleanup (sh_frame, this);
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
GF_ASSERT (!last_loop_frame);
|
||||
//loop_finish should have happened and the old_loop should be NULL
|
||||
gf_log (this->name, GF_LOG_DEBUG,
|
||||
@ -276,7 +276,7 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset,
|
||||
_gf_true, sh_loop_lock_success, sh_loop_lock_failure);
|
||||
return 0;
|
||||
out:
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
if (old_loop_frame)
|
||||
sh_loop_finish (old_loop_frame, this);
|
||||
sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);
|
||||
@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
|
||||
sh_priv->loops_running--;
|
||||
offset = sh_priv->offset;
|
||||
block_size = sh->block_size;
|
||||
while ((!sh->eof_reached) && (!is_self_heal_failed (sh)) &&
|
||||
(sh_priv->loops_running < priv->data_self_heal_window_size)
|
||||
while ((!sh->eof_reached) &&
|
||||
(!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) &&
|
||||
(sh_priv->loops_running < priv->data_self_heal_window_size)
|
||||
&& (sh_priv->offset < sh->file_size)) {
|
||||
|
||||
loop++;
|
||||
@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
|
||||
if (0 == loop) {
|
||||
//loop finish does unlock, but the erasing of the pending
|
||||
//xattrs needs to happen before that so do not finish the loop
|
||||
if (is_driver_done && !is_self_heal_failed (sh))
|
||||
if (is_driver_done &&
|
||||
!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
|
||||
goto driver_done;
|
||||
if (old_loop_frame) {
|
||||
sh_loop_finish (old_loop_frame, this);
|
||||
@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
|
||||
//If we have more loops to form we should finish previous loop after
|
||||
//the next loop lock
|
||||
while (loop--) {
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
// op failed in other loop, stop spawning more loops
|
||||
if (old_loop_frame) {
|
||||
sh_loop_finish (old_loop_frame, this);
|
||||
@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame
|
||||
}
|
||||
|
||||
if (op_ret == -1) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
if (loop_frame) {
|
||||
sh_loop_finish (loop_frame, this);
|
||||
@ -432,7 +434,7 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (loop_sh, op_errno);
|
||||
} else if (op_ret < loop_local->cont.writev.vector->iov_len) {
|
||||
gf_log (this->name, GF_LOG_ERROR,
|
||||
@ -440,7 +442,7 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
|
||||
"(expected %lu, returned %d)", sh_local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
loop_local->cont.writev.vector->iov_len, op_ret);
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
|
||||
call_count = afr_frame_return (loop_frame);
|
||||
@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
|
||||
|
||||
if (op_ret <= 0) {
|
||||
if (op_ret < 0) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
gf_log (this->name, GF_LOG_ERROR, "read failed on %d "
|
||||
"for %s reason :%s", sh->source,
|
||||
sh_local->loc.path, strerror (errno));
|
||||
@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
|
||||
"checksum on %s failed on subvolume %s (%s)",
|
||||
sh_local->loc.path, priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
} else {
|
||||
memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH,
|
||||
strong_checksum, MD5_DIGEST_LENGTH);
|
||||
@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
|
||||
}
|
||||
UNLOCK (&sh_priv->lock);
|
||||
|
||||
if (write_needed && !is_self_heal_failed (sh)) {
|
||||
if (write_needed &&
|
||||
!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
sh_loop_read (loop_frame, this);
|
||||
} else {
|
||||
sh_loop_return (sh_frame, this, loop_frame,
|
||||
@ -800,7 +803,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
sh_loop_driver_done (sh_frame, this, NULL);
|
||||
}
|
||||
return 0;
|
||||
|
@ -1018,7 +1018,7 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
|
||||
local->loc.path);
|
||||
}
|
||||
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
sh->completion_cbk (frame, this);
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_TRACE,
|
||||
@ -1250,7 +1250,7 @@ out:
|
||||
if (ret) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, "
|
||||
"reason: %s", local->loc.path, strerror (-ret));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
}
|
||||
@ -1265,7 +1265,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this,
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
if (op_ret < 0)
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
return 0;
|
||||
}
|
||||
@ -1386,7 +1386,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
|
||||
}
|
||||
return;
|
||||
out:
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
return;
|
||||
@ -1470,7 +1470,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
|
||||
LOCK (&frame->lock);
|
||||
{
|
||||
afr_sh_set_error (sh, EIO);
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
}
|
||||
@ -1552,7 +1552,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
|
||||
sh = &local->self_heal;
|
||||
priv = this->private;
|
||||
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
} else {
|
||||
if (afr_gfid_missing_count (this->name, sh->fresh_children,
|
||||
@ -1766,7 +1766,7 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
|
||||
priv->child_count, ENOENT);
|
||||
if (fresh_child_enoents == fresh_parent_count) {
|
||||
afr_sh_set_error (sh, ENOENT);
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_purge_entry (frame, this);
|
||||
} else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
|
||||
priv->child_count, local->loc.path,
|
||||
@ -1787,7 +1787,7 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
|
||||
return;
|
||||
|
||||
fail:
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
return;
|
||||
@ -1858,7 +1858,7 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
|
||||
|
||||
out:
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_missing_entries_finish (frame, this);
|
||||
return;
|
||||
}
|
||||
@ -1962,7 +1962,7 @@ afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame,
|
||||
if (int_lock->lock_op_ret < 0) {
|
||||
gf_log (this->name, GF_LOG_INFO,
|
||||
"Non blocking entrylks failed.");
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_missing_entries_done (frame, this);
|
||||
} else {
|
||||
|
||||
@ -2047,8 +2047,9 @@ afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
sh->afr_set_self_heal_status = afr_set_gfid_or_missing_entry_sh_status;
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY;
|
||||
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
|
||||
afr_self_heal_parent_entrylk (frame, this,
|
||||
afr_sh_post_nb_entrylk_missing_entry_sh_cbk);
|
||||
@ -2176,7 +2177,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
|
||||
|
||||
afr_self_heal_type_str_get (sh, sh_type_str,
|
||||
sizeof(sh_type_str));
|
||||
if (is_self_heal_failed (sh) && !priv->shd.iamshd) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) {
|
||||
loglevel = GF_LOG_ERROR;
|
||||
} else {
|
||||
loglevel = GF_LOG_DEBUG;
|
||||
@ -2191,7 +2192,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
|
||||
orig_frame_sh = &orig_frame_local->self_heal;
|
||||
orig_frame_sh->actual_sh_started = _gf_true;
|
||||
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
|
||||
is_self_heal_failed (sh));
|
||||
is_self_heal_failed (sh, AFR_CHECK_ALL));
|
||||
}
|
||||
|
||||
if (sh->background) {
|
||||
@ -2305,6 +2306,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
|
||||
sh->do_gfid_self_heal = _gf_false;
|
||||
}
|
||||
|
||||
sh->sh_type_in_action = AFR_SELF_HEAL_INVALID;
|
||||
|
||||
FRAME_SU_DO (sh_frame, afr_local_t);
|
||||
if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) {
|
||||
afr_self_heal_missing_entries (sh_frame, this);
|
||||
@ -2514,7 +2517,7 @@ out:
|
||||
GF_FREE (erase_xattr);
|
||||
|
||||
if (ret < 0) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
finish (frame, this);
|
||||
}
|
||||
|
||||
@ -2522,59 +2525,39 @@ out:
|
||||
}
|
||||
|
||||
void
|
||||
afr_set_data_sh_status (afr_self_heal_t *sh, afr_self_heal_status status)
|
||||
afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
|
||||
xlator_t *this = NULL;
|
||||
afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status);
|
||||
afr_self_heal_type sh_type_in_action = sh->sh_type_in_action;
|
||||
this = THIS;
|
||||
|
||||
if (sh)
|
||||
sh->afr_all_sh_status.data_self_heal = status;
|
||||
else
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR,
|
||||
"Null self heal struct");
|
||||
}
|
||||
if (!sh) {
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal"
|
||||
"Structure");
|
||||
goto out;
|
||||
}
|
||||
|
||||
void
|
||||
afr_set_metadata_sh_status (afr_self_heal_t *sh, afr_self_heal_status status)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
|
||||
this = THIS;
|
||||
|
||||
if (sh)
|
||||
sh->afr_all_sh_status.metadata_self_heal = status;
|
||||
else
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR,
|
||||
"Null self heal struct");
|
||||
}
|
||||
|
||||
void
|
||||
afr_set_entry_sh_status (afr_self_heal_t *sh, afr_self_heal_status status)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
|
||||
this = THIS;
|
||||
|
||||
if (sh)
|
||||
sh->afr_all_sh_status.entry_self_heal = status;
|
||||
else
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR,
|
||||
"Null self heal struct");
|
||||
}
|
||||
void
|
||||
afr_set_gfid_or_missing_entry_sh_status (afr_self_heal_t *sh,
|
||||
afr_self_heal_status status)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
|
||||
this = THIS;
|
||||
|
||||
if (sh)
|
||||
sh->afr_all_sh_status.gfid_or_missing_entry_self_heal = status;
|
||||
else
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR,
|
||||
"Null self heal struct");
|
||||
switch (sh_type_in_action) {
|
||||
case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
|
||||
sh_status->gfid_or_missing_entry_self_heal = status;
|
||||
break;
|
||||
case AFR_SELF_HEAL_METADATA:
|
||||
sh_status->metadata_self_heal = status;
|
||||
break;
|
||||
case AFR_SELF_HEAL_DATA:
|
||||
sh_status->data_self_heal = status;
|
||||
break;
|
||||
case AFR_SELF_HEAL_ENTRY:
|
||||
sh_status->entry_self_heal = status;
|
||||
break;
|
||||
case AFR_SELF_HEAL_INVALID:
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid"
|
||||
"self heal type in action");
|
||||
break;
|
||||
}
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
@ -2585,22 +2568,58 @@ afr_set_local_for_unhealable (afr_local_t *local)
|
||||
sh = &local->self_heal;
|
||||
|
||||
local->unhealable = 1;
|
||||
if (sh->afr_set_self_heal_status)
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
|
||||
int
|
||||
is_self_heal_failed (afr_self_heal_t *sh)
|
||||
is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type)
|
||||
{
|
||||
afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status;
|
||||
afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status;
|
||||
afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID;
|
||||
afr_self_heal_status status = AFR_SELF_HEAL_FAILED;
|
||||
xlator_t *this = NULL;
|
||||
int sh_failed = 0;
|
||||
|
||||
int sh_failed = 0;
|
||||
if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED))
|
||||
sh_failed = 1;
|
||||
this = THIS;
|
||||
|
||||
if (!sh) {
|
||||
gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal "
|
||||
"structure");
|
||||
sh_failed = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (type == AFR_CHECK_ALL) {
|
||||
if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED)
|
||||
|| (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED))
|
||||
sh_failed = 1;
|
||||
} else if (type == AFR_CHECK_SPECIFIC) {
|
||||
sh_type_in_action = sh->sh_type_in_action;
|
||||
switch (sh_type_in_action) {
|
||||
case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY:
|
||||
status = sh_status.gfid_or_missing_entry_self_heal;
|
||||
break;
|
||||
case AFR_SELF_HEAL_METADATA:
|
||||
status = sh_status.metadata_self_heal;
|
||||
break;
|
||||
case AFR_SELF_HEAL_ENTRY:
|
||||
status = sh_status.entry_self_heal;
|
||||
break;
|
||||
case AFR_SELF_HEAL_DATA:
|
||||
status = sh_status.data_self_heal;
|
||||
break;
|
||||
case AFR_SELF_HEAL_INVALID:
|
||||
status = AFR_SELF_HEAL_NOT_ATTEMPTED;
|
||||
break;
|
||||
}
|
||||
if (status == AFR_SELF_HEAL_FAILED)
|
||||
sh_failed = 1;
|
||||
|
||||
}
|
||||
|
||||
out:
|
||||
return sh_failed;
|
||||
}
|
||||
|
||||
|
@ -14,13 +14,6 @@
|
||||
#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
|
||||
#define AFR_SH_MIN_PARTICIPANTS 2
|
||||
|
||||
typedef enum {
|
||||
AFR_SELF_HEAL_ENTRY,
|
||||
AFR_SELF_HEAL_METADATA,
|
||||
AFR_SELF_HEAL_DATA,
|
||||
AFR_SELF_HEAL_INVALID = -1,
|
||||
} afr_self_heal_type;
|
||||
|
||||
typedef enum {
|
||||
AFR_LOOKUP_FAIL_CONFLICTS = 1,
|
||||
AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
|
||||
@ -138,20 +131,10 @@ void
|
||||
afr_set_local_for_unhealable (afr_local_t *local);
|
||||
|
||||
int
|
||||
is_self_heal_failed (afr_self_heal_t *sh);
|
||||
is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type);
|
||||
|
||||
void
|
||||
afr_set_data_sh_status (afr_self_heal_t *sh, afr_self_heal_status status);
|
||||
|
||||
void
|
||||
afr_set_metadata_sh_status (afr_self_heal_t *sh, afr_self_heal_status staus);
|
||||
|
||||
void
|
||||
afr_set_entry_sh_status (afr_self_heal_t *sh, afr_self_heal_status status);
|
||||
|
||||
void
|
||||
afr_set_gfid_or_missing_entry_sh_status (afr_self_heal_t *sh,
|
||||
afr_self_heal_status status);
|
||||
afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status);
|
||||
|
||||
void
|
||||
afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl);
|
||||
|
@ -335,7 +335,7 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
|
||||
gf_log (this->name, GF_LOG_DEBUG,
|
||||
"finishing failed data selfheal of %s", local->loc.path);
|
||||
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
if (sh->data_lock_held)
|
||||
afr_sh_data_unlock (frame, this, afr_sh_data_close);
|
||||
else
|
||||
@ -362,13 +362,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
|
||||
"log failed on %s for subvol %s, reason: %s",
|
||||
local->loc.path, priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0) {
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
if (sh->old_loop_frame)
|
||||
sh_loop_finish (sh->old_loop_frame, this);
|
||||
sh->old_loop_frame = NULL;
|
||||
@ -418,7 +418,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
priv->children[child_index]->name, strerror (op_errno));
|
||||
LOCK (&frame->lock);
|
||||
{
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
if (sh->old_loop_frame)
|
||||
@ -428,7 +428,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
|
||||
call_count = afr_frame_return (frame);
|
||||
if (call_count == 0) {
|
||||
if (is_self_heal_failed (sh))
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
|
||||
afr_sh_data_fail (frame, this);
|
||||
else
|
||||
afr_sh_data_erase_pending (frame, this);
|
||||
@ -604,7 +604,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_DEBUG,
|
||||
"ftruncate of %s on subvolume %s completed",
|
||||
@ -617,7 +617,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0) {
|
||||
if (is_self_heal_failed (sh))
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC))
|
||||
afr_sh_data_fail (frame, this);
|
||||
else
|
||||
afr_sh_data_sync_prepare (frame, this);
|
||||
@ -718,7 +718,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
|
||||
|
||||
if (sh->background && sh->unwind && !sh->unwound) {
|
||||
sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
|
||||
is_self_heal_failed (sh));
|
||||
is_self_heal_failed (sh, AFR_CHECK_SPECIFIC));
|
||||
sh->unwound = _gf_true;
|
||||
}
|
||||
|
||||
@ -1342,7 +1342,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_TRACE,
|
||||
"open of %s succeeded on child %s",
|
||||
@ -1355,7 +1355,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0) {
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
afr_sh_data_fail (frame, this);
|
||||
return 0;
|
||||
}
|
||||
@ -1485,10 +1485,10 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this)
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
sh->afr_set_self_heal_status = afr_set_data_sh_status;
|
||||
sh->sh_type_in_action = AFR_SELF_HEAL_DATA;
|
||||
|
||||
if (afr_can_start_data_self_heal (sh, priv)) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
if (IA_ISREG (sh->type)) {
|
||||
afr_sh_data_open (frame, this);
|
||||
} else {
|
||||
|
@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
|
||||
sh = &local->self_heal;
|
||||
|
||||
if (sh->entries_skipped) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
goto out;
|
||||
}
|
||||
afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION,
|
||||
@ -799,7 +799,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
|
||||
active_src = next_active_sink (frame, this, sh->active_source);
|
||||
sh->active_source = active_src;
|
||||
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1946,7 +1946,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
|
||||
local->loc.path,
|
||||
priv->children[active_src]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_TRACE,
|
||||
"readdir of %s on subvolume %s complete",
|
||||
@ -2019,7 +2019,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
|
||||
active_src = next_active_source (frame, this, sh->active_source);
|
||||
sh->active_source = active_src;
|
||||
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
afr_sh_entry_finish (frame, this);
|
||||
return 0;
|
||||
}
|
||||
@ -2068,7 +2068,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
}
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
@ -2076,7 +2076,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0) {
|
||||
if (is_self_heal_failed (sh)) {
|
||||
if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) {
|
||||
afr_sh_entry_finish (frame, this);
|
||||
return 0;
|
||||
}
|
||||
@ -2231,7 +2231,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
|
||||
priv = this->private;
|
||||
|
||||
if (op_ret < 0) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
afr_sh_entry_finish (frame, this);
|
||||
goto out;
|
||||
@ -2294,7 +2294,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
|
||||
if (int_lock->lock_op_ret < 0) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
|
||||
"failed for %s.", local->loc.path);
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_entry_done (frame, this);
|
||||
} else {
|
||||
|
||||
@ -2321,9 +2321,10 @@ afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
sh->afr_set_self_heal_status = afr_set_entry_sh_status;
|
||||
sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY;
|
||||
|
||||
if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
afr_sh_entrylk (frame, this, &local->loc, NULL,
|
||||
afr_sh_post_nonblocking_entry_cbk);
|
||||
} else {
|
||||
|
@ -97,7 +97,7 @@ afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this)
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_metadata_finish (frame, this);
|
||||
return 0;
|
||||
}
|
||||
@ -461,7 +461,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
|
||||
priv = this->private;
|
||||
|
||||
if (op_ret < 0) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED);
|
||||
afr_sh_set_error (sh, op_errno);
|
||||
afr_sh_metadata_finish (frame, this);
|
||||
goto out;
|
||||
@ -618,10 +618,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
|
||||
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
sh->afr_set_self_heal_status = afr_set_metadata_sh_status;
|
||||
sh->sh_type_in_action = AFR_SELF_HEAL_METADATA;
|
||||
|
||||
if (afr_can_start_metadata_self_heal (sh, priv)) {
|
||||
sh->afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED);
|
||||
afr_sh_metadata_lock (frame, this);
|
||||
} else {
|
||||
afr_sh_metadata_done (frame, this);
|
||||
|
@ -185,6 +185,18 @@ typedef struct {
|
||||
afr_self_heal_status entry_self_heal;
|
||||
} afr_sh_status_for_all_type;
|
||||
|
||||
typedef enum {
|
||||
AFR_SELF_HEAL_ENTRY,
|
||||
AFR_SELF_HEAL_METADATA,
|
||||
AFR_SELF_HEAL_DATA,
|
||||
AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY,
|
||||
AFR_SELF_HEAL_INVALID = -1,
|
||||
} afr_self_heal_type;
|
||||
|
||||
typedef enum {
|
||||
AFR_CHECK_ALL,
|
||||
AFR_CHECK_SPECIFIC,
|
||||
} afr_sh_fail_check_type;
|
||||
|
||||
struct afr_self_heal_ {
|
||||
/* External interface: These are variables (some optional) that
|
||||
@ -283,9 +295,8 @@ struct afr_self_heal_ {
|
||||
|
||||
afr_sh_algo_private_t *private;
|
||||
afr_sh_status_for_all_type afr_all_sh_status;
|
||||
afr_self_heal_type sh_type_in_action;
|
||||
|
||||
void (*afr_set_self_heal_status) (struct afr_self_heal_ *sh,
|
||||
afr_self_heal_status status);
|
||||
struct afr_sh_algorithm *algo;
|
||||
afr_lock_cbk_t data_lock_success_handler;
|
||||
afr_lock_cbk_t data_lock_failure_handler;
|
||||
|
Loading…
x
Reference in New Issue
Block a user