cluster/afr: Handle afr data self-heal failures gracefully
Change-Id: I5f91098111a8ca29982f3b4292e2109e4a36cce1 BUG: 765373 Signed-off-by: Pranith Kumar K <pranithk@gluster.com> Reviewed-on: http://review.gluster.com/2662 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vijay@gluster.com>
This commit is contained in:
parent
243f02a5ef
commit
1f7692fecd
@ -21,6 +21,7 @@
|
||||
#define __AFR_SELF_HEAL_COMMON_H__
|
||||
|
||||
#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
|
||||
#define AFR_SH_MIN_PARTICIPANTS 2
|
||||
|
||||
typedef enum {
|
||||
AFR_SELF_HEAL_ENTRY,
|
||||
|
@ -49,6 +49,14 @@
|
||||
#include "afr-self-heal-common.h"
|
||||
#include "afr-self-heal-algorithm.h"
|
||||
|
||||
int
|
||||
afr_sh_data_fail (call_frame_t *frame, xlator_t *this);
|
||||
|
||||
static inline gf_boolean_t
|
||||
afr_sh_data_proceed (unsigned int success_count)
|
||||
{
|
||||
return (success_count >= AFR_SH_MIN_PARTICIPANTS);
|
||||
}
|
||||
|
||||
extern int
|
||||
sh_loop_finish (call_frame_t *loop_frame, xlator_t *this);
|
||||
@ -259,9 +267,14 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,
|
||||
sh = &local->self_heal;
|
||||
|
||||
GF_ASSERT (sh->source == child_index);
|
||||
if (op_ret != -1)
|
||||
if (op_ret != -1) {
|
||||
sh->buf[child_index] = *buf;
|
||||
afr_sh_data_setattr (frame, this);
|
||||
afr_sh_data_setattr (frame, this);
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
|
||||
"time-stamps after self-heal", local->loc.path);
|
||||
afr_sh_data_fail (frame, this);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -360,12 +373,27 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
|
||||
int call_count = 0;
|
||||
afr_local_t *local = NULL;
|
||||
afr_self_heal_t *sh = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
int32_t child_index = (long) cookie;
|
||||
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
if (op_ret < 0) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change "
|
||||
"log failed on %s for subvol %s, reason: %s",
|
||||
local->loc.path, priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->op_failed = 1;
|
||||
}
|
||||
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0) {
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
if (sh->op_failed) {
|
||||
afr_sh_data_fail (frame, this);
|
||||
goto out;
|
||||
}
|
||||
if (!IA_ISREG (sh->type)) {
|
||||
afr_sh_data_finish (frame, this);
|
||||
goto out;
|
||||
@ -550,36 +578,44 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
|
||||
struct iatt *postbuf, dict_t *xdata)
|
||||
{
|
||||
afr_private_t * priv = NULL;
|
||||
afr_local_t * local = NULL;
|
||||
int call_count = 0;
|
||||
int child_index = 0;
|
||||
afr_private_t *priv = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
afr_self_heal_t *sh = NULL;
|
||||
|
||||
priv = this->private;
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
child_index = (long) cookie;
|
||||
|
||||
LOCK (&frame->lock);
|
||||
{
|
||||
if (op_ret == -1)
|
||||
gf_log (this->name, GF_LOG_INFO,
|
||||
if (op_ret == -1) {
|
||||
gf_log (this->name, GF_LOG_ERROR,
|
||||
"ftruncate of %s on subvolume %s failed (%s)",
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
else
|
||||
sh->op_failed = 1;
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_DEBUG,
|
||||
"ftruncate of %s on subvolume %s completed",
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name);
|
||||
}
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
|
||||
call_count = afr_frame_return (frame);
|
||||
|
||||
if (call_count == 0)
|
||||
afr_sh_data_sync_prepare (frame, this);
|
||||
if (call_count == 0) {
|
||||
if (sh->op_failed)
|
||||
afr_sh_data_fail (frame, this);
|
||||
else
|
||||
afr_sh_data_sync_prepare (frame, this);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -849,6 +885,12 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
|
||||
sh->buf[child_index] = *buf;
|
||||
sh->success_children[sh->success_count] = child_index;
|
||||
sh->success_count++;
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed "
|
||||
"on %s, reason %s", local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->child_errno[child_index] = op_errno;
|
||||
}
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
@ -859,13 +901,20 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
|
||||
/* Previous versions of glusterfs might have set
|
||||
* the pending data xattrs which need to be erased
|
||||
*/
|
||||
if (!afr_sh_data_proceed (sh->success_count)) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "inspecting metadata "
|
||||
"succeeded on < %d children, aborting "
|
||||
"self-heal for %s", AFR_SH_MIN_PARTICIPANTS,
|
||||
local->loc.path);
|
||||
afr_sh_data_fail (frame, this);
|
||||
goto out;
|
||||
}
|
||||
if (IA_ISREG (buf->ia_type))
|
||||
afr_sh_data_fix (frame, this);
|
||||
else
|
||||
afr_sh_data_special_file_fix (frame, this);
|
||||
|
||||
}
|
||||
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -876,33 +925,43 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
|
||||
afr_self_heal_t *sh = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
int call_count = 0;
|
||||
int i = 0;
|
||||
int call_count = 0;
|
||||
int i = 0;
|
||||
int child = 0;
|
||||
int32_t *fstat_children = NULL;
|
||||
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
call_count = afr_up_children_count (local->child_up,
|
||||
priv->child_count);
|
||||
|
||||
fstat_children = memdup (sh->success_children,
|
||||
sizeof (*fstat_children) * priv->child_count);
|
||||
if (!fstat_children) {
|
||||
afr_sh_data_fail (frame, this);
|
||||
goto out;
|
||||
}
|
||||
call_count = sh->success_count;
|
||||
local->call_count = call_count;
|
||||
|
||||
afr_reset_children (sh->success_children, priv->child_count);
|
||||
memset (sh->child_errno, 0,
|
||||
sizeof (*sh->child_errno) * priv->child_count);
|
||||
sh->success_count = 0;
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (local->child_up[i]) {
|
||||
STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
|
||||
(void *) (long) i,
|
||||
priv->children[i],
|
||||
priv->children[i]->fops->fstat,
|
||||
sh->healing_fd, NULL);
|
||||
|
||||
if (!--call_count)
|
||||
break;
|
||||
}
|
||||
child = fstat_children[i];
|
||||
if (child == -1)
|
||||
break;
|
||||
STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
|
||||
(void *) (long) child,
|
||||
priv->children[child],
|
||||
priv->children[child]->fops->fstat,
|
||||
sh->healing_fd, NULL);
|
||||
--call_count;
|
||||
}
|
||||
|
||||
GF_ASSERT (!call_count);
|
||||
out:
|
||||
if (fstat_children)
|
||||
GF_FREE (fstat_children);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -931,6 +990,12 @@ afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie,
|
||||
sh->xattr[child_index] = dict_ref (xattr);
|
||||
sh->success_children[sh->success_count] = child_index;
|
||||
sh->success_count++;
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s "
|
||||
"failed on %s, reason %s", local->loc.path,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->child_errno[child_index] = op_errno;
|
||||
}
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
@ -952,19 +1017,27 @@ afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
call_count = afr_frame_return (frame);
|
||||
if (call_count == 0) {
|
||||
(void) afr_build_sources (this, sh->xattr, NULL,
|
||||
sh->pending_matrix,
|
||||
sh->sources, sh->success_children,
|
||||
AFR_DATA_TRANSACTION, NULL, _gf_false);
|
||||
ret = afr_sh_inode_set_read_ctx (sh, this);
|
||||
if (ret)
|
||||
afr_sh_data_fail (frame, this);
|
||||
else
|
||||
afr_sh_set_timestamps (frame, this);
|
||||
}
|
||||
if (call_count)
|
||||
goto out;
|
||||
|
||||
return 0;
|
||||
if (!afr_sh_data_proceed (sh->success_count)) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "%s, inspecting change log "
|
||||
"succeeded on < %d children", local->loc.path,
|
||||
AFR_SH_MIN_PARTICIPANTS);
|
||||
afr_sh_data_fail (frame, this);
|
||||
goto out;
|
||||
}
|
||||
(void) afr_build_sources (this, sh->xattr, NULL,
|
||||
sh->pending_matrix,
|
||||
sh->sources, sh->success_children,
|
||||
AFR_DATA_TRANSACTION, NULL, _gf_false);
|
||||
ret = afr_sh_inode_set_read_ctx (sh, this);
|
||||
if (ret)
|
||||
afr_sh_data_fail (frame, this);
|
||||
else
|
||||
afr_sh_set_timestamps (frame, this);
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
@ -972,16 +1045,28 @@ afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
|
||||
xlator_t *this, int32_t op_ret, int32_t op_errno,
|
||||
dict_t *xattr, dict_t *xdata)
|
||||
{
|
||||
int call_count = -1;
|
||||
int call_count = -1;
|
||||
afr_local_t *local = NULL;
|
||||
afr_self_heal_t *sh = NULL;
|
||||
|
||||
local = frame->local;
|
||||
sh = &local->self_heal;
|
||||
|
||||
afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret,
|
||||
op_errno, xattr);
|
||||
|
||||
call_count = afr_frame_return (frame);
|
||||
if (call_count == 0) {
|
||||
if (!afr_sh_data_proceed (sh->success_count)) {
|
||||
gf_log (this->name, GF_LOG_ERROR, "%s, inspecting "
|
||||
"change log succeeded on < %d children",
|
||||
local->loc.path, AFR_SH_MIN_PARTICIPANTS);
|
||||
afr_sh_data_fail (frame, this);
|
||||
goto out;
|
||||
}
|
||||
afr_sh_data_fstat (frame, this);
|
||||
}
|
||||
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1035,6 +1120,8 @@ afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this,
|
||||
|
||||
afr_reset_xattr (sh->xattr, priv->child_count);
|
||||
afr_reset_children (sh->success_children, priv->child_count);
|
||||
memset (sh->child_errno, 0,
|
||||
sizeof (*sh->child_errno) * priv->child_count);
|
||||
sh->success_count = 0;
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (local->child_up[i]) {
|
||||
@ -1057,8 +1144,7 @@ out:
|
||||
if (ret) {
|
||||
if (zero_pending)
|
||||
GF_FREE (zero_pending);
|
||||
sh->op_failed = 1;
|
||||
afr_sh_data_done (frame, this);
|
||||
afr_sh_data_fail (frame, this);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -1240,12 +1326,12 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
||||
priv->children[child_index]->name,
|
||||
strerror (op_errno));
|
||||
sh->op_failed = 1;
|
||||
} else {
|
||||
gf_log (this->name, GF_LOG_TRACE,
|
||||
"open of %s succeeded on child %s",
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name);
|
||||
}
|
||||
|
||||
gf_log (this->name, GF_LOG_TRACE,
|
||||
"open of %s succeeded on child %s",
|
||||
local->loc.path,
|
||||
priv->children[child_index]->name);
|
||||
}
|
||||
UNLOCK (&frame->lock);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user