cluster/afr : Prevent inode-evict during split-brain resolution

1) Provided setfattr command to set timeout for split-brain
choice.
2) If split-brain inspection/resolution is being done
from the mount for a file, ref the inode when
split-brain-choice is set.
This inode will be unconditionally unref-ed after timeout
seconds set by the user/default otherwise.
3) Updated the doc and testcase to reflect the changes.

Change-Id: I15c9037dee28855f21e680e7e3632e1f48dba4e1
BUG: 1209104
Signed-off-by: Anuradha <atalur@redhat.com>
Reviewed-on: http://review.gluster.org/10134
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Tested-by: NetBSD Build System
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
This commit is contained in:
Anuradha 2015-04-30 15:31:13 +05:30 committed by Pranith Kumar Karampuri
parent d68a2dbb3a
commit 6c578c03f0
8 changed files with 316 additions and 58 deletions

View File

@ -426,6 +426,15 @@ Now performing cat operation on the file will again result in input/output error
cat: file1: Input/output error
~~~
The user can access each file for a timeout amount of period every time replica.split-brain-choice is set. This timeout is configurable by user, with a default value of 5 minutes.
### To set split-brain-choice timeout
A setfattr command from the mount allows the user set this timeout, to be specified in minutes.
~~~
# setfattr -n replica.split-brain-choice-timeout -v <timeout-in-minutes> <mount_point/file>
~~~
This is a global timeout, i.e. applicable to all files as long as the mount exists. So, the timeout need not be set each time a file needs to be inspected but for a new mount it will have to be set again for the first time. This option also needs to be set every time there is a client graph switch (_See note #3_).
### Resolving the split-brain
Once the choice for resolving split-brain is made, source brick is supposed to be set for the healing to be done.
This is done using the following command:
@ -446,3 +455,5 @@ NOTE:
~~~
2) The above mentioned process for split-brain resolution from mount will not work on nfs mounts as it doesn't provide xattrs support.
3) Client graph switch occurs when there is a change in the client side translator graph; typically during addition of new translators to the graph on client side and add-brick/remove-brick operations.

View File

@ -167,6 +167,7 @@
#define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain"
#define GF_AFR_SBRAIN_STATUS "replica.split-brain-status"
#define GF_AFR_SBRAIN_CHOICE "replica.split-brain-choice"
#define GF_AFR_SPB_CHOICE_TIMEOUT "replica.split-brain-choice-timeout"
#define GF_AFR_SBRAIN_RESOLVE "replica.split-brain-heal-finalize"
#define GF_GFIDLESS_LOOKUP "gfidless-lookup"

View File

@ -50,6 +50,7 @@ TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/data-split-brain
#Should now be able to read the contents of data-split-brain.txt
EXPECT "brick0_alive" cat $M0/data-split-brain.txt
TEST setfattr -n replica.split-brain-choice-timeout -v 10 $M0/
TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/data-split-brain.txt
#Should now be able to read the contents of data-split-brain.txt

View File

@ -413,6 +413,142 @@ out:
return ret;
}
int
afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode)
{
afr_inode_ctx_t *ctx = NULL;
int ret = -1;
if (!inode)
return ret;
LOCK(&inode->lock);
{
__afr_inode_ctx_get (this, inode, &ctx);
if (!ctx) {
gf_log (this->name, GF_LOG_WARNING, "Failed to cancel"
" split-brain choice timer.");
goto out;
}
ctx->spb_choice = -1;
if (ctx->timer) {
gf_timer_call_cancel (this->ctx, ctx->timer);
ctx->timer = NULL;
}
ret = 0;
}
out:
UNLOCK(&inode->lock);
return ret;
}
void
afr_set_split_brain_choice_cbk (void *data)
{
inode_t *inode = data;
xlator_t *this = THIS;
afr_spb_choice_timeout_cancel (this, inode);
inode_unref (inode);
return;
}
int
afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
{
int op_errno = ENOMEM;
afr_private_t *priv = NULL;
afr_inode_ctx_t *ctx = NULL;
inode_t *inode = NULL;
loc_t *loc = NULL;
xlator_t *this = NULL;
afr_spbc_timeout_t *data = opaque;
struct timespec delta = {0, };
if (ret)
goto out;
frame = data->frame;
loc = data->loc;
this = frame->this;
priv = this->private;
delta.tv_sec = priv->spb_choice_timeout;
delta.tv_nsec = 0;
inode = loc->inode;
if (!inode)
goto out;
if (!(data->d_spb || data->m_spb)) {
gf_log (this->name, GF_LOG_WARNING, "Cannot set "
"replica.split-brain-choice on %s. File is"
" not in data/metadata split-brain.",
uuid_utoa (loc->gfid));
ret = -1;
op_errno = EINVAL;
goto out;
}
LOCK(&inode->lock);
{
ret = __afr_inode_ctx_get (this, inode, &ctx);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to get"
"inode_ctx for %s", loc->name);
goto unlock;
}
ctx->spb_choice = data->spb_child_index;
/* Possible changes in spb-choice :
* -1 to valid : ref and inject timer
*
* valid to valid : cancel timer and inject new one
*
* valid to -1 : cancel timer and unref
*
* -1 to -1 : do not do anything
*/
/* ctx->timer is NULL iff previous value of
* ctx->spb_choice is -1
*/
if (ctx->timer) {
if (ctx->spb_choice == -1) {
gf_timer_call_cancel (this->ctx, ctx->timer);
ctx->timer = NULL;
inode_unref (inode);
goto unlock;
}
goto reset_timer;
} else {
if (ctx->spb_choice == -1)
goto unlock;
}
inode = inode_ref (loc->inode);
goto set_timer;
reset_timer:
gf_timer_call_cancel (this->ctx, ctx->timer);
ctx->timer = NULL;
set_timer:
ctx->timer = gf_timer_call_after (this->ctx, delta,
afr_set_split_brain_choice_cbk,
inode);
}
unlock:
UNLOCK(&inode->lock);
inode_invalidate (inode);
out:
if (data)
GF_FREE (data);
AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
return 0;
}
int
afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
@ -3589,6 +3725,7 @@ afr_forget (xlator_t *this, inode_t *inode)
uint64_t ctx_int = 0;
afr_inode_ctx_t *ctx = NULL;
afr_spb_choice_timeout_cancel (this, inode);
inode_ctx_del (inode, this, &ctx_int);
if (!ctx_int)
return 0;
@ -4552,10 +4689,10 @@ out:
}
int
afr_set_split_brain_status (call_frame_t *frame, xlator_t *this,
struct afr_reply *replies,
afr_transaction_type type,
gf_boolean_t *spb)
_afr_is_split_brain (call_frame_t *frame, xlator_t *this,
struct afr_reply *replies,
afr_transaction_type type,
gf_boolean_t *spb)
{
afr_private_t *priv = NULL;
uint64_t *witness = NULL;
@ -4583,6 +4720,37 @@ afr_set_split_brain_status (call_frame_t *frame, xlator_t *this,
return ret;
}
int
afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
{
int ret = -1;
afr_private_t *priv = NULL;
struct afr_reply *replies = NULL;
priv = this->private;
replies = alloca0 (sizeof (*replies) * priv->child_count);
ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
if (ret)
goto out;
ret = _afr_is_split_brain (frame, this, replies,
AFR_DATA_TRANSACTION, d_spb);
if (ret)
goto out;
ret = _afr_is_split_brain (frame, this, replies,
AFR_METADATA_TRANSACTION, m_spb);
out:
if (replies) {
afr_replies_wipe (replies, priv->child_count);
replies = NULL;
}
return ret;
}
int
afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
{
@ -4594,7 +4762,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
char *choices = NULL;
char *status = NULL;
dict_t *dict = NULL;
struct afr_reply *replies = NULL;
inode_t *inode = NULL;
afr_private_t *priv = NULL;
xlator_t **children = NULL;
@ -4605,7 +4772,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
inode = afr_inode_find (this, loc->gfid);
if (!inode)
goto out;
replies = alloca0 (sizeof (*replies) * priv->child_count);
/* Calculation for string length :
* (child_count X length of child-name) + strlen (" Choices :")
@ -4615,23 +4781,9 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
*/
choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) +
strlen (" Choices:"));
ret = afr_selfheal_unlocked_discover (frame, inode, loc->gfid, replies);
if (ret) {
op_errno = -ret;
ret = -1;
goto out;
}
ret = afr_set_split_brain_status (frame, this, replies,
AFR_DATA_TRANSACTION, &d_spb);
if (ret) {
op_errno = -ret;
ret = -1;
goto out;
}
ret = afr_set_split_brain_status (frame, this, replies,
AFR_METADATA_TRANSACTION, &m_spb);
ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb,
&m_spb);
if (ret) {
op_errno = -ret;
ret = -1;
@ -4678,8 +4830,6 @@ out:
AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
if (replies)
afr_replies_wipe (replies, priv->child_count);
if (inode)
inode_unref (inode);
return ret;

View File

@ -979,12 +979,7 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
int ret = -1;
int op_errno = EINVAL;
local = AFR_FRAME_INIT (frame, op_errno);
if (!local)
goto out;
local->op = GF_FOP_SETXATTR;
local = frame->local;
local->xdata_req = dict_new ();
if (!local->xdata_req) {
@ -1005,35 +1000,27 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
ret = -1;
goto out;
}
/* set spb choice to -1 whether heal succeeds or not:
* If heal succeeds : spb-choice should be set to -1 as
* it is no longer valid; file is not
* in split-brain anymore.
* If heal doesn't succeed:
* spb-choice should be set to -1
* otherwise reads will be served
* from spb-choice which is misleading.
*/
ret = afr_inode_split_brain_choice_set (loc->inode, this, -1);
if (ret)
gf_log (this->name, GF_LOG_WARNING, "Failed to set"
"split-brain choice to -1");
afr_heal_splitbrain_file (frame, this, loc);
ret = 0;
out:
if (ret < 0)
AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
return 0;
}
int
afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc,
int spb_choice)
{
int ret = -1;
int op_errno = ENOMEM;
afr_private_t *priv = NULL;
priv = this->private;
ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to set"
"split-brain choice as %s for %s",
priv->children[spb_choice]->name,
loc->name);
}
inode_invalidate (loc->inode);
AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
return ret;
}
int
afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
{
@ -1055,19 +1042,53 @@ afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
return spb_child_index;
}
int
afr_can_set_split_brain_choice (void *opaque)
{
afr_spbc_timeout_t *data = opaque;
call_frame_t *frame = NULL;
xlator_t *this = NULL;
loc_t *loc = NULL;
int ret = -1;
frame = data->frame;
loc = data->loc;
this = frame->this;
ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid,
&data->d_spb, &data->m_spb);
if (ret)
gf_log (this->name, GF_LOG_ERROR, "Failed to determine if %s"
" is in split-brain. "
"Aborting split-brain-choice set.",
uuid_utoa (loc->gfid));
return ret;
}
int
afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
loc_t *loc, dict_t *dict)
{
int len = 0;
void *value = NULL;
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_spbc_timeout_t *data = NULL;
int len = 0;
int spb_child_index = -1;
int ret = -1;
int op_errno = EINVAL;
afr_private_t *priv = NULL;
priv = this->private;
local = AFR_FRAME_INIT (frame, op_errno);
if (!local) {
ret = 1;
goto out;
}
local->op = GF_FOP_SETXATTR;
ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value,
&len);
if (value) {
@ -1079,12 +1100,29 @@ afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
spb_child_index = -1;
else {
ret = 1;
op_errno = EINVAL;
goto out;
}
}
afr_set_split_brain_choice (frame, this, loc,
spb_child_index);
data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t);
if (!data) {
ret = 1;
goto out;
}
data->spb_child_index = spb_child_index;
data->frame = frame;
data->loc = loc;
ret = synctask_new (this->ctx->env,
afr_can_set_split_brain_choice,
afr_set_split_brain_choice, NULL, data);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to create"
" synctask. Aborting split-brain choice set"
" for %s", loc->name);
ret = 1;
goto out;
}
ret = 0;
goto out;
}
@ -1111,6 +1149,41 @@ out:
return ret;
}
int
afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame,
dict_t *dict)
{
int ret = -1;
int op_errno = 0;
uint64_t timeout = 0;
afr_private_t *priv = NULL;
priv = this->private;
ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
if (!ret) {
priv->spb_choice_timeout = timeout * 60;
AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
}
return ret;
}
static int
afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
dict_t *dict)
{
int ret = -1;
ret = afr_handle_split_brain_commands (this, frame, loc, dict);
if (ret == 0)
goto out;
ret = afr_handle_spb_choice_timeout (this, frame, dict);
out:
return ret;
}
int
afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
@ -1126,8 +1199,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
ret = afr_handle_split_brain_commands (this, frame, loc, dict);
ret = afr_handle_special_xattr (this, frame, loc, dict);
if (ret == 0)
return 0;

View File

@ -43,6 +43,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_pos_data_t,
gf_afr_mt_reply_t,
gf_afr_mt_subvol_healer_t,
gf_afr_mt_spbc_timeout_t,
gf_afr_mt_end
};
#endif

View File

@ -276,6 +276,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out);
priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
GF_OPTION_INIT ("metadata-splitbrain-forced-heal",

View File

@ -38,6 +38,7 @@
#define AFR_LOCKEE_COUNT_MAX 3
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
#define ARBITER_BRICK_INDEX 2
@ -130,6 +131,7 @@ typedef struct _afr_private {
void *pump_private;
gf_boolean_t use_afr_in_pump;
gf_boolean_t consistent_metadata;
uint64_t spb_choice_timeout;
} afr_private_t;
@ -742,8 +744,17 @@ typedef struct _afr_local {
typedef struct _afr_inode_ctx {
uint64_t read_subvol;
int spb_choice;
gf_timer_t *timer;
} afr_inode_ctx_t;
typedef struct afr_spbc_timeout {
call_frame_t *frame;
gf_boolean_t d_spb;
gf_boolean_t m_spb;
loc_t *loc;
int spb_child_index;
} afr_spbc_timeout_t;
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \
@ -1046,4 +1057,13 @@ afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
int *spb_choice);
int
afr_get_child_index_from_name (xlator_t *this, char *name);
int
afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
int
afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode);
int
afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque);
#endif /* __AFR_H__ */