afr: thin-arbiter 2 domain locking and in-memory state
2 domain locking + xattrop for write-txn failures: -------------------------------------------------- - A post-op wound on TA takes AFR_TA_DOM_NOTIFY range lock and AFR_TA_DOM_MODIFY full lock, does xattrop on TA and releases AFR_TA_DOM_MODIFY lock and stores in-memory which brick is bad. - All further write txn failures are handled based on this in-memory value without querying the TA. - When shd heals the files, it does so by requesting full lock on AFR_TA_DOM_NOTIFY domain. Client uses this as a cue (via upcall), releases AFR_TA_DOM_NOTIFY range lock and invalidates its in-memory notion of which brick is bad. The next write txn failure is wound on TA to again update the in-memory state. - Any incomplete write txns before the AFR_TA_DOM_NOTIFY upcall release request is got is completed before the lock is released. - Any write txns got after the release request are maintained in a ta_waitq. - After the release is complete, the ta_waitq elements are spliced to a separate queue which is then processed one by one. - For fops that come in parallel when the in-memory bad brick is still unknown, only one is wound to TA on wire. The other ones are maintained in a ta_onwireq which is then processed after we get the response from TA. Change-Id: I32c7b61a61776663601ab0040e2f0767eca1fd64 updates: bz#1579788 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Signed-off-by: Ashish Pandey <aspandey@redhat.com>
This commit is contained in:
parent
aae1c402b7
commit
053b1309dc
@ -32,11 +32,11 @@ EXPECT "000000010000000200000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/bri
|
||||
TEST ! ls $B0/brick0/b.txt
|
||||
TEST ls $B0/brick1/b.txt
|
||||
|
||||
#This is not working as expected
|
||||
#TEST ta_kill_brick ta
|
||||
#TEST touch $M0/c.txt
|
||||
#TEST ! ls $B0/brick0/c.txt
|
||||
#TEST ! ls $B0/brick1/c.txt
|
||||
TEST ta_kill_brick ta
|
||||
# Entry create must fail since only 1 brick is up.
|
||||
TEST ! touch $M0/c.txt
|
||||
TEST ! ls $B0/brick0/c.txt
|
||||
TEST ! ls $B0/brick1/c.txt
|
||||
|
||||
TEST ta_start_brick_process brick0
|
||||
EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
|
||||
|
@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this)
|
||||
gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
|
||||
}
|
||||
gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this));
|
||||
if (priv->thin_arbiter_count) {
|
||||
gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
|
||||
gf_proc_dump_write("ta_bad_child_index", "%d",
|
||||
priv->ta_bad_child_index);
|
||||
gf_proc_dump_write("ta_notify_dom_lock_offset", "%lld",
|
||||
priv->ta_notify_dom_lock_offset);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this)
|
||||
*/
|
||||
|
||||
static int
|
||||
find_child_index(xlator_t *this, xlator_t *child)
|
||||
afr_find_child_index(xlator_t *this, xlator_t *child)
|
||||
{
|
||||
afr_private_t *priv = NULL;
|
||||
int child_count = -1;
|
||||
int i = -1;
|
||||
|
||||
priv = this->private;
|
||||
child_count = priv->child_count;
|
||||
if (priv->thin_arbiter_count) {
|
||||
child_count++;
|
||||
}
|
||||
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
for (i = 0; i < child_count; i++) {
|
||||
if ((xlator_t *)child == priv->children[i])
|
||||
break;
|
||||
}
|
||||
@ -5310,6 +5322,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
|
||||
priv->last_event[idx] = *event;
|
||||
}
|
||||
|
||||
void
|
||||
afr_ta_lock_release_synctask(xlator_t *this)
|
||||
{
|
||||
call_frame_t *ta_frame = NULL;
|
||||
int ret = 0;
|
||||
|
||||
ta_frame = afr_ta_frame_create(this);
|
||||
if (!ta_frame) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to create ta_frame");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
|
||||
afr_ta_lock_release_done, ta_frame, this);
|
||||
if (ret) {
|
||||
STACK_DESTROY(ta_frame->root);
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to release "
|
||||
"AFR_TA_DOM_NOTIFY lock.");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
|
||||
{
|
||||
struct gf_upcall_inodelk_contention *lc = NULL;
|
||||
unsigned int inmem_count = 0;
|
||||
unsigned int onwire_count = 0;
|
||||
afr_private_t *priv = this->private;
|
||||
|
||||
lc = upcall->data;
|
||||
|
||||
if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
|
||||
return;
|
||||
|
||||
if (priv->shd.iamshd) {
|
||||
/* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
|
||||
return;
|
||||
}
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
priv->release_ta_notify_dom_lock = _gf_true;
|
||||
inmem_count = priv->ta_in_mem_txn_count;
|
||||
onwire_count = priv->ta_on_wire_txn_count;
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
if (inmem_count || onwire_count)
|
||||
/* lock release will happen in txn code path after
|
||||
* inflight or on-wire txns are over.*/
|
||||
return;
|
||||
|
||||
afr_ta_lock_release_synctask(this);
|
||||
}
|
||||
|
||||
static void
|
||||
afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
|
||||
{
|
||||
struct gf_upcall_cache_invalidation *up_ci = NULL;
|
||||
afr_private_t *priv = this->private;
|
||||
inode_t *inode = NULL;
|
||||
inode_table_t *itable = NULL;
|
||||
int i = 0;
|
||||
|
||||
switch (upcall->event_type) {
|
||||
case GF_UPCALL_INODELK_CONTENTION:
|
||||
afr_handle_inodelk_contention(this, upcall);
|
||||
break;
|
||||
case GF_UPCALL_CACHE_INVALIDATION:
|
||||
up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
|
||||
|
||||
/* Since md-cache will be aggressively filtering
|
||||
* lookups, the stale read issue will be more
|
||||
* pronounced. Hence when a pending xattr is set notify
|
||||
* all the md-cache clients to invalidate the existing
|
||||
* stat cache and send the lookup next time */
|
||||
if (!up_ci->dict)
|
||||
break;
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (!dict_get(up_ci->dict, priv->pending_key[i]))
|
||||
continue;
|
||||
up_ci->flags |= UP_INVAL_ATTR;
|
||||
itable = ((xlator_t *)this->graph->top)->itable;
|
||||
/*Internal processes may not have itable for
|
||||
*top xlator*/
|
||||
if (itable)
|
||||
inode = inode_find(itable, upcall->gfid);
|
||||
if (inode)
|
||||
afr_inode_need_refresh_set(inode, this);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
{
|
||||
@ -5327,10 +5436,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
dict_t *output = NULL;
|
||||
gf_boolean_t had_quorum = _gf_false;
|
||||
gf_boolean_t has_quorum = _gf_false;
|
||||
struct gf_upcall *up_data = NULL;
|
||||
struct gf_upcall_cache_invalidation *up_ci = NULL;
|
||||
inode_table_t *itable = NULL;
|
||||
inode_t *inode = NULL;
|
||||
int64_t halo_max_latency_msec = 0;
|
||||
int64_t child_latency_msec = -1;
|
||||
|
||||
@ -5358,7 +5463,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
* subsequent revalidate lookup happens on all the dht's subvolumes
|
||||
* which triggers afr self-heals if any.
|
||||
*/
|
||||
idx = find_child_index(this, child_xlator);
|
||||
idx = afr_find_child_index(this, child_xlator);
|
||||
if (idx < 0) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
|
||||
"Received child_up from invalid subvolume");
|
||||
@ -5407,6 +5512,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (event == GF_EVENT_UPCALL) {
|
||||
afr_handle_upcall_event(this, data);
|
||||
}
|
||||
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
had_heard_from_all = __get_heard_from_all_status(this);
|
||||
@ -5416,12 +5525,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
propagate = 1;
|
||||
break;
|
||||
case GF_EVENT_CHILD_UP:
|
||||
if (priv->thin_arbiter_count &&
|
||||
(idx == AFR_CHILD_THIN_ARBITER)) {
|
||||
priv->ta_child_up = 1;
|
||||
break;
|
||||
}
|
||||
__afr_handle_child_up_event(this, child_xlator, idx,
|
||||
child_latency_msec, &event,
|
||||
&call_psh, &up_child);
|
||||
break;
|
||||
|
||||
case GF_EVENT_CHILD_DOWN:
|
||||
if (priv->thin_arbiter_count &&
|
||||
(idx == AFR_CHILD_THIN_ARBITER)) {
|
||||
priv->ta_child_up = 0;
|
||||
break;
|
||||
}
|
||||
__afr_handle_child_down_event(this, child_xlator, idx,
|
||||
child_latency_msec, &event,
|
||||
&call_psh, &up_child);
|
||||
@ -5435,34 +5554,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
case GF_EVENT_SOME_DESCENDENT_DOWN:
|
||||
priv->last_event[idx] = event;
|
||||
break;
|
||||
case GF_EVENT_UPCALL:
|
||||
up_data = (struct gf_upcall *)data;
|
||||
if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
|
||||
break;
|
||||
up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
|
||||
|
||||
/* Since md-cache will be aggressively filtering
|
||||
* lookups, the stale read issue will be more
|
||||
* pronounced. Hence when a pending xattr is set notify
|
||||
* all the md-cache clients to invalidate the existing
|
||||
* stat cache and send the lookup next time */
|
||||
if (!up_ci->dict)
|
||||
break;
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (dict_get(up_ci->dict, priv->pending_key[i])) {
|
||||
up_ci->flags |= UP_INVAL_ATTR;
|
||||
itable = ((xlator_t *)this->graph->top)->itable;
|
||||
/*Internal processes may not have itable for top
|
||||
* xlator*/
|
||||
if (itable)
|
||||
inode = inode_find(itable, up_data->gfid);
|
||||
if (inode)
|
||||
afr_inode_need_refresh_set(inode, this);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
propagate = 1;
|
||||
break;
|
||||
@ -5602,6 +5693,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
|
||||
}
|
||||
|
||||
local->need_full_crawl = _gf_false;
|
||||
if (priv->thin_arbiter_count) {
|
||||
local->ta_child_up = priv->ta_child_up;
|
||||
local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&local->healer);
|
||||
return 0;
|
||||
@ -5715,6 +5810,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
|
||||
ret = 0;
|
||||
INIT_LIST_HEAD(&local->transaction.wait_list);
|
||||
INIT_LIST_HEAD(&local->transaction.owner_list);
|
||||
INIT_LIST_HEAD(&local->ta_waitq);
|
||||
INIT_LIST_HEAD(&local->ta_onwireq);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@ -6703,9 +6800,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this)
|
||||
int
|
||||
afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
|
||||
{
|
||||
/*Note: At any given time, only one instance of this function must
|
||||
* be in progress.*/
|
||||
|
||||
int ret = 0;
|
||||
uuid_t gfid = {
|
||||
0,
|
||||
@ -6720,6 +6814,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
|
||||
};
|
||||
int32_t cmd = 0;
|
||||
|
||||
/* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
|
||||
* has been released in afr_notify due to upcall notification from shd.
|
||||
*/
|
||||
GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
|
||||
|
||||
if (!priv->shd.iamshd)
|
||||
GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
|
||||
flock1.l_type = F_WRLCK;
|
||||
@ -6731,14 +6830,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
|
||||
flock1.l_len = 0;
|
||||
} else {
|
||||
cmd = F_SETLK;
|
||||
if (priv->ta_notify_dom_lock_offset) {
|
||||
flock1.l_start = priv->ta_notify_dom_lock_offset;
|
||||
} else {
|
||||
gf_uuid_generate(gfid);
|
||||
flock1.l_start = gfid_to_ino(gfid);
|
||||
if (flock1.l_start < 0)
|
||||
flock1.l_start = -flock1.l_start;
|
||||
}
|
||||
gf_uuid_generate(gfid);
|
||||
flock1.l_start = gfid_to_ino(gfid);
|
||||
if (flock1.l_start < 0)
|
||||
flock1.l_start = -flock1.l_start;
|
||||
flock1.l_len = 1;
|
||||
}
|
||||
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
|
||||
@ -6764,7 +6859,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
|
||||
AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
|
||||
if (ret) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"Failed to get AFR_TA_DOM_MODIFY lock.");
|
||||
"Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
|
||||
flock1.l_type = F_UNLCK;
|
||||
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
|
||||
AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
|
||||
@ -6829,3 +6924,18 @@ afr_ta_frame_create(xlator_t *this)
|
||||
afr_set_lk_owner(frame, this, lk_owner);
|
||||
return frame;
|
||||
}
|
||||
|
||||
gf_boolean_t
|
||||
afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
|
||||
{
|
||||
int data_count = 0;
|
||||
|
||||
data_count = AFR_COUNT(local->child_up, priv->child_count);
|
||||
if (data_count == 2) {
|
||||
return _gf_true;
|
||||
} else if (data_count == 1 && local->ta_child_up) {
|
||||
return _gf_true;
|
||||
}
|
||||
|
||||
return _gf_false;
|
||||
}
|
||||
|
@ -261,6 +261,8 @@ afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
|
||||
if (!ta_frame) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to create ta_frame");
|
||||
goto out;
|
||||
}
|
||||
ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
|
||||
@ -440,6 +442,12 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
|
||||
goto read;
|
||||
}
|
||||
|
||||
if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = -afr_quorum_errno(priv);
|
||||
goto read;
|
||||
}
|
||||
|
||||
if (priv->thin_arbiter_count &&
|
||||
AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
|
||||
afr_ta_read_txn_synctask(frame, this);
|
||||
|
@ -32,7 +32,7 @@ void
|
||||
__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
|
||||
|
||||
void
|
||||
afr_changelog_post_op(call_frame_t *frame, xlator_t *this);
|
||||
afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
|
||||
|
||||
int
|
||||
afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
|
||||
@ -53,6 +53,90 @@ afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
|
||||
afr_changelog_resume_t changelog_resume,
|
||||
afr_xattrop_type_t op);
|
||||
|
||||
static void
|
||||
afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
|
||||
|
||||
static int
|
||||
afr_ta_post_op_do(void *opaque);
|
||||
|
||||
static int
|
||||
afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
|
||||
|
||||
static int
|
||||
afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
|
||||
|
||||
static void
|
||||
afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
|
||||
|
||||
static void
|
||||
afr_ta_process_waitq(xlator_t *this)
|
||||
{
|
||||
afr_local_t *entry = NULL;
|
||||
afr_private_t *priv = this->private;
|
||||
struct list_head waitq = {
|
||||
0,
|
||||
};
|
||||
|
||||
INIT_LIST_HEAD(&waitq);
|
||||
LOCK(&priv->lock);
|
||||
list_splice_init(&priv->ta_waitq, &waitq);
|
||||
UNLOCK(&priv->lock);
|
||||
list_for_each_entry(entry, &waitq, ta_waitq)
|
||||
{
|
||||
afr_ta_decide_post_op_state(entry->transaction.frame, this);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
|
||||
{
|
||||
afr_ta_process_waitq(ta_frame->this);
|
||||
STACK_DESTROY(ta_frame->root);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
afr_release_notify_lock_for_ta(void *opaque)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
loc_t loc = {
|
||||
0,
|
||||
};
|
||||
struct gf_flock flock = {
|
||||
0,
|
||||
};
|
||||
int ret = -1;
|
||||
|
||||
this = (xlator_t *)opaque;
|
||||
priv = this->private;
|
||||
ret = afr_fill_ta_loc(this, &loc);
|
||||
if (ret) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to populate loc for thin-arbiter.");
|
||||
goto out;
|
||||
}
|
||||
flock.l_type = F_UNLCK;
|
||||
flock.l_start = priv->ta_notify_dom_lock_offset;
|
||||
flock.l_len = 1;
|
||||
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
|
||||
AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
|
||||
if (!ret) {
|
||||
LOCK(&priv->lock);
|
||||
priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
|
||||
priv->release_ta_notify_dom_lock = _gf_false;
|
||||
priv->ta_notify_dom_lock_offset = 0;
|
||||
UNLOCK(&priv->lock);
|
||||
} else {
|
||||
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"Failed to unlock AFR_TA_DOM_NOTIFY lock.");
|
||||
}
|
||||
|
||||
out:
|
||||
loc_wipe(&loc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
afr_zero_fill_stat(afr_local_t *local)
|
||||
{
|
||||
@ -576,18 +660,109 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
|
||||
|
||||
return ret;
|
||||
}
|
||||
static void
|
||||
afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = this->private;
|
||||
unsigned int inmem_count = 0;
|
||||
unsigned int onwire_count = 0;
|
||||
gf_boolean_t release = _gf_false;
|
||||
|
||||
/* {{{ pending */
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
/*Once we get notify lock release upcall notification,
|
||||
if two fop states are non empty/non zero, we will
|
||||
not release lock.
|
||||
1 - If anything in memory txn
|
||||
2 - If anything in onwire or onwireq
|
||||
*/
|
||||
if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) {
|
||||
inmem_count = --priv->ta_in_mem_txn_count;
|
||||
} else {
|
||||
inmem_count = priv->ta_in_mem_txn_count;
|
||||
}
|
||||
onwire_count = priv->ta_on_wire_txn_count;
|
||||
release = priv->release_ta_notify_dom_lock;
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
|
||||
if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
|
||||
return;
|
||||
|
||||
afr_ta_lock_release_synctask(this);
|
||||
}
|
||||
|
||||
static void
|
||||
afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = this->private;
|
||||
afr_local_t *entry = NULL;
|
||||
int bad_child = AFR_CHILD_UNKNOWN;
|
||||
|
||||
struct list_head onwireq = {
|
||||
0,
|
||||
};
|
||||
INIT_LIST_HEAD(&onwireq);
|
||||
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
if (--priv->ta_on_wire_txn_count == 0) {
|
||||
UNLOCK(&priv->lock);
|
||||
/*Only one write fop came and after taking notify
|
||||
*lock and before doing xattrop, it has received
|
||||
*lock contention upcall, so this is the only place
|
||||
*to find this out and release the lock*/
|
||||
afr_ta_dom_lock_check_and_release(local, this);
|
||||
return;
|
||||
}
|
||||
bad_child = priv->ta_bad_child_index;
|
||||
if (bad_child == AFR_CHILD_UNKNOWN) {
|
||||
/*The previous on-wire ta_post_op was a failure. Just dequeue
|
||||
*one element to wind on-wire again. */
|
||||
entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
|
||||
list_del_init(&entry->ta_onwireq);
|
||||
} else {
|
||||
/* Prepare to process all fops based on bad_child_index. */
|
||||
list_splice_init(&priv->ta_onwireq, &onwireq);
|
||||
}
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
|
||||
if (entry) {
|
||||
afr_ta_post_op_synctask(this, entry);
|
||||
return;
|
||||
} else {
|
||||
while (!list_empty(&onwireq)) {
|
||||
entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
|
||||
list_del_init(&entry->ta_onwireq);
|
||||
LOCK(&priv->lock);
|
||||
--priv->ta_on_wire_txn_count;
|
||||
UNLOCK(&priv->lock);
|
||||
if (entry->ta_failed_subvol == bad_child) {
|
||||
afr_changelog_post_op_do(entry->transaction.frame, this);
|
||||
} else {
|
||||
afr_changelog_post_op_fail(entry->transaction.frame, this, EIO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
afr_local_t *local = NULL;
|
||||
afr_internal_lock_t *int_lock = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
|
||||
local = frame->local;
|
||||
priv = this->private;
|
||||
int_lock = &local->internal_lock;
|
||||
|
||||
if (priv->thin_arbiter_count) {
|
||||
/*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
|
||||
afr_ta_dom_lock_check_and_release(frame->local, this);
|
||||
}
|
||||
|
||||
/* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
|
||||
if (!afr_changelog_has_quorum(local, this)) {
|
||||
local->op_ret = -1;
|
||||
@ -605,6 +780,20 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
|
||||
{
|
||||
afr_local_t *local = frame->local;
|
||||
local->op_ret = -1;
|
||||
local->op_errno = op_errno;
|
||||
|
||||
gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
|
||||
"Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
|
||||
uuid_utoa(local->inode->gfid), local->fop_state);
|
||||
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
}
|
||||
|
||||
unsigned char *
|
||||
afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
|
||||
{
|
||||
@ -983,8 +1172,240 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
|
||||
static int
|
||||
afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
|
||||
{
|
||||
xlator_t *this = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
|
||||
local = (afr_local_t *)opaque;
|
||||
this = frame->this;
|
||||
|
||||
STACK_DESTROY(frame->root);
|
||||
afr_ta_process_onwireq(local, this);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
afr_ta_post_op_do(void *opaque)
|
||||
{
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
xlator_t *this = NULL;
|
||||
call_frame_t *txn_frame = NULL;
|
||||
dict_t *xattr = NULL;
|
||||
int **pending = NULL;
|
||||
int failed_subvol = -1;
|
||||
int success_subvol = -1;
|
||||
loc_t loc = {
|
||||
0,
|
||||
};
|
||||
int idx = 0;
|
||||
int i = 0;
|
||||
int ret = 0;
|
||||
|
||||
local = (afr_local_t *)opaque;
|
||||
txn_frame = local->transaction.frame;
|
||||
this = txn_frame->this;
|
||||
priv = this->private;
|
||||
idx = afr_index_for_transaction_type(local->transaction.type);
|
||||
|
||||
ret = afr_fill_ta_loc(this, &loc);
|
||||
if (ret) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to populate loc for thin-arbiter.");
|
||||
goto out;
|
||||
}
|
||||
|
||||
xattr = dict_new();
|
||||
if (!xattr) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
|
||||
if (!pending) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (local->transaction.failed_subvols[i]) {
|
||||
pending[i][idx] = hton32(1);
|
||||
failed_subvol = i;
|
||||
} else {
|
||||
success_subvol = i;
|
||||
}
|
||||
}
|
||||
|
||||
ret = afr_set_pending_dict(priv, xattr, pending);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = afr_ta_post_op_lock(this, &loc);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
|
||||
GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
if (ret == 0) {
|
||||
priv->ta_bad_child_index = failed_subvol;
|
||||
} else if (ret == -EINVAL) {
|
||||
priv->ta_bad_child_index = success_subvol;
|
||||
}
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
if (ret) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"Post-op on thin-arbiter id file %s failed for gfid %s.",
|
||||
priv->pending_key[THIN_ARBITER_BRICK_INDEX],
|
||||
uuid_utoa(local->inode->gfid));
|
||||
if (ret == -EINVAL)
|
||||
ret = -EIO; /* TA failed the fop. Return EIO to application. */
|
||||
}
|
||||
|
||||
afr_ta_post_op_unlock(this, &loc);
|
||||
out:
|
||||
if (xattr)
|
||||
dict_unref(xattr);
|
||||
|
||||
if (pending)
|
||||
afr_matrix_cleanup(pending, priv->child_count);
|
||||
|
||||
loc_wipe(&loc);
|
||||
|
||||
if (ret == 0) {
|
||||
/*Mark pending xattrs on the up data brick.*/
|
||||
afr_changelog_post_op_do(local->transaction.frame, this);
|
||||
} else {
|
||||
afr_changelog_post_op_fail(local->transaction.frame, this, -ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
|
||||
{
|
||||
call_frame_t *ta_frame = NULL;
|
||||
int ret = 0;
|
||||
|
||||
ta_frame = afr_ta_frame_create(this);
|
||||
if (!ta_frame) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to create ta_frame");
|
||||
goto err;
|
||||
}
|
||||
ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
|
||||
ta_frame, local);
|
||||
if (ret) {
|
||||
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
|
||||
"Failed to launch post-op on thin arbiter for gfid %s",
|
||||
uuid_utoa(local->inode->gfid));
|
||||
STACK_DESTROY(ta_frame->root);
|
||||
goto err;
|
||||
}
|
||||
|
||||
return ret;
|
||||
err:
|
||||
afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
|
||||
int *on_wire_count)
|
||||
{
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
if (priv->release_ta_notify_dom_lock == _gf_true) {
|
||||
/* Put the fop in waitq until notify dom lock is released.*/
|
||||
local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
|
||||
list_add_tail(&local->ta_waitq, &priv->ta_waitq);
|
||||
} else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
|
||||
/* Post-op on thin-arbiter to decide success/failure. */
|
||||
local->fop_state = TA_GET_INFO_FROM_TA_FILE;
|
||||
*on_wire_count = ++priv->ta_on_wire_txn_count;
|
||||
if (*on_wire_count > 1) {
|
||||
/*Avoid sending multiple on-wire post-ops on TA*/
|
||||
list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
|
||||
}
|
||||
} else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
|
||||
/* Post-op on TA not needed as the fop failed on the in-memory bad
|
||||
* brick. Just mark pending xattrs on the good data brick.*/
|
||||
local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
|
||||
priv->ta_in_mem_txn_count++;
|
||||
} else {
|
||||
/* Post-op on TA not needed as the fop succeeded only on the
|
||||
* in-memory bad data brick and not the good one. Fail the fop.*/
|
||||
local->fop_state = TA_INFO_IN_MEMORY_FAILED;
|
||||
}
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
}
|
||||
|
||||
static void
|
||||
afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < priv->child_count; i++) {
|
||||
if (local->transaction.failed_subvols[i]) {
|
||||
local->ta_failed_subvol = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
int on_wire_count = 0;
|
||||
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
|
||||
afr_ta_set_fop_state(priv, local, &on_wire_count);
|
||||
|
||||
switch (local->fop_state) {
|
||||
case TA_GET_INFO_FROM_TA_FILE:
|
||||
if (on_wire_count == 1)
|
||||
afr_ta_post_op_synctask(this, local);
|
||||
/*else, fop is queued in ta_onwireq.*/
|
||||
break;
|
||||
case TA_WAIT_FOR_NOTIFY_LOCK_REL:
|
||||
/*Post releasing the notify lock, we will act on this queue*/
|
||||
break;
|
||||
case TA_INFO_IN_MEMORY_SUCCESS:
|
||||
afr_changelog_post_op_do(frame, this);
|
||||
break;
|
||||
case TA_INFO_IN_MEMORY_FAILED:
|
||||
afr_changelog_post_op_fail(frame, this, EIO);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void
|
||||
afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = this->private;
|
||||
afr_local_t *local = frame->local;
|
||||
|
||||
afr_ta_fill_failed_subvol(priv, local);
|
||||
gf_msg_debug(this->name, 0,
|
||||
"Fop failed on data brick (%s) for gfid=%s. "
|
||||
"ta info needed to decide fop result.",
|
||||
priv->children[local->ta_failed_subvol]->name,
|
||||
uuid_utoa(local->inode->gfid));
|
||||
afr_ta_decide_post_op_state(frame, this);
|
||||
}
|
||||
|
||||
void
|
||||
afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = this->private;
|
||||
afr_local_t *local = NULL;
|
||||
@ -1001,9 +1422,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
|
||||
|
||||
xattr = dict_new();
|
||||
if (!xattr) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
afr_changelog_post_op_fail(frame, this, ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1030,9 +1449,8 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
|
||||
}
|
||||
|
||||
if (local->transaction.in_flight_sb) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = local->transaction.in_flight_sb_errno;
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
afr_changelog_post_op_fail(frame, this,
|
||||
local->transaction.in_flight_sb_errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1043,17 +1461,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
|
||||
|
||||
ret = afr_set_pending_dict(priv, xattr, local->pending);
|
||||
if (ret < 0) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = afr_changelog_thin_arbiter_post_op(this, local);
|
||||
if (ret < 0) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = -ret;
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
afr_changelog_post_op_fail(frame, this, ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1066,9 +1474,7 @@ set_dirty:
|
||||
ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
|
||||
sizeof(int) * AFR_NUM_CHANGE_LOGS);
|
||||
if (ret) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
afr_changelog_post_op_done(frame, this);
|
||||
afr_changelog_post_op_fail(frame, this, ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1078,6 +1484,32 @@ out:
|
||||
if (xattr)
|
||||
dict_unref(xattr);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static int
|
||||
afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
afr_private_t *priv = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
int failed_count = 0;
|
||||
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
|
||||
if (priv->thin_arbiter_count) {
|
||||
failed_count = AFR_COUNT(local->transaction.failed_subvols,
|
||||
priv->child_count);
|
||||
if (failed_count == 1) {
|
||||
afr_handle_failure_using_thin_arbiter(frame, this);
|
||||
return 0;
|
||||
} else {
|
||||
/* Txn either succeeded or failed on both data bricks. Let
|
||||
* post_op_do handle it as the case might be. */
|
||||
}
|
||||
}
|
||||
|
||||
afr_changelog_post_op_do(frame, this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2457,6 +2889,11 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
|
||||
ret = -afr_quorum_errno(priv);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = afr_transaction_local_init(local, this);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
@ -67,4 +67,9 @@ afr_lock(call_frame_t *frame, xlator_t *this);
|
||||
void
|
||||
afr_delayed_changelog_wake_up_cbk(void *data);
|
||||
|
||||
int
|
||||
afr_release_notify_lock_for_ta(void *opaque);
|
||||
|
||||
int
|
||||
afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
|
||||
#endif /* __TRANSACTION_H__ */
|
||||
|
@ -336,6 +336,22 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
afr_ta_init(afr_private_t *priv)
|
||||
{
|
||||
priv->thin_arbiter_count = 1;
|
||||
priv->child_count--;
|
||||
priv->ta_child_up = 0;
|
||||
priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
|
||||
priv->ta_notify_dom_lock_offset = 0;
|
||||
priv->ta_in_mem_txn_count = 0;
|
||||
priv->ta_on_wire_txn_count = 0;
|
||||
priv->release_ta_notify_dom_lock = _gf_false;
|
||||
INIT_LIST_HEAD(&priv->ta_waitq);
|
||||
INIT_LIST_HEAD(&priv->ta_onwireq);
|
||||
*priv->ta_gfid = 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
init(xlator_t *this)
|
||||
{
|
||||
@ -380,11 +396,7 @@ init(xlator_t *this)
|
||||
GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
|
||||
GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
|
||||
if (thin_arbiter && strlen(thin_arbiter) > 0) {
|
||||
priv->thin_arbiter_count = 1;
|
||||
priv->child_count--;
|
||||
priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
|
||||
priv->ta_notify_dom_lock_offset = 0;
|
||||
*priv->ta_gfid = 0;
|
||||
afr_ta_init(priv);
|
||||
}
|
||||
INIT_LIST_HEAD(&priv->healing);
|
||||
INIT_LIST_HEAD(&priv->heal_waiting);
|
||||
|
@ -107,8 +107,20 @@ typedef enum {
|
||||
AFR_CHILD_UNKNOWN = -1,
|
||||
AFR_CHILD_ZERO,
|
||||
AFR_CHILD_ONE,
|
||||
AFR_CHILD_THIN_ARBITER,
|
||||
} afr_child_index;
|
||||
|
||||
typedef enum {
|
||||
TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
|
||||
notification and waiting for its release.*/
|
||||
TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
|
||||
*info about which brick is bad.*/
|
||||
TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
|
||||
*on BAD brick - Success*/
|
||||
TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
|
||||
*on GOOD brick - Failed*/
|
||||
} afr_ta_fop_state_t;
|
||||
|
||||
struct afr_nfsd {
|
||||
gf_boolean_t iamnfsd;
|
||||
uint32_t halo_max_latency_msec;
|
||||
@ -127,8 +139,14 @@ typedef struct _afr_private {
|
||||
/* For thin-arbiter. */
|
||||
unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
|
||||
uuid_t ta_gfid;
|
||||
unsigned char ta_child_up;
|
||||
int ta_bad_child_index;
|
||||
off_t ta_notify_dom_lock_offset;
|
||||
gf_boolean_t release_ta_notify_dom_lock;
|
||||
unsigned int ta_in_mem_txn_count;
|
||||
unsigned int ta_on_wire_txn_count;
|
||||
struct list_head ta_waitq;
|
||||
struct list_head ta_onwireq;
|
||||
|
||||
unsigned char *child_up;
|
||||
int64_t *child_latency;
|
||||
@ -855,6 +873,13 @@ typedef struct _afr_local {
|
||||
|
||||
gf_boolean_t is_read_txn;
|
||||
afr_inode_ctx_t *inode_ctx;
|
||||
|
||||
/*For thin-arbiter transactions.*/
|
||||
unsigned char ta_child_up;
|
||||
struct list_head ta_waitq;
|
||||
struct list_head ta_onwireq;
|
||||
afr_ta_fop_state_t fop_state;
|
||||
int ta_failed_subvol;
|
||||
} afr_local_t;
|
||||
|
||||
typedef struct afr_spbc_timeout {
|
||||
@ -1289,4 +1314,10 @@ __afr_get_up_children_count(afr_private_t *priv);
|
||||
|
||||
call_frame_t *
|
||||
afr_ta_frame_create(xlator_t *this);
|
||||
|
||||
gf_boolean_t
|
||||
afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
|
||||
|
||||
void
|
||||
afr_ta_lock_release_synctask(xlator_t *this);
|
||||
#endif /* __AFR_H__ */
|
||||
|
Loading…
x
Reference in New Issue
Block a user