afr: thin-arbiter read txn changes
If both data bricks are up, read subvol will be based on read_subvols. If only one data brick is up: - First qeury the data-brick that is up. If it blames the other brick, allow the reads. - If if doesn't, query the TA to obtain the source of truth. TODO: See if in-memory state can be maintained for read txns (BZ 1624358). updates: bz#1579788 Change-Id: I61eec35592af3a1aaf9f90846d9a358b2e4b2fcc Signed-off-by: Ravishankar N <ravishankar@redhat.com>
This commit is contained in:
parent
560bd8671f
commit
69532c141b
60
tests/basic/afr/ta-read.t
Normal file
60
tests/basic/afr/ta-read.t
Normal file
@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Test read transaction logic for thin-arbiter.
|
||||
|
||||
. $(dirname $0)/../../include.rc
|
||||
. $(dirname $0)/../../volume.rc
|
||||
. $(dirname $0)/../../thin-arbiter.rc
|
||||
cleanup;
|
||||
TEST ta_create_brick_and_volfile brick0
|
||||
TEST ta_create_brick_and_volfile brick1
|
||||
TEST ta_create_ta_and_volfile ta
|
||||
TEST ta_start_brick_process brick0
|
||||
TEST ta_start_brick_process brick1
|
||||
TEST ta_start_ta_process ta
|
||||
|
||||
TEST ta_create_mount_volfile brick0 brick1 ta
|
||||
TEST ta_start_mount_process $M0
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta
|
||||
|
||||
TEST touch $M0/FILE
|
||||
TEST ls $B0/brick0/FILE
|
||||
TEST ls $B0/brick1/FILE
|
||||
TEST ! ls $B0/ta/FILE
|
||||
|
||||
# Kill one brick and write to FILE.
|
||||
TEST ta_kill_brick brick0
|
||||
EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0
|
||||
echo "brick0 down">> $M0/FILE
|
||||
TEST [ $? -eq 0 ]
|
||||
EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/FILE
|
||||
EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.patchy-ta-2
|
||||
|
||||
#Umount and mount to remove cached data.
|
||||
TEST umount $M0
|
||||
TEST ta_start_mount_process $M0
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
|
||||
# Read must be allowed since good brick is up.
|
||||
TEST cat $M0/FILE
|
||||
|
||||
# Toggle good and bad data brick processes.
|
||||
TEST ta_start_brick_process brick0
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
|
||||
TEST ta_kill_brick brick1
|
||||
EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1
|
||||
# Read must now fail.
|
||||
TEST ! cat $M0/FILE
|
||||
|
||||
# Bring all data bricks up, and kill TA.
|
||||
TEST ta_start_brick_process brick1
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
|
||||
TA_PID=$(ta_get_pid_by_brick_name ta)
|
||||
TEST [ -n $TA_PID ]
|
||||
TEST ta_kill_brick ta
|
||||
TA_PID=$(ta_get_pid_by_brick_name ta)
|
||||
TEST [ -z $TA_PID ]
|
||||
# Read must now succeed.
|
||||
TEST cat $M0/FILE
|
||||
cleanup;
|
@ -419,6 +419,11 @@ function ta_kill_brick()
|
||||
kill -9 $p
|
||||
}
|
||||
|
||||
function ta_get_pid_by_brick_name()
|
||||
{
|
||||
cat $B0/${1}.pid
|
||||
}
|
||||
|
||||
function ta_up_status()
|
||||
{
|
||||
local v=$1
|
||||
|
@ -1310,6 +1310,14 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)
|
||||
goto refresh_done;
|
||||
}
|
||||
|
||||
if (priv->thin_arbiter_count && local->is_read_txn &&
|
||||
AFR_COUNT (success_replies, priv->child_count) !=
|
||||
priv->child_count) {
|
||||
/* We need to query the good bricks and/or thin-arbiter.*/
|
||||
error = EINVAL;
|
||||
goto refresh_done;
|
||||
}
|
||||
|
||||
ret = afr_replies_interpret (frame, this, local->refreshinode,
|
||||
&start_heal);
|
||||
|
||||
@ -6977,3 +6985,17 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc)
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
call_frame_t *
|
||||
afr_ta_frame_create (xlator_t *this)
|
||||
{
|
||||
call_frame_t *frame = NULL;
|
||||
void *lk_owner = NULL;
|
||||
|
||||
frame = create_frame (this, this->ctx->pool);
|
||||
if (!frame)
|
||||
return NULL;
|
||||
lk_owner = (void *)this;
|
||||
afr_set_lk_owner (frame, this, lk_owner);
|
||||
return frame;
|
||||
}
|
||||
|
@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index)
|
||||
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
|
||||
}
|
||||
|
||||
static gf_boolean_t
|
||||
afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv,
|
||||
int child)
|
||||
{
|
||||
int *pending = NULL;
|
||||
int ret = 0;
|
||||
int i = 0;
|
||||
|
||||
ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending);
|
||||
if (ret == 0) {
|
||||
for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
|
||||
/* Not doing a ntoh32(pending) as we just want to check
|
||||
* if it is non-zero or not. */
|
||||
if (pending[i]) {
|
||||
return _gf_true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _gf_false;
|
||||
}
|
||||
|
||||
void
|
||||
afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
|
||||
{
|
||||
@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque)
|
||||
{
|
||||
STACK_DESTROY(ta_frame->root);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
afr_ta_read_txn (void *opaque)
|
||||
{
|
||||
call_frame_t *frame = NULL;
|
||||
xlator_t *this = NULL;
|
||||
int read_subvol = -1;
|
||||
int up_child = AFR_CHILD_UNKNOWN;
|
||||
int possible_bad_child = AFR_CHILD_UNKNOWN;
|
||||
int ret = 0;
|
||||
int op_errno = ENOMEM;
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
struct gf_flock flock = {0, };
|
||||
dict_t *xdata_req = NULL;
|
||||
dict_t *xdata_rsp = NULL;
|
||||
int **pending = NULL;
|
||||
loc_t loc = {0,};
|
||||
|
||||
frame = (call_frame_t *)opaque;
|
||||
this = frame->this;
|
||||
local = frame->local;
|
||||
priv = this->private;
|
||||
|
||||
if (local->child_up[AFR_CHILD_ZERO]) {
|
||||
up_child = AFR_CHILD_ZERO;
|
||||
possible_bad_child = AFR_CHILD_ONE;
|
||||
} else if (local->child_up[AFR_CHILD_ONE]) {
|
||||
up_child = AFR_CHILD_ONE;
|
||||
possible_bad_child = AFR_CHILD_ZERO;
|
||||
}
|
||||
|
||||
GF_ASSERT (up_child != AFR_CHILD_UNKNOWN);
|
||||
|
||||
/* Query the up_child to see if it blames the down one. */
|
||||
xdata_req = dict_new();
|
||||
if (!xdata_req)
|
||||
goto out;
|
||||
|
||||
pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
|
||||
if (!pending)
|
||||
goto out;
|
||||
|
||||
ret = afr_set_pending_dict (priv, xdata_req, pending);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (local->fd) {
|
||||
ret = syncop_fxattrop (priv->children[up_child], local->fd,
|
||||
GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
|
||||
&xdata_rsp, NULL);
|
||||
} else {
|
||||
ret = syncop_xattrop (priv->children[up_child], &local->loc,
|
||||
GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
|
||||
&xdata_rsp, NULL);
|
||||
}
|
||||
if (ret || !xdata_rsp) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"Failed xattrop for gfid %s on %s",
|
||||
uuid_utoa (local->inode->gfid),
|
||||
priv->children[up_child]->name);
|
||||
op_errno = -ret;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv,
|
||||
possible_bad_child)) {
|
||||
read_subvol = up_child;
|
||||
goto out;
|
||||
}
|
||||
dict_unref (xdata_rsp);
|
||||
/* Query thin-arbiter to see if it blames any data brick. */
|
||||
ret = afr_fill_ta_loc (this, &loc);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"Failed to populate thin-arbiter loc for: %s.",
|
||||
loc.name);
|
||||
goto out;
|
||||
}
|
||||
flock.l_type = F_WRLCK;/*start and length are already zero. */
|
||||
ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
|
||||
AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock,
|
||||
NULL, NULL);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
|
||||
uuid_utoa (local->inode->gfid),
|
||||
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
|
||||
op_errno = -ret;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
|
||||
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
|
||||
NULL);
|
||||
if (ret || !xdata_rsp) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"gfid:%s: Failed xattrop on %s.",
|
||||
uuid_utoa (local->inode->gfid),
|
||||
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
|
||||
op_errno = -ret;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) {
|
||||
read_subvol = up_child;
|
||||
} else {
|
||||
gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
|
||||
"Failing read for gfid %s since good brick %s is down",
|
||||
uuid_utoa (local->inode->gfid),
|
||||
priv->children[possible_bad_child]->name);
|
||||
op_errno = EIO;
|
||||
}
|
||||
|
||||
unlock:
|
||||
flock.l_type = F_UNLCK;
|
||||
ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
|
||||
AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock,
|
||||
NULL, NULL);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
|
||||
"gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
|
||||
"%s.", uuid_utoa (local->inode->gfid),
|
||||
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
|
||||
}
|
||||
out:
|
||||
if (xdata_req)
|
||||
dict_unref(xdata_req);
|
||||
if (xdata_rsp)
|
||||
dict_unref(xdata_rsp);
|
||||
if (pending)
|
||||
afr_matrix_cleanup (pending, priv->child_count);
|
||||
loc_wipe (&loc);
|
||||
|
||||
if (read_subvol == -1) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = op_errno;
|
||||
}
|
||||
afr_read_txn_wind (frame, this, read_subvol);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
call_frame_t *ta_frame = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
int ret = 0;
|
||||
|
||||
local = frame->local;
|
||||
ta_frame = afr_ta_frame_create(this);
|
||||
if (!ta_frame) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
ret = synctask_new (this->ctx->env, afr_ta_read_txn,
|
||||
afr_ta_read_txn_done, ta_frame, frame);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
|
||||
AFR_MSG_THIN_ARB, "Failed to launch "
|
||||
"afr_ta_read_txn synctask for gfid %s.",
|
||||
uuid_utoa(local->inode->gfid));
|
||||
local->op_ret = -1;
|
||||
local->op_errno = ENOMEM;
|
||||
STACK_DESTROY(ta_frame->root);
|
||||
goto out;
|
||||
}
|
||||
return;
|
||||
out:
|
||||
afr_read_txn_wind (frame, this, -1);
|
||||
}
|
||||
|
||||
int
|
||||
afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
|
||||
{
|
||||
afr_local_t *local = NULL;
|
||||
int read_subvol = 0;
|
||||
inode_t *inode = NULL;
|
||||
int ret = -1;
|
||||
int spb_choice = -1;
|
||||
afr_private_t *priv = NULL;
|
||||
afr_local_t *local = NULL;
|
||||
int read_subvol = -1;
|
||||
inode_t *inode = NULL;
|
||||
int ret = -1;
|
||||
int spb_choice = -1;
|
||||
|
||||
local = frame->local;
|
||||
inode = local->inode;
|
||||
priv = this->private;
|
||||
|
||||
if (err) {
|
||||
read_subvol = -1;
|
||||
goto readfn;
|
||||
if (!priv->thin_arbiter_count)
|
||||
goto readfn;
|
||||
if (err != EINVAL)
|
||||
goto readfn;
|
||||
/* We need to query the good bricks and/or thin-arbiter.*/
|
||||
afr_ta_read_txn_synctask (frame, this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
read_subvol = afr_read_subvol_select_by_policy (inode, this,
|
||||
@ -127,7 +335,6 @@ readfn:
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
|
||||
{
|
||||
@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
afr_read_txn:
|
||||
|
||||
@ -207,13 +413,13 @@ int
|
||||
afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
||||
afr_read_txn_wind_t readfn, afr_transaction_type type)
|
||||
{
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
unsigned char *data = NULL;
|
||||
unsigned char *metadata = NULL;
|
||||
int read_subvol = -1;
|
||||
int event_generation = 0;
|
||||
int ret = -1;
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
unsigned char *data = NULL;
|
||||
unsigned char *metadata = NULL;
|
||||
int read_subvol = -1;
|
||||
int event_generation = 0;
|
||||
int ret = -1;
|
||||
|
||||
priv = this->private;
|
||||
local = frame->local;
|
||||
@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
||||
local->readfn = readfn;
|
||||
local->inode = inode_ref (inode);
|
||||
local->is_read_txn = _gf_true;
|
||||
local->transaction.type = type;
|
||||
|
||||
if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
|
||||
local->op_ret = -1;
|
||||
local->op_errno = afr_quorum_errno(priv);
|
||||
read_subvol = -1;
|
||||
goto read;
|
||||
}
|
||||
|
||||
if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {
|
||||
local->op_ret = -1;
|
||||
read_subvol = -1;
|
||||
goto read;
|
||||
}
|
||||
|
||||
local->transaction.type = type;
|
||||
if (priv->thin_arbiter_count &&
|
||||
AFR_COUNT (local->child_up, priv->child_count) !=
|
||||
priv->child_count) {
|
||||
afr_ta_read_txn_synctask (frame, this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = afr_inode_read_subvol_get (inode, this, data, metadata,
|
||||
&event_generation);
|
||||
if (ret == -1)
|
||||
|
@ -1252,4 +1252,7 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc);
|
||||
|
||||
gf_boolean_t
|
||||
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
|
||||
|
||||
call_frame_t*
|
||||
afr_ta_frame_create (xlator_t *this);
|
||||
#endif /* __AFR_H__ */
|
||||
|
Loading…
Reference in New Issue
Block a user