afr: thin-arbiter read txn changes

If both data bricks are up, read subvol will be based on read_subvols.

If only one data brick is up:
- First qeury the data-brick that is up. If it blames the other brick,
allow the reads.

- If if doesn't, query the TA to obtain the source of truth.

TODO: See if in-memory state can be maintained for read txns (BZ 1624358).

updates: bz#1579788
Change-Id: I61eec35592af3a1aaf9f90846d9a358b2e4b2fcc
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
This commit is contained in:
Ravishankar N 2018-08-31 10:32:20 +05:30
parent 560bd8671f
commit 69532c141b
5 changed files with 320 additions and 19 deletions

60
tests/basic/afr/ta-read.t Normal file
View File

@ -0,0 +1,60 @@
#!/bin/bash
# Test read transaction logic for thin-arbiter.
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../thin-arbiter.rc
cleanup;
TEST ta_create_brick_and_volfile brick0
TEST ta_create_brick_and_volfile brick1
TEST ta_create_ta_and_volfile ta
TEST ta_start_brick_process brick0
TEST ta_start_brick_process brick1
TEST ta_start_ta_process ta
TEST ta_create_mount_volfile brick0 brick1 ta
TEST ta_start_mount_process $M0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta
TEST touch $M0/FILE
TEST ls $B0/brick0/FILE
TEST ls $B0/brick1/FILE
TEST ! ls $B0/ta/FILE
# Kill one brick and write to FILE.
TEST ta_kill_brick brick0
EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0
echo "brick0 down">> $M0/FILE
TEST [ $? -eq 0 ]
EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/FILE
EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.patchy-ta-2
#Umount and mount to remove cached data.
TEST umount $M0
TEST ta_start_mount_process $M0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
# Read must be allowed since good brick is up.
TEST cat $M0/FILE
# Toggle good and bad data brick processes.
TEST ta_start_brick_process brick0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
TEST ta_kill_brick brick1
EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1
# Read must now fail.
TEST ! cat $M0/FILE
# Bring all data bricks up, and kill TA.
TEST ta_start_brick_process brick1
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
TA_PID=$(ta_get_pid_by_brick_name ta)
TEST [ -n $TA_PID ]
TEST ta_kill_brick ta
TA_PID=$(ta_get_pid_by_brick_name ta)
TEST [ -z $TA_PID ]
# Read must now succeed.
TEST cat $M0/FILE
cleanup;

View File

@ -419,6 +419,11 @@ function ta_kill_brick()
kill -9 $p
}
function ta_get_pid_by_brick_name()
{
cat $B0/${1}.pid
}
function ta_up_status()
{
local v=$1

View File

@ -1310,6 +1310,14 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error)
goto refresh_done;
}
if (priv->thin_arbiter_count && local->is_read_txn &&
AFR_COUNT (success_replies, priv->child_count) !=
priv->child_count) {
/* We need to query the good bricks and/or thin-arbiter.*/
error = EINVAL;
goto refresh_done;
}
ret = afr_replies_interpret (frame, this, local->refreshinode,
&start_heal);
@ -6977,3 +6985,17 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc)
out:
return ret;
}
call_frame_t *
afr_ta_frame_create (xlator_t *this)
{
call_frame_t *frame = NULL;
void *lk_owner = NULL;
frame = create_frame (this, this->ctx->pool);
if (!frame)
return NULL;
lk_owner = (void *)this;
afr_set_lk_owner (frame, this, lk_owner);
return frame;
}

View File

@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index)
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
static gf_boolean_t
afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv,
int child)
{
int *pending = NULL;
int ret = 0;
int i = 0;
ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending);
if (ret == 0) {
for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
/* Not doing a ntoh32(pending) as we just want to check
* if it is non-zero or not. */
if (pending[i]) {
return _gf_true;
}
}
}
return _gf_false;
}
void
afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
return 0;
}
static int
afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque)
{
STACK_DESTROY(ta_frame->root);
return 0;
}
static int
afr_ta_read_txn (void *opaque)
{
call_frame_t *frame = NULL;
xlator_t *this = NULL;
int read_subvol = -1;
int up_child = AFR_CHILD_UNKNOWN;
int possible_bad_child = AFR_CHILD_UNKNOWN;
int ret = 0;
int op_errno = ENOMEM;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0, };
dict_t *xdata_req = NULL;
dict_t *xdata_rsp = NULL;
int **pending = NULL;
loc_t loc = {0,};
frame = (call_frame_t *)opaque;
this = frame->this;
local = frame->local;
priv = this->private;
if (local->child_up[AFR_CHILD_ZERO]) {
up_child = AFR_CHILD_ZERO;
possible_bad_child = AFR_CHILD_ONE;
} else if (local->child_up[AFR_CHILD_ONE]) {
up_child = AFR_CHILD_ONE;
possible_bad_child = AFR_CHILD_ZERO;
}
GF_ASSERT (up_child != AFR_CHILD_UNKNOWN);
/* Query the up_child to see if it blames the down one. */
xdata_req = dict_new();
if (!xdata_req)
goto out;
pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
if (!pending)
goto out;
ret = afr_set_pending_dict (priv, xdata_req, pending);
if (ret < 0)
goto out;
if (local->fd) {
ret = syncop_fxattrop (priv->children[up_child], local->fd,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
&xdata_rsp, NULL);
} else {
ret = syncop_xattrop (priv->children[up_child], &local->loc,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL,
&xdata_rsp, NULL);
}
if (ret || !xdata_rsp) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed xattrop for gfid %s on %s",
uuid_utoa (local->inode->gfid),
priv->children[up_child]->name);
op_errno = -ret;
goto out;
}
if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv,
possible_bad_child)) {
read_subvol = up_child;
goto out;
}
dict_unref (xdata_rsp);
/* Query thin-arbiter to see if it blames any data brick. */
ret = afr_fill_ta_loc (this, &loc);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"Failed to populate thin-arbiter loc for: %s.",
loc.name);
goto out;
}
flock.l_type = F_WRLCK;/*start and length are already zero. */
ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock,
NULL, NULL);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
uuid_utoa (local->inode->gfid),
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
op_errno = -ret;
goto out;
}
ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
NULL);
if (ret || !xdata_rsp) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"gfid:%s: Failed xattrop on %s.",
uuid_utoa (local->inode->gfid),
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
op_errno = -ret;
goto unlock;
}
if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) {
read_subvol = up_child;
} else {
gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
"Failing read for gfid %s since good brick %s is down",
uuid_utoa (local->inode->gfid),
priv->children[possible_bad_child]->name);
op_errno = EIO;
}
unlock:
flock.l_type = F_UNLCK;
ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX],
AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock,
NULL, NULL);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
"gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
"%s.", uuid_utoa (local->inode->gfid),
priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
}
out:
if (xdata_req)
dict_unref(xdata_req);
if (xdata_rsp)
dict_unref(xdata_rsp);
if (pending)
afr_matrix_cleanup (pending, priv->child_count);
loc_wipe (&loc);
if (read_subvol == -1) {
local->op_ret = -1;
local->op_errno = op_errno;
}
afr_read_txn_wind (frame, this, read_subvol);
return ret;
}
void
afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this)
{
call_frame_t *ta_frame = NULL;
afr_local_t *local = NULL;
int ret = 0;
local = frame->local;
ta_frame = afr_ta_frame_create(this);
if (!ta_frame) {
local->op_ret = -1;
local->op_errno = ENOMEM;
goto out;
}
ret = synctask_new (this->ctx->env, afr_ta_read_txn,
afr_ta_read_txn_done, ta_frame, frame);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
AFR_MSG_THIN_ARB, "Failed to launch "
"afr_ta_read_txn synctask for gfid %s.",
uuid_utoa(local->inode->gfid));
local->op_ret = -1;
local->op_errno = ENOMEM;
STACK_DESTROY(ta_frame->root);
goto out;
}
return;
out:
afr_read_txn_wind (frame, this, -1);
}
int
afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
{
afr_local_t *local = NULL;
int read_subvol = 0;
inode_t *inode = NULL;
int ret = -1;
int spb_choice = -1;
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int read_subvol = -1;
inode_t *inode = NULL;
int ret = -1;
int spb_choice = -1;
local = frame->local;
inode = local->inode;
priv = this->private;
if (err) {
read_subvol = -1;
goto readfn;
if (!priv->thin_arbiter_count)
goto readfn;
if (err != EINVAL)
goto readfn;
/* We need to query the good bricks and/or thin-arbiter.*/
afr_ta_read_txn_synctask (frame, this);
return 0;
}
read_subvol = afr_read_subvol_select_by_policy (inode, this,
@ -127,7 +335,6 @@ readfn:
return 0;
}
int
afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
{
@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
}
}
/*
afr_read_txn:
@ -207,13 +413,13 @@ int
afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
afr_read_txn_wind_t readfn, afr_transaction_type type)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
unsigned char *data = NULL;
unsigned char *metadata = NULL;
int read_subvol = -1;
int event_generation = 0;
int ret = -1;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
unsigned char *data = NULL;
unsigned char *metadata = NULL;
int read_subvol = -1;
int event_generation = 0;
int ret = -1;
priv = this->private;
local = frame->local;
@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->readfn = readfn;
local->inode = inode_ref (inode);
local->is_read_txn = _gf_true;
local->transaction.type = type;
if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) {
local->op_ret = -1;
local->op_errno = afr_quorum_errno(priv);
read_subvol = -1;
goto read;
}
if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) {
local->op_ret = -1;
read_subvol = -1;
goto read;
}
local->transaction.type = type;
if (priv->thin_arbiter_count &&
AFR_COUNT (local->child_up, priv->child_count) !=
priv->child_count) {
afr_ta_read_txn_synctask (frame, this);
return 0;
}
ret = afr_inode_read_subvol_get (inode, this, data, metadata,
&event_generation);
if (ret == -1)

View File

@ -1252,4 +1252,7 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc);
gf_boolean_t
afr_is_pending_set (xlator_t *this, dict_t *xdata, int type);
call_frame_t*
afr_ta_frame_create (xlator_t *this);
#endif /* __AFR_H__ */