From 69532c141be160b3fea03c1579ae4ac13018dcdf Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 31 Aug 2018 10:32:20 +0530 Subject: [PATCH] afr: thin-arbiter read txn changes If both data bricks are up, read subvol will be based on read_subvols. If only one data brick is up: - First qeury the data-brick that is up. If it blames the other brick, allow the reads. - If if doesn't, query the TA to obtain the source of truth. TODO: See if in-memory state can be maintained for read txns (BZ 1624358). updates: bz#1579788 Change-Id: I61eec35592af3a1aaf9f90846d9a358b2e4b2fcc Signed-off-by: Ravishankar N --- tests/basic/afr/ta-read.t | 60 ++++++ tests/thin-arbiter.rc | 5 + xlators/cluster/afr/src/afr-common.c | 22 +++ xlators/cluster/afr/src/afr-read-txn.c | 249 +++++++++++++++++++++++-- xlators/cluster/afr/src/afr.h | 3 + 5 files changed, 320 insertions(+), 19 deletions(-) create mode 100644 tests/basic/afr/ta-read.t diff --git a/tests/basic/afr/ta-read.t b/tests/basic/afr/ta-read.t new file mode 100644 index 000000000..f2b3c38e0 --- /dev/null +++ b/tests/basic/afr/ta-read.t @@ -0,0 +1,60 @@ +#!/bin/bash + +# Test read transaction logic for thin-arbiter. + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../thin-arbiter.rc +cleanup; +TEST ta_create_brick_and_volfile brick0 +TEST ta_create_brick_and_volfile brick1 +TEST ta_create_ta_and_volfile ta +TEST ta_start_brick_process brick0 +TEST ta_start_brick_process brick1 +TEST ta_start_ta_process ta + +TEST ta_create_mount_volfile brick0 brick1 ta +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "trusted.afr.patchy-ta-2" ls $B0/ta + +TEST touch $M0/FILE +TEST ls $B0/brick0/FILE +TEST ls $B0/brick1/FILE +TEST ! ls $B0/ta/FILE + +# Kill one brick and write to FILE. +TEST ta_kill_brick brick0 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 0 +echo "brick0 down">> $M0/FILE +TEST [ $? -eq 0 ] +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/brick1/FILE +EXPECT "000000010000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/ta/trusted.afr.patchy-ta-2 + +#Umount and mount to remove cached data. +TEST umount $M0 +TEST ta_start_mount_process $M0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" ta_up_status $V0 $M0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 +# Read must be allowed since good brick is up. +TEST cat $M0/FILE + +# Toggle good and bad data brick processes. +TEST ta_start_brick_process brick0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 +TEST ta_kill_brick brick1 +EXPECT_WITHIN $PROCESS_DOWN_TIMEOUT "0" afr_child_up_status_meta $M0 $V0-replicate-0 1 +# Read must now fail. +TEST ! cat $M0/FILE + +# Bring all data bricks up, and kill TA. +TEST ta_start_brick_process brick1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 +TA_PID=$(ta_get_pid_by_brick_name ta) +TEST [ -n $TA_PID ] +TEST ta_kill_brick ta +TA_PID=$(ta_get_pid_by_brick_name ta) +TEST [ -z $TA_PID ] +# Read must now succeed. +TEST cat $M0/FILE +cleanup; diff --git a/tests/thin-arbiter.rc b/tests/thin-arbiter.rc index 8bb888a74..36d11cea6 100644 --- a/tests/thin-arbiter.rc +++ b/tests/thin-arbiter.rc @@ -419,6 +419,11 @@ function ta_kill_brick() kill -9 $p } +function ta_get_pid_by_brick_name() +{ + cat $B0/${1}.pid +} + function ta_up_status() { local v=$1 diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 702973a6e..641485b1e 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -1310,6 +1310,14 @@ afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error) goto refresh_done; } + if (priv->thin_arbiter_count && local->is_read_txn && + AFR_COUNT (success_replies, priv->child_count) != + priv->child_count) { + /* We need to query the good bricks and/or thin-arbiter.*/ + error = EINVAL; + goto refresh_done; + } + ret = afr_replies_interpret (frame, this, local->refreshinode, &start_heal); @@ -6977,3 +6985,17 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc) out: return ret; } + +call_frame_t * +afr_ta_frame_create (xlator_t *this) +{ + call_frame_t *frame = NULL; + void *lk_owner = NULL; + + frame = create_frame (this, this->ctx->pool); + if (!frame) + return NULL; + lk_owner = (void *)this; + afr_set_lk_owner (frame, this, lk_owner); + return frame; +} diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index a66a6660b..945c050da 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -30,6 +30,28 @@ afr_pending_read_decrement (afr_private_t *priv, int child_index) GF_ATOMIC_DEC(priv->pending_reads[child_index]); } +static gf_boolean_t +afr_ta_dict_contains_pending_xattr (dict_t *dict, afr_private_t *priv, + int child) +{ + int *pending = NULL; + int ret = 0; + int i = 0; + + ret = dict_get_ptr (dict, priv->pending_key[child], (void *)&pending); + if (ret == 0) { + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + /* Not doing a ntoh32(pending) as we just want to check + * if it is non-zero or not. */ + if (pending[i]) { + return _gf_true; + } + } + } + + return _gf_false; +} + void afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol) { @@ -81,21 +103,207 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) return 0; } +static int +afr_ta_read_txn_done (int ret, call_frame_t *ta_frame, void *opaque) +{ + STACK_DESTROY(ta_frame->root); + return 0; +} + +static int +afr_ta_read_txn (void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + int read_subvol = -1; + int up_child = AFR_CHILD_UNKNOWN; + int possible_bad_child = AFR_CHILD_UNKNOWN; + int ret = 0; + int op_errno = ENOMEM; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0, }; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int **pending = NULL; + loc_t loc = {0,}; + + frame = (call_frame_t *)opaque; + this = frame->this; + local = frame->local; + priv = this->private; + + if (local->child_up[AFR_CHILD_ZERO]) { + up_child = AFR_CHILD_ZERO; + possible_bad_child = AFR_CHILD_ONE; + } else if (local->child_up[AFR_CHILD_ONE]) { + up_child = AFR_CHILD_ONE; + possible_bad_child = AFR_CHILD_ZERO; + } + + GF_ASSERT (up_child != AFR_CHILD_UNKNOWN); + + /* Query the up_child to see if it blames the down one. */ + xdata_req = dict_new(); + if (!xdata_req) + goto out; + + pending = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!pending) + goto out; + + ret = afr_set_pending_dict (priv, xdata_req, pending); + if (ret < 0) + goto out; + + if (local->fd) { + ret = syncop_fxattrop (priv->children[up_child], local->fd, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, + &xdata_rsp, NULL); + } else { + ret = syncop_xattrop (priv->children[up_child], &local->loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, + &xdata_rsp, NULL); + } + if (ret || !xdata_rsp) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed xattrop for gfid %s on %s", + uuid_utoa (local->inode->gfid), + priv->children[up_child]->name); + op_errno = -ret; + goto out; + } + + if (afr_ta_dict_contains_pending_xattr (xdata_rsp, priv, + possible_bad_child)) { + read_subvol = up_child; + goto out; + } + dict_unref (xdata_rsp); + /* Query thin-arbiter to see if it blames any data brick. */ + ret = afr_fill_ta_loc (this, &loc); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", + loc.name); + goto out; + } + flock.l_type = F_WRLCK;/*start and length are already zero. */ + ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, + NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", + uuid_utoa (local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto out; + } + + ret = syncop_xattrop (priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + if (ret || !xdata_rsp) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed xattrop on %s.", + uuid_utoa (local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto unlock; + } + + if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, up_child)) { + read_subvol = up_child; + } else { + gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Failing read for gfid %s since good brick %s is down", + uuid_utoa (local->inode->gfid), + priv->children[possible_bad_child]->name); + op_errno = EIO; + } + +unlock: + flock.l_type = F_UNLCK; + ret = syncop_inodelk (priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, + NULL, NULL); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " + "%s.", uuid_utoa (local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + } +out: + if (xdata_req) + dict_unref(xdata_req); + if (xdata_rsp) + dict_unref(xdata_rsp); + if (pending) + afr_matrix_cleanup (pending, priv->child_count); + loc_wipe (&loc); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = op_errno; + } + afr_read_txn_wind (frame, this, read_subvol); + return ret; +} + +void +afr_ta_read_txn_synctask (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto out; + } + ret = synctask_new (this->ctx->env, afr_ta_read_txn, + afr_ta_read_txn_done, ta_frame, frame); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + AFR_MSG_THIN_ARB, "Failed to launch " + "afr_ta_read_txn synctask for gfid %s.", + uuid_utoa(local->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + STACK_DESTROY(ta_frame->root); + goto out; + } + return; +out: + afr_read_txn_wind (frame, this, -1); +} + int afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) { - afr_local_t *local = NULL; - int read_subvol = 0; - inode_t *inode = NULL; - int ret = -1; - int spb_choice = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + inode_t *inode = NULL; + int ret = -1; + int spb_choice = -1; local = frame->local; inode = local->inode; + priv = this->private; if (err) { - read_subvol = -1; - goto readfn; + if (!priv->thin_arbiter_count) + goto readfn; + if (err != EINVAL) + goto readfn; + /* We need to query the good bricks and/or thin-arbiter.*/ + afr_ta_read_txn_synctask (frame, this); + return 0; } read_subvol = afr_read_subvol_select_by_policy (inode, this, @@ -127,7 +335,6 @@ readfn: return 0; } - int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) { @@ -175,7 +382,6 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) } } - /* afr_read_txn: @@ -207,13 +413,13 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, afr_read_txn_wind_t readfn, afr_transaction_type type) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - unsigned char *data = NULL; - unsigned char *metadata = NULL; - int read_subvol = -1; - int event_generation = 0; - int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data = NULL; + unsigned char *metadata = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; priv = this->private; local = frame->local; @@ -225,21 +431,26 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, local->readfn = readfn; local->inode = inode_ref (inode); local->is_read_txn = _gf_true; + local->transaction.type = type; if (priv->quorum_count && !afr_has_quorum (local->child_up, this)) { local->op_ret = -1; local->op_errno = afr_quorum_errno(priv); - read_subvol = -1; goto read; } if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) { local->op_ret = -1; - read_subvol = -1; goto read; } - local->transaction.type = type; + if (priv->thin_arbiter_count && + AFR_COUNT (local->child_up, priv->child_count) != + priv->child_count) { + afr_ta_read_txn_synctask (frame, this); + return 0; + } + ret = afr_inode_read_subvol_get (inode, this, data, metadata, &event_generation); if (ret == -1) diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 155dc1d96..f7b636cb3 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1252,4 +1252,7 @@ afr_ta_post_op_unlock (xlator_t *this, loc_t *loc); gf_boolean_t afr_is_pending_set (xlator_t *this, dict_t *xdata, int type); + +call_frame_t* +afr_ta_frame_create (xlator_t *this); #endif /* __AFR_H__ */