afr: add new value for read-hash-mode volume option
Updates: #363 This new value (3) will try to wind read requests to the child of AFR having the least amount of pending requests in its queue. Change-Id: If6bda2aac9bf7aec3fc39622f78659313c4b6508 Signed-off-by: Ravishankar N <ravishankar@redhat.com>
This commit is contained in:
parent
89577d8b0a
commit
c87bd439ef
56
tests/basic/afr/afr-read-hash-mode.t
Normal file
56
tests/basic/afr/afr-read-hash-mode.t
Normal file
@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
|
||||
. $(dirname $0)/../../include.rc
|
||||
. $(dirname $0)/../../volume.rc
|
||||
|
||||
cleanup
|
||||
|
||||
function reads_brick_count {
|
||||
$CLI volume profile $V0 info incremental | grep -w READ | wc -l
|
||||
}
|
||||
|
||||
TEST glusterd
|
||||
TEST pidof glusterd
|
||||
TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..2}
|
||||
|
||||
TEST $CLI volume set $V0 cluster.choose-local off
|
||||
TEST $CLI volume set $V0 performance.quick-read off
|
||||
TEST $CLI volume set $V0 performance.io-cache off
|
||||
TEST $CLI volume set $V0 performance.write-behind off
|
||||
TEST $CLI volume set $V0 performance.stat-prefetch off
|
||||
TEST $CLI volume set $V0 performance.read-ahead off
|
||||
TEST $CLI volume start $V0
|
||||
|
||||
# Disable all caching
|
||||
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
|
||||
TEST dd if=/dev/urandom of=$M0/FILE bs=1M count=8
|
||||
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||||
|
||||
# TEST if the option gives the intended behavior. The way we perform this test
|
||||
# is by performing reads from the mount and write to /dev/null. If the
|
||||
# read-hash-mode is 3, then for a given file, more than 1 brick should serve the
|
||||
# read-fops where as with the default read-hash-mode (i.e. 1), only 1 brick will.
|
||||
|
||||
# read-hash-mode=1
|
||||
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
|
||||
EXPECT "1" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
|
||||
TEST $CLI volume profile $V0 start
|
||||
TEST dd if=$M0/FILE of=/dev/null bs=1M
|
||||
count=`reads_brick_count`
|
||||
TEST [ $count -eq 1 ]
|
||||
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||||
|
||||
# read-hash-mode=3
|
||||
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
|
||||
TEST $CLI volume set $V0 cluster.read-hash-mode 3
|
||||
EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "3" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
|
||||
TEST $CLI volume profile $V0 info clear
|
||||
TEST dd if=$M0/FILE of=/dev/null bs=1M
|
||||
count=`reads_brick_count`
|
||||
TEST [ $count -eq 2 ]
|
||||
|
||||
# Check that the arbiter did not serve any reads
|
||||
arbiter_reads=$($CLI volume top $V0 read brick $H0:$B0/${V0}2|grep FILE|awk '{print $1}')
|
||||
TEST [ -z $arbiter_reads ]
|
||||
|
||||
cleanup;
|
@ -1630,38 +1630,69 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
afr_least_pending_reads_child (afr_private_t *priv)
|
||||
{
|
||||
int i = 0;
|
||||
int child = 0;
|
||||
int64_t read_iter = -1;
|
||||
int64_t pending_read = -1;
|
||||
|
||||
pending_read = GF_ATOMIC_GET (priv->pending_reads[0]);
|
||||
for (i = 1; i < priv->child_count; i++) {
|
||||
if (AFR_IS_ARBITER_BRICK(priv, i))
|
||||
continue;
|
||||
read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
|
||||
if (read_iter < pending_read) {
|
||||
pending_read = read_iter;
|
||||
child = i;
|
||||
}
|
||||
}
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
int
|
||||
afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
|
||||
afr_hash_child (afr_read_subvol_args_t *args, afr_private_t *priv)
|
||||
{
|
||||
uuid_t gfid_copy = {0,};
|
||||
pid_t pid;
|
||||
int child = -1;
|
||||
|
||||
if (!hashmode) {
|
||||
return -1;
|
||||
switch (priv->hash_mode) {
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
gf_uuid_copy (gfid_copy, args->gfid);
|
||||
child = SuperFastHash((char *)gfid_copy,
|
||||
sizeof(gfid_copy)) % priv->child_count;
|
||||
break;
|
||||
case 2:
|
||||
if (args->ia_type != IA_IFDIR) {
|
||||
/*
|
||||
* Why getpid? Because it's one of the cheapest calls
|
||||
* available - faster than gethostname etc. - and
|
||||
* returns a constant-length value that's sure to be
|
||||
* shorter than a UUID. It's still very unlikely to be
|
||||
* the same across clients, so it still provides good
|
||||
* mixing. We're not trying for perfection here. All we
|
||||
* need is a low probability that multiple clients
|
||||
* won't converge on the same subvolume.
|
||||
*/
|
||||
pid = getpid();
|
||||
memcpy (gfid_copy, &pid, sizeof(pid));
|
||||
}
|
||||
child = SuperFastHash((char *)gfid_copy,
|
||||
sizeof(gfid_copy)) % priv->child_count;
|
||||
break;
|
||||
case 3:
|
||||
child = afr_least_pending_reads_child (priv);
|
||||
break;
|
||||
}
|
||||
|
||||
gf_uuid_copy (gfid_copy, args->gfid);
|
||||
|
||||
if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) {
|
||||
/*
|
||||
* Why getpid? Because it's one of the cheapest calls
|
||||
* available - faster than gethostname etc. - and returns a
|
||||
* constant-length value that's sure to be shorter than a UUID.
|
||||
* It's still very unlikely to be the same across clients, so
|
||||
* it still provides good mixing. We're not trying for
|
||||
* perfection here. All we need is a low probability that
|
||||
* multiple clients won't converge on the same subvolume.
|
||||
*/
|
||||
pid = getpid();
|
||||
memcpy (gfid_copy, &pid, sizeof(pid));
|
||||
}
|
||||
|
||||
return SuperFastHash((char *)gfid_copy,
|
||||
sizeof(gfid_copy)) % child_count;
|
||||
return child;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
|
||||
unsigned char *readable,
|
||||
@ -1686,8 +1717,7 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
|
||||
}
|
||||
|
||||
/* second preference - use hashed mode */
|
||||
read_subvol = afr_hash_child (&local_args, priv->child_count,
|
||||
priv->hash_mode);
|
||||
read_subvol = afr_hash_child (&local_args, priv);
|
||||
if (read_subvol >= 0 && readable[read_subvol])
|
||||
return read_subvol;
|
||||
|
||||
@ -4611,6 +4641,8 @@ afr_priv_dump (xlator_t *this)
|
||||
gf_proc_dump_write(key, "%d", priv->child_up[i]);
|
||||
sprintf (key, "pending_key[%d]", i);
|
||||
gf_proc_dump_write(key, "%s", priv->pending_key[i]);
|
||||
sprintf (key, "pending_reads[%d]", i);
|
||||
gf_proc_dump_write(key, "%"PRId64, GF_ATOMIC_GET(priv->pending_reads[i]));
|
||||
}
|
||||
gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
|
||||
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
|
||||
@ -4623,6 +4655,7 @@ afr_priv_dump (xlator_t *this)
|
||||
gf_proc_dump_write("background-self-heal-count", "%d",
|
||||
priv->background_self_heal_count);
|
||||
gf_proc_dump_write("healers", "%d", priv->healers);
|
||||
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
|
||||
if (priv->quorum_count == AFR_QUORUM_AUTO) {
|
||||
gf_proc_dump_write ("quorum-type", "auto");
|
||||
} else if (priv->quorum_count == 0) {
|
||||
@ -5325,6 +5358,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
|
||||
goto out;
|
||||
}
|
||||
|
||||
local->read_subvol = -1;
|
||||
|
||||
local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
|
||||
gf_afr_mt_reply_t);
|
||||
if (!local->replies) {
|
||||
@ -5474,9 +5509,12 @@ afr_priv_destroy (afr_private_t *priv)
|
||||
for (i = 0; i < priv->child_count; i++)
|
||||
GF_FREE (priv->pending_key[i]);
|
||||
}
|
||||
GF_FREE (priv->pending_reads);
|
||||
GF_FREE (priv->local);
|
||||
GF_FREE (priv->pending_key);
|
||||
GF_FREE (priv->children);
|
||||
GF_FREE (priv->child_up);
|
||||
GF_FREE (priv->child_latency);
|
||||
LOCK_DESTROY (&priv->lock);
|
||||
|
||||
GF_FREE (priv);
|
||||
|
@ -47,6 +47,7 @@ enum gf_afr_mem_types_ {
|
||||
gf_afr_mt_spb_status_t,
|
||||
gf_afr_mt_empty_brick_t,
|
||||
gf_afr_mt_child_latency_t,
|
||||
gf_afr_mt_atomic_t,
|
||||
gf_afr_mt_end
|
||||
};
|
||||
#endif
|
||||
|
@ -12,6 +12,39 @@
|
||||
#include "afr-transaction.h"
|
||||
#include "afr-messages.h"
|
||||
|
||||
void
|
||||
afr_pending_read_increment (afr_private_t *priv, int child_index)
|
||||
{
|
||||
if (child_index < 0 || child_index > priv->child_count)
|
||||
return;
|
||||
|
||||
GF_ATOMIC_INC(priv->pending_reads[child_index]);
|
||||
}
|
||||
|
||||
void
|
||||
afr_pending_read_decrement (afr_private_t *priv, int child_index)
|
||||
{
|
||||
if (child_index < 0 || child_index > priv->child_count)
|
||||
return;
|
||||
|
||||
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
|
||||
}
|
||||
|
||||
void
|
||||
afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
|
||||
{
|
||||
afr_local_t *local = NULL;
|
||||
afr_private_t *priv = NULL;
|
||||
|
||||
local = frame->local;
|
||||
priv = this->private;
|
||||
|
||||
afr_pending_read_decrement (priv, local->read_subvol);
|
||||
local->read_subvol = subvol;
|
||||
afr_pending_read_increment (priv, subvol);
|
||||
local->readfn (frame, this, subvol);
|
||||
}
|
||||
|
||||
int
|
||||
afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
|
||||
{
|
||||
@ -43,7 +76,7 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
|
||||
readable subvols. */
|
||||
if (subvol != -1)
|
||||
local->read_attempted[subvol] = 1;
|
||||
local->readfn (frame, this, subvol);
|
||||
afr_read_txn_wind (frame, this, subvol);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -89,7 +122,7 @@ readfn:
|
||||
if (read_subvol == -1) {
|
||||
AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err);
|
||||
}
|
||||
local->readfn (frame, this, read_subvol);
|
||||
afr_read_txn_wind (frame, this, read_subvol);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -246,7 +279,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
||||
local->read_attempted[read_subvol] = 1;
|
||||
|
||||
read:
|
||||
local->readfn (frame, this, read_subvol);
|
||||
afr_read_txn_wind (frame, this, read_subvol);
|
||||
|
||||
return 0;
|
||||
|
||||
|
@ -37,6 +37,12 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
|
||||
|
||||
int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
|
||||
|
||||
void
|
||||
afr_pending_read_increment (afr_private_t *priv, int child_index);
|
||||
|
||||
void
|
||||
afr_pending_read_decrement (afr_private_t *priv, int child_index);
|
||||
|
||||
call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
|
||||
gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
|
||||
gf_boolean_t afr_needs_changelog_update (afr_local_t *local);
|
||||
|
@ -429,6 +429,9 @@ init (xlator_t *this)
|
||||
}
|
||||
GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out);
|
||||
|
||||
priv->pending_reads = GF_CALLOC (sizeof(*priv->pending_reads),
|
||||
priv->child_count, gf_afr_mt_atomic_t);
|
||||
|
||||
GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
|
||||
|
||||
priv->favorite_child = -1;
|
||||
@ -703,18 +706,19 @@ struct volume_options options[] = {
|
||||
{ .key = {"read-hash-mode" },
|
||||
.type = GF_OPTION_TYPE_INT,
|
||||
.min = 0,
|
||||
.max = 2,
|
||||
.max = 3,
|
||||
.default_value = "1",
|
||||
.op_version = {2},
|
||||
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
|
||||
.tags = {"replicate"},
|
||||
.description = "inode-read fops happen only on one of the bricks in "
|
||||
"replicate. AFR will prefer the one computed using "
|
||||
"the method specified using this option"
|
||||
"0 = first up server, "
|
||||
"the method specified using this option.\n"
|
||||
"0 = first readable child of AFR, starting from 1st child.\n"
|
||||
"1 = hash by GFID of file (all clients use "
|
||||
"same subvolume), "
|
||||
"2 = hash by GFID of file and client PID",
|
||||
"same subvolume).\n"
|
||||
"2 = hash by GFID of file and client PID.\n"
|
||||
"3 = brick having the least outstanding read requests."
|
||||
},
|
||||
{ .key = {"choose-local" },
|
||||
.type = GF_OPTION_TYPE_BOOL,
|
||||
|
@ -113,6 +113,7 @@ typedef struct _afr_private {
|
||||
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
|
||||
int read_child; /* read-subvolume */
|
||||
unsigned int hash_mode; /* for when read_child is not set */
|
||||
gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
|
||||
int favorite_child; /* subvolume to be preferred in resolving
|
||||
split-brain cases */
|
||||
|
||||
@ -425,6 +426,8 @@ typedef struct _afr_local {
|
||||
unsigned char *readable;
|
||||
unsigned char *readable2; /*For rename transaction*/
|
||||
|
||||
int read_subvol; /* Current read subvolume */
|
||||
|
||||
afr_inode_refresh_cbk_t refreshfn;
|
||||
|
||||
/* @refreshinode:
|
||||
@ -974,6 +977,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
|
||||
__this = frame->this; \
|
||||
afr_handle_inconsistent_fop (frame, &__op_ret,\
|
||||
&__op_errno);\
|
||||
if (__local && __local->is_read_txn) \
|
||||
afr_pending_read_decrement (__this->private, __local->read_subvol); \
|
||||
frame->local = NULL; \
|
||||
} \
|
||||
\
|
||||
|
Loading…
x
Reference in New Issue
Block a user