afr: add new value for read-hash-mode volume option

Updates: #363

This new value (3) will try to wind read requests to the child of AFR
having the least amount of pending requests in its queue.

Change-Id: If6bda2aac9bf7aec3fc39622f78659313c4b6508
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
This commit is contained in:
Ravishankar N 2018-03-22 17:55:15 +05:30
parent 89577d8b0a
commit c87bd439ef
7 changed files with 175 additions and 32 deletions

View File

@ -0,0 +1,56 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup
function reads_brick_count {
$CLI volume profile $V0 info incremental | grep -w READ | wc -l
}
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..2}
TEST $CLI volume set $V0 cluster.choose-local off
TEST $CLI volume set $V0 performance.quick-read off
TEST $CLI volume set $V0 performance.io-cache off
TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 performance.read-ahead off
TEST $CLI volume start $V0
# Disable all caching
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
TEST dd if=/dev/urandom of=$M0/FILE bs=1M count=8
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
# TEST if the option gives the intended behavior. The way we perform this test
# is by performing reads from the mount and write to /dev/null. If the
# read-hash-mode is 3, then for a given file, more than 1 brick should serve the
# read-fops where as with the default read-hash-mode (i.e. 1), only 1 brick will.
# read-hash-mode=1
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
EXPECT "1" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
TEST $CLI volume profile $V0 start
TEST dd if=$M0/FILE of=/dev/null bs=1M
count=`reads_brick_count`
TEST [ $count -eq 1 ]
EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
# read-hash-mode=3
TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0
TEST $CLI volume set $V0 cluster.read-hash-mode 3
EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "3" mount_get_option_value $M0 $V0-replicate-0 read-hash-mode
TEST $CLI volume profile $V0 info clear
TEST dd if=$M0/FILE of=/dev/null bs=1M
count=`reads_brick_count`
TEST [ $count -eq 2 ]
# Check that the arbiter did not serve any reads
arbiter_reads=$($CLI volume top $V0 read brick $H0:$B0/${V0}2|grep FILE|awk '{print $1}')
TEST [ -z $arbiter_reads ]
cleanup;

View File

@ -1630,38 +1630,69 @@ out:
return ret;
}
int
afr_least_pending_reads_child (afr_private_t *priv)
{
int i = 0;
int child = 0;
int64_t read_iter = -1;
int64_t pending_read = -1;
pending_read = GF_ATOMIC_GET (priv->pending_reads[0]);
for (i = 1; i < priv->child_count; i++) {
if (AFR_IS_ARBITER_BRICK(priv, i))
continue;
read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
if (read_iter < pending_read) {
pending_read = read_iter;
child = i;
}
}
return child;
}
int
afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
afr_hash_child (afr_read_subvol_args_t *args, afr_private_t *priv)
{
uuid_t gfid_copy = {0,};
pid_t pid;
int child = -1;
if (!hashmode) {
return -1;
switch (priv->hash_mode) {
case 0:
break;
case 1:
gf_uuid_copy (gfid_copy, args->gfid);
child = SuperFastHash((char *)gfid_copy,
sizeof(gfid_copy)) % priv->child_count;
break;
case 2:
if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and
* returns a constant-length value that's sure to be
* shorter than a UUID. It's still very unlikely to be
* the same across clients, so it still provides good
* mixing. We're not trying for perfection here. All we
* need is a low probability that multiple clients
* won't converge on the same subvolume.
*/
pid = getpid();
memcpy (gfid_copy, &pid, sizeof(pid));
}
child = SuperFastHash((char *)gfid_copy,
sizeof(gfid_copy)) % priv->child_count;
break;
case 3:
child = afr_least_pending_reads_child (priv);
break;
}
gf_uuid_copy (gfid_copy, args->gfid);
if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and returns a
* constant-length value that's sure to be shorter than a UUID.
* It's still very unlikely to be the same across clients, so
* it still provides good mixing. We're not trying for
* perfection here. All we need is a low probability that
* multiple clients won't converge on the same subvolume.
*/
pid = getpid();
memcpy (gfid_copy, &pid, sizeof(pid));
}
return SuperFastHash((char *)gfid_copy,
sizeof(gfid_copy)) % child_count;
return child;
}
int
afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
unsigned char *readable,
@ -1686,8 +1717,7 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
}
/* second preference - use hashed mode */
read_subvol = afr_hash_child (&local_args, priv->child_count,
priv->hash_mode);
read_subvol = afr_hash_child (&local_args, priv);
if (read_subvol >= 0 && readable[read_subvol])
return read_subvol;
@ -4611,6 +4641,8 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write(key, "%d", priv->child_up[i]);
sprintf (key, "pending_key[%d]", i);
gf_proc_dump_write(key, "%s", priv->pending_key[i]);
sprintf (key, "pending_reads[%d]", i);
gf_proc_dump_write(key, "%"PRId64, GF_ATOMIC_GET(priv->pending_reads[i]));
}
gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
@ -4623,6 +4655,7 @@ afr_priv_dump (xlator_t *this)
gf_proc_dump_write("background-self-heal-count", "%d",
priv->background_self_heal_count);
gf_proc_dump_write("healers", "%d", priv->healers);
gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
if (priv->quorum_count == AFR_QUORUM_AUTO) {
gf_proc_dump_write ("quorum-type", "auto");
} else if (priv->quorum_count == 0) {
@ -5325,6 +5358,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
goto out;
}
local->read_subvol = -1;
local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
gf_afr_mt_reply_t);
if (!local->replies) {
@ -5474,9 +5509,12 @@ afr_priv_destroy (afr_private_t *priv)
for (i = 0; i < priv->child_count; i++)
GF_FREE (priv->pending_key[i]);
}
GF_FREE (priv->pending_reads);
GF_FREE (priv->local);
GF_FREE (priv->pending_key);
GF_FREE (priv->children);
GF_FREE (priv->child_up);
GF_FREE (priv->child_latency);
LOCK_DESTROY (&priv->lock);
GF_FREE (priv);

View File

@ -47,6 +47,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_spb_status_t,
gf_afr_mt_empty_brick_t,
gf_afr_mt_child_latency_t,
gf_afr_mt_atomic_t,
gf_afr_mt_end
};
#endif

View File

@ -12,6 +12,39 @@
#include "afr-transaction.h"
#include "afr-messages.h"
void
afr_pending_read_increment (afr_private_t *priv, int child_index)
{
if (child_index < 0 || child_index > priv->child_count)
return;
GF_ATOMIC_INC(priv->pending_reads[child_index]);
}
void
afr_pending_read_decrement (afr_private_t *priv, int child_index)
{
if (child_index < 0 || child_index > priv->child_count)
return;
GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
void
afr_read_txn_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
local = frame->local;
priv = this->private;
afr_pending_read_decrement (priv, local->read_subvol);
local->read_subvol = subvol;
afr_pending_read_increment (priv, subvol);
local->readfn (frame, this, subvol);
}
int
afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
{
@ -43,7 +76,7 @@ afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
readable subvols. */
if (subvol != -1)
local->read_attempted[subvol] = 1;
local->readfn (frame, this, subvol);
afr_read_txn_wind (frame, this, subvol);
return 0;
}
@ -89,7 +122,7 @@ readfn:
if (read_subvol == -1) {
AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err);
}
local->readfn (frame, this, read_subvol);
afr_read_txn_wind (frame, this, read_subvol);
return 0;
}
@ -246,7 +279,7 @@ afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
local->read_attempted[read_subvol] = 1;
read:
local->readfn (frame, this, read_subvol);
afr_read_txn_wind (frame, this, read_subvol);
return 0;

View File

@ -37,6 +37,12 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
void
afr_pending_read_increment (afr_private_t *priv, int child_index);
void
afr_pending_read_decrement (afr_private_t *priv, int child_index);
call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
gf_boolean_t afr_needs_changelog_update (afr_local_t *local);

View File

@ -429,6 +429,9 @@ init (xlator_t *this)
}
GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out);
priv->pending_reads = GF_CALLOC (sizeof(*priv->pending_reads),
priv->child_count, gf_afr_mt_atomic_t);
GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
priv->favorite_child = -1;
@ -703,18 +706,19 @@ struct volume_options options[] = {
{ .key = {"read-hash-mode" },
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 2,
.max = 3,
.default_value = "1",
.op_version = {2},
.flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.tags = {"replicate"},
.description = "inode-read fops happen only on one of the bricks in "
"replicate. AFR will prefer the one computed using "
"the method specified using this option"
"0 = first up server, "
"the method specified using this option.\n"
"0 = first readable child of AFR, starting from 1st child.\n"
"1 = hash by GFID of file (all clients use "
"same subvolume), "
"2 = hash by GFID of file and client PID",
"same subvolume).\n"
"2 = hash by GFID of file and client PID.\n"
"3 = brick having the least outstanding read requests."
},
{ .key = {"choose-local" },
.type = GF_OPTION_TYPE_BOOL,

View File

@ -113,6 +113,7 @@ typedef struct _afr_private {
gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
int favorite_child; /* subvolume to be preferred in resolving
split-brain cases */
@ -425,6 +426,8 @@ typedef struct _afr_local {
unsigned char *readable;
unsigned char *readable2; /*For rename transaction*/
int read_subvol; /* Current read subvolume */
afr_inode_refresh_cbk_t refreshfn;
/* @refreshinode:
@ -974,6 +977,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
__this = frame->this; \
afr_handle_inconsistent_fop (frame, &__op_ret,\
&__op_errno);\
if (__local && __local->is_read_txn) \
afr_pending_read_decrement (__this->private, __local->read_subvol); \
frame->local = NULL; \
} \
\