cluster/ec: Change [f]getxattr to parallel-dispatch-one

At the moment in EC, [f]getxattr operations wait to acquire a lock
while other operations are in progress even when it is in the same mount with a
lock on the file/directory. This happens because [f]getxattr operations
follow the model where the operation is wound on 'k' of the bricks and are
matched to make sure the data returned is same on all of them. This consistency
check requires that no other operations are on-going while [f]getxattr
operations are wound to the bricks. We can perform [f]getxattr in
another way as well, where we find the good_mask from the lock that is already
granted and wind the operation on any one of the good bricks and unwind the
answer after adjusting size/blocks to the parent xlator. Since we are taking
into account good_mask, the reply we get will either be before or after a
possible on-going operation. Using this method, the operation doesn't need to
depend on completion of on-going operations which could be taking long time (In
case of some slow disks and writes are in progress etc). Thus we reduce the
time to serve [f]getxattr requests.

I changed [f]getxattr to dispatch-one and added extra logic in
ec_link_has_lock_conflict() to not have any conflicts for fops with
EC_MINIMUM_ONE as fop->minimum to achieve the effect described above.
Modified scripts to make sure READ fop is received in EC to trigger heals.

Updates gluster/glusterfs#368
Change-Id: I3b4ebf89181c336b7b8d5471b0454f016cdaf296
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
This commit is contained in:
Pranith Kumar K 2017-12-06 07:59:53 +05:30
parent 85d321b21c
commit c96a1338fe
7 changed files with 199 additions and 5 deletions

View File

@ -72,6 +72,9 @@
#define FNM_EXTMATCH 0
#endif
/*gets max-offset on all architectures correctly*/
#define GF_OFF_MAX ((1ULL << (sizeof(off_t) * 8 - 1)) - 1ULL)
#define GLUSTERD_MAX_SNAP_NAME 255
#define GLUSTERFS_SOCKET_LISTEN_BACKLOG 10
#define ZR_MOUNTPOINT_OPT "mountpoint"

View File

@ -0,0 +1,133 @@
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <limits.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <glusterfs/api/glfs.h>
#include <glusterfs/api/glfs-handles.h>
int cbk_complete = 0;
ssize_t cbk_ret_val = 0;
int
fill_iov (struct iovec *iov, char fillchar, int count)
{
int ret = -1;
iov->iov_base = calloc (count + 1, sizeof(fillchar));
if (iov->iov_base == NULL) {
return ret;
} else {
iov->iov_len = count;
ret = 0;
}
memset (iov->iov_base, fillchar, count);
memset (iov->iov_base + count, '\0', 1);
return ret;
}
void
write_async_cbk (glfs_fd_t *fd, ssize_t ret, void *cookie)
{
if (ret < 0) {
fprintf (stderr, "glfs_write failed");
}
cbk_ret_val = ret;
cbk_complete = 1;
}
int
write_async (glfs_t *fs, glfs_fd_t *glfd, int char_count)
{
ssize_t ret = -1;
int flags = O_RDWR;
struct iovec iov = {0};
ret = fill_iov (&iov, 'a', char_count);
if (ret) {
fprintf (stderr, "failed to create iov");
goto out;
}
ret = glfs_pwritev_async (glfd, &iov, 1, 0, flags, write_async_cbk,
NULL);
out:
if (ret < 0) {
fprintf (stderr, "glfs_pwritev async failed");
}
return ret;
}
int
main (int argc, char *argv[])
{
glfs_t *fs = NULL;
glfs_fd_t *fd = NULL;
int ret = 1;
char buf[1024] = {0};
if (argc != 4) {
fprintf (stderr, "Syntax: %s <host> <volname> <file>\n", argv[0]);
return 1;
}
fs = glfs_new (argv[2]);
if (!fs) {
fprintf (stderr, "glfs_new: returned NULL\n");
return 1;
}
ret = glfs_set_volfile_server (fs, "tcp", argv[1], 24007);
if (ret != 0) {
fprintf (stderr, "glfs_set_volfile_server: retuned %d\n", ret);
goto out;
}
ret = glfs_set_logging (fs, "/tmp/ec-fgetxattr.log", 7);
if (ret != 0) {
fprintf (stderr, "glfs_set_logging: returned %d\n", ret);
goto out;
}
ret = glfs_init (fs);
if (ret != 0) {
fprintf (stderr, "glfs_init: returned %d\n", ret);
goto out;
}
fd = glfs_open (fs, argv[3], O_RDWR | O_TRUNC);
if (fd == NULL) {
fprintf (stderr, "glfs_open: returned NULL\n");
goto out;
}
ret = write_async (fs, fd, 16);
if (ret) {
fprintf (stderr, "write_async failed\n");
}
sleep (1);
ret = glfs_fgetxattr (fd, "trusted.glusterfs.abc", buf, sizeof buf);
while (cbk_complete != 1) {
/* ret will be -ve as xattr doesn't exist, and fgetxattr should
* return waaaayyy before writev */
ret = 0;
sleep (1);
}
if (cbk_ret_val < 0) {
fprintf (stderr, "cbk_ret_val is -ve\n");
ret = -1;
}
glfs_close(fd);
out:
unlink ("/tmp/ec-fgetxattr.log");
glfs_fini (fs);
return ret;
}

View File

@ -0,0 +1,40 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup;
TEST glusterd
TEST pidof glusterd
TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{1..6}
TEST $CLI volume set $V0 performance.quick-read off
TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 performance.io-cache off
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 performance.client-io-threads off
TEST $CLI volume set $V0 brick-log-level DEBUG
TEST $CLI volume set $V0 delay-gen posix
TEST $CLI volume set $V0 delay-gen.delay-duration 10000000
TEST $CLI volume set $V0 delay-gen.delay-percentage 100
TEST $CLI volume set $V0 delay-gen.enable read,write
TEST $CLI volume start $V0
EXPECT 'Started' volinfo_field $V0 'Status'
TEST $GFS -s $H0 --volfile-id $V0 $M0
EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
TEST touch $M0/file
# Perform two writes to make sure io-threads have enough threads to perform
# things in parallel when the test execution happens.
echo abc > $M0/file1 &
echo abc > $M0/file2 &
wait
TEST build_tester $(dirname $0)/ec-fast-fgetxattr.c -lgfapi -Wall -O2
TEST $(dirname $0)/ec-fast-fgetxattr $H0 $V0 /file
cleanup_tester $(dirname ${0})/ec-fast-fgetxattr
cleanup;

View File

@ -53,6 +53,13 @@ ec_is_range_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
static gf_boolean_t
ec_lock_conflict (ec_lock_link_t *l1, ec_lock_link_t *l2)
{
/* Fops like access/stat won't have to worry what the other fops are
* modifying as the fop is wound only to one brick. So it can be
* executed in parallel*/
if (l1->fop->minimum == EC_MINIMUM_ONE ||
l2->fop->minimum == EC_MINIMUM_ONE)
return _gf_false;
if ((l1->fop->flags & EC_FLAG_LOCK_SHARED) &&
(l2->fop->flags & EC_FLAG_LOCK_SHARED))
return _gf_false;

View File

@ -152,7 +152,7 @@ ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
} else {
/* Check if there has been an overflow. */
if ((off_t)tmp < 0) {
tmp = (1ULL << (sizeof(off_t) * 8 - 1)) - 1ULL;
tmp = GF_OFF_MAX;
tail = -tail;
}
}

View File

@ -324,13 +324,23 @@ int32_t ec_manager_getxattr(ec_fop_data_t * fop, int32_t state)
return EC_STATE_DISPATCH;
case EC_STATE_DISPATCH:
ec_dispatch_all(fop);
if (fop->minimum == EC_MINIMUM_ALL) {
ec_dispatch_all(fop);
} else {
ec_dispatch_one(fop);
}
return EC_STATE_PREPARE_ANSWER;
case EC_STATE_PREPARE_ANSWER:
ec_handle_special_xattrs (fop);
cbk = ec_fop_prepare_answer(fop, _gf_true);
if (fop->minimum == EC_MINIMUM_ALL) {
cbk = ec_fop_prepare_answer(fop, _gf_true);
} else {
if (ec_dispatch_one_retry (fop, &cbk)) {
return EC_STATE_DISPATCH;
}
}
if (cbk != NULL) {
int32_t err;
@ -1809,6 +1819,7 @@ int32_t ec_manager_stat(ec_fop_data_t * fop, int32_t state)
case EC_STATE_PREPARE_ANSWER:
cbk = ec_fop_prepare_answer(fop, _gf_true);
if (cbk != NULL) {
if (cbk->iatt[0].ia_type == IA_IFREG) {
ec_iatt_rebuild(fop->xl->private, cbk->iatt, 1,

View File

@ -864,7 +864,7 @@ ec_gf_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
int error = 0;
ec_t *ec = this->private;
int32_t minimum = EC_MINIMUM_MIN;
int32_t minimum = EC_MINIMUM_ONE;
if (name && strcmp (name, EC_XATTR_HEAL) != 0) {
EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
@ -901,7 +901,7 @@ ec_gf_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
EC_INTERNAL_XATTR_OR_GOTO(name, NULL, error, out);
ec_fgetxattr (frame, this, -1, EC_MINIMUM_MIN, default_fgetxattr_cbk,
ec_fgetxattr (frame, this, -1, EC_MINIMUM_ONE, default_fgetxattr_cbk,
NULL, fd, name, xdata);
return 0;
out: