features/shard: Handle offset in appending writes

When a file is opened with append, all writes are appended at the end of file
irrespective of the offset given in the write syscall. This needs to be
considered in shard size update function and also for choosing which shard to
write to.

At the moment shard piggybacks on queuing from write-behind
xlator for ordering of the operations. So if write-behind is disabled and
two parallel appending-writes come both of which can increase the file size
beyond shard-size the file will be corrupted.

BUG: 1455301
Change-Id: I9007e6a39098ab0b5d5386367bd07eb5f89cb09e
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: https://review.gluster.org/17387
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
This commit is contained in:
Pranith Kumar K 2017-05-24 22:30:29 +05:30 committed by Pranith Kumar Karampuri
parent 3ca5ae2f3b
commit bea02e26a3
4 changed files with 296 additions and 61 deletions

View File

@ -2,7 +2,6 @@ performance.quick-read=off
performance.read-ahead=off
performance.io-cache=off
performance.stat-prefetch=off
performance.write-behind=off
performance.open-behind=off
performance.readdir-ahead=off
network.remote-dio=enable

View File

@ -0,0 +1,179 @@
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <limits.h>
#include <string.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <glusterfs/api/glfs.h>
#include <glusterfs/api/glfs-handles.h>
#define LOG_ERR(msg) do { \
fprintf (stderr, "%s : Error (%s)\n", msg, strerror (errno)); \
} while (0)
/*This test tests that shard xlator handles offset in appending writes
* correctly. This test performs writes of 1025 bytes 1025 times, in 5 threads
* with different threads. The buffer to be written is same character repeated
* 1025 times in the buffer for a thread. At the end it reads the buffer till
* end of file and tests that the read of 1025 bytes is always same character
* and the content read is 5*1025*1025 size. 1025 bytes is chosen because it
* will lead to write on more than one shard at some point when the size is
* going over the initial shard*/
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
int thread_data = '1';
glfs_t *
init_glfs (const char *hostname, const char *volname,
const char *logfile)
{
int ret = -1;
glfs_t *fs = NULL;
fs = glfs_new (volname);
if (!fs) {
LOG_ERR ("glfs_new failed");
return NULL;
}
ret = glfs_set_volfile_server (fs, "tcp", hostname, 24007);
if (ret < 0) {
LOG_ERR ("glfs_set_volfile_server failed");
goto out;
}
ret = glfs_set_logging (fs, logfile, 7);
if (ret < 0) {
LOG_ERR ("glfs_set_logging failed");
goto out;
}
ret = glfs_init (fs);
if (ret < 0) {
LOG_ERR ("glfs_init failed");
goto out;
}
ret = 0;
out:
if (ret) {
glfs_fini (fs);
fs = NULL;
}
return fs;
}
void*
write_data (void *data)
{
char buf[1025] = {0};
glfs_fd_t *glfd = NULL;
glfs_t *fs = data;
int i = 0;
pthread_mutex_lock (&lock);
{
memset(buf, thread_data, sizeof(buf));
thread_data++;
}
pthread_mutex_unlock (&lock);
for (i = 0; i < 1025; i++) {
glfd = glfs_creat(fs, "parallel-write.txt", O_WRONLY | O_APPEND,
S_IRUSR | S_IWUSR | O_SYNC);
if (!glfd) {
LOG_ERR ("Failed to create file");
exit(1);
}
if (glfs_write (glfd, buf, sizeof(buf), 0) < 0) {
LOG_ERR ("Failed to write to file");
exit(1);
}
if (glfs_close(glfd) != 0) {
LOG_ERR ("Failed to close file");
exit(1);
}
}
return NULL;
}
int
main (int argc, char *argv[])
{
pthread_t tid[5] = {0};
char buf[1025] = {0};
char cmp_buf[1025] = {0};
int ret = 0;
char *hostname = NULL;
char *volname = NULL;
char *logfile = NULL;
glfs_t *fs = NULL;
glfs_fd_t *glfd = NULL;
ssize_t bytes_read = 0;
ssize_t total_bytes_read = 0;
int i = 0;
if (argc != 4) {
fprintf (stderr, "Invalid argument\n");
exit(1);
}
hostname = argv[1];
volname = argv[2];
logfile = argv[3];
fs = init_glfs (hostname, volname, logfile);
if (fs == NULL) {
LOG_ERR ("init_glfs failed");
return -1;
}
for (i = 0; i < 5; i++) {
pthread_create(&tid[i], NULL, write_data, fs);
}
for (i = 0; i < 5; i++) {
pthread_join(tid[i], NULL);
}
glfd = glfs_open(fs, "parallel-write.txt", O_RDONLY);
if (!glfd) {
LOG_ERR ("Failed to open file for reading");
exit(1);
}
while ((bytes_read = glfs_read (glfd, buf, sizeof(buf), 0)) > 0) {
if (bytes_read != sizeof(buf)) {
fprintf (stderr, "Didn't read complete data read: %zd "
"expected: %lu", bytes_read, sizeof(buf));
exit(1);
}
total_bytes_read += bytes_read;
if (buf[0] < '1' || buf[0] >= thread_data) {
fprintf(stderr, "Invalid character found: %c", buf[0]);
exit(1);
}
memset(cmp_buf, buf[0], sizeof(cmp_buf));
if (memcmp(cmp_buf, buf, sizeof(cmp_buf))) {
LOG_ERR ("Data corrupted");
exit(1);
}
memset(cmp_buf, 0, sizeof(cmp_buf));
}
if (total_bytes_read != 5*1025*1025) {
fprintf(stderr, "Failed to read what is written, read; %zd, "
"expected %zu", total_bytes_read, 5*1025*1025);
exit(1);
}
if (glfs_close(glfd) != 0) {
LOG_ERR ("Failed to close");
exit(1);
}
return 0;
}

View File

@ -0,0 +1,32 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
cleanup;
TEST glusterd
TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{1,2,3};
TEST $CLI volume set $V0 features.shard on
TEST $CLI volume set $V0 features.shard-block-size 4MB
TEST $CLI volume set $V0 performance.quick-read off
TEST $CLI volume set $V0 performance.io-cache off
#Uncomment the following line after shard-queuing is implemented
#TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 performance.strict-o-direct on
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 performance.read-ahead off
TEST $CLI volume start $V0;
logdir=`gluster --print-logdir`
TEST build_tester $(dirname $0)/shard-append-test.c -lgfapi -lpthread
TEST ./$(dirname $0)/shard-append-test ${H0} $V0 $logdir/shard-append-test.log
cleanup_tester $(dirname $0)/shard-append-test
cleanup;

View File

@ -3631,6 +3631,18 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame,
return 0;
}
static gf_boolean_t
shard_is_appending_write (shard_local_t *local)
{
if (local->fop != GF_FOP_WRITE)
return _gf_false;
if (local->flags & O_APPEND)
return _gf_true;
if (local->fd->flags & O_APPEND)
return _gf_true;
return _gf_false;
}
int
__shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
xlator_t *this)
@ -3645,13 +3657,15 @@ __shard_get_delta_size_from_inode_ctx (shard_local_t *local, inode_t *inode,
ctx = (shard_inode_ctx_t *) ctx_uint;
if (local->offset + local->total_size > ctx->stat.ia_size) {
if (shard_is_appending_write (local)) {
local->delta_size = local->total_size;
} else if (local->offset + local->total_size > ctx->stat.ia_size) {
local->delta_size = (local->offset + local->total_size) -
ctx->stat.ia_size;
ctx->stat.ia_size += (local->delta_size);
} else {
local->delta_size = 0;
}
ctx->stat.ia_size += (local->delta_size);
local->postbuf = ctx->stat;
return 0;
@ -3957,33 +3971,8 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame,
}
int
shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
xlator_t *this)
{
shard_local_t *local = NULL;
local = frame->local;
if (local->op_ret < 0) {
shard_common_inode_write_failure_unwind (local->fop, frame,
local->op_ret,
local->op_errno);
return 0;
}
local->postbuf = local->prebuf;
if (local->call_count) {
shard_common_lookup_shards (frame, this,
local->resolver_base_inode,
shard_common_inode_write_post_lookup_shards_handler);
} else {
shard_common_inode_write_do (frame, this);
}
return 0;
}
shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this,
shard_post_resolve_fop_handler_t handler);
int
shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
xlator_t *this)
@ -3999,8 +3988,71 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame,
return 0;
}
shard_lookup_base_file (frame, this, &local->loc,
shard_common_inode_write_post_lookup_handler);
if (local->call_count) {
shard_common_lookup_shards (frame, this,
local->resolver_base_inode,
shard_common_inode_write_post_lookup_shards_handler);
} else {
shard_common_inode_write_do (frame, this);
}
return 0;
}
int
shard_common_inode_write_post_lookup_handler (call_frame_t *frame,
xlator_t *this)
{
shard_local_t *local = frame->local;
shard_priv_t *priv = this->private;
if (local->op_ret < 0) {
shard_common_inode_write_failure_unwind (local->fop, frame,
local->op_ret,
local->op_errno);
return 0;
}
local->postbuf = local->prebuf;
/*Adjust offset to EOF so that correct shard is chosen for append*/
if (shard_is_appending_write (local))
local->offset = local->prebuf.ia_size;
local->first_block = get_lowest_block (local->offset,
local->block_size);
local->last_block = get_highest_block (local->offset, local->total_size,
local->block_size);
local->num_blocks = local->last_block - local->first_block + 1;
local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
gf_shard_mt_inode_list);
if (!local->inode_list) {
shard_common_inode_write_failure_unwind (local->fop, frame,
-1, ENOMEM);
return 0;
}
gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
"last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64
" total_size=%zu flags=%"PRId32"",
gf_fop_list[local->fop],
uuid_utoa (local->resolver_base_inode->gfid),
local->first_block, local->last_block, local->num_blocks,
local->offset, local->total_size, local->flags);
local->dot_shard_loc.inode = inode_find (this->itable,
priv->dot_shard_gfid);
if (!local->dot_shard_loc.inode) {
/*change handler*/
shard_mkdir_dot_shard (frame, this,
shard_common_inode_write_post_resolve_handler);
} else {
/*change handler*/
local->post_res_handler =
shard_common_inode_write_post_resolve_handler;
shard_refresh_dot_shard (frame, this);
}
return 0;
}
@ -4699,9 +4751,6 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
int i = 0;
uint64_t block_size = 0;
shard_local_t *local = NULL;
shard_priv_t *priv = NULL;
priv = this->private;
ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size);
if (ret) {
@ -4777,37 +4826,13 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this,
local->iobref = iobref_ref (iobref);
local->fd = fd_ref (fd);
local->block_size = block_size;
local->first_block = get_lowest_block (offset, local->block_size);
local->last_block = get_highest_block (offset, local->total_size,
local->block_size);
local->num_blocks = local->last_block - local->first_block + 1;
local->resolver_base_inode = local->fd->inode;
local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *),
gf_shard_mt_inode_list);
if (!local->inode_list)
goto out;
local->loc.inode = inode_ref (fd->inode);
gf_uuid_copy (local->loc.gfid, fd->inode->gfid);
gf_msg_trace (this->name, 0, "%s: gfid=%s first_block=%"PRIu32" "
"last_block=%"PRIu32" num_blocks=%"PRIu32" offset=%"PRId64""
" total_size=%zu flags=%"PRId32"", gf_fop_list[fop],
uuid_utoa (fd->inode->gfid), local->first_block,
local->last_block, local->num_blocks, offset,
local->total_size, local->flags);
local->dot_shard_loc.inode = inode_find (this->itable,
priv->dot_shard_gfid);
if (!local->dot_shard_loc.inode) {
shard_mkdir_dot_shard (frame, this,
shard_common_inode_write_post_resolve_handler);
} else {
local->post_res_handler = shard_common_inode_write_post_resolve_handler;
shard_refresh_dot_shard (frame, this);
}
shard_lookup_base_file (frame, this, &local->loc,
shard_common_inode_write_post_lookup_handler);
return 0;
out:
shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM);