storage/posix: implement native linux AIO support

Configurable via cli with "storage.linux-aio" settable option

Change-Id: I9929e0d6fc1bbc2a0fe1fb67bfc8d15d8a483d3f
BUG: 837495
Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-on: http://review.gluster.com/3627
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Amar Tumballi <amarts@redhat.com>
This commit is contained in:
Anand Avati 2011-09-30 13:29:18 +05:30 committed by Anand Avati
parent ab44480749
commit fb8efa4c6a
9 changed files with 668 additions and 6 deletions

View File

@ -519,6 +519,15 @@ if test "x$RLLIBS" != "x"; then
BUILD_READLINE=yes
fi
BUILD_LIBAIO=no
AC_CHECK_LIB([aio],[io_setup],[LIBAIO="-laio"])
if test "x$LIBAIO" != "x"; then
AC_DEFINE(HAVE_LIBAIO, 1, [libaio based POSIX enabled])
BUILD_LIBAIO=yes
fi
AC_SUBST(GF_HOST_OS)
AC_SUBST(GF_GLUSTERFS_LDFLAGS)
AC_SUBST(GF_GLUSTERFS_CFLAGS)
@ -528,6 +537,7 @@ AC_SUBST(GF_LDADD)
AC_SUBST(GF_FUSE_LDADD)
AC_SUBST(GF_FUSE_CFLAGS)
AC_SUBST(RLLIBS)
AC_SUBST(LIBAIO)
AC_SUBST(AM_MAKEFLAGS)
AC_SUBST(AM_LIBTOOLFLAGS)
@ -553,4 +563,5 @@ echo "argp-standalone : $BUILD_ARGP_STANDALONE"
echo "fusermount : $BUILD_FUSERMOUNT"
echo "readline : $BUILD_READLINE"
echo "georeplication : $BUILD_SYNCDAEMON"
echo "Linux-AIO : $BUILD_LIBAIO"
echo

View File

@ -237,6 +237,7 @@ static struct volopt_map_entry glusterd_volopt_map[] = {
{"features.grace-timeout", "protocol/server", "grace-timeout", NULL, DOC, 0},
{"features.read-only", "features/read-only", "!read-only", "off", DOC, 0},
{"features.worm", "features/worm", "!worm", "off", DOC, 0},
{"storage.linux-aio", "storage/posix", NULL, NULL, DOC, 0},
{NULL, }
};

View File

@ -4,10 +4,10 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
posix_la_LDFLAGS = -module -avoidversion
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO)
noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h
noinst_HEADERS = posix.h posix-mem-types.h posix-handle.h posix-aio.h
AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \
-D$(GF_HOST_OS) -Wall -I$(top_srcdir)/libglusterfs/src -shared \

View File

@ -0,0 +1,519 @@
/*
Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
GlusterFS is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3 of the License,
or (at your option) any later version.
GlusterFS is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
#include "xlator.h"
#include "glusterfs.h"
#include "posix.h"
#include <sys/uio.h>
#ifdef HAVE_LIBAIO
#include <libaio.h>
struct posix_aio_cb {
struct iocb iocb;
call_frame_t *frame;
struct iobuf *iobuf;
struct iobref *iobref;
struct iatt prebuf;
int fd;
int op;
off_t offset;
};
int
posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2)
{
call_frame_t *frame = NULL;
xlator_t *this = NULL;
struct iobuf *iobuf = NULL;
struct iatt prebuf = {0,};
struct iatt postbuf = {0,};
int _fd = -1;
int op_ret = -1;
int op_errno = 0;
struct iovec iov;
struct iobref *iobref = NULL;
int ret = 0;
off_t offset = 0;
struct posix_private * priv = NULL;
frame = paiocb->frame;
this = frame->this;
priv = this->private;
iobuf = paiocb->iobuf;
prebuf = paiocb->prebuf;
_fd = paiocb->fd;
offset = paiocb->offset;
if (res < 0) {
op_ret = -1;
op_errno = -res;
gf_log (this->name, GF_LOG_ERROR,
"readv(async) failed fd=%d,size=%lu,offset=%llu (%d/%s)",
_fd, paiocb->iocb.u.c.nbytes,
(unsigned long long) paiocb->offset,
res, strerror (op_errno));
goto out;
}
ret = posix_fdstat (this, _fd, &postbuf);
if (ret != 0) {
op_ret = -1;
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"fstat failed on fd=%d: %s", _fd,
strerror (op_errno));
goto out;
}
op_ret = res;
op_errno = 0;
iobref = iobref_new ();
if (!iobref) {
op_ret = -1;
op_errno = ENOMEM;
goto out;
}
iobref_add (iobref, iobuf);
iov.iov_base = iobuf_ptr (iobuf);
iov.iov_len = op_ret;
/* Hack to notify higher layers of EOF. */
if (postbuf.ia_size == 0)
op_errno = ENOENT;
else if ((offset + iov.iov_len) == postbuf.ia_size)
op_errno = ENOENT;
else if (offset > postbuf.ia_size)
op_errno = ENOENT;
LOCK (&priv->lock);
{
priv->read_value += op_ret;
}
UNLOCK (&priv->lock);
out:
STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
&postbuf, iobref, NULL);
if (iobuf)
iobuf_unref (iobuf);
if (iobref)
iobref_unref (iobref);
GF_FREE (paiocb);
return 0;
}
int
posix_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
int32_t op_errno = EINVAL;
int _fd = -1;
struct iobuf *iobuf = NULL;
struct posix_fd * pfd = NULL;
int ret = -1;
struct posix_aio_cb *paiocb = NULL;
struct posix_private *priv = NULL;
struct iocb *iocb = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
priv = this->private;
ret = posix_fd_ctx_get_off (fd, this, &pfd, offset);
if (ret < 0) {
op_errno = -ret;
gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL from fd=%p", fd);
goto err;
}
_fd = pfd->fd;
if (!size) {
op_errno = EINVAL;
gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
goto err;
}
iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
if (!iobuf) {
op_errno = ENOMEM;
goto err;
}
paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb);
if (!paiocb) {
op_errno = ENOMEM;
goto err;
}
paiocb->frame = frame;
paiocb->iobuf = iobuf;
paiocb->offset = offset;
paiocb->fd = _fd;
paiocb->op = GF_FOP_READ;
paiocb->iocb.data = paiocb;
paiocb->iocb.aio_fildes = _fd;
paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
paiocb->iocb.aio_reqprio = 0;
paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
paiocb->iocb.u.c.nbytes = size;
paiocb->iocb.u.c.offset = offset;
iocb = &paiocb->iocb;
ret = io_submit (priv->ctxp, 1, &iocb);
if (ret != 1) {
gf_log (this->name, GF_LOG_ERROR,
"io_submit() returned %d", ret);
op_errno = -ret;
goto err;
}
return 0;
err:
STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
if (iobuf)
iobuf_unref (iobuf);
if (paiocb)
GF_FREE (paiocb);
return 0;
}
int
posix_aio_writev_complete (struct posix_aio_cb *paiocb, int res, int res2)
{
call_frame_t *frame = NULL;
xlator_t *this = NULL;
struct iatt prebuf = {0,};
struct iatt postbuf = {0,};
int _fd = -1;
int op_ret = -1;
int op_errno = 0;
int ret = 0;
struct posix_private * priv = NULL;
frame = paiocb->frame;
this = frame->this;
priv = this->private;
prebuf = paiocb->prebuf;
_fd = paiocb->fd;
if (res < 0) {
op_ret = -1;
op_errno = -res;
gf_log (this->name, GF_LOG_ERROR,
"writev(async) failed fd=%d,offset=%llu (%d/%s)",
_fd, (unsigned long long) paiocb->offset, res,
strerror (op_errno));
goto out;
}
ret = posix_fdstat (this, _fd, &postbuf);
if (ret != 0) {
op_ret = -1;
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"fstat failed on fd=%d: %s", _fd,
strerror (op_errno));
goto out;
}
op_ret = res;
op_errno = 0;
LOCK (&priv->lock);
{
priv->write_value += op_ret;
}
UNLOCK (&priv->lock);
out:
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
NULL);
if (paiocb) {
if (paiocb->iobref)
iobref_unref (paiocb->iobref);
GF_FREE (paiocb);
}
return 0;
}
int
posix_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *iov, int count, off_t offset, uint32_t flags,
struct iobref *iobref, dict_t *xdata)
{
int32_t op_errno = EINVAL;
int _fd = -1;
struct posix_fd * pfd = NULL;
int ret = -1;
struct posix_aio_cb *paiocb = NULL;
struct posix_private *priv = NULL;
struct iocb *iocb = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
priv = this->private;
ret = posix_fd_ctx_get_off (fd, this, &pfd, offset);
if (ret < 0) {
op_errno = -ret;
gf_log (this->name, GF_LOG_WARNING,
"pfd is NULL from fd=%p", fd);
goto err;
}
_fd = pfd->fd;
paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_posix_mt_paiocb);
if (!paiocb) {
op_errno = ENOMEM;
goto err;
}
paiocb->frame = frame;
paiocb->offset = offset;
paiocb->fd = _fd;
paiocb->op = GF_FOP_WRITE;
paiocb->iocb.data = paiocb;
paiocb->iocb.aio_fildes = _fd;
paiocb->iobref = iobref_ref (iobref);
paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
paiocb->iocb.aio_reqprio = 0;
paiocb->iocb.u.v.vec = iov;
paiocb->iocb.u.v.nr = count;
paiocb->iocb.u.v.offset = offset;
iocb = &paiocb->iocb;
ret = posix_fdstat (this, _fd, &paiocb->prebuf);
if (ret != 0) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
"fstat failed on fd=%p: %s", fd,
strerror (op_errno));
goto err;
}
ret = io_submit (priv->ctxp, 1, &iocb);
if (ret != 1) {
gf_log (this->name, GF_LOG_ERROR,
"io_submit() returned %d", ret);
op_errno = -ret;
goto err;
}
return 0;
err:
STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
if (paiocb) {
if (paiocb->iobref)
iobref_unref (paiocb->iobref);
GF_FREE (paiocb);
}
return 0;
}
void *
posix_aio_thread (void *data)
{
xlator_t *this = NULL;
struct posix_private *priv = NULL;
int ret = 0;
int i = 0;
struct io_event events[POSIX_AIO_MAX_NR_GETEVENTS];
struct io_event *event = NULL;
struct posix_aio_cb *paiocb = NULL;
this = data;
THIS = this;
priv = this->private;
for (;;) {
memset (&events[0], 0, sizeof (events));
ret = io_getevents (priv->ctxp, 1, POSIX_AIO_MAX_NR_GETEVENTS,
&events[0], NULL);
if (ret <= 0) {
gf_log (this->name, GF_LOG_ERROR,
"io_getevents() returned %d", ret);
if (ret == -EINTR)
continue;
break;
}
for (i = 0; i < ret; i++) {
event = &events[i];
paiocb = event->data;
switch (paiocb->op) {
case GF_FOP_READ:
posix_aio_readv_complete (paiocb, event->res,
event->res2);
break;
case GF_FOP_WRITE:
posix_aio_writev_complete (paiocb, event->res,
event->res2);
break;
default:
gf_log (this->name, GF_LOG_ERROR,
"unknown op %d found in piocb",
paiocb->op);
break;
}
}
}
return NULL;
}
int
posix_aio_init (xlator_t *this)
{
struct posix_private *priv = NULL;
int ret = 0;
priv = this->private;
ret = io_setup (POSIX_AIO_MAX_NR_EVENTS, &priv->ctxp);
if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
gf_log (this->name, GF_LOG_WARNING,
"Linux AIO not availble at run-time."
" Continuing with synchronous IO");
ret = 0;
goto out;
}
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"io_setup() failed. ret=%d, errno=%d",
ret, errno);
goto out;
}
ret = pthread_create (&priv->aiothread, NULL,
posix_aio_thread, this);
if (ret != 0) {
io_destroy (priv->ctxp);
goto out;
}
this->fops->readv = posix_aio_readv;
this->fops->writev = posix_aio_writev;
out:
return ret;
}
int
posix_aio_on (xlator_t *this)
{
struct posix_private *priv = NULL;
int ret = 0;
priv = this->private;
if (!priv->aio_init_done) {
ret = posix_aio_init (this);
if (ret == 0)
priv->aio_capable = _gf_true;
else
priv->aio_capable = _gf_false;
priv->aio_init_done = _gf_true;
}
if (priv->aio_capable) {
this->fops->readv = posix_aio_readv;
this->fops->writev = posix_aio_writev;
}
return ret;
}
int
posix_aio_off (xlator_t *this)
{
this->fops->readv = posix_readv;
this->fops->writev = posix_writev;
return 0;
}
#else
int
posix_aio_on (xlator_t *this)
{
gf_log (this->name, GF_LOG_INFO,
"Linux AIO not availble at build-time."
" Continuing with synchronous IO");
return 0;
}
int
posix_aio_off (xlator_t *this)
{
gf_log (this->name, GF_LOG_INFO,
"Linux AIO not availble at build-time."
" Continuing with synchronous IO");
return 0;
}
#endif

View File

@ -0,0 +1,49 @@
/*
Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
GlusterFS is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3 of the License,
or (at your option) any later version.
GlusterFS is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/
#ifndef _POSIX_AIO_H
#define _POSIX_AIO_H
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
#include "xlator.h"
#include "glusterfs.h"
// Maximum number of concurrently submitted IO events. The heaviest load
// GlusterFS has been able to handle had 60-80 concurrent calls
#define POSIX_AIO_MAX_NR_EVENTS 256
// Maximum number of completed IO operations to reap per getevents syscall
#define POSIX_AIO_MAX_NR_GETEVENTS 16
int posix_aio_on (xlator_t *this);
int posix_aio_off (xlator_t *this);
int posix_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata);
int posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata);
#endif /* !_POSIX_AIO_H */

View File

@ -1058,5 +1058,29 @@ int
posix_fd_ctx_get_off (fd_t *fd, xlator_t *this, struct posix_fd **pfd,
off_t offset)
{
return posix_fd_ctx_get (fd, this, pfd);
int ret;
int flags;
LOCK (&fd->inode->lock);
{
ret = __posix_fd_ctx_get (fd, this, pfd);
if (ret)
goto unlock;
if ((offset & 0xfff) && (*pfd)->odirect) {
flags = fcntl ((*pfd)->fd, F_GETFL);
ret = fcntl ((*pfd)->fd, F_SETFL, (flags & (~O_DIRECT)));
(*pfd)->odirect = 0;
}
if (((offset & 0xfff) == 0) && (!(*pfd)->odirect)) {
flags = fcntl ((*pfd)->fd, F_GETFL);
ret = fcntl ((*pfd)->fd, F_SETFL, (flags | O_DIRECT));
(*pfd)->odirect = 1;
}
}
unlock:
UNLOCK (&fd->inode->lock);
return ret;
}

View File

@ -30,6 +30,7 @@ enum gf_posix_mem_types_ {
gf_posix_mt_int32_t,
gf_posix_mt_posix_dev_t,
gf_posix_mt_trash_path,
gf_posix_mt_paiocb,
gf_posix_mt_end
};
#endif

View File

@ -59,6 +59,7 @@
#include "timer.h"
#include "glusterfs3-xdr.h"
#include "hashfn.h"
#include "posix-aio.h"
extern char *marker_xattrs[];
@ -3918,6 +3919,29 @@ mem_acct_init (xlator_t *this)
return ret;
}
int
reconfigure (xlator_t *this, dict_t *options)
{
int ret = -1;
struct posix_private *priv = NULL;
priv = this->private;
GF_OPTION_RECONF ("linux-aio", priv->aio_configured,
options, bool, out);
if (priv->aio_configured)
posix_aio_on (this);
else
posix_aio_off (this);
ret = 0;
out:
return ret;
}
/**
* init -
*/
@ -4248,7 +4272,23 @@ init (xlator_t *this)
"Posix landfill setup failed");
ret = -1;
goto out;
}
}
_private->aio_init_done = _gf_false;
_private->aio_capable = _gf_false;
GF_OPTION_INIT ("linux-aio", _private->aio_configured, bool, out);
if (_private->aio_configured) {
op_ret = posix_aio_on (this);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_ERROR,
"Posix AIO init failed");
ret = -1;
goto out;
}
}
pthread_mutex_init (&_private->janitor_lock, NULL);
pthread_cond_init (&_private->janitor_cond, NULL);
@ -4347,5 +4387,11 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_ANY },
{ .key = {"glusterd-uuid"},
.type = GF_OPTION_TYPE_STR },
{
.key = {"linux-aio"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
.description = "Support for native Linux AIO"
},
{ .key = {NULL} }
};

View File

@ -54,6 +54,11 @@
#include "posix-mem-types.h"
#include "posix-handle.h"
#ifdef HAVE_LIBAIO
#include <libaio.h>
#include "posix-aio.h"
#endif
/**
* posix_fd - internal structure common to file and directory fd's
*/
@ -64,7 +69,6 @@ struct posix_fd {
DIR * dir; /* handle returned by the kernel */
int flushwrites;
int odirect;
int op_performed;
struct list_head list; /* to add to the janitor list */
};
@ -124,6 +128,13 @@ struct posix_private {
/* uuid of glusterd that swapned the brick process */
uuid_t glusterd_uuid;
gf_boolean_t aio_configured;
gf_boolean_t aio_init_done;
gf_boolean_t aio_capable;
#ifdef HAVE_LIBAIO
io_context_t ctxp;
pthread_t aiothread;
#endif
};
#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path)