1
0
mirror of https://github.com/samba-team/samba.git synced 2025-08-04 08:22:08 +03:00

s3:modules: add vfs_io_uring module

The module makes use of the new io_uring infrastructure
(intruduced in linux 5.1), see https://lwn.net/Articles/778411/ and
http://git.kernel.dk/cgit/liburing/

Currently this only implements SMB_VFS_{PREAD,PWRITE,FSYNC}_SEND/RECV
and avoids the overhead of our userspace threadpool.

In future we'll hopefully make more use of more advanced io_uring
features.

For now we don't have automated tests as our test infrastructure
doesn't use a recent kernel. At least we're able to do compile tests
on fedora31.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Jeremy Allison <jra@samba.org>

Autobuild-User(master): Stefan Metzmacher <metze@samba.org>
Autobuild-Date(master): Sat Feb 15 11:37:45 UTC 2020 on sn-devel-184
This commit is contained in:
Stefan Metzmacher
2019-06-05 17:01:49 +02:00
parent fb5a99fa02
commit 195e88cea3
5 changed files with 670 additions and 0 deletions

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE refentry PUBLIC "-//Samba-Team//DTD DocBook V4.2-Based Variant V1.0//EN" "http://www.samba.org/samba/DTD/samba-doc">
<refentry id="vfs_io_uring.8">
<refmeta>
<refentrytitle>vfs_io_uring</refentrytitle>
<manvolnum>8</manvolnum>
<refmiscinfo class="source">Samba</refmiscinfo>
<refmiscinfo class="manual">System Administration tools</refmiscinfo>
<refmiscinfo class="version">&doc.version;</refmiscinfo>
</refmeta>
<refnamediv>
<refname>vfs_io_uring</refname>
<refpurpose>Implement async io in Samba vfs using io_uring of Linux (>= 5.1).</refpurpose>
</refnamediv>
<refsynopsisdiv>
<cmdsynopsis>
<command>vfs objects = io_uring</command>
</cmdsynopsis>
</refsynopsisdiv>
<refsect1>
<title>DESCRIPTION</title>
<para>This VFS module is part of the
<citerefentry><refentrytitle>samba</refentrytitle>
<manvolnum>7</manvolnum></citerefentry> suite.</para>
<para>The <command>io_uring</command> VFS module enables asynchronous
pread, pwrite and fsync using the io_uring infrastructure of Linux (>= 5.1).
This provides much less overhead compared to the usage of the pthreadpool for
async io.</para>
<para>This module SHOULD be listed last in any module stack as
it requires real kernel file descriptors.</para>
</refsect1>
<refsect1>
<title>EXAMPLES</title>
<para>Straight forward use:</para>
<programlisting>
<smbconfsection name="[cooldata]"/>
<smbconfoption name="path">/data/ice</smbconfoption>
<smbconfoption name="vfs objects">io_uring</smbconfoption>
</programlisting>
</refsect1>
<refsect1>
<title>OPTIONS</title>
<variablelist>
<varlistentry>
<term>io_uring:num_entries = NUMBER_OF_QUEUE_ENTRIES</term>
<listitem>
<para>The number of entries in the submission queue.
The maximum allowed value depends on the kernel version
and the kernel will roundup the value to a power of 2.
</para>
<para>The default is '128'.</para>
</listitem>
</varlistentry>
<varlistentry>
<term>io_uring:sqpoll = BOOL</term>
<listitem>
<para>Use the IORING_SETUP_SQPOLL feature.
</para>
<para>The default is 'no'.</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>SEE ALSO</title>
<para>
<citerefentry><refentrytitle>io_uring_setup</refentrytitle><manvolnum>2</manvolnum></citerefentry>.
</para>
</refsect1>
<refsect1>
<title>VERSION</title>
<para>This man page is part of version &doc.version; of the Samba suite.
</para>
</refsect1>
<refsect1>
<title>AUTHOR</title>
<para>The original Samba software and related utilities
were created by Andrew Tridgell. Samba is now developed
by the Samba Team as an Open Source project similar
to the way the Linux kernel is developed.</para>
</refsect1>
</refentry>

View File

@ -68,6 +68,7 @@ vfs_module_manpages = ['vfs_acl_tdb',
'vfs_acl_xattr', 'vfs_acl_xattr',
'vfs_aio_fork', 'vfs_aio_fork',
'vfs_aio_pthread', 'vfs_aio_pthread',
'vfs_io_uring',
'vfs_audit', 'vfs_audit',
'vfs_btrfs', 'vfs_btrfs',
'vfs_cacheprime', 'vfs_cacheprime',

View File

@ -0,0 +1,543 @@
/*
* Use the io_uring of Linux (>= 5.1)
*
* Copyright (C) Volker Lendecke 2008
* Copyright (C) Jeremy Allison 2010
* Copyright (C) Stefan Metzmacher 2019
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "includes.h"
#include "system/filesys.h"
#include "smbd/smbd.h"
#include "smbd/globals.h"
#include "lib/util/tevent_unix.h"
#include "smbprofile.h"
#include <liburing.h>
struct vfs_io_uring_request;
struct vfs_io_uring_config {
struct io_uring uring;
struct tevent_fd *fde;
struct vfs_io_uring_request *queue;
struct vfs_io_uring_request *pending;
};
struct vfs_io_uring_request {
struct vfs_io_uring_request *prev, *next;
struct vfs_io_uring_request **list_head;
struct vfs_io_uring_config *config;
struct tevent_req *req;
void *state;
struct io_uring_sqe sqe;
struct io_uring_cqe cqe;
struct timespec start_time;
struct timespec end_time;
SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
};
static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
const struct io_uring_cqe *cqe,
struct timespec end_time,
const char *location)
{
struct tevent_req *req =
talloc_get_type_abort(cur->req,
struct tevent_req);
talloc_set_destructor(cur->state, NULL);
if (cur->list_head != NULL) {
DLIST_REMOVE((*cur->list_head), cur);
cur->list_head = NULL;
}
cur->cqe = *cqe;
SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
cur->end_time = end_time;
/*
* We rely on being inside the _send() function
* or tevent_req_defer_callback() being called
* already.
*/
_tevent_req_done(req, location);
}
static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
int ret,
const char *location)
{
struct vfs_io_uring_request *cur = NULL, *next = NULL;
struct timespec start_time;
struct timespec end_time;
struct io_uring_cqe err_cqe = {
.res = ret,
};
PROFILE_TIMESTAMP(&start_time);
if (config->uring.ring_fd != -1) {
/* TODO: cancel queued and pending requests */
TALLOC_FREE(config->fde);
io_uring_queue_exit(&config->uring);
config->uring.ring_fd = -1;
}
PROFILE_TIMESTAMP(&end_time);
for (cur = config->pending; cur != NULL; cur = next) {
next = cur->next;
err_cqe.user_data = (uintptr_t)(void *)cur;
vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
}
for (cur = config->queue; cur != NULL; cur = next) {
next = cur->next;
err_cqe.user_data = (uintptr_t)(void *)cur;
cur->start_time = start_time;
vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
}
}
static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
{
vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
return 0;
}
static int vfs_io_uring_request_state_deny_destructor(void *_state)
{
struct __vfs_io_uring_generic_state {
struct vfs_io_uring_request ur;
} *state = (struct __vfs_io_uring_generic_state *)_state;
struct vfs_io_uring_request *cur = &state->ur;
/* our parent is gone */
cur->req = NULL;
/* remove ourself from any list */
DLIST_REMOVE((*cur->list_head), cur);
cur->list_head = NULL;
/*
* Our state is about to go away,
* all we can do is shutting down the whole uring.
* But that's ok as we're most likely called from exit_server()
*/
vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
return 0;
}
static void vfs_io_uring_fd_handler(struct tevent_context *ev,
struct tevent_fd *fde,
uint16_t flags,
void *private_data);
static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
const char *user)
{
int ret;
struct vfs_io_uring_config *config;
unsigned num_entries;
bool sqpoll;
unsigned flags = 0;
config = talloc_zero(handle->conn, struct vfs_io_uring_config);
if (config == NULL) {
DEBUG(0, ("talloc_zero() failed\n"));
return -1;
}
SMB_VFS_HANDLE_SET_DATA(handle, config,
NULL, struct vfs_io_uring_config,
return -1);
ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
if (ret < 0) {
return ret;
}
num_entries = lp_parm_ulong(SNUM(handle->conn),
"vfs_io_uring",
"num_entries",
128);
num_entries = MAX(num_entries, 1);
sqpoll = lp_parm_bool(SNUM(handle->conn),
"vfs_io_uring",
"sqpoll",
false);
if (sqpoll) {
flags |= IORING_SETUP_SQPOLL;
}
ret = io_uring_queue_init(num_entries, &config->uring, flags);
if (ret < 0) {
SMB_VFS_NEXT_DISCONNECT(handle);
errno = -ret;
return -1;
}
talloc_set_destructor(config, vfs_io_uring_config_destructor);
#ifdef HAVE_IO_URING_RING_DONTFORK
ret = io_uring_ring_dontfork(&config->uring);
if (ret < 0) {
SMB_VFS_NEXT_DISCONNECT(handle);
errno = -ret;
return -1;
}
#endif /* HAVE_IO_URING_RING_DONTFORK */
config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
config,
config->uring.ring_fd,
TEVENT_FD_READ,
vfs_io_uring_fd_handler,
handle);
if (config->fde == NULL) {
ret = errno;
SMB_VFS_NEXT_DISCONNECT(handle);
errno = ret;
return -1;
}
return 0;
}
static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
{
struct vfs_io_uring_request *cur = NULL, *next = NULL;
struct io_uring_cqe *cqe = NULL;
unsigned cqhead;
unsigned nr = 0;
struct timespec start_time;
struct timespec end_time;
int ret;
PROFILE_TIMESTAMP(&start_time);
if (config->uring.ring_fd == -1) {
vfs_io_uring_config_destroy(config, -ESTALE, __location__);
return;
}
for (cur = config->queue; cur != NULL; cur = next) {
struct io_uring_sqe *sqe = NULL;
next = cur->next;
sqe = io_uring_get_sqe(&config->uring);
if (sqe == NULL) {
break;
}
talloc_set_destructor(cur->state,
vfs_io_uring_request_state_deny_destructor);
DLIST_REMOVE(config->queue, cur);
*sqe = cur->sqe;
DLIST_ADD_END(config->pending, cur);
cur->list_head = &config->pending;
SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
cur->start_time = start_time;
}
ret = io_uring_submit(&config->uring);
if (ret == -EAGAIN || ret == -EBUSY) {
/* We just retry later */
} else if (ret < 0) {
vfs_io_uring_config_destroy(config, ret, __location__);
return;
}
PROFILE_TIMESTAMP(&end_time);
io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
nr++;
}
io_uring_cq_advance(&config->uring, nr);
}
static void vfs_io_uring_fd_handler(struct tevent_context *ev,
struct tevent_fd *fde,
uint16_t flags,
void *private_data)
{
vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
struct vfs_io_uring_config *config = NULL;
SMB_VFS_HANDLE_GET_DATA(handle, config,
struct vfs_io_uring_config,
smb_panic(__location__));
vfs_io_uring_queue_run(config);
}
struct vfs_io_uring_pread_state {
struct vfs_io_uring_request ur;
struct iovec iov;
};
static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
TALLOC_CTX *mem_ctx,
struct tevent_context *ev,
struct files_struct *fsp,
void *data,
size_t n, off_t offset)
{
struct tevent_req *req = NULL;
struct vfs_io_uring_pread_state *state = NULL;
struct vfs_io_uring_config *config = NULL;
SMB_VFS_HANDLE_GET_DATA(handle, config,
struct vfs_io_uring_config,
smb_panic(__location__));
req = tevent_req_create(mem_ctx, &state,
struct vfs_io_uring_pread_state);
if (req == NULL) {
return NULL;
}
state->ur.config = config;
state->ur.req = req;
state->ur.state = state;
SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
state->ur.profile_bytes, n);
SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
state->iov.iov_base = (void *)data;
state->iov.iov_len = n;
io_uring_prep_readv(&state->ur.sqe,
fsp->fh->fd,
&state->iov, 1,
offset);
io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
DLIST_ADD_END(config->queue, &state->ur);
state->ur.list_head = &config->queue;
vfs_io_uring_queue_run(config);
if (!tevent_req_is_in_progress(req)) {
return tevent_req_post(req, ev);
}
tevent_req_defer_callback(req, ev);
return req;
}
static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
struct vfs_aio_state *vfs_aio_state)
{
struct vfs_io_uring_pread_state *state = tevent_req_data(
req, struct vfs_io_uring_pread_state);
int ret;
SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
&state->ur.start_time);
if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
return -1;
}
if (state->ur.cqe.res < 0) {
vfs_aio_state->error = -state->ur.cqe.res;
ret = -1;
} else {
vfs_aio_state->error = 0;
ret = state->ur.cqe.res;
}
tevent_req_received(req);
return ret;
}
struct vfs_io_uring_pwrite_state {
struct vfs_io_uring_request ur;
struct iovec iov;
};
static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
TALLOC_CTX *mem_ctx,
struct tevent_context *ev,
struct files_struct *fsp,
const void *data,
size_t n, off_t offset)
{
struct tevent_req *req = NULL;
struct vfs_io_uring_pwrite_state *state = NULL;
struct vfs_io_uring_config *config = NULL;
SMB_VFS_HANDLE_GET_DATA(handle, config,
struct vfs_io_uring_config,
smb_panic(__location__));
req = tevent_req_create(mem_ctx, &state,
struct vfs_io_uring_pwrite_state);
if (req == NULL) {
return NULL;
}
state->ur.config = config;
state->ur.req = req;
state->ur.state = state;
SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
state->ur.profile_bytes, n);
SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
state->iov.iov_base = discard_const(data);
state->iov.iov_len = n;
io_uring_prep_writev(&state->ur.sqe,
fsp->fh->fd,
&state->iov, 1,
offset);
io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
DLIST_ADD_END(config->queue, &state->ur);
state->ur.list_head = &config->queue;
vfs_io_uring_queue_run(config);
if (!tevent_req_is_in_progress(req)) {
return tevent_req_post(req, ev);
}
tevent_req_defer_callback(req, ev);
return req;
}
static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
struct vfs_aio_state *vfs_aio_state)
{
struct vfs_io_uring_pwrite_state *state = tevent_req_data(
req, struct vfs_io_uring_pwrite_state);
int ret;
SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
&state->ur.start_time);
if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
return -1;
}
if (state->ur.cqe.res < 0) {
vfs_aio_state->error = -state->ur.cqe.res;
ret = -1;
} else {
vfs_aio_state->error = 0;
ret = state->ur.cqe.res;
}
tevent_req_received(req);
return ret;
}
struct vfs_io_uring_fsync_state {
struct vfs_io_uring_request ur;
};
static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
TALLOC_CTX *mem_ctx,
struct tevent_context *ev,
struct files_struct *fsp)
{
struct tevent_req *req = NULL;
struct vfs_io_uring_fsync_state *state = NULL;
struct vfs_io_uring_config *config = NULL;
SMB_VFS_HANDLE_GET_DATA(handle, config,
struct vfs_io_uring_config,
smb_panic(__location__));
req = tevent_req_create(mem_ctx, &state,
struct vfs_io_uring_fsync_state);
if (req == NULL) {
return NULL;
}
state->ur.config = config;
state->ur.req = req;
state->ur.state = state;
SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
state->ur.profile_bytes, 0);
SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
io_uring_prep_fsync(&state->ur.sqe,
fsp->fh->fd,
0); /* fsync_flags */
io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
DLIST_ADD_END(config->queue, &state->ur);
state->ur.list_head = &config->queue;
vfs_io_uring_queue_run(config);
if (!tevent_req_is_in_progress(req)) {
return tevent_req_post(req, ev);
}
tevent_req_defer_callback(req, ev);
return req;
}
static int vfs_io_uring_fsync_recv(struct tevent_req *req,
struct vfs_aio_state *vfs_aio_state)
{
struct vfs_io_uring_fsync_state *state = tevent_req_data(
req, struct vfs_io_uring_fsync_state);
int ret;
SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
&state->ur.start_time);
if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
return -1;
}
if (state->ur.cqe.res < 0) {
vfs_aio_state->error = -state->ur.cqe.res;
ret = -1;
} else {
vfs_aio_state->error = 0;
ret = state->ur.cqe.res;
}
tevent_req_received(req);
return ret;
}
static struct vfs_fn_pointers vfs_io_uring_fns = {
.connect_fn = vfs_io_uring_connect,
.pread_send_fn = vfs_io_uring_pread_send,
.pread_recv_fn = vfs_io_uring_pread_recv,
.pwrite_send_fn = vfs_io_uring_pwrite_send,
.pwrite_recv_fn = vfs_io_uring_pwrite_recv,
.fsync_send_fn = vfs_io_uring_fsync_send,
.fsync_recv_fn = vfs_io_uring_fsync_recv,
};
static_decl_vfs;
NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
{
return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
"io_uring", &vfs_io_uring_fns);
}

View File

@ -396,6 +396,14 @@ bld.SAMBA3_MODULE('vfs_aio_pthread',
internal_module=bld.SAMBA3_IS_STATIC_MODULE('vfs_aio_pthread'), internal_module=bld.SAMBA3_IS_STATIC_MODULE('vfs_aio_pthread'),
enabled=bld.SAMBA3_IS_ENABLED_MODULE('vfs_aio_pthread')) enabled=bld.SAMBA3_IS_ENABLED_MODULE('vfs_aio_pthread'))
bld.SAMBA3_MODULE('vfs_io_uring',
subsystem='vfs',
source='vfs_io_uring.c',
deps='samba-util tevent uring',
init_function='',
internal_module=bld.SAMBA3_IS_STATIC_MODULE('vfs_io_uring'),
enabled=bld.SAMBA3_IS_ENABLED_MODULE('vfs_io_uring'))
bld.SAMBA3_MODULE('vfs_preopen', bld.SAMBA3_MODULE('vfs_preopen',
subsystem='vfs', subsystem='vfs',
source='vfs_preopen.c', source='vfs_preopen.c',

View File

@ -1754,6 +1754,14 @@ main() {
and conf.CHECK_LIB('dbus-1', shlib=True)): and conf.CHECK_LIB('dbus-1', shlib=True)):
conf.DEFINE('HAVE_DBUS', '1') conf.DEFINE('HAVE_DBUS', '1')
if conf.CHECK_CFG(package='liburing', args='--cflags --libs',
msg='Checking for liburing package', uselib_store="URING"):
if (conf.CHECK_HEADERS('liburing.h', lib='uring')
and conf.CHECK_LIB('uring', shlib=True)):
conf.CHECK_FUNCS_IN('io_uring_ring_dontfork', 'uring',
headers='liburing.h')
conf.DEFINE('HAVE_LIBURING', '1')
conf.env.build_regedit = False conf.env.build_regedit = False
if not Options.options.with_regedit == False: if not Options.options.with_regedit == False:
conf.PROCESS_SEPARATE_RULE('system_ncurses') conf.PROCESS_SEPARATE_RULE('system_ncurses')
@ -1932,6 +1940,9 @@ main() {
if (conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_CONTROL') or conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS')): if (conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_CONTROL') or conf.CONFIG_SET('HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS')):
default_shared_modules.extend(TO_LIST('vfs_aio_fork')) default_shared_modules.extend(TO_LIST('vfs_aio_fork'))
if conf.CONFIG_SET('HAVE_LIBURING'):
default_shared_modules.extend(TO_LIST('vfs_io_uring'))
if Options.options.with_pthreadpool: if Options.options.with_pthreadpool:
default_shared_modules.extend(TO_LIST('vfs_aio_pthread')) default_shared_modules.extend(TO_LIST('vfs_aio_pthread'))