5cf4f52e6d
mmap_lock nests under uring_lock out of necessity, as we may be doing user copies with uring_lock held. However, for mmap of provided buffer rings, we attempt to grab uring_lock with mmap_lock already held from do_mmap(). This makes lockdep, rightfully, complain: WARNING: possible circular locking dependency detected 6.7.0-rc1-00009-gff3337ebaf94-dirty #4438 Not tainted ------------------------------------------------------ buf-ring.t/442 is trying to acquire lock: ffff00020e1480a8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_uring_validate_mmap_request.isra.0+0x4c/0x140 but task is already holding lock: ffff0000dc226190 (&mm->mmap_lock){++++}-{3:3}, at: vm_mmap_pgoff+0x124/0x264 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_lock){++++}-{3:3}: __might_fault+0x90/0xbc io_register_pbuf_ring+0x94/0x488 __arm64_sys_io_uring_register+0x8dc/0x1318 invoke_syscall+0x5c/0x17c el0_svc_common.constprop.0+0x108/0x130 do_el0_svc+0x2c/0x38 el0_svc+0x4c/0x94 el0t_64_sync_handler+0x118/0x124 el0t_64_sync+0x168/0x16c -> #0 (&ctx->uring_lock){+.+.}-{3:3}: __lock_acquire+0x19a0/0x2d14 lock_acquire+0x2e0/0x44c __mutex_lock+0x118/0x564 mutex_lock_nested+0x20/0x28 io_uring_validate_mmap_request.isra.0+0x4c/0x140 io_uring_mmu_get_unmapped_area+0x3c/0x98 get_unmapped_area+0xa4/0x158 do_mmap+0xec/0x5b4 vm_mmap_pgoff+0x158/0x264 ksys_mmap_pgoff+0x1d4/0x254 __arm64_sys_mmap+0x80/0x9c invoke_syscall+0x5c/0x17c el0_svc_common.constprop.0+0x108/0x130 do_el0_svc+0x2c/0x38 el0_svc+0x4c/0x94 el0t_64_sync_handler+0x118/0x124 el0t_64_sync+0x168/0x16c From that mmap(2) path, we really just need to ensure that the buffer list doesn't go away from underneath us. For the lower indexed entries, they never go away until the ring is freed and we can always sanely reference those as long as the caller has a file reference. For the higher indexed ones in our xarray, we just need to ensure that the buffer list remains valid while we return the address of it. Free the higher indexed io_buffer_list entries via RCU. With that we can avoid needing ->uring_lock inside mmap(2), and simply hold the RCU read lock around the buffer list lookup and address check. To ensure that the arrayed lookup either returns a valid fully formulated entry via RCU lookup, add an 'is_ready' flag that we access with store and release memory ordering. This isn't needed for the xarray lookups, but doesn't hurt either. Since this isn't a fast path, retain it across both types. Similarly, for the allocated array inside the ctx, ensure we use the proper load/acquire as setup could in theory be running in parallel with mmap. While in there, add a few lockdep checks for documentation purposes. Cc: stable@vger.kernel.org Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Signed-off-by: Jens Axboe <axboe@kernel.dk>
148 lines
4.0 KiB
C
148 lines
4.0 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#ifndef IOU_KBUF_H
|
|
#define IOU_KBUF_H
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
struct io_buffer_list {
|
|
/*
|
|
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
|
|
* then these are classic provided buffers and ->buf_list is used.
|
|
*/
|
|
union {
|
|
struct list_head buf_list;
|
|
struct {
|
|
struct page **buf_pages;
|
|
struct io_uring_buf_ring *buf_ring;
|
|
};
|
|
struct rcu_head rcu;
|
|
};
|
|
__u16 bgid;
|
|
|
|
/* below is for ring provided buffers */
|
|
__u16 buf_nr_pages;
|
|
__u16 nr_entries;
|
|
__u16 head;
|
|
__u16 mask;
|
|
|
|
/* ring mapped provided buffers */
|
|
__u8 is_mapped;
|
|
/* ring mapped provided buffers, but mmap'ed by application */
|
|
__u8 is_mmap;
|
|
/* bl is visible from an RCU point of view for lookup */
|
|
__u8 is_ready;
|
|
};
|
|
|
|
struct io_buffer {
|
|
struct list_head list;
|
|
__u64 addr;
|
|
__u32 len;
|
|
__u16 bid;
|
|
__u16 bgid;
|
|
};
|
|
|
|
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
|
|
unsigned int issue_flags);
|
|
void io_destroy_buffers(struct io_ring_ctx *ctx);
|
|
|
|
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
|
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
|
|
|
|
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
|
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
|
|
|
|
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
|
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
|
|
|
|
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
|
|
|
|
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
|
|
|
|
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
|
|
|
|
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
|
|
|
|
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
|
|
{
|
|
/*
|
|
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
|
|
* the flag and hence ensure that bl->head doesn't get incremented.
|
|
* If the tail has already been incremented, hang on to it.
|
|
* The exception is partial io, that case we should increment bl->head
|
|
* to monopolize the buffer.
|
|
*/
|
|
if (req->buf_list) {
|
|
if (req->flags & REQ_F_PARTIAL_IO) {
|
|
/*
|
|
* If we end up here, then the io_uring_lock has
|
|
* been kept held since we retrieved the buffer.
|
|
* For the io-wq case, we already cleared
|
|
* req->buf_list when the buffer was retrieved,
|
|
* hence it cannot be set here for that case.
|
|
*/
|
|
req->buf_list->head++;
|
|
req->buf_list = NULL;
|
|
} else {
|
|
req->buf_index = req->buf_list->bgid;
|
|
req->flags &= ~REQ_F_BUFFER_RING;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static inline bool io_do_buffer_select(struct io_kiocb *req)
|
|
{
|
|
if (!(req->flags & REQ_F_BUFFER_SELECT))
|
|
return false;
|
|
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
|
|
}
|
|
|
|
static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
|
|
{
|
|
if (req->flags & REQ_F_BUFFER_SELECTED)
|
|
return io_kbuf_recycle_legacy(req, issue_flags);
|
|
if (req->flags & REQ_F_BUFFER_RING)
|
|
return io_kbuf_recycle_ring(req);
|
|
return false;
|
|
}
|
|
|
|
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
|
|
struct list_head *list)
|
|
{
|
|
unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
|
|
|
|
if (req->flags & REQ_F_BUFFER_RING) {
|
|
if (req->buf_list) {
|
|
req->buf_index = req->buf_list->bgid;
|
|
req->buf_list->head++;
|
|
}
|
|
req->flags &= ~REQ_F_BUFFER_RING;
|
|
} else {
|
|
req->buf_index = req->kbuf->bgid;
|
|
list_add(&req->kbuf->list, list);
|
|
req->flags &= ~REQ_F_BUFFER_SELECTED;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
|
|
{
|
|
lockdep_assert_held(&req->ctx->completion_lock);
|
|
|
|
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
|
|
return 0;
|
|
return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
|
|
}
|
|
|
|
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
|
|
unsigned issue_flags)
|
|
{
|
|
|
|
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
|
|
return 0;
|
|
return __io_put_kbuf(req, issue_flags);
|
|
}
|
|
#endif
|