for-5.19/io_uring-passthrough-2022-05-22
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmKKovAQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpv9oD/4qCs7k3bPZZWZ6xoWb4EObyyWOUifi26lp vpsJHFUbA67S/i4++LV9H18SazWJ7h08ac4bjgZ+NQz40/1WkTN8/Fa76jo+BnNK 7T10Wp4Ak6uwWVrKaA81pnT+G9+xmHlJ3X27aKxzLuT7BEPpShZ6ouFVjTkx9CzN LrLjuCDTOBBN+ZoaroWYfdLwTQX2VCAl9B15lOtQIlFvuuU8VlrvLboY+80K8TvY 1wvTA2HTjnXoYx+/cTTMIFZIwQH3r1hsbwEDD8/YJj1+ouhSRQ1b0p/nk2pA+3ws HF5r/YS/rLBjlPF094IzeOBaUyA433AN1VhZqnII8ek7ViT3W3x+BRrgE9O6ZkWT 0AjX1BXReI5rdFmxBmwsSdBnrSoGaJOf2GdsCCdubXBIi+F/RvyajrPf7PTB5zbW 9WEK/uy3xvZsRVkUGAzOb9QGdvjcllgMzwPJsDegDCw5PdcPdT3mzy6KGIWipFLp j8R+br7hRMpOJv/YpihJDMzSDkQ/r1/SCwR4fpLid/QdSHG/eRTQK6c4Su5bNYEy QDy2F6kQdBVtEJCQHcEOsbhXzSTNBcdB+ujUUM5653FkaHe6y4JbomLrsNx407Id i/4ROwA5K1dioJx503Eap+OhbI5rV+PFytJTwxvLrNyVGccwbH2YOVq80fsVBP2e cZbn6EX4Vg== =/peE -----END PGP SIGNATURE----- Merge tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block Pull io_uring NVMe command passthrough from Jens Axboe: "On top of everything else, this adds support for passthrough for io_uring. The initial feature for this is NVMe passthrough support, which allows non-filesystem based IO commands and admin commands. To support this, io_uring grows support for SQE and CQE members that are twice as big, allowing to pass in a full NVMe command without having to copy data around. And to complete with more than just a single 32-bit value as the output" * tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block: (22 commits) io_uring: cleanup handling of the two task_work lists nvme: enable uring-passthrough for admin commands nvme: helper for uring-passthrough checks blk-mq: fix passthrough plugging nvme: add vectored-io support for uring-cmd nvme: wire-up uring-cmd support for io-passthru on char-device. nvme: refactor nvme_submit_user_cmd() block: wire-up support for passthrough plugging fs,io_uring: add infrastructure for uring-cmd io_uring: support CQE32 for nop operation io_uring: enable CQE32 io_uring: support CQE32 in /proc info io_uring: add tracing for additional CQE32 fields io_uring: overflow processing for CQE32 io_uring: flush completions for CQE32 io_uring: modify io_get_cqe for CQE32 io_uring: add CQE32 completion processing io_uring: add CQE32 setup processing io_uring: change ring size calculation for CQE32 io_uring: store add. return values for CQE32 ...
This commit is contained in:
commit
9836e93c0a
109
block/blk-mq.c
109
block/blk-mq.c
@ -1169,6 +1169,62 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
|
||||
complete(waiting);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
|
||||
* queues. This is important for md arrays to benefit from merging
|
||||
* requests.
|
||||
*/
|
||||
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
|
||||
{
|
||||
if (plug->multiple_queues)
|
||||
return BLK_MAX_REQUEST_COUNT * 2;
|
||||
return BLK_MAX_REQUEST_COUNT;
|
||||
}
|
||||
|
||||
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
|
||||
{
|
||||
struct request *last = rq_list_peek(&plug->mq_list);
|
||||
|
||||
if (!plug->rq_count) {
|
||||
trace_block_plug(rq->q);
|
||||
} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
|
||||
(!blk_queue_nomerges(rq->q) &&
|
||||
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
|
||||
blk_mq_flush_plug_list(plug, false);
|
||||
trace_block_plug(rq->q);
|
||||
}
|
||||
|
||||
if (!plug->multiple_queues && last && last->q != rq->q)
|
||||
plug->multiple_queues = true;
|
||||
if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
|
||||
plug->has_elevator = true;
|
||||
rq->rq_next = NULL;
|
||||
rq_list_add(&plug->mq_list, rq);
|
||||
plug->rq_count++;
|
||||
}
|
||||
|
||||
static void __blk_execute_rq_nowait(struct request *rq, bool at_head,
|
||||
rq_end_io_fn *done, bool use_plug)
|
||||
{
|
||||
WARN_ON(irqs_disabled());
|
||||
WARN_ON(!blk_rq_is_passthrough(rq));
|
||||
|
||||
rq->end_io = done;
|
||||
|
||||
blk_account_io_start(rq);
|
||||
|
||||
if (use_plug && current->plug) {
|
||||
blk_add_rq_to_plug(current->plug, rq);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* don't check dying flag for MQ because the request won't
|
||||
* be reused after dying flag is set
|
||||
*/
|
||||
blk_mq_sched_insert_request(rq, at_head, true, false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* blk_execute_rq_nowait - insert a request to I/O scheduler for execution
|
||||
* @rq: request to insert
|
||||
@ -1184,18 +1240,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
|
||||
*/
|
||||
void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done)
|
||||
{
|
||||
WARN_ON(irqs_disabled());
|
||||
WARN_ON(!blk_rq_is_passthrough(rq));
|
||||
__blk_execute_rq_nowait(rq, at_head, done, true);
|
||||
|
||||
rq->end_io = done;
|
||||
|
||||
blk_account_io_start(rq);
|
||||
|
||||
/*
|
||||
* don't check dying flag for MQ because the request won't
|
||||
* be reused after dying flag is set
|
||||
*/
|
||||
blk_mq_sched_insert_request(rq, at_head, true, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
|
||||
|
||||
@ -1233,8 +1279,13 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
|
||||
DECLARE_COMPLETION_ONSTACK(wait);
|
||||
unsigned long hang_check;
|
||||
|
||||
/*
|
||||
* iopoll requires request to be submitted to driver, so can't
|
||||
* use plug
|
||||
*/
|
||||
rq->end_io_data = &wait;
|
||||
blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq);
|
||||
__blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq,
|
||||
!blk_rq_is_poll(rq));
|
||||
|
||||
/* Prevent hang_check timer from firing at us during very long I/O */
|
||||
hang_check = sysctl_hung_task_timeout_secs;
|
||||
@ -2676,40 +2727,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
|
||||
hctx->queue->mq_ops->commit_rqs(hctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
|
||||
* queues. This is important for md arrays to benefit from merging
|
||||
* requests.
|
||||
*/
|
||||
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
|
||||
{
|
||||
if (plug->multiple_queues)
|
||||
return BLK_MAX_REQUEST_COUNT * 2;
|
||||
return BLK_MAX_REQUEST_COUNT;
|
||||
}
|
||||
|
||||
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
|
||||
{
|
||||
struct request *last = rq_list_peek(&plug->mq_list);
|
||||
|
||||
if (!plug->rq_count) {
|
||||
trace_block_plug(rq->q);
|
||||
} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
|
||||
(!blk_queue_nomerges(rq->q) &&
|
||||
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
|
||||
blk_mq_flush_plug_list(plug, false);
|
||||
trace_block_plug(rq->q);
|
||||
}
|
||||
|
||||
if (!plug->multiple_queues && last && last->q != rq->q)
|
||||
plug->multiple_queues = true;
|
||||
if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
|
||||
plug->has_elevator = true;
|
||||
rq->rq_next = NULL;
|
||||
rq_list_add(&plug->mq_list, rq);
|
||||
plug->rq_count++;
|
||||
}
|
||||
|
||||
static bool blk_mq_attempt_bio_merge(struct request_queue *q,
|
||||
struct bio *bio, unsigned int nr_segs)
|
||||
{
|
||||
|
@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = {
|
||||
.release = nvme_dev_release,
|
||||
.unlocked_ioctl = nvme_dev_ioctl,
|
||||
.compat_ioctl = compat_ptr_ioctl,
|
||||
.uring_cmd = nvme_dev_uring_cmd,
|
||||
};
|
||||
|
||||
static ssize_t nvme_sysfs_reset(struct device *dev,
|
||||
@ -3699,6 +3700,7 @@ static const struct file_operations nvme_ns_chr_fops = {
|
||||
.release = nvme_ns_chr_release,
|
||||
.unlocked_ioctl = nvme_ns_chr_ioctl,
|
||||
.compat_ioctl = compat_ptr_ioctl,
|
||||
.uring_cmd = nvme_ns_chr_uring_cmd,
|
||||
};
|
||||
|
||||
static int nvme_add_ns_cdev(struct nvme_ns *ns)
|
||||
|
@ -5,6 +5,7 @@
|
||||
*/
|
||||
#include <linux/ptrace.h> /* for force_successful_syscall_return */
|
||||
#include <linux/nvme_ioctl.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include "nvme.h"
|
||||
|
||||
/*
|
||||
@ -53,10 +54,21 @@ out:
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
|
||||
void *meta, unsigned len, int ret)
|
||||
{
|
||||
if (!ret && req_op(req) == REQ_OP_DRV_IN &&
|
||||
copy_to_user(ubuf, meta, len))
|
||||
ret = -EFAULT;
|
||||
kfree(meta);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct request *nvme_alloc_user_request(struct request_queue *q,
|
||||
struct nvme_command *cmd, void __user *ubuffer,
|
||||
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
|
||||
u32 meta_seed, u64 *result, unsigned timeout, bool vec)
|
||||
u32 meta_seed, void **metap, unsigned timeout, bool vec,
|
||||
unsigned int rq_flags, blk_mq_req_flags_t blk_flags)
|
||||
{
|
||||
bool write = nvme_is_write(cmd);
|
||||
struct nvme_ns *ns = q->queuedata;
|
||||
@ -66,9 +78,9 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
void *meta = NULL;
|
||||
int ret;
|
||||
|
||||
req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0);
|
||||
req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
return req;
|
||||
nvme_init_request(req, cmd);
|
||||
|
||||
if (timeout)
|
||||
@ -105,26 +117,50 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
goto out_unmap;
|
||||
}
|
||||
req->cmd_flags |= REQ_INTEGRITY;
|
||||
*metap = meta;
|
||||
}
|
||||
}
|
||||
|
||||
ret = nvme_execute_passthru_rq(req);
|
||||
if (result)
|
||||
*result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
if (meta && !ret && !write) {
|
||||
if (copy_to_user(meta_buffer, meta, meta_len))
|
||||
ret = -EFAULT;
|
||||
}
|
||||
kfree(meta);
|
||||
out_unmap:
|
||||
return req;
|
||||
|
||||
out_unmap:
|
||||
if (bio)
|
||||
blk_rq_unmap_user(bio);
|
||||
out:
|
||||
blk_mq_free_request(req);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
struct nvme_command *cmd, void __user *ubuffer,
|
||||
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
|
||||
u32 meta_seed, u64 *result, unsigned timeout, bool vec)
|
||||
{
|
||||
struct request *req;
|
||||
void *meta = NULL;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
|
||||
req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
|
||||
meta_len, meta_seed, &meta, timeout, vec, 0, 0);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
bio = req->bio;
|
||||
|
||||
ret = nvme_execute_passthru_rq(req);
|
||||
|
||||
if (result)
|
||||
*result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
if (meta)
|
||||
ret = nvme_finish_user_metadata(req, meta_buffer, meta,
|
||||
meta_len, ret);
|
||||
if (bio)
|
||||
blk_rq_unmap_user(bio);
|
||||
out:
|
||||
blk_mq_free_request(req);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
||||
{
|
||||
struct nvme_user_io io;
|
||||
@ -296,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
return status;
|
||||
}
|
||||
|
||||
struct nvme_uring_data {
|
||||
__u64 metadata;
|
||||
__u64 addr;
|
||||
__u32 data_len;
|
||||
__u32 metadata_len;
|
||||
__u32 timeout_ms;
|
||||
};
|
||||
|
||||
/*
|
||||
* This overlays struct io_uring_cmd pdu.
|
||||
* Expect build errors if this grows larger than that.
|
||||
*/
|
||||
struct nvme_uring_cmd_pdu {
|
||||
union {
|
||||
struct bio *bio;
|
||||
struct request *req;
|
||||
};
|
||||
void *meta; /* kernel-resident buffer */
|
||||
void __user *meta_buffer;
|
||||
u32 meta_len;
|
||||
};
|
||||
|
||||
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
|
||||
struct io_uring_cmd *ioucmd)
|
||||
{
|
||||
return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
|
||||
}
|
||||
|
||||
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
|
||||
{
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
struct request *req = pdu->req;
|
||||
struct bio *bio = req->bio;
|
||||
int status;
|
||||
u64 result;
|
||||
|
||||
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
|
||||
status = -EINTR;
|
||||
else
|
||||
status = nvme_req(req)->status;
|
||||
|
||||
result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
if (pdu->meta)
|
||||
status = nvme_finish_user_metadata(req, pdu->meta_buffer,
|
||||
pdu->meta, pdu->meta_len, status);
|
||||
if (bio)
|
||||
blk_rq_unmap_user(bio);
|
||||
blk_mq_free_request(req);
|
||||
|
||||
io_uring_cmd_done(ioucmd, status, result);
|
||||
}
|
||||
|
||||
static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = req->end_io_data;
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
/* extract bio before reusing the same field for request */
|
||||
struct bio *bio = pdu->bio;
|
||||
|
||||
pdu->req = req;
|
||||
req->bio = bio;
|
||||
/* this takes care of moving rest of completion-work to task context */
|
||||
io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
|
||||
}
|
||||
|
||||
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
|
||||
{
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
const struct nvme_uring_cmd *cmd = ioucmd->cmd;
|
||||
struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
|
||||
struct nvme_uring_data d;
|
||||
struct nvme_command c;
|
||||
struct request *req;
|
||||
unsigned int rq_flags = 0;
|
||||
blk_mq_req_flags_t blk_flags = 0;
|
||||
void *meta = NULL;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
||||
c.common.opcode = READ_ONCE(cmd->opcode);
|
||||
c.common.flags = READ_ONCE(cmd->flags);
|
||||
if (c.common.flags)
|
||||
return -EINVAL;
|
||||
|
||||
c.common.command_id = 0;
|
||||
c.common.nsid = cpu_to_le32(cmd->nsid);
|
||||
if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid)))
|
||||
return -EINVAL;
|
||||
|
||||
c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2));
|
||||
c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3));
|
||||
c.common.metadata = 0;
|
||||
c.common.dptr.prp1 = c.common.dptr.prp2 = 0;
|
||||
c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10));
|
||||
c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11));
|
||||
c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12));
|
||||
c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13));
|
||||
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
|
||||
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
|
||||
|
||||
d.metadata = READ_ONCE(cmd->metadata);
|
||||
d.addr = READ_ONCE(cmd->addr);
|
||||
d.data_len = READ_ONCE(cmd->data_len);
|
||||
d.metadata_len = READ_ONCE(cmd->metadata_len);
|
||||
d.timeout_ms = READ_ONCE(cmd->timeout_ms);
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK) {
|
||||
rq_flags = REQ_NOWAIT;
|
||||
blk_flags = BLK_MQ_REQ_NOWAIT;
|
||||
}
|
||||
|
||||
req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
|
||||
d.data_len, nvme_to_user_ptr(d.metadata),
|
||||
d.metadata_len, 0, &meta, d.timeout_ms ?
|
||||
msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags,
|
||||
blk_flags);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
req->end_io_data = ioucmd;
|
||||
|
||||
/* to free bio on completion, as req->bio will be null at that time */
|
||||
pdu->bio = req->bio;
|
||||
pdu->meta = meta;
|
||||
pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
|
||||
pdu->meta_len = d.metadata_len;
|
||||
|
||||
blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io);
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
|
||||
static bool is_ctrl_ioctl(unsigned int cmd)
|
||||
{
|
||||
if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
|
||||
@ -387,6 +556,53 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return __nvme_ioctl(ns, cmd, (void __user *)arg);
|
||||
}
|
||||
|
||||
static int nvme_uring_cmd_checks(unsigned int issue_flags)
|
||||
{
|
||||
/* IOPOLL not supported yet */
|
||||
if (issue_flags & IO_URING_F_IOPOLL)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* NVMe passthrough requires big SQE/CQE support */
|
||||
if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
|
||||
(IO_URING_F_SQE128|IO_URING_F_CQE32))
|
||||
return -EOPNOTSUPP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
int ret;
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
|
||||
|
||||
ret = nvme_uring_cmd_checks(issue_flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
switch (ioucmd->cmd_op) {
|
||||
case NVME_URING_CMD_IO:
|
||||
ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false);
|
||||
break;
|
||||
case NVME_URING_CMD_IO_VEC:
|
||||
ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true);
|
||||
break;
|
||||
default:
|
||||
ret = -ENOTTY;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
|
||||
{
|
||||
struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
|
||||
struct nvme_ns, cdev);
|
||||
|
||||
return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
void __user *argp, struct nvme_ns_head *head, int srcu_idx)
|
||||
@ -453,8 +669,46 @@ out_unlock:
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
|
||||
struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
|
||||
int srcu_idx = srcu_read_lock(&head->srcu);
|
||||
struct nvme_ns *ns = nvme_find_path(head);
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (ns)
|
||||
ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_NVME_MULTIPATH */
|
||||
|
||||
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
|
||||
int ret;
|
||||
|
||||
ret = nvme_uring_cmd_checks(issue_flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
switch (ioucmd->cmd_op) {
|
||||
case NVME_URING_CMD_ADMIN:
|
||||
ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false);
|
||||
break;
|
||||
case NVME_URING_CMD_ADMIN_VEC:
|
||||
ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true);
|
||||
break;
|
||||
default:
|
||||
ret = -ENOTTY;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
|
||||
.release = nvme_ns_head_chr_release,
|
||||
.unlocked_ioctl = nvme_ns_head_chr_ioctl,
|
||||
.compat_ioctl = compat_ptr_ioctl,
|
||||
.uring_cmd = nvme_ns_head_chr_uring_cmd,
|
||||
};
|
||||
|
||||
static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
|
||||
|
@ -782,7 +782,12 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
|
||||
unsigned long arg);
|
||||
long nvme_dev_ioctl(struct file *file, unsigned int cmd,
|
||||
unsigned long arg);
|
||||
int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags);
|
||||
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags);
|
||||
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
|
||||
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
|
||||
|
||||
extern const struct attribute_group *nvme_ns_id_attr_groups[];
|
||||
extern const struct pr_ops nvme_pr_ops;
|
||||
|
444
fs/io_uring.c
444
fs/io_uring.c
@ -204,13 +204,6 @@ struct io_rings {
|
||||
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
enum io_uring_cmd_flags {
|
||||
IO_URING_F_COMPLETE_DEFER = 1,
|
||||
IO_URING_F_UNLOCKED = 2,
|
||||
/* int's last bit, sign checks are usually faster than a bit test */
|
||||
IO_URING_F_NONBLOCK = INT_MIN,
|
||||
};
|
||||
|
||||
struct io_mapped_ubuf {
|
||||
u64 ubuf;
|
||||
u64 ubuf_end;
|
||||
@ -222,8 +215,8 @@ struct io_mapped_ubuf {
|
||||
struct io_ring_ctx;
|
||||
|
||||
struct io_overflow_cqe {
|
||||
struct io_uring_cqe cqe;
|
||||
struct list_head list;
|
||||
struct io_uring_cqe cqe;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -551,7 +544,7 @@ struct io_uring_task {
|
||||
|
||||
spinlock_t task_lock;
|
||||
struct io_wq_work_list task_list;
|
||||
struct io_wq_work_list prior_task_list;
|
||||
struct io_wq_work_list prio_task_list;
|
||||
struct callback_head task_work;
|
||||
struct file **registered_rings;
|
||||
bool task_running;
|
||||
@ -788,6 +781,12 @@ struct io_msg {
|
||||
u32 len;
|
||||
};
|
||||
|
||||
struct io_nop {
|
||||
struct file *file;
|
||||
u64 extra1;
|
||||
u64 extra2;
|
||||
};
|
||||
|
||||
struct io_async_connect {
|
||||
struct sockaddr_storage address;
|
||||
};
|
||||
@ -992,6 +991,8 @@ struct io_kiocb {
|
||||
struct io_msg msg;
|
||||
struct io_xattr xattr;
|
||||
struct io_socket sock;
|
||||
struct io_nop nop;
|
||||
struct io_uring_cmd uring_cmd;
|
||||
};
|
||||
|
||||
u8 opcode;
|
||||
@ -1036,7 +1037,13 @@ struct io_kiocb {
|
||||
atomic_t poll_refs;
|
||||
struct io_task_work io_task_work;
|
||||
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
|
||||
struct hlist_node hash_node;
|
||||
union {
|
||||
struct hlist_node hash_node;
|
||||
struct {
|
||||
u64 extra1;
|
||||
u64 extra2;
|
||||
};
|
||||
};
|
||||
/* internal polling, see IORING_FEAT_FAST_POLL */
|
||||
struct async_poll *apoll;
|
||||
/* opcode allocated if it needs to store data for async defer */
|
||||
@ -1070,6 +1077,14 @@ struct io_cancel_data {
|
||||
int seq;
|
||||
};
|
||||
|
||||
/*
|
||||
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
|
||||
* the following sqe if SQE128 is used.
|
||||
*/
|
||||
#define uring_cmd_pdu_size(is_sqe128) \
|
||||
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
|
||||
offsetof(struct io_uring_sqe, cmd))
|
||||
|
||||
struct io_op_def {
|
||||
/* needs req->file assigned */
|
||||
unsigned needs_file : 1;
|
||||
@ -1311,6 +1326,12 @@ static const struct io_op_def io_op_defs[] = {
|
||||
[IORING_OP_SOCKET] = {
|
||||
.audit_skip = 1,
|
||||
},
|
||||
[IORING_OP_URING_CMD] = {
|
||||
.needs_file = 1,
|
||||
.plug = 1,
|
||||
.needs_async_setup = 1,
|
||||
.async_size = uring_cmd_pdu_size(1),
|
||||
},
|
||||
};
|
||||
|
||||
/* requests with any of those set should undergo io_disarm_next() */
|
||||
@ -1450,6 +1471,8 @@ const char *io_uring_get_opcode(u8 opcode)
|
||||
return "GETXATTR";
|
||||
case IORING_OP_SOCKET:
|
||||
return "SOCKET";
|
||||
case IORING_OP_URING_CMD:
|
||||
return "URING_CMD";
|
||||
case IORING_OP_LAST:
|
||||
return "INVALID";
|
||||
}
|
||||
@ -2119,8 +2142,12 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_rings *rings = ctx->rings;
|
||||
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
|
||||
unsigned int shift = 0;
|
||||
unsigned int free, queued, len;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_CQE32)
|
||||
shift = 1;
|
||||
|
||||
/* userspace may cheat modifying the tail, be safe and do min */
|
||||
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
|
||||
free = ctx->cq_entries - queued;
|
||||
@ -2132,15 +2159,26 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
|
||||
ctx->cached_cq_tail++;
|
||||
ctx->cqe_cached = &rings->cqes[off];
|
||||
ctx->cqe_sentinel = ctx->cqe_cached + len;
|
||||
return ctx->cqe_cached++;
|
||||
ctx->cqe_cached++;
|
||||
return &rings->cqes[off << shift];
|
||||
}
|
||||
|
||||
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
|
||||
struct io_uring_cqe *cqe = ctx->cqe_cached;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_CQE32) {
|
||||
unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
|
||||
|
||||
cqe += off;
|
||||
}
|
||||
|
||||
ctx->cached_cq_tail++;
|
||||
return ctx->cqe_cached++;
|
||||
ctx->cqe_cached++;
|
||||
return cqe;
|
||||
}
|
||||
|
||||
return __io_get_cqe(ctx);
|
||||
}
|
||||
|
||||
@ -2212,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
|
||||
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
|
||||
{
|
||||
bool all_flushed, posted;
|
||||
size_t cqe_size = sizeof(struct io_uring_cqe);
|
||||
|
||||
if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
|
||||
return false;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_CQE32)
|
||||
cqe_size <<= 1;
|
||||
|
||||
posted = false;
|
||||
spin_lock(&ctx->completion_lock);
|
||||
while (!list_empty(&ctx->cq_overflow_list)) {
|
||||
@ -2227,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
|
||||
ocqe = list_first_entry(&ctx->cq_overflow_list,
|
||||
struct io_overflow_cqe, list);
|
||||
if (cqe)
|
||||
memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
|
||||
memcpy(cqe, &ocqe->cqe, cqe_size);
|
||||
else
|
||||
io_account_cq_overflow(ctx);
|
||||
|
||||
@ -2315,11 +2357,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
|
||||
}
|
||||
|
||||
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
|
||||
s32 res, u32 cflags)
|
||||
s32 res, u32 cflags, u64 extra1,
|
||||
u64 extra2)
|
||||
{
|
||||
struct io_overflow_cqe *ocqe;
|
||||
size_t ocq_size = sizeof(struct io_overflow_cqe);
|
||||
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
|
||||
|
||||
ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
|
||||
if (is_cqe32)
|
||||
ocq_size += sizeof(struct io_uring_cqe);
|
||||
|
||||
ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
|
||||
trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
|
||||
if (!ocqe) {
|
||||
/*
|
||||
@ -2339,6 +2387,10 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
|
||||
ocqe->cqe.user_data = user_data;
|
||||
ocqe->cqe.res = res;
|
||||
ocqe->cqe.flags = cflags;
|
||||
if (is_cqe32) {
|
||||
ocqe->cqe.big_cqe[0] = extra1;
|
||||
ocqe->cqe.big_cqe[1] = extra2;
|
||||
}
|
||||
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
|
||||
return true;
|
||||
}
|
||||
@ -2360,7 +2412,7 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
|
||||
WRITE_ONCE(cqe->flags, cflags);
|
||||
return true;
|
||||
}
|
||||
return io_cqring_event_overflow(ctx, user_data, res, cflags);
|
||||
return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
|
||||
}
|
||||
|
||||
static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
|
||||
@ -2369,7 +2421,7 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
|
||||
struct io_uring_cqe *cqe;
|
||||
|
||||
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
|
||||
req->cqe.res, req->cqe.flags);
|
||||
req->cqe.res, req->cqe.flags, 0, 0);
|
||||
|
||||
/*
|
||||
* If we can't get a cq entry, userspace overflowed the
|
||||
@ -2382,35 +2434,91 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
|
||||
return true;
|
||||
}
|
||||
return io_cqring_event_overflow(ctx, req->cqe.user_data,
|
||||
req->cqe.res, req->cqe.flags);
|
||||
req->cqe.res, req->cqe.flags, 0, 0);
|
||||
}
|
||||
|
||||
static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
|
||||
struct io_kiocb *req)
|
||||
{
|
||||
struct io_uring_cqe *cqe;
|
||||
u64 extra1 = req->extra1;
|
||||
u64 extra2 = req->extra2;
|
||||
|
||||
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
|
||||
req->cqe.res, req->cqe.flags, extra1, extra2);
|
||||
|
||||
/*
|
||||
* If we can't get a cq entry, userspace overflowed the
|
||||
* submission (by quite a lot). Increment the overflow count in
|
||||
* the ring.
|
||||
*/
|
||||
cqe = io_get_cqe(ctx);
|
||||
if (likely(cqe)) {
|
||||
memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
|
||||
cqe->big_cqe[0] = extra1;
|
||||
cqe->big_cqe[1] = extra2;
|
||||
return true;
|
||||
}
|
||||
|
||||
return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
|
||||
req->cqe.flags, extra1, extra2);
|
||||
}
|
||||
|
||||
static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
{
|
||||
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
|
||||
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
|
||||
return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
|
||||
}
|
||||
|
||||
static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
|
||||
u64 extra1, u64 extra2)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_uring_cqe *cqe;
|
||||
|
||||
if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
|
||||
return;
|
||||
if (req->flags & REQ_F_CQE_SKIP)
|
||||
return;
|
||||
|
||||
trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
|
||||
extra1, extra2);
|
||||
|
||||
/*
|
||||
* If we can't get a cq entry, userspace overflowed the
|
||||
* submission (by quite a lot). Increment the overflow count in
|
||||
* the ring.
|
||||
*/
|
||||
cqe = io_get_cqe(ctx);
|
||||
if (likely(cqe)) {
|
||||
WRITE_ONCE(cqe->user_data, req->cqe.user_data);
|
||||
WRITE_ONCE(cqe->res, res);
|
||||
WRITE_ONCE(cqe->flags, cflags);
|
||||
WRITE_ONCE(cqe->big_cqe[0], extra1);
|
||||
WRITE_ONCE(cqe->big_cqe[1], extra2);
|
||||
return;
|
||||
}
|
||||
|
||||
io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
|
||||
}
|
||||
|
||||
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
|
||||
s32 res, u32 cflags)
|
||||
{
|
||||
ctx->cq_extra++;
|
||||
trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
|
||||
trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
|
||||
return __io_fill_cqe(ctx, user_data, res, cflags);
|
||||
}
|
||||
|
||||
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
|
||||
u32 cflags)
|
||||
static void __io_req_complete_put(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (!(req->flags & REQ_F_CQE_SKIP))
|
||||
__io_fill_cqe_req(req, res, cflags);
|
||||
/*
|
||||
* If we're the last reference to this request, add to our locked
|
||||
* free_list cache.
|
||||
*/
|
||||
if (req_ref_put_and_test(req)) {
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (req->flags & IO_REQ_LINK_FLAGS) {
|
||||
if (req->flags & IO_DISARM_MASK)
|
||||
io_disarm_next(req);
|
||||
@ -2433,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
|
||||
}
|
||||
}
|
||||
|
||||
static void io_req_complete_post(struct io_kiocb *req, s32 res,
|
||||
u32 cflags)
|
||||
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
|
||||
u32 cflags)
|
||||
{
|
||||
if (!(req->flags & REQ_F_CQE_SKIP))
|
||||
__io_fill_cqe_req(req, res, cflags);
|
||||
__io_req_complete_put(req);
|
||||
}
|
||||
|
||||
static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
|
||||
u32 cflags, u64 extra1, u64 extra2)
|
||||
{
|
||||
if (!(req->flags & REQ_F_CQE_SKIP))
|
||||
__io_fill_cqe32_req(req, res, cflags, extra1, extra2);
|
||||
__io_req_complete_put(req);
|
||||
}
|
||||
|
||||
static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
@ -2445,6 +2568,18 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
|
||||
io_cqring_ev_posted(ctx);
|
||||
}
|
||||
|
||||
static void io_req_complete_post32(struct io_kiocb *req, s32 res,
|
||||
u32 cflags, u64 extra1, u64 extra2)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
__io_req_complete_post32(req, res, cflags, extra1, extra2);
|
||||
io_commit_cqring(ctx);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
io_cqring_ev_posted(ctx);
|
||||
}
|
||||
|
||||
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
|
||||
u32 cflags)
|
||||
{
|
||||
@ -2462,6 +2597,19 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
|
||||
io_req_complete_post(req, res, cflags);
|
||||
}
|
||||
|
||||
static inline void __io_req_complete32(struct io_kiocb *req,
|
||||
unsigned int issue_flags, s32 res,
|
||||
u32 cflags, u64 extra1, u64 extra2)
|
||||
{
|
||||
if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
|
||||
io_req_complete_state(req, res, cflags);
|
||||
req->extra1 = extra1;
|
||||
req->extra2 = extra2;
|
||||
} else {
|
||||
io_req_complete_post32(req, res, cflags, extra1, extra2);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void io_req_complete(struct io_kiocb *req, s32 res)
|
||||
{
|
||||
if (res < 0)
|
||||
@ -2803,10 +2951,10 @@ static void tctx_task_work(struct callback_head *cb)
|
||||
struct io_wq_work_node *node1, *node2;
|
||||
|
||||
spin_lock_irq(&tctx->task_lock);
|
||||
node1 = tctx->prior_task_list.first;
|
||||
node1 = tctx->prio_task_list.first;
|
||||
node2 = tctx->task_list.first;
|
||||
INIT_WQ_LIST(&tctx->task_list);
|
||||
INIT_WQ_LIST(&tctx->prior_task_list);
|
||||
INIT_WQ_LIST(&tctx->prio_task_list);
|
||||
if (!node2 && !node1)
|
||||
tctx->task_running = false;
|
||||
spin_unlock_irq(&tctx->task_lock);
|
||||
@ -2820,7 +2968,7 @@ static void tctx_task_work(struct callback_head *cb)
|
||||
cond_resched();
|
||||
|
||||
if (data_race(!tctx->task_list.first) &&
|
||||
data_race(!tctx->prior_task_list.first) && uring_locked)
|
||||
data_race(!tctx->prio_task_list.first) && uring_locked)
|
||||
io_submit_flush_completions(ctx);
|
||||
}
|
||||
|
||||
@ -2831,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb)
|
||||
io_uring_drop_tctx_refs(current);
|
||||
}
|
||||
|
||||
static void io_req_task_work_add(struct io_kiocb *req, bool priority)
|
||||
static void __io_req_task_work_add(struct io_kiocb *req,
|
||||
struct io_uring_task *tctx,
|
||||
struct io_wq_work_list *list)
|
||||
{
|
||||
struct task_struct *tsk = req->task;
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_uring_task *tctx = tsk->io_uring;
|
||||
struct io_wq_work_node *node;
|
||||
unsigned long flags;
|
||||
bool running;
|
||||
|
||||
WARN_ON_ONCE(!tctx);
|
||||
|
||||
io_drop_inflight_file(req);
|
||||
|
||||
spin_lock_irqsave(&tctx->task_lock, flags);
|
||||
if (priority)
|
||||
wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
|
||||
else
|
||||
wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
|
||||
wq_list_add_tail(&req->io_task_work.node, list);
|
||||
running = tctx->task_running;
|
||||
if (!running)
|
||||
tctx->task_running = true;
|
||||
@ -2861,12 +3004,12 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
|
||||
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
|
||||
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
|
||||
|
||||
if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
|
||||
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&tctx->task_lock, flags);
|
||||
tctx->task_running = false;
|
||||
node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
|
||||
node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
|
||||
spin_unlock_irqrestore(&tctx->task_lock, flags);
|
||||
|
||||
while (node) {
|
||||
@ -2878,6 +3021,23 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
|
||||
}
|
||||
}
|
||||
|
||||
static void io_req_task_work_add(struct io_kiocb *req)
|
||||
{
|
||||
struct io_uring_task *tctx = req->task->io_uring;
|
||||
|
||||
__io_req_task_work_add(req, tctx, &tctx->task_list);
|
||||
}
|
||||
|
||||
static void io_req_task_prio_work_add(struct io_kiocb *req)
|
||||
{
|
||||
struct io_uring_task *tctx = req->task->io_uring;
|
||||
|
||||
if (req->ctx->flags & IORING_SETUP_SQPOLL)
|
||||
__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
|
||||
else
|
||||
__io_req_task_work_add(req, tctx, &tctx->task_list);
|
||||
}
|
||||
|
||||
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
io_req_complete_post(req, req->cqe.res, req->cqe.flags);
|
||||
@ -2888,7 +3048,7 @@ static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
req->cqe.res = res;
|
||||
req->cqe.flags = cflags;
|
||||
req->io_task_work.func = io_req_tw_post;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
|
||||
@ -2912,19 +3072,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
|
||||
{
|
||||
req->cqe.res = ret;
|
||||
req->io_task_work.func = io_req_task_cancel;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static void io_req_task_queue(struct io_kiocb *req)
|
||||
{
|
||||
req->io_task_work.func = io_req_task_submit;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static void io_req_task_queue_reissue(struct io_kiocb *req)
|
||||
{
|
||||
req->io_task_work.func = io_queue_iowq;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static void io_queue_next(struct io_kiocb *req)
|
||||
@ -2998,8 +3158,12 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
|
||||
struct io_kiocb *req = container_of(node, struct io_kiocb,
|
||||
comp_list);
|
||||
|
||||
if (!(req->flags & REQ_F_CQE_SKIP))
|
||||
__io_fill_cqe_req_filled(ctx, req);
|
||||
if (!(req->flags & REQ_F_CQE_SKIP)) {
|
||||
if (!(ctx->flags & IORING_SETUP_CQE32))
|
||||
__io_fill_cqe_req_filled(ctx, req);
|
||||
else
|
||||
__io_fill_cqe32_req_filled(ctx, req);
|
||||
}
|
||||
}
|
||||
|
||||
io_commit_cqring(ctx);
|
||||
@ -3328,7 +3492,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
|
||||
return;
|
||||
req->cqe.res = res;
|
||||
req->io_task_work.func = io_req_task_complete;
|
||||
io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
|
||||
io_req_task_prio_work_add(req);
|
||||
}
|
||||
|
||||
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
|
||||
@ -4462,10 +4626,6 @@ static int __io_getxattr_prep(struct io_kiocb *req,
|
||||
const char __user *name;
|
||||
int ret;
|
||||
|
||||
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
return -EINVAL;
|
||||
if (unlikely(sqe->ioprio))
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
@ -4575,10 +4735,6 @@ static int __io_setxattr_prep(struct io_kiocb *req,
|
||||
const char __user *name;
|
||||
int ret;
|
||||
|
||||
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
return -EINVAL;
|
||||
if (unlikely(sqe->ioprio))
|
||||
return -EINVAL;
|
||||
if (unlikely(req->flags & REQ_F_FIXED_FILE))
|
||||
return -EBADF;
|
||||
|
||||
@ -4857,6 +5013,96 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
|
||||
{
|
||||
req->uring_cmd.task_work_cb(&req->uring_cmd);
|
||||
}
|
||||
|
||||
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
|
||||
void (*task_work_cb)(struct io_uring_cmd *))
|
||||
{
|
||||
struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
|
||||
|
||||
req->uring_cmd.task_work_cb = task_work_cb;
|
||||
req->io_task_work.func = io_uring_cmd_work;
|
||||
io_req_task_prio_work_add(req);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
|
||||
|
||||
/*
|
||||
* Called by consumers of io_uring_cmd, if they originally returned
|
||||
* -EIOCBQUEUED upon receiving the command.
|
||||
*/
|
||||
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
|
||||
{
|
||||
struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
|
||||
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
if (req->ctx->flags & IORING_SETUP_CQE32)
|
||||
__io_req_complete32(req, 0, ret, 0, res2, 0);
|
||||
else
|
||||
io_req_complete(req, ret);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
|
||||
|
||||
static int io_uring_cmd_prep_async(struct io_kiocb *req)
|
||||
{
|
||||
size_t cmd_size;
|
||||
|
||||
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
|
||||
|
||||
memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_uring_cmd_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = &req->uring_cmd;
|
||||
|
||||
if (sqe->rw_flags)
|
||||
return -EINVAL;
|
||||
ioucmd->cmd = sqe->cmd;
|
||||
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = &req->uring_cmd;
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct file *file = req->file;
|
||||
int ret;
|
||||
|
||||
if (!req->file->f_op->uring_cmd)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (ctx->flags & IORING_SETUP_SQE128)
|
||||
issue_flags |= IO_URING_F_SQE128;
|
||||
if (ctx->flags & IORING_SETUP_CQE32)
|
||||
issue_flags |= IO_URING_F_CQE32;
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL)
|
||||
issue_flags |= IO_URING_F_IOPOLL;
|
||||
|
||||
if (req_has_async_data(req))
|
||||
ioucmd->cmd = req->async_data;
|
||||
|
||||
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
|
||||
if (ret == -EAGAIN) {
|
||||
if (!req_has_async_data(req)) {
|
||||
if (io_alloc_async_data(req))
|
||||
return -ENOMEM;
|
||||
io_uring_cmd_prep_async(req);
|
||||
}
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
if (ret != -EIOCBQUEUED)
|
||||
io_uring_cmd_done(ioucmd, ret, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_shutdown_prep(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
@ -4992,11 +5238,25 @@ done:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
/*
|
||||
* If the ring is setup with CQE32, relay back addr/addr
|
||||
*/
|
||||
if (req->ctx->flags & IORING_SETUP_CQE32) {
|
||||
req->nop.extra1 = READ_ONCE(sqe->addr);
|
||||
req->nop.extra2 = READ_ONCE(sqe->addr2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* IORING_OP_NOP just posts a completion event, nothing else.
|
||||
*/
|
||||
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
unsigned int cflags;
|
||||
void __user *buf;
|
||||
|
||||
if (req->flags & REQ_F_BUFFER_SELECT) {
|
||||
@ -5007,7 +5267,12 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
__io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
|
||||
cflags = io_put_kbuf(req, issue_flags);
|
||||
if (!(req->ctx->flags & IORING_SETUP_CQE32))
|
||||
__io_req_complete(req, issue_flags, 0, cflags);
|
||||
else
|
||||
__io_req_complete32(req, issue_flags, 0, cflags,
|
||||
req->nop.extra1, req->nop.extra2);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -6366,9 +6631,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_socket *sock = &req->sock;
|
||||
|
||||
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
return -EINVAL;
|
||||
if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index)
|
||||
if (sqe->addr || sqe->rw_flags || sqe->buf_index)
|
||||
return -EINVAL;
|
||||
|
||||
sock->domain = READ_ONCE(sqe->fd);
|
||||
@ -6750,7 +7013,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
|
||||
req->io_task_work.func = io_apoll_task_func;
|
||||
|
||||
trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static inline void io_poll_execute(struct io_kiocb *req, int res,
|
||||
@ -7255,7 +7518,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
|
||||
|
||||
req->cqe.res = -ETIME;
|
||||
req->io_task_work.func = io_req_task_complete;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
@ -7751,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
switch (req->opcode) {
|
||||
case IORING_OP_NOP:
|
||||
return 0;
|
||||
return io_nop_prep(req, sqe);
|
||||
case IORING_OP_READV:
|
||||
case IORING_OP_READ_FIXED:
|
||||
case IORING_OP_READ:
|
||||
@ -7835,6 +8098,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
return io_getxattr_prep(req, sqe);
|
||||
case IORING_OP_SOCKET:
|
||||
return io_socket_prep(req, sqe);
|
||||
case IORING_OP_URING_CMD:
|
||||
return io_uring_cmd_prep(req, sqe);
|
||||
}
|
||||
|
||||
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
|
||||
@ -7867,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req)
|
||||
return io_recvmsg_prep_async(req);
|
||||
case IORING_OP_CONNECT:
|
||||
return io_connect_prep_async(req);
|
||||
case IORING_OP_URING_CMD:
|
||||
return io_uring_cmd_prep_async(req);
|
||||
}
|
||||
printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
|
||||
req->opcode);
|
||||
@ -8161,6 +8428,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
|
||||
case IORING_OP_SOCKET:
|
||||
ret = io_socket(req, issue_flags);
|
||||
break;
|
||||
case IORING_OP_URING_CMD:
|
||||
ret = io_uring_cmd(req, issue_flags);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
@ -8371,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
|
||||
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
|
||||
|
||||
req->io_task_work.func = io_req_task_link_timeout;
|
||||
io_req_task_work_add(req, false);
|
||||
io_req_task_work_add(req);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
@ -8761,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
|
||||
* though the application is the one updating it.
|
||||
*/
|
||||
head = READ_ONCE(ctx->sq_array[sq_idx]);
|
||||
if (likely(head < ctx->sq_entries))
|
||||
if (likely(head < ctx->sq_entries)) {
|
||||
/* double index for 128-byte SQEs, twice as long */
|
||||
if (ctx->flags & IORING_SETUP_SQE128)
|
||||
head <<= 1;
|
||||
return &ctx->sq_sqes[head];
|
||||
}
|
||||
|
||||
/* drop invalid entries */
|
||||
ctx->cq_extra--;
|
||||
@ -10080,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||
task->io_uring = tctx;
|
||||
spin_lock_init(&tctx->task_lock);
|
||||
INIT_WQ_LIST(&tctx->task_list);
|
||||
INIT_WQ_LIST(&tctx->prior_task_list);
|
||||
INIT_WQ_LIST(&tctx->prio_task_list);
|
||||
init_task_work(&tctx->task_work, tctx_task_work);
|
||||
return 0;
|
||||
}
|
||||
@ -10258,8 +10532,8 @@ static void *io_mem_alloc(size_t size)
|
||||
return (void *) __get_free_pages(gfp, get_order(size));
|
||||
}
|
||||
|
||||
static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
|
||||
size_t *sq_offset)
|
||||
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
|
||||
unsigned int cq_entries, size_t *sq_offset)
|
||||
{
|
||||
struct io_rings *rings;
|
||||
size_t off, sq_array_size;
|
||||
@ -10267,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
|
||||
off = struct_size(rings, cqes, cq_entries);
|
||||
if (off == SIZE_MAX)
|
||||
return SIZE_MAX;
|
||||
if (ctx->flags & IORING_SETUP_CQE32) {
|
||||
if (check_shl_overflow(off, 1, &off))
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
off = ALIGN(off, SMP_CACHE_BYTES);
|
||||
@ -11833,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
|
||||
unsigned int sq_tail = READ_ONCE(r->sq.tail);
|
||||
unsigned int cq_head = READ_ONCE(r->cq.head);
|
||||
unsigned int cq_tail = READ_ONCE(r->cq.tail);
|
||||
unsigned int cq_shift = 0;
|
||||
unsigned int sq_entries, cq_entries;
|
||||
bool has_lock;
|
||||
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
|
||||
unsigned int i;
|
||||
|
||||
if (is_cqe32)
|
||||
cq_shift = 1;
|
||||
|
||||
/*
|
||||
* we may get imprecise sqe and cqe info if uring is actively running
|
||||
* since we get cached_sq_head and cached_cq_tail without uring_lock
|
||||
@ -11869,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
|
||||
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
|
||||
for (i = 0; i < cq_entries; i++) {
|
||||
unsigned int entry = i + cq_head;
|
||||
struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
|
||||
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
|
||||
|
||||
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
|
||||
if (!is_cqe32) {
|
||||
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
|
||||
entry & cq_mask, cqe->user_data, cqe->res,
|
||||
cqe->flags);
|
||||
} else {
|
||||
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
|
||||
"extra1:%llu, extra2:%llu\n",
|
||||
entry & cq_mask, cqe->user_data, cqe->res,
|
||||
cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -11976,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
ctx->sq_entries = p->sq_entries;
|
||||
ctx->cq_entries = p->cq_entries;
|
||||
|
||||
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
|
||||
size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
|
||||
if (size == SIZE_MAX)
|
||||
return -EOVERFLOW;
|
||||
|
||||
@ -11991,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
rings->sq_ring_entries = p->sq_entries;
|
||||
rings->cq_ring_entries = p->cq_entries;
|
||||
|
||||
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
|
||||
if (p->flags & IORING_SETUP_SQE128)
|
||||
size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
|
||||
else
|
||||
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
|
||||
if (size == SIZE_MAX) {
|
||||
io_mem_free(ctx->rings);
|
||||
ctx->rings = NULL;
|
||||
@ -12235,7 +12528,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
|
||||
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
|
||||
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
|
||||
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
|
||||
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
|
||||
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
|
||||
IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
|
||||
return -EINVAL;
|
||||
|
||||
return io_uring_create(entries, &p, params);
|
||||
@ -12924,6 +13218,8 @@ static int __init io_uring_init(void)
|
||||
|
||||
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
|
||||
|
||||
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
|
||||
SLAB_ACCOUNT);
|
||||
return 0;
|
||||
|
@ -1953,6 +1953,7 @@ struct dir_context {
|
||||
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
|
||||
|
||||
struct iov_iter;
|
||||
struct io_uring_cmd;
|
||||
|
||||
struct file_operations {
|
||||
struct module *owner;
|
||||
@ -1995,6 +1996,7 @@ struct file_operations {
|
||||
struct file *file_out, loff_t pos_out,
|
||||
loff_t len, unsigned int remap_flags);
|
||||
int (*fadvise)(struct file *, loff_t, loff_t, int);
|
||||
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
|
||||
} __randomize_layout;
|
||||
|
||||
struct inode_operations {
|
||||
|
@ -5,7 +5,32 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/xarray.h>
|
||||
|
||||
enum io_uring_cmd_flags {
|
||||
IO_URING_F_COMPLETE_DEFER = 1,
|
||||
IO_URING_F_UNLOCKED = 2,
|
||||
/* int's last bit, sign checks are usually faster than a bit test */
|
||||
IO_URING_F_NONBLOCK = INT_MIN,
|
||||
|
||||
/* ctx state flags, for URING_CMD */
|
||||
IO_URING_F_SQE128 = 4,
|
||||
IO_URING_F_CQE32 = 8,
|
||||
IO_URING_F_IOPOLL = 16,
|
||||
};
|
||||
|
||||
struct io_uring_cmd {
|
||||
struct file *file;
|
||||
const void *cmd;
|
||||
/* callback to defer completions to task context */
|
||||
void (*task_work_cb)(struct io_uring_cmd *cmd);
|
||||
u32 cmd_op;
|
||||
u32 pad;
|
||||
u8 pdu[32]; /* available inline for free use */
|
||||
};
|
||||
|
||||
#if defined(CONFIG_IO_URING)
|
||||
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
|
||||
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
|
||||
void (*task_work_cb)(struct io_uring_cmd *));
|
||||
struct sock *io_uring_get_socket(struct file *file);
|
||||
void __io_uring_cancel(bool cancel_all);
|
||||
void __io_uring_free(struct task_struct *tsk);
|
||||
@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk)
|
||||
__io_uring_free(tsk);
|
||||
}
|
||||
#else
|
||||
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
|
||||
ssize_t ret2)
|
||||
{
|
||||
}
|
||||
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
|
||||
void (*task_work_cb)(struct io_uring_cmd *))
|
||||
{
|
||||
}
|
||||
static inline struct sock *io_uring_get_socket(struct file *file)
|
||||
{
|
||||
return NULL;
|
||||
|
@ -321,13 +321,16 @@ TRACE_EVENT(io_uring_fail_link,
|
||||
* @user_data: user data associated with the request
|
||||
* @res: result of the request
|
||||
* @cflags: completion flags
|
||||
* @extra1: extra 64-bit data for CQE32
|
||||
* @extra2: extra 64-bit data for CQE32
|
||||
*
|
||||
*/
|
||||
TRACE_EVENT(io_uring_complete,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags),
|
||||
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
|
||||
u64 extra1, u64 extra2),
|
||||
|
||||
TP_ARGS(ctx, req, user_data, res, cflags),
|
||||
TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
@ -335,6 +338,8 @@ TRACE_EVENT(io_uring_complete,
|
||||
__field( u64, user_data )
|
||||
__field( int, res )
|
||||
__field( unsigned, cflags )
|
||||
__field( u64, extra1 )
|
||||
__field( u64, extra2 )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -343,12 +348,17 @@ TRACE_EVENT(io_uring_complete,
|
||||
__entry->user_data = user_data;
|
||||
__entry->res = res;
|
||||
__entry->cflags = cflags;
|
||||
__entry->extra1 = extra1;
|
||||
__entry->extra2 = extra2;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x",
|
||||
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
|
||||
"extra1 %llu extra2 %llu ",
|
||||
__entry->ctx, __entry->req,
|
||||
__entry->user_data,
|
||||
__entry->res, __entry->cflags)
|
||||
__entry->res, __entry->cflags,
|
||||
(unsigned long long) __entry->extra1,
|
||||
(unsigned long long) __entry->extra2)
|
||||
);
|
||||
|
||||
/**
|
||||
|
@ -22,6 +22,7 @@ struct io_uring_sqe {
|
||||
union {
|
||||
__u64 off; /* offset into file */
|
||||
__u64 addr2;
|
||||
__u32 cmd_op;
|
||||
};
|
||||
union {
|
||||
__u64 addr; /* pointer to buffer or iovecs */
|
||||
@ -61,8 +62,17 @@ struct io_uring_sqe {
|
||||
__s32 splice_fd_in;
|
||||
__u32 file_index;
|
||||
};
|
||||
__u64 addr3;
|
||||
__u64 __pad2[1];
|
||||
union {
|
||||
struct {
|
||||
__u64 addr3;
|
||||
__u64 __pad2[1];
|
||||
};
|
||||
/*
|
||||
* If the ring is initialized with IORING_SETUP_SQE128, then
|
||||
* this field is used for 80 bytes of arbitrary command data
|
||||
*/
|
||||
__u8 cmd[0];
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
@ -128,6 +138,9 @@ enum {
|
||||
*/
|
||||
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
|
||||
|
||||
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
|
||||
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
|
||||
|
||||
enum io_uring_op {
|
||||
IORING_OP_NOP,
|
||||
IORING_OP_READV,
|
||||
@ -175,6 +188,7 @@ enum io_uring_op {
|
||||
IORING_OP_FGETXATTR,
|
||||
IORING_OP_GETXATTR,
|
||||
IORING_OP_SOCKET,
|
||||
IORING_OP_URING_CMD,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
@ -251,6 +265,12 @@ struct io_uring_cqe {
|
||||
__u64 user_data; /* sqe->data submission passed back */
|
||||
__s32 res; /* result code for this event */
|
||||
__u32 flags;
|
||||
|
||||
/*
|
||||
* If the ring is initialized with IORING_SETUP_CQE32, then this field
|
||||
* contains 16-bytes of padding, doubling the size of the CQE.
|
||||
*/
|
||||
__u64 big_cqe[];
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 {
|
||||
__u64 result;
|
||||
};
|
||||
|
||||
/* same as struct nvme_passthru_cmd64, minus the 8b result field */
|
||||
struct nvme_uring_cmd {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 rsvd1;
|
||||
__u32 nsid;
|
||||
__u32 cdw2;
|
||||
__u32 cdw3;
|
||||
__u64 metadata;
|
||||
__u64 addr;
|
||||
__u32 metadata_len;
|
||||
__u32 data_len;
|
||||
__u32 cdw10;
|
||||
__u32 cdw11;
|
||||
__u32 cdw12;
|
||||
__u32 cdw13;
|
||||
__u32 cdw14;
|
||||
__u32 cdw15;
|
||||
__u32 timeout_ms;
|
||||
__u32 rsvd2;
|
||||
};
|
||||
|
||||
#define nvme_admin_cmd nvme_passthru_cmd
|
||||
|
||||
#define NVME_IOCTL_ID _IO('N', 0x40)
|
||||
@ -83,4 +105,10 @@ struct nvme_passthru_cmd64 {
|
||||
#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64)
|
||||
#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64)
|
||||
|
||||
/* io_uring async commands: */
|
||||
#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd)
|
||||
#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd)
|
||||
#define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd)
|
||||
#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd)
|
||||
|
||||
#endif /* _UAPI_LINUX_NVME_IOCTL_H */
|
||||
|
Loading…
x
Reference in New Issue
Block a user