From 40c6f9c28ef03f2f2c3ee58c2447a6e6b9a713f2 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Fri, 15 Jun 2018 12:39:27 -0600 Subject: [PATCH 01/19] nvme.h: resync with nvme-cli Added some feature ids present in nvme-cli but not kernel. Signed-off-by: Revanth Rajashekar Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2950ce957656..80dfedcf0bf7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -749,6 +749,11 @@ enum { NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0x10, + NVME_FEAT_NOPSC = 0x11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, From 230f1f9e04e2abee34b1478b3bcc2d947b7cc2a0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 12 Jun 2018 16:28:24 -0700 Subject: [PATCH 02/19] nvme: move init of keep_alive work item to controller initialization Currently, the code initializes the keep alive work item whenever nvme_start_keep_alive() is called. However, this routine is called several times while reconnecting, etc. Although it's hoped that keep alive is always disabled and not scheduled when start is called, re-initing if it were scheduled or completing can have very bad side effects. There's no need for re-initialization. Move the keep_alive work item and cmd struct initialization to controller init. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 46df030b2c3f..e541fe268bcf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -848,9 +848,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); - memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); - ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } @@ -3484,6 +3481,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; From 0866bf0c3778661e65f68a5c93df8e0a1e9e43cc Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 11 Jun 2018 13:40:07 -0400 Subject: [PATCH 03/19] nvmet: add commands supported and effects log page This patch adds support for Commands Supported and Effects log page (Log Identifier 05h) for NVMeOF. This also makes it easier to find which commands are supported, e.g. :- subnqn : testnqn1 Admin Command Set ACS2 [Get Log Page ] 00000001 ACS6 [Identify ] 00000001 ACS8 [Abort ] 00000001 ACS9 [Set Features ] 00000001 ACS10 [Get Features ] 00000001 ACS12 [Asynchronous Event Request ] 00000001 ACS24 [Keep Alive ] 00000001 NVM Command Set IOCS0 [Flush ] 00000001 IOCS1 [Write ] 00000001 IOCS2 [Read ] 00000001 IOCS8 [Write Zeroes ] 00000001 IOCS9 [Dataset Management ] 00000001 This partticular functionality can be used from the host side to examine the NVMeOF ctrl commands supported. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 38803576d5e1..e2c6f8b39388 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -128,6 +128,36 @@ out: nvmet_req_complete(req, status); } +static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_INTERNAL; + struct nvme_effects_log *log; + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + goto out; + + log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); + + log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); + + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); + + kfree(log); +out: + nvmet_req_complete(req, status); +} + static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -208,7 +238,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* first slot is read-only, only one slot supported */ id->frmw = (1 << 0) | (1 << 1); - id->lpa = (1 << 0) | (1 << 2); + id->lpa = (1 << 0) | (1 << 1) | (1 << 2); id->elpe = NVMET_ERROR_LOG_SLOTS - 1; id->npss = 0; @@ -586,6 +616,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_LOG_CHANGED_NS: req->execute = nvmet_execute_get_log_changed_ns; return 0; + case NVME_LOG_CMD_EFFECTS: + req->execute = nvmet_execute_get_log_cmd_effects_ns; + return 0; } break; case nvme_admin_identify: From 55eb942eda2ccbbbea61db4c1a774ba22b618046 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 20 Jun 2018 00:01:41 -0400 Subject: [PATCH 04/19] nvmet: add buffered I/O support for file backed ns Add a new "buffered_io" attribute, which disabled direct I/O and thus enables page cache based caching when enabled. The attribute can only be changed when the namespace is disabled as the file has to be reopend for the change to take effect. The possibly blocking read/write are deferred to a newly introduced global workqueue. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 29 +++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 9 +++++++++ drivers/nvme/target/io-cmd-file.c | 31 ++++++++++++++++++++++++++----- drivers/nvme/target/nvmet.h | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d3f3b3ec4d1a..fee56b3a23bc 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -407,11 +407,40 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, CONFIGFS_ATTR(nvmet_ns_, enable); +static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io); +} + +static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting buffered_io value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->buffered_io = val; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, buffered_io); + static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_enable, + &nvmet_ns_attr_buffered_io, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 74d4b785d2da..96eafbd419e7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -18,6 +18,7 @@ #include "nvmet.h" +struct workqueue_struct *buffered_io_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; static DEFINE_IDA(cntlid_ida); @@ -437,6 +438,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; uuid_gen(&ns->uuid); + ns->buffered_io = false; return ns; } @@ -1109,6 +1111,12 @@ static int __init nvmet_init(void) { int error; + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", + WQ_MEM_RECLAIM, 0); + if (!buffered_io_wq) { + error = -ENOMEM; + goto out; + } error = nvmet_init_discovery(); if (error) goto out; @@ -1129,6 +1137,7 @@ static void __exit nvmet_exit(void) nvmet_exit_configfs(); nvmet_exit_discovery(); ida_destroy(&cntlid_ida); + destroy_workqueue(buffered_io_wq); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 8c42b3a8c420..57c660e3245d 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -16,6 +16,8 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) { if (ns->file) { + if (ns->buffered_io) + flush_workqueue(buffered_io_wq); mempool_destroy(ns->bvec_pool); ns->bvec_pool = NULL; kmem_cache_destroy(ns->bvec_cache); @@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) int nvmet_file_ns_enable(struct nvmet_ns *ns) { - int ret; + int flags = O_RDWR | O_LARGEFILE; struct kstat stat; + int ret; - ns->file = filp_open(ns->device_path, - O_RDWR | O_LARGEFILE | O_DIRECT, 0); + if (!ns->buffered_io) + flags |= O_DIRECT; + + ns->file = filp_open(ns->device_path, flags, 0); if (IS_ERR(ns->file)) { pr_err("failed to open file %s: (%ld)\n", ns->device_path, PTR_ERR(ns->file)); @@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; - iocb->ki_flags = IOCB_DIRECT | ki_flags; + iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); ret = call_iter(iocb, &iter); @@ -189,6 +194,19 @@ out: nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); } +static void nvmet_file_buffered_io_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + nvmet_file_execute_rw(req); +} + +static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); + queue_work(buffered_io_wq, &req->f.work); +} + static void nvmet_file_flush_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); @@ -280,7 +298,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: - req->execute = nvmet_file_execute_rw; + if (req->ns->buffered_io) + req->execute = nvmet_file_execute_rw_buffered_io; + else + req->execute = nvmet_file_execute_rw; req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 480dfe10fad9..5efb98ec95df 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -65,6 +65,7 @@ struct nvmet_ns { u8 nguid[16]; uuid_t uuid; + bool buffered_io; bool enabled; struct nvmet_subsys *subsys; const char *device_path; @@ -269,6 +270,8 @@ struct nvmet_req { const struct nvmet_fabrics_ops *ops; }; +extern struct workqueue_struct *buffered_io_wq; + static inline void nvmet_set_status(struct nvmet_req *req, u16 status) { req->rsp->status = cpu_to_le16(status << 1); From 64a741c1eaa83e34a8846c7196feb8e45785bebc Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 20 Jun 2018 07:15:05 -0700 Subject: [PATCH 05/19] nvme-rdma: support up to 4 segments of inline data Allow up to 4 segments of inline data for NVMF WRITE operations. This reduces latency for small WRITEs by removing the need for the target to issue a READ WR for IB, or a REG_MR + READ WR chain for iWarp. Also cap the inline segments used based on the limitations of the device. Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Steve Wise Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 518c5b09038c..363f73fe549c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -40,13 +40,14 @@ #define NVME_RDMA_MAX_SEGMENTS 256 -#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 +#define NVME_RDMA_MAX_INLINE_SEGMENTS 4 struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; struct kref ref; struct list_head entry; + unsigned int num_inline_segments; }; struct nvme_rdma_qe { @@ -117,6 +118,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + bool use_inline_data; }; static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) @@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) /* +1 for drain */ init_attr.cap.max_recv_wr = queue->queue_size + 1; init_attr.cap.max_recv_sge = 1; - init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; @@ -374,6 +376,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_pd; } + ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, + ndev->dev->attrs.max_sge - 1); list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); @@ -925,6 +929,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; + if (ctrl->ctrl.sgls & (1 << 20)) + ctrl->use_inline_data = true; + if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) @@ -1090,19 +1097,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c) } static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, - struct nvme_rdma_request *req, struct nvme_command *c) + struct nvme_rdma_request *req, struct nvme_command *c, + int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + struct scatterlist *sgl = req->sg_table.sgl; + struct ib_sge *sge = &req->sge[1]; + u32 len = 0; + int i; - req->sge[1].addr = sg_dma_address(req->sg_table.sgl); - req->sge[1].length = sg_dma_len(req->sg_table.sgl); - req->sge[1].lkey = queue->device->pd->local_dma_lkey; + for (i = 0; i < count; i++, sgl++, sge++) { + sge->addr = sg_dma_address(sgl); + sge->length = sg_dma_len(sgl); + sge->lkey = queue->device->pd->local_dma_lkey; + len += sge->length; + } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); - sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->length = cpu_to_le32(len); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->num_sge++; + req->num_sge += count; return 0; } @@ -1195,15 +1210,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, goto out_free_table; } - if (count == 1) { + if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { - ret = nvme_rdma_map_sg_inline(queue, req, c); + ret = nvme_rdma_map_sg_inline(queue, req, c, count); goto out; } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { ret = nvme_rdma_map_sg_single(queue, req, c); goto out; } From 0d5ee2b2ab4f6776c361bc975c2323bc8b5cf349 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 20 Jun 2018 07:15:10 -0700 Subject: [PATCH 06/19] nvmet-rdma: support max(16KB, PAGE_SIZE) inline data The patch enables inline data sizes using up to 4 recv sges, and capping the size at 16KB or at least 1 page size. So on a 4K page system, up to 16KB is supported, and for a 64K page system 1 page of 64KB is supported. We avoid > 0 order page allocations for the inline buffers by using multiple recv sges, one for each page. If the device cannot support the configured inline data size due to lack of enough recv sges, then log a warning and reduce the inline size. Add a new configfs port attribute, called param_inline_data_size, to allow configuring the size of inline data for a given nvmf port. The maximum size allowed is still enforced by nvmet-rdma with NVMET_RDMA_MAX_INLINE_DATA_SIZE, which is now max(16KB, PAGE_SIZE). And the default size, if not specified via configfs, is still PAGE_SIZE. This preserves the existing behavior, but allows larger inline sizes for small page systems. If the configured inline data size exceeds NVMET_RDMA_MAX_INLINE_DATA_SIZE, a warning is logged and the size is reduced. If param_inline_data_size is set to 0, then inline data is disabled for that nvmf port. Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Steve Wise Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 +- drivers/nvme/target/configfs.c | 31 ++++++ drivers/nvme/target/core.c | 4 + drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 2 +- drivers/nvme/target/rdma.c | 169 ++++++++++++++++++++++++-------- 6 files changed, 169 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e2c6f8b39388..837bbdbfaa4b 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -268,14 +268,14 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->has_keyed_sgls) id->sgls |= cpu_to_le32(1 << 2); - if (ctrl->ops->sqe_inline_size) + if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); strcpy(id->subnqn, ctrl->subsys->subsysnqn); /* Max command capsule size is sqe + single page of in-capsule data */ id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + - ctrl->ops->sqe_inline_size) / 16); + req->port->inline_data_size) / 16); /* Max response capsule size is cqe */ id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index fee56b3a23bc..3ba5ea5c4376 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, CONFIGFS_ATTR(nvmet_, addr_trsvcid); +static ssize_t nvmet_param_inline_data_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size); +} + +static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (port->enabled) { + pr_err("Cannot modify inline_data_size while port enabled\n"); + pr_err("Disable the port before modifying\n"); + return -EACCES; + } + ret = kstrtoint(page, 0, &port->inline_data_size); + if (ret) { + pr_err("Invalid value '%s' for inline_data_size\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_inline_data_size); + static ssize_t nvmet_addr_trtype_show(struct config_item *item, char *page) { @@ -903,6 +932,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_traddr, &nvmet_attr_addr_trsvcid, &nvmet_attr_addr_trtype, + &nvmet_attr_param_inline_data_size, NULL, }; @@ -932,6 +962,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); + port->inline_data_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); config_group_init_type_name(&port->group, name, &nvmet_port_type); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 96eafbd419e7..ddd85715a00a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -242,6 +242,10 @@ int nvmet_enable_port(struct nvmet_port *port) return ret; } + /* If the transport didn't set inline_data_size, then disable it. */ + if (port->inline_data_size < 0) + port->inline_data_size = 0; + port->enabled = true; return 0; } diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 08656b849bd6..eae29f493a07 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->has_keyed_sgls) id->sgls |= cpu_to_le32(1 << 2); - if (ctrl->ops->sqe_inline_size) + if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); strcpy(id->subnqn, ctrl->subsys->subsysnqn); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 5efb98ec95df..688993855402 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -117,6 +117,7 @@ struct nvmet_port { struct list_head referrals; void *priv; bool enabled; + int inline_data_size; }; static inline struct nvmet_port *to_nvmet_port(struct config_item *item) @@ -226,7 +227,6 @@ struct nvmet_req; struct nvmet_fabrics_ops { struct module *owner; unsigned int type; - unsigned int sqe_inline_size; unsigned int msdbd; bool has_keyed_sgls : 1; void (*queue_response)(struct nvmet_req *req); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 52e0c5d579a7..2106ae2ec177 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -33,16 +33,17 @@ #include "nvmet.h" /* - * We allow up to a page of inline data to go with the SQE + * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ -#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_MAX_INLINE_SGE 4 +#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) struct nvmet_rdma_cmd { - struct ib_sge sge[2]; + struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; struct ib_recv_wr wr; - struct scatterlist inline_sg; - struct page *inline_page; + struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; }; @@ -116,6 +117,8 @@ struct nvmet_rdma_device { size_t srq_size; struct kref ref; struct list_head entry; + int inline_data_size; + int inline_page_count; }; static bool nvmet_rdma_use_srq; @@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); static const struct nvmet_fabrics_ops nvmet_rdma_ops; +static int num_pages(int len) +{ + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); +} + /* XXX: really should move to a generic header sooner or later.. */ static inline u32 get_unaligned_le24(const u8 *p) { @@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); } +static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + int i; + + if (!ndev->inline_data_size) + return; + + sg = c->inline_sg; + sge = &c->sge[1]; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } +} + +static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + struct page *pg; + int len; + int i; + + if (!ndev->inline_data_size) + return 0; + + sg = c->inline_sg; + sg_init_table(sg, ndev->inline_page_count); + sge = &c->sge[1]; + len = ndev->inline_data_size; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) + goto out_err; + sg_assign_page(sg, pg); + sge->addr = ib_dma_map_page(ndev->device, + pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, sge->addr)) + goto out_err; + sge->length = min_t(int, len, PAGE_SIZE); + sge->lkey = ndev->pd->local_dma_lkey; + len -= sge->length; + } + + return 0; +out_err: + for (; i >= 0; i--, sg--, sge--) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } + return -ENOMEM; +} + static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { @@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, c->sge[0].length = sizeof(*c->nvme_cmd); c->sge[0].lkey = ndev->pd->local_dma_lkey; - if (!admin) { - c->inline_page = alloc_pages(GFP_KERNEL, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - if (!c->inline_page) - goto out_unmap_cmd; - c->sge[1].addr = ib_dma_map_page(ndev->device, - c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) - goto out_free_inline_page; - c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; - c->sge[1].lkey = ndev->pd->local_dma_lkey; - } + if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) + goto out_unmap_cmd; c->cqe.done = nvmet_rdma_recv_done; c->wr.wr_cqe = &c->cqe; c->wr.sg_list = c->sge; - c->wr.num_sge = admin ? 1 : 2; + c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; return 0; -out_free_inline_page: - if (!admin) { - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } out_unmap_cmd: ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); @@ -240,12 +297,8 @@ out: static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { - if (!admin) { - ib_dma_unmap_page(ndev->device, c->sge[1].addr, - NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } + if (!admin) + nvmet_rdma_free_inline_pages(ndev, c); ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); kfree(c->nvme_cmd); @@ -429,7 +482,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } - if (rsp->req.sg != &rsp->cmd->inline_sg) + if (rsp->req.sg != rsp->cmd->inline_sg) sgl_free(rsp->req.sg); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) @@ -529,10 +582,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, u64 off) { - sg_init_table(&rsp->cmd->inline_sg, 1); - sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); - rsp->req.sg = &rsp->cmd->inline_sg; - rsp->req.sg_cnt = 1; + int sg_count = num_pages(len); + struct scatterlist *sg; + int i; + + sg = rsp->cmd->inline_sg; + for (i = 0; i < sg_count; i++, sg++) { + if (i < sg_count - 1) + sg_unmark_end(sg); + else + sg_mark_end(sg); + sg->offset = off; + sg->length = min_t(int, len, PAGE_SIZE - off); + len -= sg->length; + if (!i) + off = 0; + } + + rsp->req.sg = rsp->cmd->inline_sg; + rsp->req.sg_cnt = sg_count; } static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) @@ -544,7 +612,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) if (!nvme_is_write(rsp->req.cmd)) return NVME_SC_INVALID_FIELD | NVME_SC_DNR; - if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; } @@ -743,7 +811,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) srq_size = 4095; /* XXX: tune */ srq_attr.attr.max_wr = srq_size; - srq_attr.attr.max_sge = 2; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; srq_attr.attr.srq_limit = 0; srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); @@ -793,7 +861,10 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { + struct nvmet_port *port = cm_id->context; struct nvmet_rdma_device *ndev; + int inline_page_count; + int inline_sge_count; int ret; mutex_lock(&device_list_mutex); @@ -807,6 +878,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; + inline_page_count = num_pages(port->inline_data_size); + inline_sge_count = max(cm_id->device->attrs.max_sge_rd, + cm_id->device->attrs.max_sge) - 1; + if (inline_page_count > inline_sge_count) { + pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", + port->inline_data_size, cm_id->device->name, + inline_sge_count * PAGE_SIZE); + port->inline_data_size = inline_sge_count * PAGE_SIZE; + inline_page_count = inline_sge_count; + } + ndev->inline_data_size = port->inline_data_size; + ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -881,7 +964,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; - qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); @@ -1379,6 +1462,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) return -EINVAL; } + if (port->inline_data_size < 0) { + port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + port->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, port->disc_addr.trsvcid, &addr); if (ret) { @@ -1456,7 +1548,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, - .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, .msdbd = 1, .has_keyed_sgls = 1, .add_port = nvmet_rdma_add_port, From 2fc464e2162c2b2f7faf7404fa9c35d1cf70aa00 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 27 Jun 2018 14:58:02 +0300 Subject: [PATCH 07/19] nvmet-rdma: add unlikely check in the fast path ib_post_send operation should succeed unless something unusual happened to the ib device. Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2106ae2ec177..4ca09456bbbb 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -546,7 +546,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) rsp->send_sge.addr, rsp->send_sge.length, DMA_TO_DEVICE); - if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } From 202093848cac2da7d92ae666b51b7109bbab633c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 1 Jul 2018 12:20:24 +0300 Subject: [PATCH 08/19] nvmet-rdma: add an error flow for post_recv failures Posting receive buffer operation can fail, thus we should make sure to have an error flow during initialization phase. While we're here, add a debug print in case of a failure. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 4ca09456bbbb..e7f43d1e1779 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -436,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmd) { struct ib_recv_wr *bad_wr; + int ret; ib_dma_sync_single_for_device(ndev->device, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); if (ndev->srq) - return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); - return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + else + ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + + if (unlikely(ret)) + pr_err("post_recv cmd failed\n"); + + return ret; } static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) @@ -833,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) ndev->srq = srq; ndev->srq_size = srq_size; - for (i = 0; i < srq_size; i++) - nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + for (i = 0; i < srq_size; i++) { + ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + if (ret) + goto out_free_cmds; + } return 0; +out_free_cmds: + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); out_destroy_srq: ib_destroy_srq(srq); return ret; @@ -982,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) if (!ndev->srq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; - nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + if (ret) + goto err_destroy_qp; } } out: return ret; +err_destroy_qp: + rdma_destroy_qp(queue->cm_id); err_destroy_cq: ib_free_cq(queue->cq); goto out; From 59e29ce66bc52ebd6d0cb450f13079c7e913430d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 29 Jun 2018 16:50:00 -0600 Subject: [PATCH 09/19] nvme: cache struct nvme_ctrl reference to struct nvme_request We will need to reference the controller in the setup and completion time for tracing and future traffic based keep alive support. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 2 ++ drivers/nvme/host/rdma.c | 1 + drivers/nvme/target/loop.c | 1 + 5 files changed, 6 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 41d45a1b5c62..9cc33752539a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; + nvme_req(rq)->ctrl = &ctrl->ctrl; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0c4a33df3b2f..f2249387b60d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -102,6 +102,7 @@ struct nvme_request { u8 retries; u8 flags; u16 status; + struct nvme_ctrl *ctrl; }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ba943f211687..8dcae11bbf3a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, BUG_ON(!nvmeq); iod->nvmeq = nvmeq; + + nvme_req(req)->ctrl = &dev->ctrl; return 0; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 363f73fe549c..bb6e26fa8331 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -288,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct ib_device *ibdev = dev->dev; int ret; + nvme_req(rq)->ctrl = &ctrl->ctrl; ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index d8d91f04bd7e..af7fbf4132b0 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, { struct nvme_loop_ctrl *ctrl = set->driver_data; + nvme_req(req)->ctrl = &ctrl->ctrl; return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } From 5d87eb94d9ba13e5e2d5ceb56ac6fe0948259ffa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Jun 2018 16:50:01 -0600 Subject: [PATCH 10/19] nvme: use hw qid in trace events We can not match a command to its completion based on the command id alone. We need the submitting queue identifier to pair with the completion, so this patch adds that to the trace buffer. This patch is also collapsing the admin and IO submission traces into a single one so we don't need to duplicate this and creating unnecessary code branches: we know if the command is an admin vs IO based on the qid. And since we're here, the patch fixes code formatting in the area. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn [hch: move the qid helper to nvme.h and made it an inline function] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +- drivers/nvme/host/nvme.h | 7 +++ drivers/nvme/host/trace.h | 114 +++++++++++++++----------------------- 3 files changed, 54 insertions(+), 72 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e541fe268bcf..e77e6418a21c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -652,10 +652,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; - if (ns) - trace_nvme_setup_nvm_cmd(req->q->id, cmd); - else - trace_nvme_setup_admin_cmd(cmd); + trace_nvme_setup_cmd(req, cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f2249387b60d..4ad0c8ad2a27 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -120,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req) return blk_mq_rq_to_pdu(req); } +static inline u16 nvme_req_qid(struct request *req) +{ + if (!req->rq_disk) + return 0; + return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; +} + /* The below value is the specific amount of delay needed before checking * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 01390f0e1671..e6362677447d 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -50,13 +50,8 @@ nvme_admin_opcode_name(nvme_admin_security_recv), \ nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) -const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_admin_cmd(opcode, cdw10) \ - nvme_trace_parse_admin_cmd(p, opcode, cdw10) - #define nvme_opcode_name(opcode) { opcode, #opcode } -#define show_opcode_name(val) \ +#define show_nvm_opcode_name(val) \ __print_symbolic(val, \ nvme_opcode_name(nvme_cmd_flush), \ nvme_opcode_name(nvme_cmd_write), \ @@ -70,83 +65,66 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, nvme_opcode_name(nvme_cmd_resv_acquire), \ nvme_opcode_name(nvme_cmd_resv_release)) +#define show_opcode_name(qid, opcode) \ + (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_cmd(opcode, cdw10) \ - nvme_trace_parse_nvm_cmd(p, opcode, cdw10) + u8 *cdw10); -TRACE_EVENT(nvme_setup_admin_cmd, - TP_PROTO(struct nvme_command *cmd), - TP_ARGS(cmd), +#define parse_nvme_cmd(qid, opcode, cdw10) \ + (qid ? \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10)) + +TRACE_EVENT(nvme_setup_cmd, + TP_PROTO(struct request *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), TP_STRUCT__entry( - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u64, metadata) - __array(u8, cdw10, 24) + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) ), TP_fast_assign( - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); + __entry->qid = nvme_req_qid(req); + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); ), - TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->cid, __entry->flags, __entry->metadata, - show_admin_opcode_name(__entry->opcode), - __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) -); - - -TRACE_EVENT(nvme_setup_nvm_cmd, - TP_PROTO(int qid, struct nvme_command *cmd), - TP_ARGS(qid, cmd), - TP_STRUCT__entry( - __field(int, qid) - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u32, nsid) - __field(u64, metadata) - __array(u8, cdw10, 24) - ), - TP_fast_assign( - __entry->qid = qid; - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->nsid = le32_to_cpu(cmd->common.nsid); - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); - ), - TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->qid, __entry->nsid, __entry->cid, + TP_printk("qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, - show_opcode_name(__entry->opcode), - __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) + show_opcode_name(__entry->qid, __entry->opcode), + parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( - __field(int, qid) - __field(int, cid) - __field(u64, result) - __field(u8, retries) - __field(u8, flags) - __field(u16, status) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) ), TP_fast_assign( - __entry->qid = req->q->id; - __entry->cid = req->tag; - __entry->result = le64_to_cpu(nvme_req(req)->result.u64); - __entry->retries = nvme_req(req)->retries; - __entry->flags = nvme_req(req)->flags; - __entry->status = nvme_req(req)->status; + __entry->qid = nvme_req_qid(req); + __entry->cid = req->tag; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; ), TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", __entry->qid, __entry->cid, __entry->result, From b80a55e246a1b817cb254d79d077f364a2419578 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Jul 2018 09:15:03 -0600 Subject: [PATCH 11/19] nvme: add controller name to trace events This appends the controller instance to the nvme trace buffer to distinguish which controller is dispatching and completing a command. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index e6362677447d..35b8c72478d5 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -82,6 +82,7 @@ TRACE_EVENT(nvme_setup_cmd, TP_PROTO(struct request *req, struct nvme_command *cmd), TP_ARGS(req, cmd), TP_STRUCT__entry( + __field(int, ctrl_id) __field(int, qid) __field(u8, opcode) __field(u8, flags) @@ -91,6 +92,7 @@ TRACE_EVENT(nvme_setup_cmd, __array(u8, cdw10, 24) ), TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; __entry->qid = nvme_req_qid(req); __entry->opcode = cmd->common.opcode; __entry->flags = cmd->common.flags; @@ -100,9 +102,9 @@ TRACE_EVENT(nvme_setup_cmd, memcpy(__entry->cdw10, cmd->common.cdw10, sizeof(__entry->cdw10)); ), - TP_printk("qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->qid, __entry->cid, __entry->nsid, - __entry->flags, __entry->metadata, + TP_printk("nvme%d: qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->ctrl_id, __entry->qid, __entry->cid, + __entry->nsid, __entry->flags, __entry->metadata, show_opcode_name(__entry->qid, __entry->opcode), parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); @@ -111,6 +113,7 @@ TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( + __field(int, ctrl_id) __field(int, qid) __field(int, cid) __field(u64, result) @@ -119,6 +122,7 @@ TRACE_EVENT(nvme_complete_rq, __field(u16, status) ), TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; __entry->qid = nvme_req_qid(req); __entry->cid = req->tag; __entry->result = le64_to_cpu(nvme_req(req)->result.u64); @@ -126,9 +130,10 @@ TRACE_EVENT(nvme_complete_rq, __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; ), - TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", - __entry->qid, __entry->cid, __entry->result, - __entry->retries, __entry->flags, __entry->status) + TP_printk("nvme%d: qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->ctrl_id, __entry->qid, __entry->cid, + __entry->result, __entry->retries, __entry->flags, + __entry->status) ); From 6268953e8977a23ca7512a2921e82a5d9252ec01 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Jun 2018 16:50:03 -0600 Subject: [PATCH 12/19] nvme: add disk name to trace events This will print the disk name to the nvme event trace for io requests so a user can better distinguish traffic to different disks. This can be used to create disk based filters. For example, to see only nvme0n2 traffic: echo "disk == \"nvme0n2\"" > /sys/kernel/debug/tracing/events/nvme/filter Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn [hch: turned __assign_disk_name into an inline function] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 11 +++++++++++ drivers/nvme/host/trace.h | 33 ++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 41944bbef835..25b0e310f4a8 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_common(p, cdw10); } } + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 35b8c72478d5..a490790d6691 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -78,10 +78,25 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ nvme_trace_parse_admin_cmd(p, opcode, cdw10)) +const char *nvme_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvme_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + TRACE_EVENT(nvme_setup_cmd, TP_PROTO(struct request *req, struct nvme_command *cmd), TP_ARGS(req, cmd), TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) __field(int, ctrl_id) __field(int, qid) __field(u8, opcode) @@ -99,12 +114,14 @@ TRACE_EVENT(nvme_setup_cmd, __entry->cid = cmd->common.command_id; __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); + __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, cmd->common.cdw10, sizeof(__entry->cdw10)); ), - TP_printk("nvme%d: qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->ctrl_id, __entry->qid, __entry->cid, - __entry->nsid, __entry->flags, __entry->metadata, + TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, + __entry->flags, __entry->metadata, show_opcode_name(__entry->qid, __entry->opcode), parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); @@ -113,6 +130,7 @@ TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) __field(int, ctrl_id) __field(int, qid) __field(int, cid) @@ -129,11 +147,12 @@ TRACE_EVENT(nvme_complete_rq, __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; + __assign_disk_name(__entry->disk, req->rq_disk); ), - TP_printk("nvme%d: qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", - __entry->ctrl_id, __entry->qid, __entry->cid, - __entry->result, __entry->retries, __entry->flags, - __entry->status) + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->result, + __entry->retries, __entry->flags, __entry->status) ); From 249090f9016b7d68a18fc4c79c42accca18d6961 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 5 Jul 2018 08:14:00 -0500 Subject: [PATCH 13/19] nvme-rdma: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bb6e26fa8331..2d4a51a80e8f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1591,6 +1591,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); + /* fall through */ case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); From 90140624e8face94207003ac9a9d2a329b309d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:05 +0300 Subject: [PATCH 14/19] nvme-rdma: unquiesce queues when deleting the controller If the controller is going away, we need to unquiesce the IO queues so that all pending request can fail gracefully before moving forward with controller deletion. Do that before we destroy the IO queues so blk_cleanup_queue won't block in freeze. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2d4a51a80e8f..2b683b8d4763 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1759,6 +1759,8 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + if (shutdown) + nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, shutdown); } From c66e2998c8ca4d5da85d4915612dca29e054ad21 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:06 +0300 Subject: [PATCH 15/19] nvme-rdma: centralize controller setup sequence Centralize controller sequence to a single routine that correctly cleans up after failures instead of having multiple apperances in several flows (create, reset, reconnect). One thing that we also gain here are the sanity/boundary checks also when connecting back to a dynamic controller. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 130 ++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 77 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2b683b8d4763..c22125c5661b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -917,24 +917,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) } } -static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { - struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), - struct nvme_rdma_ctrl, reconnect_work); + int ret = -EINVAL; bool changed; - int ret; - ++ctrl->ctrl.nr_reconnects; - - ret = nvme_rdma_configure_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) - goto requeue; + return ret; + + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (!(ctrl->ctrl.sgls & (1 << 2))) { + dev_err(ctrl->ctrl.device, + "Mandatory keyed sgls are not supported!\n"); + goto destroy_admin; + } + + if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); + } + + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; + } if (ctrl->ctrl.sgls & (1 << 20)) ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); + ret = nvme_rdma_configure_io_queues(ctrl, new); if (ret) goto destroy_admin; } @@ -943,10 +963,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (!changed) { /* state change failure is ok if we're in DELETING state */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; + ret = -EINVAL; + goto destroy_io; } nvme_start_ctrl(&ctrl->ctrl); + return 0; + +destroy_io: + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_rdma_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + + ++ctrl->ctrl.nr_reconnects; + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto requeue; dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); @@ -955,9 +996,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; -destroy_admin: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -1786,8 +1824,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); - int ret; - bool changed; nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); @@ -1798,25 +1834,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) + if (nvme_rdma_setup_ctrl(ctrl, false)) goto out_fail; - if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); - if (ret) - goto out_fail; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - if (!changed) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; - } - - nvme_start_ctrl(&ctrl->ctrl); - return; out_fail: @@ -1979,49 +1999,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); - ret = nvme_rdma_configure_admin_queue(ctrl, true); + ret = nvme_rdma_setup_ctrl(ctrl, true); if (ret) goto out_uninit_ctrl; - /* sanity check icdoff */ - if (ctrl->ctrl.icdoff) { - dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* sanity check keyed sgls */ - if (!(ctrl->ctrl.sgls & (1 << 2))) { - dev_err(ctrl->ctrl.device, - "Mandatory keyed sgls are not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* only warn if argument is too large here, will clamp later */ - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, clamping down\n", - opts->queue_size, ctrl->ctrl.sqsize + 1); - } - - /* warn if maxcmd is lower than sqsize+1 */ - if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { - dev_warn(ctrl->ctrl.device, - "sqsize %u > ctrl maxcmd %u, clamping down\n", - ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); - ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; - } - - if (opts->nr_io_queues) { - ret = nvme_rdma_configure_io_queues(ctrl, true); - if (ret) - goto out_remove_admin_queue; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); @@ -2031,13 +2012,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - nvme_start_ctrl(&ctrl->ctrl); - return &ctrl->ctrl; -out_remove_admin_queue: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, true); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); From 75862c72323e222656792370e2f240bc4029ff96 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:07 +0300 Subject: [PATCH 16/19] nvme-rdma: centralize admin/io queue teardown sequence We follow the same queue teardown sequence in delete, reset and error recovery. Centralize the logic. This patch does not change any functionality. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 66 ++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c22125c5661b..13a6064e4794 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -873,6 +873,31 @@ out_free_io_queues: return ret; } +static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, + &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_destroy_admin_queue(ctrl, remove); +} + +static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, + &ctrl->ctrl); + if (remove) + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, remove); + } +} + static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1008,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); - - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, false); - } - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, false); - - /* - * queues are not a live anymore, so restart the queues to fail fast - * new IO - */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, false); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1792,27 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - if (shutdown) - nvme_start_queues(&ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, shutdown); - } - + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl, shutdown); + nvme_rdma_teardown_admin_queue(ctrl, shutdown); } static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) From 1b72b71faccee986e2128a271125177dfe91f7b7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Jul 2018 12:43:16 +0300 Subject: [PATCH 17/19] nvmet: fix file discard return status If nvmet_copy_from_sgl failed, we falsly return successful completion status. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 57c660e3245d..dad8d44bf90e 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -227,22 +227,24 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) { int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; struct nvme_dsm_range range; - loff_t offset; - loff_t len; - int i, ret; + loff_t offset, len; + u16 ret; + int i; for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { - if (nvmet_copy_from_sgl(req, i * sizeof(range), &range, - sizeof(range))) + ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (ret) break; offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; - ret = vfs_fallocate(req->ns->file, mode, offset, len); - if (ret) + if (vfs_fallocate(req->ns->file, mode, offset, len)) { + ret = NVME_SC_INTERNAL | NVME_SC_DNR; break; + } } - nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + nvmet_req_complete(req, ret); } static void nvmet_file_dsm_work(struct work_struct *w) From 9c891c139894ce2ec0ca585a00e0bec5e6b4ccab Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Jul 2018 16:13:04 +0300 Subject: [PATCH 18/19] nvmet: check fileio lba range access boundaries Fail out-of-bounds with a proper status code. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index dad8d44bf90e..c2d0d08b59c8 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -145,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) return; } + pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; + if (unlikely(pos + req->data_len > req->ns->size)) { + nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + return; + } + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_KERNEL); @@ -160,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) is_sync = true; } - pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; - memset(&req->f.iocb, 0, sizeof(struct kiocb)); for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); @@ -236,8 +240,14 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) sizeof(range)); if (ret) break; + offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; + if (offset + len > req->ns->size) { + ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; + break; + } + if (vfs_fallocate(req->ns->file, mode, offset, len)) { ret = NVME_SC_INTERNAL | NVME_SC_DNR; break; @@ -283,6 +293,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << req->ns->blksize_shift); + if (unlikely(offset + len > req->ns->size)) { + nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + return; + } + ret = vfs_fallocate(req->ns->file, mode, offset, len); nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); } From 1b0d274523df5ef1caedc834da055ff721e4d4f0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 17 Jul 2018 17:17:36 +0300 Subject: [PATCH 19/19] nvmet: don't use uuid_le type Don't use sizeof(uuid_le) where none of the parameters is type of uuid_le. Since both arguments are u8 [16], use size of destination there. Moreover, uuid_le is a deprecated type, and nvmet is using uuid_t already. Signed-off-by: Andy Shevchenko Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 837bbdbfaa4b..16a9b24270f9 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -338,7 +338,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) */ id->nmic = (1 << 0); - memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); id->lbaf[0].ds = ns->blksize_shift;