From 9b4b7c1f9f54120940e243251e2b1407767b3381 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 28 Apr 2023 12:13:22 -0500 Subject: [PATCH 01/71] RDMA/rxe: Add workqueue support for rxe tasks Replace tasklets by work queues for the three main rxe tasklets: rxe_requester, rxe_completer and rxe_responder. work queues are a more modern way to process work from an IRQ and provide more control over how that work is run for future patches. Link: https://lore.kernel.org/r/20230428171321.5774-1-rpearsonhpe@gmail.com Signed-off-by: Ian Ziemba Signed-off-by: Bob Pearson Reviewed-by: Daisuke Matsuda Tested-by: Daisuke Matsuda Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 9 ++- drivers/infiniband/sw/rxe/rxe_task.c | 110 +++++++++++++++------------ drivers/infiniband/sw/rxe/rxe_task.h | 6 +- 3 files changed, 76 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 7a7e713de52d..54c723a6edda 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -212,10 +212,16 @@ static int __init rxe_module_init(void) { int err; - err = rxe_net_init(); + err = rxe_alloc_wq(); if (err) return err; + err = rxe_net_init(); + if (err) { + rxe_destroy_wq(); + return err; + } + rdma_link_register(&rxe_link_ops); pr_info("loaded\n"); return 0; @@ -226,6 +232,7 @@ static void __exit rxe_module_exit(void) rdma_link_unregister(&rxe_link_ops); ib_unregister_driver(RDMA_DRIVER_RXE); rxe_net_exit(); + rxe_destroy_wq(); pr_info("unloaded\n"); } diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index fb9a6bc8e620..1501120d4f52 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -6,8 +6,24 @@ #include "rxe.h" +static struct workqueue_struct *rxe_wq; + +int rxe_alloc_wq(void) +{ + rxe_wq = alloc_workqueue("rxe_wq", WQ_UNBOUND, WQ_MAX_ACTIVE); + if (!rxe_wq) + return -ENOMEM; + + return 0; +} + +void rxe_destroy_wq(void) +{ + destroy_workqueue(rxe_wq); +} + /* Check if task is idle i.e. not running, not scheduled in - * tasklet queue and not draining. If so move to busy to + * work queue and not draining. If so move to busy to * reserve a slot in do_task() by setting to busy and taking * a qp reference to cover the gap from now until the task finishes. * state will move out of busy if task returns a non zero value @@ -21,9 +37,6 @@ static bool __reserve_if_idle(struct rxe_task *task) { WARN_ON(rxe_read(task->qp) <= 0); - if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) - return false; - if (task->state == TASK_STATE_IDLE) { rxe_get(task->qp); task->state = TASK_STATE_BUSY; @@ -38,7 +51,7 @@ static bool __reserve_if_idle(struct rxe_task *task) } /* check if task is idle or drained and not currently - * scheduled in the tasklet queue. This routine is + * scheduled in the work queue. This routine is * called by rxe_cleanup_task or rxe_disable_task to * see if the queue is empty. * Context: caller should hold task->lock. @@ -46,7 +59,7 @@ static bool __reserve_if_idle(struct rxe_task *task) */ static bool __is_done(struct rxe_task *task) { - if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) + if (work_pending(&task->work)) return false; if (task->state == TASK_STATE_IDLE || @@ -77,23 +90,23 @@ static bool is_done(struct rxe_task *task) * schedules the task. They must call __reserve_if_idle to * move the task to busy before calling or scheduling. * The task can also be moved to drained or invalid - * by calls to rxe-cleanup_task or rxe_disable_task. + * by calls to rxe_cleanup_task or rxe_disable_task. * In that case tasks which get here are not executed but * just flushed. The tasks are designed to look to see if - * there is work to do and do part of it before returning + * there is work to do and then do part of it before returning * here with a return value of zero until all the work - * has been consumed then it retuens a non-zero value. + * has been consumed then it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. + * If the limit is hit and work remains the task is rescheduled. */ -static void do_task(struct tasklet_struct *t) +static void do_task(struct rxe_task *task) { - int cont; - int ret; - struct rxe_task *task = from_tasklet(task, t, tasklet); unsigned int iterations; unsigned long flags; int resched = 0; + int cont; + int ret; WARN_ON(rxe_read(task->qp) <= 0); @@ -115,25 +128,22 @@ static void do_task(struct tasklet_struct *t) } while (ret == 0 && iterations-- > 0); spin_lock_irqsave(&task->lock, flags); + /* we're not done yet but we ran out of iterations. + * yield the cpu and reschedule the task + */ + if (!ret) { + task->state = TASK_STATE_IDLE; + resched = 1; + goto exit; + } + switch (task->state) { case TASK_STATE_BUSY: - if (ret) { - task->state = TASK_STATE_IDLE; - } else { - /* This can happen if the client - * can add work faster than the - * tasklet can finish it. - * Reschedule the tasklet and exit - * the loop to give up the cpu - */ - task->state = TASK_STATE_IDLE; - resched = 1; - } + task->state = TASK_STATE_IDLE; break; - /* someone tried to run the task since the last time we called - * func, so we will call one more time regardless of the - * return value + /* someone tried to schedule the task while we + * were running, keep going */ case TASK_STATE_ARMED: task->state = TASK_STATE_BUSY; @@ -141,22 +151,24 @@ static void do_task(struct tasklet_struct *t) break; case TASK_STATE_DRAINING: - if (ret) - task->state = TASK_STATE_DRAINED; - else - cont = 1; + task->state = TASK_STATE_DRAINED; break; default: WARN_ON(1); - rxe_info_qp(task->qp, "unexpected task state = %d", task->state); + rxe_dbg_qp(task->qp, "unexpected task state = %d", + task->state); + task->state = TASK_STATE_IDLE; } +exit: if (!cont) { task->num_done++; if (WARN_ON(task->num_done != task->num_sched)) - rxe_err_qp(task->qp, "%ld tasks scheduled, %ld tasks done", - task->num_sched, task->num_done); + rxe_dbg_qp( + task->qp, + "%ld tasks scheduled, %ld tasks done", + task->num_sched, task->num_done); } spin_unlock_irqrestore(&task->lock, flags); } while (cont); @@ -169,6 +181,12 @@ static void do_task(struct tasklet_struct *t) rxe_put(task->qp); } +/* wrapper around do_task to fix argument for work queue */ +static void do_work(struct work_struct *work) +{ + do_task(container_of(work, struct rxe_task, work)); +} + int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, int (*func)(struct rxe_qp *)) { @@ -176,11 +194,9 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, task->qp = qp; task->func = func; - - tasklet_setup(&task->tasklet, do_task); - task->state = TASK_STATE_IDLE; spin_lock_init(&task->lock); + INIT_WORK(&task->work, do_work); return 0; } @@ -213,8 +229,6 @@ void rxe_cleanup_task(struct rxe_task *task) while (!is_done(task)) cond_resched(); - tasklet_kill(&task->tasklet); - spin_lock_irqsave(&task->lock, flags); task->state = TASK_STATE_INVALID; spin_unlock_irqrestore(&task->lock, flags); @@ -226,7 +240,7 @@ void rxe_cleanup_task(struct rxe_task *task) void rxe_run_task(struct rxe_task *task) { unsigned long flags; - int run; + bool run; WARN_ON(rxe_read(task->qp) <= 0); @@ -235,11 +249,11 @@ void rxe_run_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); if (run) - do_task(&task->tasklet); + do_task(task); } -/* schedule the task to run later as a tasklet. - * the tasklet)schedule call can be called holding +/* schedule the task to run later as a work queue entry. + * the queue_work call can be called holding * the lock. */ void rxe_sched_task(struct rxe_task *task) @@ -250,7 +264,7 @@ void rxe_sched_task(struct rxe_task *task) spin_lock_irqsave(&task->lock, flags); if (__reserve_if_idle(task)) - tasklet_schedule(&task->tasklet); + queue_work(rxe_wq, &task->work); spin_unlock_irqrestore(&task->lock, flags); } @@ -277,7 +291,9 @@ void rxe_disable_task(struct rxe_task *task) while (!is_done(task)) cond_resched(); - tasklet_disable(&task->tasklet); + spin_lock_irqsave(&task->lock, flags); + task->state = TASK_STATE_DRAINED; + spin_unlock_irqrestore(&task->lock, flags); } void rxe_enable_task(struct rxe_task *task) @@ -291,7 +307,7 @@ void rxe_enable_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); return; } + task->state = TASK_STATE_IDLE; - tasklet_enable(&task->tasklet); spin_unlock_irqrestore(&task->lock, flags); } diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index facb7c8e3729..a63e258b3d66 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -22,7 +22,7 @@ enum { * called again. */ struct rxe_task { - struct tasklet_struct tasklet; + struct work_struct work; int state; spinlock_t lock; struct rxe_qp *qp; @@ -32,6 +32,10 @@ struct rxe_task { long num_done; }; +int rxe_alloc_wq(void); + +void rxe_destroy_wq(void); + /* * init rxe_task structure * qp => parameter to pass to func From ab4e8fc1746ffef46c6e0a1c254eeb1618d5ffb1 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 15 May 2023 15:11:40 -0400 Subject: [PATCH 02/71] RDMA/irdma: Return void from irdma_init_iw_device() The return value from irdma_init_iw_device() is always 0 - change it to be void. Link: https://lore.kernel.org/r/20230515191142.413633-2-kheib@redhat.com Signed-off-by: Kamal Heib Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index ab5cdf782785..baaef6ce195c 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4515,7 +4515,7 @@ static void irdma_init_roce_device(struct irdma_device *iwdev) * irdma_init_iw_device - initialization of iwarp rdma device * @iwdev: irdma device */ -static int irdma_init_iw_device(struct irdma_device *iwdev) +static void irdma_init_iw_device(struct irdma_device *iwdev) { struct net_device *netdev = iwdev->netdev; @@ -4533,8 +4533,6 @@ static int irdma_init_iw_device(struct irdma_device *iwdev) memcpy(iwdev->ibdev.iw_ifname, netdev->name, sizeof(iwdev->ibdev.iw_ifname)); ib_set_device_ops(&iwdev->ibdev, &irdma_iw_dev_ops); - - return 0; } /** @@ -4544,15 +4542,12 @@ static int irdma_init_iw_device(struct irdma_device *iwdev) static int irdma_init_rdma_device(struct irdma_device *iwdev) { struct pci_dev *pcidev = iwdev->rf->pcidev; - int ret; - if (iwdev->roce_mode) { + if (iwdev->roce_mode) irdma_init_roce_device(iwdev); - } else { - ret = irdma_init_iw_device(iwdev); - if (ret) - return ret; - } + else + irdma_init_iw_device(iwdev); + iwdev->ibdev.phys_port_cnt = 1; iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; iwdev->ibdev.dev.parent = &pcidev->dev; From bc89be9443afb2190dddfabce87f390cf521eceb Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 15 May 2023 15:11:41 -0400 Subject: [PATCH 03/71] RDMA/irdma: Return void from irdma_init_rdma_device() The return value from irdma_init_rdma_device() is always 0 - change it to be void. Link: https://lore.kernel.org/r/20230515191142.413633-3-kheib@redhat.com Signed-off-by: Kamal Heib Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index baaef6ce195c..9ff06feda872 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4539,7 +4539,7 @@ static void irdma_init_iw_device(struct irdma_device *iwdev) * irdma_init_rdma_device - initialization of rdma device * @iwdev: irdma device */ -static int irdma_init_rdma_device(struct irdma_device *iwdev) +static void irdma_init_rdma_device(struct irdma_device *iwdev) { struct pci_dev *pcidev = iwdev->rf->pcidev; @@ -4552,8 +4552,6 @@ static int irdma_init_rdma_device(struct irdma_device *iwdev) iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; iwdev->ibdev.dev.parent = &pcidev->dev; ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops); - - return 0; } /** @@ -4591,9 +4589,7 @@ int irdma_ib_register_device(struct irdma_device *iwdev) { int ret; - ret = irdma_init_rdma_device(iwdev); - if (ret) - return ret; + irdma_init_rdma_device(iwdev); ret = ib_device_set_netdev(&iwdev->ibdev, iwdev->netdev, 1); if (ret) From a7dae5daf4bf50de01ebdd192bf52c2e8cd80c75 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 15 May 2023 15:11:42 -0400 Subject: [PATCH 04/71] RDMA/irdma: Move iw device ops initialization Move the initialization of the iw device ops to be under the declaration of the irdma_iw_dev_ops. Link: https://lore.kernel.org/r/20230515191142.413633-4-kheib@redhat.com Signed-off-by: Kamal Heib Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 9ff06feda872..6242ab6af77f 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4450,8 +4450,16 @@ static const struct ib_device_ops irdma_roce_dev_ops = { }; static const struct ib_device_ops irdma_iw_dev_ops = { - .modify_qp = irdma_modify_qp, .get_port_immutable = irdma_iw_port_immutable, + .iw_accept = irdma_accept, + .iw_add_ref = irdma_qp_add_ref, + .iw_connect = irdma_connect, + .iw_create_listen = irdma_create_listen, + .iw_destroy_listen = irdma_destroy_listen, + .iw_get_qp = irdma_get_qp, + .iw_reject = irdma_reject, + .iw_rem_ref = irdma_qp_rem_ref, + .modify_qp = irdma_modify_qp, .query_gid = irdma_query_gid, }; @@ -4522,14 +4530,6 @@ static void irdma_init_iw_device(struct irdma_device *iwdev) iwdev->ibdev.node_type = RDMA_NODE_RNIC; addrconf_addr_eui48((u8 *)&iwdev->ibdev.node_guid, netdev->dev_addr); - iwdev->ibdev.ops.iw_add_ref = irdma_qp_add_ref; - iwdev->ibdev.ops.iw_rem_ref = irdma_qp_rem_ref; - iwdev->ibdev.ops.iw_get_qp = irdma_get_qp; - iwdev->ibdev.ops.iw_connect = irdma_connect; - iwdev->ibdev.ops.iw_accept = irdma_accept; - iwdev->ibdev.ops.iw_reject = irdma_reject; - iwdev->ibdev.ops.iw_create_listen = irdma_create_listen; - iwdev->ibdev.ops.iw_destroy_listen = irdma_destroy_listen; memcpy(iwdev->ibdev.iw_ifname, netdev->name, sizeof(iwdev->ibdev.iw_ifname)); ib_set_device_ops(&iwdev->ibdev, &irdma_iw_dev_ops); From 42b0a5e691087c7e7ccbb8da87d51b301a9ddeb1 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Thu, 18 May 2023 16:00:27 +0900 Subject: [PATCH 05/71] RDMA/rxe: Fix comments about removed tasklets The commit 9b4b7c1f9f54 ("RDMA/rxe: Add workqueue support for rxe tasks") removed tasklets and replaced them with a workqueue, but relevant comments are still remaining in the source code. Fixes: 9b4b7c1f9f54 ("RDMA/rxe: Add workqueue support for rxe tasks") Link: https://lore.kernel.org/r/20230518070027.942715-1-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Reviewed-by: Bob Pearson Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 2 +- drivers/infiniband/sw/rxe/rxe_param.h | 2 +- drivers/infiniband/sw/rxe/rxe_req.c | 2 +- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index db18ace74d2b..0c0ae214c3a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -826,7 +826,7 @@ int rxe_completer(struct rxe_qp *qp) } /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_completer */ done: diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 7b41d79e72b2..d2f57ead78ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -112,7 +112,7 @@ enum rxe_device_param { RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, - /* Max number of interations of each tasklet + /* Max number of interations of each work item * before yielding the cpu to let other * work make progress */ diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 65134a9aefe7..400840c913a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -853,7 +853,7 @@ int rxe_requester(struct rxe_qp *qp) update_state(qp, &pkt); /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_requester */ done: diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 68f6cd188d8e..b92c41cdb620 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1654,7 +1654,7 @@ int rxe_responder(struct rxe_qp *qp) } /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_responder */ done: From ab112ee7899d6171da5acd77a7ed7ae103f488de Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 18 May 2023 23:48:11 -0700 Subject: [PATCH 06/71] RDMA/bnxt_re: Disable/kill tasklet only if it is enabled When the ulp hook to start the IRQ fails because the rings are not available, tasklets are not enabled. In this case when the driver is unloaded, driver calls CREQ tasklet_kill. This causes an indefinite hang as the tasklet is not enabled. Driver shouldn't call tasklet_kill if it is not enabled. So using the creq->requested and nq->requested flags to identify if both tasklets/irqs are registered. Checking this flag while scheduling the tasklet from ISR. Also, added a cleanup for disabling tasklet, in case request_irq fails during start_irq. Check for return value for bnxt_qplib_rcfw_start_irq and in case the bnxt_qplib_rcfw_start_irq fails, return bnxt_re_start_irq without attempting to start NQ IRQs. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684478897-12247-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 12 +++++++++--- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 16 ++++++++++------ drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 14 +++++++++----- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b9e2f89337e8..a44f2900e343 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -283,15 +283,21 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) for (indx = 0; indx < rdev->num_msix; indx++) rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; - bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, - false); + rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, + false); + if (rc) { + ibdev_warn(&rdev->ibdev, "Failed to reinit CREQ\n"); + return; + } for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) { nq = &rdev->nq[indx - 1]; rc = bnxt_qplib_nq_start_irq(nq, indx - 1, msix_ent[indx].vector, false); - if (rc) + if (rc) { ibdev_warn(&rdev->ibdev, "Failed to reinit NQ index %d\n", indx - 1); + return; + } } } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index f139d4cd1712..7d756d8c6699 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -399,6 +399,9 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance) void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) { + if (!nq->requested) + return; + tasklet_disable(&nq->nq_tasklet); /* Mask h/w interrupt */ bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, false); @@ -406,11 +409,10 @@ void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) synchronize_irq(nq->msix_vec); if (kill) tasklet_kill(&nq->nq_tasklet); - if (nq->requested) { - irq_set_affinity_hint(nq->msix_vec, NULL); - free_irq(nq->msix_vec, nq); - nq->requested = false; - } + + irq_set_affinity_hint(nq->msix_vec, NULL); + free_irq(nq->msix_vec, nq); + nq->requested = false; } void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) @@ -449,8 +451,10 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx); rc = request_irq(nq->msix_vec, bnxt_qplib_nq_irq, 0, nq->name, nq); - if (rc) + if (rc) { + tasklet_disable(&nq->nq_tasklet); return rc; + } cpumask_clear(&nq->mask); cpumask_set_cpu(nq_indx, &nq->mask); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index de9069103177..a668f877a7bf 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -636,6 +636,10 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) struct bnxt_qplib_creq_ctx *creq; creq = &rcfw->creq; + + if (!creq->requested) + return; + tasklet_disable(&creq->creq_tasklet); /* Mask h/w interrupts */ bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false); @@ -644,10 +648,8 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) if (kill) tasklet_kill(&creq->creq_tasklet); - if (creq->requested) { - free_irq(creq->msix_vec, rcfw); - creq->requested = false; - } + free_irq(creq->msix_vec, rcfw); + creq->requested = false; } void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) @@ -693,8 +695,10 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, tasklet_enable(&creq->creq_tasklet); rc = request_irq(creq->msix_vec, bnxt_qplib_creq_irq, 0, "bnxt_qplib_creq", rcfw); - if (rc) + if (rc) { + tasklet_disable(&creq->creq_tasklet); return rc; + } creq->requested = true; bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true); From 9b3ee47796f529e5bc31a355d6cb756d68a7079a Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:12 -0700 Subject: [PATCH 07/71] RDMA/bnxt_re: Fix to remove unnecessary return labels If there is no cleanup needed then just return directly. This cleans up the code and improve readability. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684478897-12247-3-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Kashyap Desai Reviewed-by: Saravanan Vajravel Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 7d756d8c6699..298d89390e66 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1618,7 +1618,7 @@ static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp, il_src = (void *)wqe->sg_list[indx].addr; t_len += len; if (t_len > qp->max_inline_data) - goto bad; + return -ENOMEM; while (len) { if (pull_dst) { pull_dst = false; @@ -1642,8 +1642,6 @@ static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp, } return t_len; -bad: - return -ENOMEM; } static u32 bnxt_qplib_put_sges(struct bnxt_qplib_hwq *hwq, @@ -2067,7 +2065,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) hwq_attr.sginfo = &cq->sg_info; rc = bnxt_qplib_alloc_init_hwq(&cq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_CQ, @@ -2115,7 +2113,6 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) fail: bnxt_qplib_free_hwq(res, &cq->hwq); -exit: return rc; } From ff2e4bfd162cf66a112a81509e419805add44d64 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:13 -0700 Subject: [PATCH 08/71] RDMA/bnxt_re: Use unique names while registering interrupts bnxt_re currently uses the names "bnxt_qplib_creq" and "bnxt_qplib_nq-0" while registering IRQs. There is no way to distinguish the IRQs of different device ports when there are multiple IB devices registered. This could make the scenarios worse where one want to pin IRQs of a device port to certain CPUs. Fixed the code to use unique names which has PCI BDF information while registering interrupts like: "bnxt_re-nq-0@pci:0000:65:00.0" and "bnxt_re-creq@pci:0000:65:00.1". Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684478897-12247-4-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Bhargava Chenna Marreddy Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 12 ++++++++++-- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 2 +- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 15 +++++++++++++-- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 1 + 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 298d89390e66..ff33d7733369 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -412,6 +412,8 @@ void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) irq_set_affinity_hint(nq->msix_vec, NULL); free_irq(nq->msix_vec, nq); + kfree(nq->name); + nq->name = NULL; nq->requested = false; } @@ -438,6 +440,7 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, int msix_vector, bool need_init) { + struct bnxt_qplib_res *res = nq->res; int rc; if (nq->requested) @@ -449,9 +452,14 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, else tasklet_enable(&nq->nq_tasklet); - snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx); + nq->name = kasprintf(GFP_KERNEL, "bnxt_re-nq-%d@pci:%s", + nq_indx, pci_name(res->pdev)); + if (!nq->name) + return -ENOMEM; rc = request_irq(nq->msix_vec, bnxt_qplib_nq_irq, 0, nq->name, nq); if (rc) { + kfree(nq->name); + nq->name = NULL; tasklet_disable(&nq->nq_tasklet); return rc; } @@ -465,7 +473,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, nq->msix_vec, nq_indx); } nq->requested = true; - bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true); + bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, res->cctx, true); return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index d74d5ead2e32..a42820821c47 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -472,7 +472,7 @@ typedef int (*srqn_handler_t)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_nq { struct pci_dev *pdev; struct bnxt_qplib_res *res; - char name[32]; + char *name; struct bnxt_qplib_hwq hwq; struct bnxt_qplib_nq_db nq_db; u16 ring_id; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index a668f877a7bf..688eaa01db64 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -649,6 +649,8 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) tasklet_kill(&creq->creq_tasklet); free_irq(creq->msix_vec, rcfw); + kfree(creq->irq_name); + creq->irq_name = NULL; creq->requested = false; } @@ -681,9 +683,11 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, bool need_init) { struct bnxt_qplib_creq_ctx *creq; + struct bnxt_qplib_res *res; int rc; creq = &rcfw->creq; + res = rcfw->res; if (creq->requested) return -EFAULT; @@ -693,15 +697,22 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, tasklet_setup(&creq->creq_tasklet, bnxt_qplib_service_creq); else tasklet_enable(&creq->creq_tasklet); + + creq->irq_name = kasprintf(GFP_KERNEL, "bnxt_re-creq@pci:%s", + pci_name(res->pdev)); + if (!creq->irq_name) + return -ENOMEM; rc = request_irq(creq->msix_vec, bnxt_qplib_creq_irq, 0, - "bnxt_qplib_creq", rcfw); + creq->irq_name, rcfw); if (rc) { + kfree(creq->irq_name); + creq->irq_name = NULL; tasklet_disable(&creq->creq_tasklet); return rc; } creq->requested = true; - bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true); + bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index dd5651478bbb..92f7a25533d3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -186,6 +186,7 @@ struct bnxt_qplib_creq_ctx { u16 ring_id; int msix_vec; bool requested; /*irq handler installed */ + char *irq_name; }; /* RCFW Communication Channels */ From b989f90cef0af48aa5679b6a75476371705ec53c Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:14 -0700 Subject: [PATCH 09/71] RDMA/bnxt_re: Remove a redundant check inside bnxt_re_update_gid The NULL check inside bnxt_re_update_gid() always return false. If sgid_tbl->tbl is not allocated, then dev_init would have failed. Fixes: 5fac5b1b297f ("RDMA/bnxt_re: Add vlan tag for untagged RoCE traffic when PFC is configured") Link: https://lore.kernel.org/r/1684478897-12247-5-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Saravanan Vajravel Reviewed-by: Damodharam Ammepalli Reviewed-by: Ajit Khaparde Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a44f2900e343..4718af6c4c47 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -969,12 +969,6 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) if (!ib_device_try_get(&rdev->ibdev)) return 0; - if (!sgid_tbl) { - ibdev_err(&rdev->ibdev, "QPLIB: SGID table not allocated"); - rc = -EINVAL; - goto out; - } - for (index = 0; index < sgid_tbl->active; index++) { gid_idx = sgid_tbl->hw_id[index]; @@ -992,7 +986,7 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx, rdev->qplib_res.netdev->dev_addr); } -out: + ib_device_put(&rdev->ibdev); return rc; } From 43774bc156614346fe5dacabc8e8c229167f2536 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:15 -0700 Subject: [PATCH 10/71] RDMA/bnxt_re: Fix to remove an unnecessary log During destroy_qp, driver sets the qp handle in the existing CQEs belonging to the QP being destroyed to NULL. As a result, a poll_cq after destroy_qp can report unnecessary messages. Remove this noise from system logs. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684478897-12247-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ff33d7733369..e695abaecaec 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2798,11 +2798,8 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); - if (!qp) { - dev_err(&cq->hwq.pdev->dev, - "FP: CQ Process terminal qp is NULL\n"); + if (!qp) return -EINVAL; - } /* Must block new posting of SQ and RQ */ qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; From 07d5ce14b2aa22dc6a92f9769035281b68250d3e Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:16 -0700 Subject: [PATCH 11/71] RDMA/bnxt_re: Return directly without goto jumps When there is no cleanup to be done, return directly. This will help eliminating unnecessary local variables and goto labels. This patch fixes such occurrences in qplib_fp.c file. Fixes: 37cb11acf1f7 ("RDMA/bnxt_re: Add SRQ support for Broadcom adapters") Fixes: 159fb4ceacd7 ("RDMA/bnxt_re: introduce a function to allocate swq") Link: https://lore.kernel.org/r/1684478897-12247-7-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 60 +++++++++--------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e695abaecaec..d48a26e89b10 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -483,7 +483,6 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) resource_size_t reg_base; struct bnxt_qplib_nq_db *nq_db; struct pci_dev *pdev; - int rc = 0; pdev = nq->pdev; nq_db = &nq->nq_db; @@ -493,8 +492,7 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) if (!nq_db->reg.bar_base) { dev_err(&pdev->dev, "QPLIB: NQ BAR region %d resc start is 0!", nq_db->reg.bar_id); - rc = -ENOMEM; - goto fail; + return -ENOMEM; } reg_base = nq_db->reg.bar_base + reg_offt; @@ -504,15 +502,14 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) if (!nq_db->reg.bar_reg) { dev_err(&pdev->dev, "QPLIB: NQ BAR region %d mapping failed", nq_db->reg.bar_id); - rc = -ENOMEM; - goto fail; + return -ENOMEM; } nq_db->dbinfo.db = nq_db->reg.bar_reg; nq_db->dbinfo.hwq = &nq->hwq; nq_db->dbinfo.xid = nq->ring_id; -fail: - return rc; + + return 0; } int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, @@ -626,7 +623,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), GFP_KERNEL); @@ -680,7 +677,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, fail: bnxt_qplib_free_hwq(res, &srq->hwq); kfree(srq->swq); -exit: + return rc; } @@ -744,15 +741,14 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, struct rq_wqe *srqe; struct sq_sge *hw_sge; u32 sw_prod, sw_cons, count = 0; - int i, rc = 0, next; + int i, next; spin_lock(&srq_hwq->lock); if (srq->start_idx == srq->last_idx) { dev_err(&srq_hwq->pdev->dev, "FP: SRQ (0x%x) is full!\n", srq->id); - rc = -EINVAL; spin_unlock(&srq_hwq->lock); - goto done; + return -EINVAL; } next = srq->start_idx; srq->start_idx = srq->swq[next].next_idx; @@ -793,22 +789,19 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, srq->arm_req = false; bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold); } -done: - return rc; + + return 0; } /* QP */ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { - int rc = 0; int indx; que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); - if (!que->swq) { - rc = -ENOMEM; - goto out; - } + if (!que->swq) + return -ENOMEM; que->swq_start = 0; que->swq_last = que->max_wqe - 1; @@ -816,8 +809,8 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; -out: - return rc; + + return 0; } int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) @@ -851,7 +844,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; rc = bnxt_qplib_alloc_init_swq(sq); if (rc) @@ -939,7 +932,6 @@ sq_swq: kfree(sq->swq); fail_sq: bnxt_qplib_free_hwq(res, &sq->hwq); -exit: return rc; } @@ -1004,7 +996,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; rc = bnxt_qplib_alloc_init_swq(sq); if (rc) @@ -1152,7 +1144,6 @@ sq_swq: kfree(sq->swq); fail_sq: bnxt_qplib_free_hwq(res, &sq->hwq); -exit: return rc; } @@ -2513,7 +2504,6 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2524,7 +2514,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; @@ -2580,8 +2570,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, } } -done: - return rc; + return 0; } static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, @@ -2594,7 +2583,6 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2605,7 +2593,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; @@ -2667,8 +2655,8 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } } -done: - return rc; + + return 0; } bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq) @@ -2695,7 +2683,6 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2706,7 +2693,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; @@ -2775,8 +2762,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, } } -done: - return rc; + return 0; } static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, From 8c1ee346da583718fb0a7791a1f84bdafb103caf Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 23:48:17 -0700 Subject: [PATCH 12/71] RDMA/bnxt_re: Remove unnecessary checks The NULL check inside bnxt_qplib_del_sgid() and bnxt_qplib_add_sgid() always return false as the "sgid_tbl" inside "rdev->qplib_res" is a static memory. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684478897-12247-8-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 1714a1e23113..dbb0e4e9254d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -233,10 +233,6 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_rcfw *rcfw = res->rcfw; int index; - if (!sgid_tbl) { - dev_err(&res->pdev->dev, "SGID table not allocated\n"); - return -EINVAL; - } /* Do we need a sgid_lock here? */ if (!sgid_tbl->active) { dev_err(&res->pdev->dev, "SGID table has no active entries\n"); @@ -297,10 +293,6 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_rcfw *rcfw = res->rcfw; int i, free_idx; - if (!sgid_tbl) { - dev_err(&res->pdev->dev, "SGID table not allocated\n"); - return -EINVAL; - } /* Do we need a sgid_lock here? */ if (sgid_tbl->active == sgid_tbl->max) { dev_err(&res->pdev->dev, "SGID table is full\n"); From 2145328515c8fa9b8a9f7889250bc6c032f2a0e6 Mon Sep 17 00:00:00 2001 From: Long Li Date: Sat, 13 May 2023 23:18:15 -0700 Subject: [PATCH 13/71] RDMA/mana_ib: Use v2 version of cfg_rx_steer_req to enable RX coalescing With RX coalescing, one CQE entry can be used to indicate multiple packets on the receive queue. This saves processing time and PCI bandwidth over the CQ. The MANA Ethernet driver also uses the v2 version of the protocol. It doesn't use RX coalescing and its behavior is not changed. Link: https://lore.kernel.org/r/1684045095-31228-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 5 ++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 5 ++++- include/net/mana/mana.h | 4 +++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 54b61930a7fd..4b3b5b274e84 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -13,7 +13,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, u8 *rx_hash_key) { struct mana_port_context *mpc = netdev_priv(ndev); - struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_req_v2 *req; struct mana_cfg_rx_steer_resp resp = {}; mana_handle_t *req_indir_tab; struct gdma_context *gc; @@ -33,6 +33,8 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, sizeof(resp)); + req->hdr.req.msg_version = GDMA_MESSAGE_V2; + req->vport = mpc->port_handle; req->rx_enable = 1; req->update_default_rxobj = 1; @@ -46,6 +48,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; req->indir_tab_offset = sizeof(*req); req->update_indir_tab = true; + req->cqe_coalescing_enable = 1; req_indir_tab = (mana_handle_t *)(req + 1); /* The ind table passed to the hardware must have diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 06d6292e09b3..b3fcb767b9ab 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -972,7 +972,7 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, bool update_tab) { u16 num_entries = MANA_INDIRECT_TABLE_SIZE; - struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_req_v2 *req; struct mana_cfg_rx_steer_resp resp = {}; struct net_device *ndev = apc->ndev; mana_handle_t *req_indir_tab; @@ -987,6 +987,8 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, sizeof(resp)); + req->hdr.req.msg_version = GDMA_MESSAGE_V2; + req->vport = apc->port_handle; req->num_indir_entries = num_entries; req->indir_tab_offset = sizeof(*req); @@ -996,6 +998,7 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, req->update_hashkey = update_key; req->update_indir_tab = update_tab; req->default_rxobj = apc->default_rxobj; + req->cqe_coalescing_enable = 0; if (update_key) memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index cd386aa7c7cc..1512bd48df81 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -581,7 +581,7 @@ struct mana_fence_rq_resp { }; /* HW DATA */ /* Configure vPort Rx Steering */ -struct mana_cfg_rx_steer_req { +struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; mana_handle_t vport; u16 num_indir_entries; @@ -594,6 +594,8 @@ struct mana_cfg_rx_steer_req { u8 reserved; mana_handle_t default_rxobj; u8 hashkey[MANA_HASH_KEY_SIZE]; + u8 cqe_coalescing_enable; + u8 reserved2[7]; }; /* HW DATA */ struct mana_cfg_rx_steer_resp { From b002760f877c0d91ecd3c78565b52f4bbac379dd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 May 2023 13:18:45 +0200 Subject: [PATCH 14/71] RDMA/irdma: avoid fortify-string warning in irdma_clr_wqes Commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") triggers a warning for fortified memset(): In function 'fortify_memset_chk', inlined from 'irdma_clr_wqes' at drivers/infiniband/hw/irdma/uk.c:103:4: include/linux/fortify-string.h:493:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 493 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The problem here isthat the inner array only has four 8-byte elements, so clearing 4096 bytes overflows that. As this structure is part of an outer array, change the code to pass a pointer to the irdma_qp_quanta instead, and change the size argument for readability, matching the comment above it. Fixes: 551c46edc769 ("RDMA/irdma: Add user/kernel shared libraries") Link: https://lore.kernel.org/r/20230523111859.2197825-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/uk.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 16183e894da7..dd428d915c17 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -93,16 +93,18 @@ static int irdma_nop_1(struct irdma_qp_uk *qp) */ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) { - __le64 *wqe; + struct irdma_qp_quanta *sq; u32 wqe_idx; if (!(qp_wqe_idx & 0x7F)) { wqe_idx = (qp_wqe_idx + 128) % qp->sq_ring.size; - wqe = qp->sq_base[wqe_idx].elem; + sq = qp->sq_base + wqe_idx; if (wqe_idx) - memset(wqe, qp->swqe_polarity ? 0 : 0xFF, 0x1000); + memset(sq, qp->swqe_polarity ? 0 : 0xFF, + 128 * sizeof(*sq)); else - memset(wqe, qp->swqe_polarity ? 0xFF : 0, 0x1000); + memset(sq, qp->swqe_polarity ? 0xFF : 0, + 128 * sizeof(*sq)); } } From 84510a61ef0974ec765cc68d04ddab905b9b6c7c Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Wed, 24 May 2023 10:52:20 +0200 Subject: [PATCH 15/71] RDMA/rxe: Remove dangling declaration of rxe_cq_disable() rxe_cq_disable() has been removed but not its declaration. Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c") Link: https://lore.kernel.org/r/4f20ffc5-b2c4-0c11-2883-a835caf01a94@suse.com Signed-off-by: Nicolas Morey Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 804b15e929dd..666e06a82bc9 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -31,8 +31,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); -void rxe_cq_disable(struct rxe_cq *cq); - void rxe_cq_cleanup(struct rxe_pool_elem *elem); /* rxe_mcast.c */ From 1cc625cecce9cbbe7898c88cc9e6c06021d7d8f0 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 Apr 2023 01:02:41 +0000 Subject: [PATCH 16/71] RDMA/rtrs: Remove duplicate cq_num assignment line 1701 and 1713 are duplicate: > 1701 cq_num = max_send_wr + max_recv_wr; 1702 /* alloc iu to recv new rkey reply when server reports flags set */ 1703 if (clt_path->flags & RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) { 1704 con->rsp_ius = rtrs_iu_alloc(cq_num, sizeof(*rsp), 1705 GFP_KERNEL, 1706 clt_path->s.dev->ib_dev, 1707 DMA_FROM_DEVICE, 1708 rtrs_clt_rdma_done); 1709 if (!con->rsp_ius) 1710 return -ENOMEM; 1711 con->queue_num = cq_num; 1712 } > 1713 cq_num = max_send_wr + max_recv_wr; Remove the duplicate. Link: https://lore.kernel.org/r/1682384563-2-2-git-send-email-lizhijian@fujitsu.com Acked-by: Guoqing Jiang Acked-by: Jack Wang Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index edb2e3a25880..3738aaefbe13 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1710,7 +1710,6 @@ static int create_con_cq_qp(struct rtrs_clt_con *con) return -ENOMEM; con->queue_num = cq_num; } - cq_num = max_send_wr + max_recv_wr; cq_vector = con->cpu % clt_path->s.dev->ib_dev->num_comp_vectors; if (con->c.cid >= clt_path->s.irq_con_num) err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge, From c9358de193ecfb360c3ce75f27ce839ca0b0bc8c Mon Sep 17 00:00:00 2001 From: Brendan Cunningham Date: Fri, 19 May 2023 12:32:16 -0400 Subject: [PATCH 17/71] IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate The hfi1 user SDMA pinned-page cache will leave a stale cache entry when the cache-entry's virtual address range is invalidated but that cache entry is in-use by an outstanding SDMA request. Subsequent user SDMA requests with buffers in or spanning the virtual address range of the stale cache entry will result in packets constructed from the wrong memory, the physical pages pointed to by the stale cache entry. To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler cache independent of the cache entry's refcount. Add 'struct kref refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with kref_get() and kref_put(). mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove 'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node code to use mmu_rb_node.refcount. Move the mmu_rb_handler destructor call after a wait-for-SDMA-request-completion call so mmu_rb_nodes that need mmu_rb_handler's workqueue to queue themselves up for destruction from an interrupt context may do so. Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging") Fixes: 00cbce5cbf88 ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests") Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick Signed-off-by: Brendan Cunningham Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 4 +- drivers/infiniband/hw/hfi1/mmu_rb.c | 101 ++++++++++------- drivers/infiniband/hw/hfi1/mmu_rb.h | 3 + drivers/infiniband/hw/hfi1/sdma.c | 23 +++- drivers/infiniband/hw/hfi1/sdma.h | 47 +++++--- drivers/infiniband/hw/hfi1/sdma_txreq.h | 2 + drivers/infiniband/hw/hfi1/user_sdma.c | 137 ++++++++++-------------- drivers/infiniband/hw/hfi1/user_sdma.h | 1 - drivers/infiniband/hw/hfi1/vnic_sdma.c | 4 +- 9 files changed, 177 insertions(+), 145 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 8973a081d641..e7d831330278 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -215,11 +215,11 @@ static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx, const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ret = sdma_txadd_page(dd, - NULL, txreq, skb_frag_page(frag), frag->bv_offset, - skb_frag_size(frag)); + skb_frag_size(frag), + NULL, NULL, NULL); if (unlikely(ret)) break; } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 1cea8b0c78e0..a864423c256d 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -19,8 +19,7 @@ static int mmu_notifier_range_start(struct mmu_notifier *, const struct mmu_notifier_range *); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); -static void do_remove(struct mmu_rb_handler *handler, - struct list_head *del_list); +static void release_immediate(struct kref *refcount); static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { @@ -106,7 +105,11 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) } spin_unlock_irqrestore(&handler->lock, flags); - do_remove(handler, &del_list); + while (!list_empty(&del_list)) { + rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&rbnode->list); + kref_put(&rbnode->refcount, release_immediate); + } /* Now the mm may be freed. */ mmdrop(handler->mn.mm); @@ -134,12 +137,6 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, } __mmu_int_rb_insert(mnode, &handler->root); list_add_tail(&mnode->list, &handler->lru_list); - - ret = handler->ops->insert(handler->ops_arg, mnode); - if (ret) { - __mmu_int_rb_remove(mnode, &handler->root); - list_del(&mnode->list); /* remove from LRU list */ - } mnode->handler = handler; unlock: spin_unlock_irqrestore(&handler->lock, flags); @@ -183,6 +180,48 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } +/* + * Must NOT call while holding mnode->handler->lock. + * mnode->handler->ops->remove() may sleep and mnode->handler->lock is a + * spinlock. + */ +static void release_immediate(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + mnode->handler->ops->remove(mnode->handler->ops_arg, mnode); +} + +/* Caller must hold mnode->handler->lock */ +static void release_nolock(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + list_move(&mnode->list, &mnode->handler->del_list); + queue_work(mnode->handler->wq, &mnode->handler->del_work); +} + +/* + * struct mmu_rb_node->refcount kref_put() callback. + * Adds mmu_rb_node to mmu_rb_node->handler->del_list and queues + * handler->del_work on handler->wq. + * Does not remove mmu_rb_node from handler->lru_list or handler->rb_root. + * Acquires mmu_rb_node->handler->lock; do not call while already holding + * handler->lock. + */ +void hfi1_mmu_rb_release(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + struct mmu_rb_handler *handler = mnode->handler; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + list_move(&mnode->list, &mnode->handler->del_list); + spin_unlock_irqrestore(&handler->lock, flags); + queue_work(handler->wq, &handler->del_work); +} + void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) { struct mmu_rb_node *rbnode, *ptr; @@ -197,6 +236,10 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) spin_lock_irqsave(&handler->lock, flags); list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) { + /* refcount == 1 implies mmu_rb_handler has only rbnode ref */ + if (kref_read(&rbnode->refcount) > 1) + continue; + if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, &stop)) { __mmu_int_rb_remove(rbnode, &handler->root); @@ -209,7 +252,7 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) spin_unlock_irqrestore(&handler->lock, flags); list_for_each_entry_safe(rbnode, ptr, &del_list, list) { - handler->ops->remove(handler->ops_arg, rbnode); + kref_put(&rbnode->refcount, release_immediate); } } @@ -221,7 +264,6 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, struct rb_root_cached *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; - bool added = false; spin_lock_irqsave(&handler->lock, flags); for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1); @@ -230,38 +272,16 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, ptr = __mmu_int_rb_iter_next(node, range->start, range->end - 1); trace_hfi1_mmu_mem_invalidate(node->addr, node->len); - if (handler->ops->invalidate(handler->ops_arg, node)) { - __mmu_int_rb_remove(node, root); - /* move from LRU list to delete list */ - list_move(&node->list, &handler->del_list); - added = true; - } + /* Remove from rb tree and lru_list. */ + __mmu_int_rb_remove(node, root); + list_del_init(&node->list); + kref_put(&node->refcount, release_nolock); } spin_unlock_irqrestore(&handler->lock, flags); - if (added) - queue_work(handler->wq, &handler->del_work); - return 0; } -/* - * Call the remove function for the given handler and the list. This - * is expected to be called with a delete list extracted from handler. - * The caller should not be holding the handler lock. - */ -static void do_remove(struct mmu_rb_handler *handler, - struct list_head *del_list) -{ - struct mmu_rb_node *node; - - while (!list_empty(del_list)) { - node = list_first_entry(del_list, struct mmu_rb_node, list); - list_del(&node->list); - handler->ops->remove(handler->ops_arg, node); - } -} - /* * Work queue function to remove all nodes that have been queued up to * be removed. The key feature is that mm->mmap_lock is not being held @@ -274,11 +294,16 @@ static void handle_remove(struct work_struct *work) del_work); struct list_head del_list; unsigned long flags; + struct mmu_rb_node *node; /* remove anything that is queued to get removed */ spin_lock_irqsave(&handler->lock, flags); list_replace_init(&handler->del_list, &del_list); spin_unlock_irqrestore(&handler->lock, flags); - do_remove(handler, &del_list); + while (!list_empty(&del_list)) { + node = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&node->list); + handler->ops->remove(handler->ops_arg, node); + } } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index c4da064188c9..82c505a04fc6 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -16,6 +16,7 @@ struct mmu_rb_node { struct rb_node node; struct mmu_rb_handler *handler; struct list_head list; + struct kref refcount; }; /* @@ -61,6 +62,8 @@ int hfi1_mmu_rb_register(void *ops_arg, void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, struct mmu_rb_node *mnode); +void hfi1_mmu_rb_release(struct kref *refcount); + void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, unsigned long addr, diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index bb2552dd29c1..26c62162759b 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1593,7 +1593,20 @@ static inline void sdma_unmap_desc( struct hfi1_devdata *dd, struct sdma_desc *descp) { - system_descriptor_complete(dd, descp); + switch (sdma_mapping_type(descp)) { + case SDMA_MAP_SINGLE: + dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + case SDMA_MAP_PAGE: + dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + } + + if (descp->pinning_ctx && descp->ctx_put) + descp->ctx_put(descp->pinning_ctx); + descp->pinning_ctx = NULL; } /* @@ -3113,8 +3126,8 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, /* Add descriptor for coalesce buffer */ tx->desc_limit = MAX_DESC; - return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, - addr, tx->tlen); + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, + addr, tx->tlen, NULL, NULL, NULL); } return 1; @@ -3157,9 +3170,9 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) make_tx_sdma_desc( tx, SDMA_MAP_NONE, - NULL, dd->sdma_pad_phys, - sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)), + NULL, NULL, NULL); tx->num_desc++; _sdma_close_tx(dd, tx); return rval; diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 95aaec14c6c2..7fdebab202c4 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -594,9 +594,11 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) static inline void make_tx_sdma_desc( struct sdma_txreq *tx, int type, - void *pinning_ctx, dma_addr_t addr, - size_t len) + size_t len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { struct sdma_desc *desc = &tx->descp[tx->num_desc]; @@ -613,7 +615,11 @@ static inline void make_tx_sdma_desc( << SDMA_DESC0_PHY_ADDR_SHIFT) | (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) << SDMA_DESC0_BYTE_COUNT_SHIFT); + desc->pinning_ctx = pinning_ctx; + desc->ctx_put = ctx_put; + if (pinning_ctx && ctx_get) + ctx_get(pinning_ctx); } /* helper to extend txreq */ @@ -645,18 +651,20 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd, static inline int _sdma_txadd_daddr( struct hfi1_devdata *dd, int type, - void *pinning_ctx, struct sdma_txreq *tx, dma_addr_t addr, - u16 len) + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { int rval = 0; make_tx_sdma_desc( tx, type, - pinning_ctx, - addr, len); + addr, len, + pinning_ctx, ctx_get, ctx_put); WARN_ON(len > tx->tlen); tx->num_desc++; tx->tlen -= len; @@ -676,11 +684,18 @@ static inline int _sdma_txadd_daddr( /** * sdma_txadd_page() - add a page to the sdma_txreq * @dd: the device to use for mapping - * @pinning_ctx: context to be released at descriptor retirement * @tx: tx request to which the page is added * @page: page to map * @offset: offset within the page * @len: length in bytes + * @pinning_ctx: context to be stored on struct sdma_desc .pinning_ctx. Not + * added if coalesce buffer is used. E.g. pointer to pinned-page + * cache entry for the sdma_desc. + * @ctx_get: optional function to take reference to @pinning_ctx. Not called if + * @pinning_ctx is NULL. + * @ctx_put: optional function to release reference to @pinning_ctx after + * sdma_desc completes. May be called in interrupt context so must + * not sleep. Not called if @pinning_ctx is NULL. * * This is used to add a page/offset/length descriptor. * @@ -692,11 +707,13 @@ static inline int _sdma_txadd_daddr( */ static inline int sdma_txadd_page( struct hfi1_devdata *dd, - void *pinning_ctx, struct sdma_txreq *tx, struct page *page, unsigned long offset, - u16 len) + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { dma_addr_t addr; int rval; @@ -720,7 +737,8 @@ static inline int sdma_txadd_page( return -ENOSPC; } - return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, pinning_ctx, tx, addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, tx, addr, len, + pinning_ctx, ctx_get, ctx_put); } /** @@ -754,8 +772,8 @@ static inline int sdma_txadd_daddr( return rval; } - return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, NULL, tx, - addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len, + NULL, NULL, NULL); } /** @@ -801,7 +819,8 @@ static inline int sdma_txadd_kvaddr( return -ENOSPC; } - return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, addr, len, + NULL, NULL, NULL); } struct iowait_work; @@ -1034,6 +1053,4 @@ u16 sdma_get_descq_cnt(void); extern uint mod_num_sdma; void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); - -void system_descriptor_complete(struct hfi1_devdata *dd, struct sdma_desc *descp); #endif diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index fad946cb5e0d..85ae7293c274 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -20,6 +20,8 @@ struct sdma_desc { /* private: don't use directly */ u64 qw[2]; void *pinning_ctx; + /* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */ + void (*ctx_put)(void *ctx); }; /** diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index ae58b48afe07..02bd62b857b7 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -62,18 +62,14 @@ static int defer_packet_queue( static void activate_packet_queue(struct iowait *wait, int reason); static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len); -static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); -static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); static struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, - .insert = sdma_rb_insert, .evict = sdma_rb_evict, .remove = sdma_rb_remove, - .invalidate = sdma_rb_invalidate }; static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, @@ -247,14 +243,14 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, spin_unlock(&fd->pq_rcu_lock); synchronize_srcu(&fd->pq_srcu); /* at this point there can be no more new requests */ - if (pq->handler) - hfi1_mmu_rb_unregister(pq->handler); iowait_sdma_drain(&pq->busy); /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, !atomic_read(&pq->n_reqs)); kfree(pq->reqs); + if (pq->handler) + hfi1_mmu_rb_unregister(pq->handler); bitmap_free(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); flush_pq_iowait(pq); @@ -1275,25 +1271,17 @@ static void free_system_node(struct sdma_mmu_node *node) kfree(node); } -static inline void acquire_node(struct sdma_mmu_node *node) -{ - atomic_inc(&node->refcount); - WARN_ON(atomic_read(&node->refcount) < 0); -} - -static inline void release_node(struct mmu_rb_handler *handler, - struct sdma_mmu_node *node) -{ - atomic_dec(&node->refcount); - WARN_ON(atomic_read(&node->refcount) < 0); -} - +/* + * kref_get()'s an additional kref on the returned rb_node to prevent rb_node + * from being released until after rb_node is assigned to an SDMA descriptor + * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the + * virtual address range for rb_node is invalidated between now and then. + */ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, unsigned long start, unsigned long end) { struct mmu_rb_node *rb_node; - struct sdma_mmu_node *node; unsigned long flags; spin_lock_irqsave(&handler->lock, flags); @@ -1302,11 +1290,12 @@ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, spin_unlock_irqrestore(&handler->lock, flags); return NULL; } - node = container_of(rb_node, struct sdma_mmu_node, rb); - acquire_node(node); + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&rb_node->refcount); spin_unlock_irqrestore(&handler->lock, flags); - return node; + return container_of(rb_node, struct sdma_mmu_node, rb); } static int pin_system_pages(struct user_sdma_request *req, @@ -1355,6 +1344,13 @@ retry: return 0; } +/* + * kref refcount on *node_p will be 2 on successful addition: one kref from + * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being + * released until after *node_p is assigned to an SDMA descriptor (struct + * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual + * address range for *node_p is invalidated between now and then. + */ static int add_system_pinning(struct user_sdma_request *req, struct sdma_mmu_node **node_p, unsigned long start, unsigned long len) @@ -1368,6 +1364,12 @@ static int add_system_pinning(struct user_sdma_request *req, if (!node) return -ENOMEM; + /* First kref "moves" to mmu_rb_handler */ + kref_init(&node->rb.refcount); + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&node->rb.refcount); + node->pq = pq; ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); if (ret == 0) { @@ -1431,15 +1433,15 @@ static int get_system_cache_entry(struct user_sdma_request *req, return 0; } - SDMA_DBG(req, "prepend: node->rb.addr %lx, node->refcount %d", - node->rb.addr, atomic_read(&node->refcount)); + SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", + node->rb.addr, kref_read(&node->rb.refcount)); prepend_len = node->rb.addr - start; /* * This node will not be returned, instead a new node * will be. So release the reference. */ - release_node(handler, node); + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); /* Prepend a node to cover the beginning of the allocation */ ret = add_system_pinning(req, node_p, start, prepend_len); @@ -1451,6 +1453,20 @@ static int get_system_cache_entry(struct user_sdma_request *req, } } +static void sdma_mmu_rb_node_get(void *ctx) +{ + struct mmu_rb_node *node = ctx; + + kref_get(&node->refcount); +} + +static void sdma_mmu_rb_node_put(void *ctx) +{ + struct sdma_mmu_node *node = ctx; + + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); +} + static int add_mapping_to_sdma_packet(struct user_sdma_request *req, struct user_sdma_txreq *tx, struct sdma_mmu_node *cache_entry, @@ -1494,9 +1510,12 @@ static int add_mapping_to_sdma_packet(struct user_sdma_request *req, ctx = cache_entry; } - ret = sdma_txadd_page(pq->dd, ctx, &tx->txreq, + ret = sdma_txadd_page(pq->dd, &tx->txreq, cache_entry->pages[page_index], - page_offset, from_this_page); + page_offset, from_this_page, + ctx, + sdma_mmu_rb_node_get, + sdma_mmu_rb_node_put); if (ret) { /* * When there's a failure, the entire request is freed by @@ -1518,8 +1537,6 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, struct user_sdma_iovec *iovec, size_t from_this_iovec) { - struct mmu_rb_handler *handler = req->pq->handler; - while (from_this_iovec > 0) { struct sdma_mmu_node *cache_entry; size_t from_this_cache_entry; @@ -1540,15 +1557,15 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, from_this_cache_entry); + + /* + * Done adding cache_entry to zero or more sdma_desc. Can + * kref_put() the "safety" kref taken under + * get_system_cache_entry(). + */ + kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); + if (ret) { - /* - * We're guaranteed that there will be no descriptor - * completion callback that releases this node - * because only the last descriptor referencing it - * has a context attached, and a failure means the - * last descriptor was never added. - */ - release_node(handler, cache_entry); SDMA_DBG(req, "add system segment failed %d", ret); return ret; } @@ -1599,42 +1616,12 @@ static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, return 0; } -void system_descriptor_complete(struct hfi1_devdata *dd, - struct sdma_desc *descp) -{ - switch (sdma_mapping_type(descp)) { - case SDMA_MAP_SINGLE: - dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), - sdma_mapping_len(descp), DMA_TO_DEVICE); - break; - case SDMA_MAP_PAGE: - dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), - sdma_mapping_len(descp), DMA_TO_DEVICE); - break; - } - - if (descp->pinning_ctx) { - struct sdma_mmu_node *node = descp->pinning_ctx; - - release_node(node->rb.handler, node); - } -} - static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len) { return (bool)(node->addr == addr); } -static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) -{ - struct sdma_mmu_node *node = - container_of(mnode, struct sdma_mmu_node, rb); - - atomic_inc(&node->refcount); - return 0; -} - /* * Return 1 to remove the node from the rb tree and call the remove op. * @@ -1647,10 +1634,6 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, container_of(mnode, struct sdma_mmu_node, rb); struct evict_data *evict_data = evict_arg; - /* is this node still being used? */ - if (atomic_read(&node->refcount)) - return 0; /* keep this node */ - /* this node will be evicted, add its pages to our count */ evict_data->cleared += node->npages; @@ -1668,13 +1651,3 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) free_system_node(node); } - -static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) -{ - struct sdma_mmu_node *node = - container_of(mnode, struct sdma_mmu_node, rb); - - if (!atomic_read(&node->refcount)) - return 1; - return 0; -} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index a241836371dc..548347d4c5bc 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -104,7 +104,6 @@ struct hfi1_user_sdma_comp_q { struct sdma_mmu_node { struct mmu_rb_node rb; struct hfi1_user_sdma_pkt_q *pq; - atomic_t refcount; struct page **pages; unsigned int npages; }; diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 727eedfba332..cc6324d2d1dd 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -64,11 +64,11 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, /* combine physically continuous fragments later? */ ret = sdma_txadd_page(sde->dd, - NULL, &tx->txreq, skb_frag_page(frag), skb_frag_off(frag), - skb_frag_size(frag)); + skb_frag_size(frag), + NULL, NULL, NULL); if (unlikely(ret)) goto bail_txadd; } From e236e2eae52e1e7fb0d38b8d921b04e527151b73 Mon Sep 17 00:00:00 2001 From: Brendan Cunningham Date: Fri, 19 May 2023 12:54:19 -0400 Subject: [PATCH 18/71] IB/hfi1: Add mmu_rb_node refcount to hfi1_mmu_rb_template tracepoints Add kref_read() of mmu_rb_node.refcount in hfi1_mmu_rb_template-type tracepoint output. Change hfi1_mmu_rb_template tracepoint to take a struct mmu_rb_node* and record the values it needs from that. This makes the trace_hfi1_mmu*() calls shorter and easier to read. Add hfi1_mmu_release_node() tracepoint before all mmu_rb_node->handler->ops->remove() calls. Make hfi1_mmu_rb_search() tracepoint its own tracepoint type separate from hfi1_mmu_rb_template since hfi1_mmu_rb_search() does not take a struct mmu_rb_node*. Link: https://lore.kernel.org/r/168451525987.3702129.12824880387615916700.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick Signed-off-by: Brendan Cunningham Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mmu_rb.c | 7 ++-- drivers/infiniband/hw/hfi1/trace_mmu.h | 48 +++++++++++++++++++------- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index a864423c256d..7a51f7d73b61 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -124,7 +124,7 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, unsigned long flags; int ret = 0; - trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len); + trace_hfi1_mmu_rb_insert(mnode); if (current->mm != handler->mn.mm) return -EPERM; @@ -189,6 +189,7 @@ static void release_immediate(struct kref *refcount) { struct mmu_rb_node *mnode = container_of(refcount, struct mmu_rb_node, refcount); + trace_hfi1_mmu_release_node(mnode); mnode->handler->ops->remove(mnode->handler->ops_arg, mnode); } @@ -252,6 +253,7 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) spin_unlock_irqrestore(&handler->lock, flags); list_for_each_entry_safe(rbnode, ptr, &del_list, list) { + trace_hfi1_mmu_rb_evict(rbnode); kref_put(&rbnode->refcount, release_immediate); } } @@ -271,7 +273,7 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, /* Guard against node removal. */ ptr = __mmu_int_rb_iter_next(node, range->start, range->end - 1); - trace_hfi1_mmu_mem_invalidate(node->addr, node->len); + trace_hfi1_mmu_mem_invalidate(node); /* Remove from rb tree and lru_list. */ __mmu_int_rb_remove(node, root); list_del_init(&node->list); @@ -304,6 +306,7 @@ static void handle_remove(struct work_struct *work) while (!list_empty(&del_list)) { node = list_first_entry(&del_list, struct mmu_rb_node, list); list_del(&node->list); + trace_hfi1_mmu_release_node(node); handler->ops->remove(handler->ops_arg, node); } } diff --git a/drivers/infiniband/hw/hfi1/trace_mmu.h b/drivers/infiniband/hw/hfi1/trace_mmu.h index 57900ebb7702..82cc12aa3fb8 100644 --- a/drivers/infiniband/hw/hfi1/trace_mmu.h +++ b/drivers/infiniband/hw/hfi1/trace_mmu.h @@ -15,31 +15,53 @@ #define TRACE_SYSTEM hfi1_mmu DECLARE_EVENT_CLASS(hfi1_mmu_rb_template, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len), + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node), TP_STRUCT__entry(__field(unsigned long, addr) __field(unsigned long, len) + __field(unsigned int, refcount) ), - TP_fast_assign(__entry->addr = addr; - __entry->len = len; + TP_fast_assign(__entry->addr = node->addr; + __entry->len = node->len; + __entry->refcount = kref_read(&node->refcount); ), - TP_printk("MMU node addr 0x%lx, len %lu", + TP_printk("MMU node addr 0x%lx, len %lu, refcount %u", __entry->addr, - __entry->len + __entry->len, + __entry->refcount ) ); DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_insert, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); -DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_search, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); +TRACE_EVENT(hfi1_mmu_rb_search, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len), + TP_STRUCT__entry(__field(unsigned long, addr) + __field(unsigned long, len) + ), + TP_fast_assign(__entry->addr = addr; + __entry->len = len; + ), + TP_printk("MMU node addr 0x%lx, len %lu", + __entry->addr, + __entry->len + ) +); DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_evict, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_release_node, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); #endif /* __HFI1_TRACE_RC_H */ From 95ea2efbd66fb9e85238bd8d59341f8ce7a31065 Mon Sep 17 00:00:00 2001 From: Brendan Cunningham Date: Fri, 19 May 2023 12:54:25 -0400 Subject: [PATCH 19/71] IB/hfi1: Remove unused struct mmu_rb_ops fields .insert, .invalidate The struct mmu_rb_ops function pointers .insert, .invalidate were only used to increment and decrement struct sdma_mmu_node.refcount. With the deletion of struct sdma_mmu_node.refcount and the addition of struct mmu_rb_node.refcount these function pointers are not called and there are no implementations of them. So it is safe to delete these from struct mmu_rb_ops. Link: https://lore.kernel.org/r/168451526508.3702129.8677714753157495310.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Brendan Cunningham Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mmu_rb.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index 82c505a04fc6..751dc3fe1e02 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -19,16 +19,11 @@ struct mmu_rb_node { struct kref refcount; }; -/* - * NOTE: filter, insert, invalidate, and evict must not sleep. Only remove is - * allowed to sleep. - */ +/* filter and evict must not sleep. Only remove is allowed to sleep. */ struct mmu_rb_ops { bool (*filter)(struct mmu_rb_node *node, unsigned long addr, unsigned long len); - int (*insert)(void *ops_arg, struct mmu_rb_node *mnode); void (*remove)(void *ops_arg, struct mmu_rb_node *mnode); - int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, void *evict_arg, bool *stop); }; From b9989ab3f61ec459cbaf0a492fea3168bbfa4c7a Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Tue, 23 May 2023 20:16:39 +0800 Subject: [PATCH 20/71] RDMA/hns: Remove unnecessary QP type checks It is not necessary to check the type of the queue on IO path because unsupported QP type cannot be created. Link: https://lore.kernel.org/r/20230523121641.3132102-2-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 24 +++------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 84f1167de1d9..eba5fdc10703 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -373,17 +373,10 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_qp *ibqp = &hr_qp->ibqp; - if (unlikely(ibqp->qp_type != IB_QPT_RC && - ibqp->qp_type != IB_QPT_GSI && - ibqp->qp_type != IB_QPT_UD)) { - ibdev_err(ibdev, "not supported QP(0x%x)type!\n", - ibqp->qp_type); - return -EOPNOTSUPP; - } else if (unlikely(hr_qp->state == IB_QPS_RESET || - hr_qp->state == IB_QPS_INIT || - hr_qp->state == IB_QPS_RTR)) { + if (unlikely(hr_qp->state == IB_QPS_RESET || + hr_qp->state == IB_QPS_INIT || + hr_qp->state == IB_QPS_RTR)) { ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", hr_qp->state); return -EINVAL; @@ -771,17 +764,6 @@ out: static int check_recv_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_qp *ibqp = &hr_qp->ibqp; - - if (unlikely(ibqp->qp_type != IB_QPT_RC && - ibqp->qp_type != IB_QPT_GSI && - ibqp->qp_type != IB_QPT_UD)) { - ibdev_err(ibdev, "unsupported qp type, qp_type = %d.\n", - ibqp->qp_type); - return -EOPNOTSUPP; - } - if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) return -EIO; From cf5b608fb0e369c473a8303cad6ddb386505e5b8 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Tue, 23 May 2023 20:16:40 +0800 Subject: [PATCH 21/71] RDMA/hns: Fix hns_roce_table_get return value The return value of set_hem has been fixed to ENODEV, which will lead a diagnostic information missing. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Link: https://lore.kernel.org/r/20230523121641.3132102-3-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index aa8a08d1c014..f30274986c0d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -595,11 +595,12 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev, } /* Set HEM base address(128K/page, pa) to Hardware */ - if (hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) { + ret = hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) { hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; - ret = -ENODEV; - dev_err(dev, "set HEM base address to HW failed.\n"); + dev_err(dev, "set HEM base address to HW failed, ret = %d.\n", + ret); goto out; } From a519a612a71848b69b70b18b4d14d165b2d8aaf7 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Tue, 23 May 2023 20:16:41 +0800 Subject: [PATCH 22/71] RDMA/hns: Add clear_hem return value to log Log return value of clear_hem() to help diagnose. Link: https://lore.kernel.org/r/20230523121641.3132102-4-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hem.c | 42 ++++++++++++++++-------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index f30274986c0d..47c0efed1821 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -619,6 +619,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; + int ret; index->inited = HEM_INDEX_BUF; chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN; @@ -642,16 +643,24 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, else step_idx = hop_num; - if (hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx)) - ibdev_warn(ibdev, "failed to clear hop%u HEM.\n", hop_num); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); + if (ret) + ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); - if (index->inited & HEM_INDEX_L1) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1)) - ibdev_warn(ibdev, "failed to clear HEM step 1.\n"); + if (index->inited & HEM_INDEX_L1) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", + ret); + } - if (index->inited & HEM_INDEX_L0) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0)) - ibdev_warn(ibdev, "failed to clear HEM step 0.\n"); + if (index->inited & HEM_INDEX_L0) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", + ret); + } } } @@ -688,6 +697,7 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int ret; if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_table_mhop_put(hr_dev, table, obj, 1); @@ -700,8 +710,10 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, &table->mutex)) return; - if (hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) - dev_warn(dev, "failed to clear HEM base address.\n"); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) + dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; @@ -917,6 +929,8 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int obj; + int ret; if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); @@ -925,9 +939,11 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, for (i = 0; i < table->num_hem; ++i) if (table->hem[i]) { - if (hr_dev->hw->clear_hem(hr_dev, table, - i * table->table_chunk_size / table->obj_size, 0)) - dev_err(dev, "clear HEM base address failed.\n"); + obj = i * table->table_chunk_size / table->obj_size; + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + dev_err(dev, "clear HEM base address failed, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); } From d11442c6bde7aad0948b39c3e47f375fed98c776 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:30 -0500 Subject: [PATCH 23/71] RDMA/rxe: Rename IB_ACCESS_REMOTE Rename IB_ACCESS_REMOTE to RXE_ACCESS_REMOTE and move to rxe_verbs.h as an enum instead of a #define. Shouldn't use IB_xxx for rxe symbols. Link: https://lore.kernel.org/r/20230530221334.89432-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 10 +++------- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 ++++++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 0e538fafcc20..b3bc4ac5fedd 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -45,14 +45,10 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) } } -#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ - | IB_ACCESS_REMOTE_WRITE \ - | IB_ACCESS_REMOTE_ATOMIC) - static void rxe_mr_init(int access, struct rxe_mr *mr) { u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); - u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + u32 rkey = (access & RXE_ACCESS_REMOTE) ? lkey : 0; /* set ibmr->l/rkey and also copy into private l/rkey * for user MRs these will always be the same @@ -195,7 +191,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) int err; /* always allow remote access for FMRs */ - rxe_mr_init(IB_ACCESS_REMOTE, mr); + rxe_mr_init(RXE_ACCESS_REMOTE, mr); err = rxe_mr_alloc(mr, max_pages); if (err) @@ -715,7 +711,7 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) mr->access = access; mr->lkey = key; - mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; + mr->rkey = (access & RXE_ACCESS_REMOTE) ? key : 0; mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; mr->state = RXE_MR_STATE_VALID; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 26a20f088692..0a2b7343e38f 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -253,6 +253,12 @@ struct rxe_qp { struct execute_work cleanup_work; }; +enum rxe_access { + RXE_ACCESS_REMOTE = (IB_ACCESS_REMOTE_READ + | IB_ACCESS_REMOTE_WRITE + | IB_ACCESS_REMOTE_ATOMIC), +}; + enum rxe_mr_state { RXE_MR_STATE_INVALID, RXE_MR_STATE_FREE, From 2a129958bdf045ba5eec20d3b6a1c20614ea05a7 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:31 -0500 Subject: [PATCH 24/71] RDMA//rxe: Optimize send path in rxe_resp.c Bypass calling check_rkey() in rxe_resp.c for non-rdma messages. Link: https://lore.kernel.org/r/20230530221334.89432-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_opcode.h | 3 +++ drivers/infiniband/sw/rxe/rxe_resp.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index cea4e0a63919..5686b691d6b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -91,6 +91,9 @@ enum rxe_hdr_mask { RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), + RXE_RDMA_OP_MASK = (RXE_READ_MASK | RXE_WRITE_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_FLUSH_MASK | + RXE_ATOMIC_MASK), }; #define OPCODE_NONE (-1) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index b92c41cdb620..07299205242e 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -387,7 +387,10 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, } } - return RESPST_CHK_RKEY; + if (pkt->mask & RXE_RDMA_OP_MASK) + return RESPST_CHK_RKEY; + else + return RESPST_EXECUTE; } /* if the reth length field is zero we can assume nothing @@ -434,6 +437,10 @@ static enum resp_states check_rkey(struct rxe_qp *qp, enum resp_states state; int access = 0; + /* parse RETH or ATMETH header for first/only packets + * for va, length, rkey, etc. or use current value for + * middle/last packets. + */ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); @@ -454,7 +461,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { - return RESPST_EXECUTE; + /* shouldn't happen */ + WARN_ON(1); } /* A zero-byte read or write op is not required to From 425e1c9018fdf25cb4531606cc92d9d01a55534f Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:32 -0500 Subject: [PATCH 25/71] RDMA/rxe: Fix access checks in rxe_check_bind_mw The subroutine rxe_check_bind_mw() in rxe_mw.c performs checks on the mw access flags before they are set so they always succeed. This patch instead checks the access flags passed in the send wqe. Fixes: 32a577b4c3a9 ("RDMA/rxe: Add support for bind MW work requests") Link: https://lore.kernel.org/r/20230530221334.89432-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mw.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index afa5ce1a7116..a7ec57ab8fad 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -48,7 +48,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) } static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_mw *mw, struct rxe_mr *mr) + struct rxe_mw *mw, struct rxe_mr *mr, int access) { if (mw->ibmw.type == IB_MW_TYPE_1) { if (unlikely(mw->state != RXE_MW_STATE_VALID)) { @@ -58,7 +58,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* o10-36.2.2 */ - if (unlikely((mw->access & IB_ZERO_BASED))) { + if (unlikely((access & IB_ZERO_BASED))) { rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n"); return -EINVAL; } @@ -104,7 +104,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* C10-74 */ - if (unlikely((mw->access & + if (unlikely((access & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { rxe_dbg_mw(mw, @@ -113,7 +113,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* C10-75 */ - if (mw->access & IB_ZERO_BASED) { + if (access & IB_ZERO_BASED) { if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) { rxe_dbg_mw(mw, "attempt to bind a ZB MW outside of the MR\n"); @@ -133,12 +133,12 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_mw *mw, struct rxe_mr *mr) + struct rxe_mw *mw, struct rxe_mr *mr, int access) { u32 key = wqe->wr.wr.mw.rkey & 0xff; mw->rkey = (mw->rkey & ~0xff) | key; - mw->access = wqe->wr.wr.mw.access; + mw->access = access; mw->state = RXE_MW_STATE_VALID; mw->addr = wqe->wr.wr.mw.addr; mw->length = wqe->wr.wr.mw.length; @@ -169,6 +169,7 @@ int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) struct rxe_dev *rxe = to_rdev(qp->ibqp.device); u32 mw_rkey = wqe->wr.wr.mw.mw_rkey; u32 mr_lkey = wqe->wr.wr.mw.mr_lkey; + int access = wqe->wr.wr.mw.access; mw = rxe_pool_get_index(&rxe->mw_pool, mw_rkey >> 8); if (unlikely(!mw)) { @@ -198,11 +199,11 @@ int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) spin_lock_bh(&mw->lock); - ret = rxe_check_bind_mw(qp, wqe, mw, mr); + ret = rxe_check_bind_mw(qp, wqe, mw, mr, access); if (ret) goto err_unlock; - rxe_do_bind_mw(qp, wqe, mw, mr); + rxe_do_bind_mw(qp, wqe, mw, mr, access); err_unlock: spin_unlock_bh(&mw->lock); err_drop_mr: From 02ed253770fb27380c51af8e7e0903e6905469b6 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:33 -0500 Subject: [PATCH 26/71] RDMA/rxe: Introduce rxe access supported flags Introduce supported bit masks for setting the access attributes of MWs, MRs, and QPs. Check these when attributes are set. Link: https://lore.kernel.org/r/20230530221334.89432-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mw.c | 5 +++++ drivers/infiniband/sw/rxe/rxe_qp.c | 7 +++++++ drivers/infiniband/sw/rxe/rxe_verbs.c | 6 ++++++ drivers/infiniband/sw/rxe/rxe_verbs.h | 15 ++++++++++++--- 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index a7ec57ab8fad..d8a43d87de93 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -197,6 +197,11 @@ int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) mr = NULL; } + if (access & ~RXE_ACCESS_SUPPORTED_MW) { + rxe_err_mw(mw, "access %#x not supported", access); + return -EOPNOTSUPP; + } + spin_lock_bh(&mw->lock); ret = rxe_check_bind_mw(qp, wqe, mw, mr, access); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c5451a4488ca..95d4a6760c33 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -392,6 +392,13 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; + if (mask & IB_QP_ACCESS_FLAGS) { + if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)) + goto err1; + if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP) + goto err1; + } + if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index dea605b7f683..bb2b9d40e242 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1260,6 +1260,12 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, struct rxe_mr *mr; int err, cleanup_err; + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_pd(pd, "access = %#x not supported (%#x)", access, + RXE_ACCESS_SUPPORTED_MR); + return ERR_PTR(-EOPNOTSUPP); + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 0a2b7343e38f..2f2dc67f03dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -253,10 +253,19 @@ struct rxe_qp { struct execute_work cleanup_work; }; -enum rxe_access { - RXE_ACCESS_REMOTE = (IB_ACCESS_REMOTE_READ +enum { + RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE - | IB_ACCESS_REMOTE_ATOMIC), + | IB_ACCESS_REMOTE_ATOMIC, + RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE + | IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_MW_BIND + | IB_ACCESS_ON_DEMAND + | IB_ACCESS_FLUSH_GLOBAL + | IB_ACCESS_FLUSH_PERSISTENT, + RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, + RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR + | IB_ZERO_BASED, }; enum rxe_mr_state { From 86a3fb55bc4fb2ea0c276d56f03335574a694ee4 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:34 -0500 Subject: [PATCH 27/71] RDMA/rxe: Let rkey == lkey for local access In order to conform to other drivers stop using rkey == 0 as an indication that there are no remote access flags set. Set rkey == lkey by default for all MRs. Link: https://lore.kernel.org/r/20230530221334.89432-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index b3bc4ac5fedd..f54042e9aeb2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -47,16 +47,15 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) static void rxe_mr_init(int access, struct rxe_mr *mr) { - u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); - u32 rkey = (access & RXE_ACCESS_REMOTE) ? lkey : 0; + u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); /* set ibmr->l/rkey and also copy into private l/rkey * for user MRs these will always be the same * for cases where caller 'owns' the key portion * they may be different until REG_MR WQE is executed. */ - mr->lkey = mr->ibmr.lkey = lkey; - mr->rkey = mr->ibmr.rkey = rkey; + mr->lkey = mr->ibmr.lkey = key; + mr->rkey = mr->ibmr.rkey = key; mr->access = access; mr->ibmr.page_size = PAGE_SIZE; @@ -640,6 +639,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_mr *mr; + int remote; int ret; mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); @@ -649,9 +649,10 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) goto err; } - if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { + remote = mr->access & RXE_ACCESS_REMOTE; + if (remote ? (key != mr->rkey) : (key != mr->lkey)) { rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", - key, (mr->rkey ? mr->rkey : mr->lkey)); + key, (remote ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } @@ -711,7 +712,7 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) mr->access = access; mr->lkey = key; - mr->rkey = (access & RXE_ACCESS_REMOTE) ? key : 0; + mr->rkey = key; mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; mr->state = RXE_MR_STATE_VALID; From 544c7f62cf32db2bd358f1e8a40a98bf98fa2a5c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 30 May 2023 17:13:35 -0500 Subject: [PATCH 28/71] RDMA/rxe: Implement rereg_user_mr Implement the two easy cases of ib_rereg_user_mr. Link: https://lore.kernel.org/r/20230530221334.89432-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 35 +++++++++++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_verbs.h | 5 ++++ 2 files changed, 40 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index bb2b9d40e242..f6396333bcef 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1299,6 +1299,40 @@ err_free: return ERR_PTR(err); } +static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, + u64 start, u64 length, u64 iova, + int access, struct ib_pd *ibpd, + struct ib_udata *udata) +{ + struct rxe_mr *mr = to_rmr(ibmr); + struct rxe_pd *old_pd = to_rpd(ibmr->pd); + struct rxe_pd *pd = to_rpd(ibpd); + + /* for now only support the two easy cases: + * rereg_pd and rereg_access + */ + if (flags & ~RXE_MR_REREG_SUPPORTED) { + rxe_err_mr(mr, "flags = %#x not supported", flags); + return ERR_PTR(-EOPNOTSUPP); + } + + if (flags & IB_MR_REREG_PD) { + rxe_put(old_pd); + rxe_get(pd); + mr->ibmr.pd = ibpd; + } + + if (flags & IB_MR_REREG_ACCESS) { + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_mr(mr, "access = %#x not supported", access); + return ERR_PTR(-EOPNOTSUPP); + } + mr->access = access; + } + + return NULL; +} + static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { @@ -1451,6 +1485,7 @@ static const struct ib_device_ops rxe_dev_ops = { .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, + .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 2f2dc67f03dd..cb18b83b73c1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -284,6 +284,11 @@ enum rxe_mr_lookup_type { RXE_LOOKUP_REMOTE, }; +enum rxe_rereg { + RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD + | IB_MR_REREG_ACCESS, +}; + static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; From c3e1bf626eb3c53ea60a0c64b441fb3015e1652e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 2 Jun 2023 11:42:29 -0500 Subject: [PATCH 29/71] RDMA/rxe: Send last wqe reached event on qp cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IBA requires: o11-5.2.5: If the HCA supports SRQ, for RC and UD service, the CI shall generate a Last WQE Reached Affiliated Asynchronous Event on a QP that is in the Error State and is associated with an SRQ when either: • a CQE is generated for the last WQE, or • the QP gets in the Error State and there are no more WQEs on the RQ. This patch implements this behavior in flush_recv_queue() which is called as a result of rxe_qp_error() being called whenever the qp is put into the error state. The rxe responder executes SRQ WQEs directly from the SRQ so there are never more WQES on the RQ. Link: https://lore.kernel.org/r/20230602164229.9277-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 07299205242e..8a3c9c2c5a2d 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1455,8 +1455,17 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) struct rxe_recv_wqe *wqe; int err; - if (qp->srq) + if (qp->srq) { + if (notify && qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } return; + } while ((wqe = queue_head(q, q->type))) { if (notify) { From 7ad697cdd31b9d5e59e25b035bad4cdd8e76aca1 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Mon, 5 Jun 2023 11:37:28 -0700 Subject: [PATCH 30/71] RDMA/vmw_pvrdma: Remove unnecessary check on wr->opcode wr->opcode is unsigned; checking if it is negative is unnecessary. Fix this issue by removing the check. Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Link: https://lore.kernel.org/r/20230605183728.47021-1-bryantan@vmware.com Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/07c48eee-0aca-48ee-897a-38588c341c41@kili.mountain Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index f83cd4a9d992..98b2a0090bf2 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -709,14 +709,6 @@ int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto out; } - if (unlikely(wr->opcode < 0)) { - dev_warn_ratelimited(&dev->pdev->dev, - "invalid send opcode\n"); - *bad_wr = wr; - ret = -EINVAL; - goto out; - } - /* * Only support UD, RC. * Need to check opcode table for thorough checking. From c023b61ac8285dc6b2b2f275bf9d97cfd36b56fb Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:14:04 +0300 Subject: [PATCH 31/71] net/mlx5: Nullify qp->dbg pointer post destruction Nullifying qp->dbg is a preparation for the next patches from the series in which mlx5_core_destroy_qp() could actually fail, and then it can be called again which causes a kernel crash, since qp->dbg was not nullified in previous call. Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/1677e52bb642fd8d6062d73a5aa69083c0283dc9.1685953497.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index bb95b40d25eb..b08b5695ee45 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -513,11 +513,11 @@ EXPORT_SYMBOL(mlx5_debug_qp_add); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { - if (!mlx5_debugfs_root) + if (!mlx5_debugfs_root || !qp->dbg) return; - if (qp->dbg) - rem_res_tree(qp->dbg); + rem_res_tree(qp->dbg); + qp->dbg = NULL; } EXPORT_SYMBOL(mlx5_debug_qp_remove); From 2ecfd946169e7f56534db2a5f6935858be3005ba Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2023 13:14:05 +0300 Subject: [PATCH 32/71] RDMA/mlx5: Reduce QP table exposure driver.h is common header to whole mlx5 code base, but struct mlx5_qp_table is used in mlx5_ib driver only. So move that struct to be under sole responsibility of mlx5_ib. Link: https://lore.kernel.org/r/bec0dc1158e795813b135d1143147977f26bf668.1685953497.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.h | 11 ++++++++++- include/linux/mlx5/driver.h | 9 --------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index efa4dc6e7dee..4bceef878c43 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -25,6 +25,7 @@ #include #include "srq.h" +#include "qp.h" #define mlx5_ib_dbg(_dev, format, arg...) \ dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index 77f9b4a54816..f5130873dd52 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -6,7 +6,16 @@ #ifndef _MLX5_IB_QP_H #define _MLX5_IB_QP_H -#include "mlx5_ib.h" +struct mlx5_ib_dev; + +struct mlx5_qp_table { + struct notifier_block nb; + + /* protect radix tree + */ + spinlock_t lock; + struct radix_tree_root tree; +}; int mlx5_init_qp_table(struct mlx5_ib_dev *dev); void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a4c4f737f9c1..e3b616388b18 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -443,15 +443,6 @@ struct mlx5_core_health { struct delayed_work update_fw_log_ts_work; }; -struct mlx5_qp_table { - struct notifier_block nb; - - /* protect radix tree - */ - spinlock_t lock; - struct radix_tree_root tree; -}; - enum { MLX5_PF_NOTIFY_DISABLE_VF, MLX5_PF_NOTIFY_ENABLE_VF, From afff24899846ffca0c25c75b24893c90aef82603 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:14:06 +0300 Subject: [PATCH 33/71] RDMA/mlx5: Handle DCT QP logic separately from low level QP interface Previously when destroying a DCT, if the firmware function for the destruction failed, the common resource would have been destroyed either way, since it was destroyed before the firmware object. Which leads to kernel warning "refcount_t: underflow" which indicates possible use-after-free. Which is triggered when we try to destroy the common resource for the second time and execute refcount_dec_and_test(&common->refcount). So, let's fix the destruction order by factoring out the DCT QP logic to be in separate XArray database. refcount_t: underflow; use-after-free. WARNING: CPU: 8 PID: 1002 at lib/refcount.c:28 refcount_warn_saturate+0xd8/0xe0 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core overlay mlx5_core fuse CPU: 8 PID: 1002 Comm: python3 Not tainted 5.16.0-rc5+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0xd8/0xe0 Code: ff 48 c7 c7 18 f5 23 82 c6 05 60 70 ff 00 01 e8 d0 0a 45 00 0f 0b c3 48 c7 c7 c0 f4 23 82 c6 05 4c 70 ff 00 01 e8 ba 0a 45 00 <0f> 0b c3 0f 1f 44 00 00 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 RSP: 0018:ffff8881221d3aa8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff8881313e8d40 RCX: ffff88852cc1b5c8 RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff88852cc1b5c0 RBP: ffff888100f70000 R08: ffff88853ffd1ba8 R09: 0000000000000003 R10: 00000000fffff000 R11: 3fffffffffffffff R12: 0000000000000246 R13: ffff888100f71fa0 R14: ffff8881221d3c68 R15: 0000000000000020 FS: 00007efebbb13740(0000) GS:ffff88852cc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005611aac29f80 CR3: 00000001313de004 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: destroy_resource_common+0x6e/0x95 [mlx5_ib] mlx5_core_destroy_rq_tracked+0x38/0xbe [mlx5_ib] mlx5_ib_destroy_wq+0x22/0x80 [mlx5_ib] ib_destroy_wq_user+0x1f/0x40 [ib_core] uverbs_free_wq+0x19/0x40 [ib_uverbs] destroy_hw_idr_uobject+0x18/0x50 [ib_uverbs] uverbs_destroy_uobject+0x2f/0x190 [ib_uverbs] uobj_destroy+0x3c/0x80 [ib_uverbs] ib_uverbs_cmd_verbs+0x3e4/0xb80 [ib_uverbs] ? uverbs_free_wq+0x40/0x40 [ib_uverbs] ? ip_list_rcv+0xf7/0x120 ? netif_receive_skb_list_internal+0x1b6/0x2d0 ? task_tick_fair+0xbf/0x450 ? __handle_mm_fault+0x11fc/0x1450 ib_uverbs_ioctl+0xa4/0x110 [ib_uverbs] __x64_sys_ioctl+0x3e4/0x8e0 ? handle_mm_fault+0xb9/0x210 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7efebc0be17b Code: 0f 1e fa 48 8b 05 1d ad 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ed ac 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe71813e78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe71813fb8 RCX: 00007efebc0be17b RDX: 00007ffe71813fa0 RSI: 00000000c0181b01 RDI: 0000000000000005 RBP: 00007ffe71813f80 R08: 00005611aae96020 R09: 000000000000004f R10: 00007efebbf9ffa0 R11: 0000000000000246 R12: 00007ffe71813f80 R13: 00007ffe71813f4c R14: 00005611aae2eca0 R15: 00007efeae6c89d0 Signed-off-by: Patrisious Haddad Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4470888466c8a898edc9833286967529cc5f3c0d.1685953497.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.h | 1 + drivers/infiniband/hw/mlx5/qpc.c | 83 +++++++++++++++++++------------- include/linux/mlx5/driver.h | 1 - 3 files changed, 51 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index f5130873dd52..b6ee7c3ee1ca 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -10,6 +10,7 @@ struct mlx5_ib_dev; struct mlx5_qp_table { struct notifier_block nb; + struct xarray dct_xa; /* protect radix tree */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index bae0334d6e7f..3a2f755d4985 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -88,23 +88,35 @@ static bool is_event_type_allowed(int rsc_type, int event_type) } } +static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_core_dct *dct; + unsigned long flags; + u32 qpn; + + qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xFFFFFF; + xa_lock_irqsave(&dev->qp_table.dct_xa, flags); + dct = xa_load(&dev->qp_table.dct_xa, qpn); + if (dct) + complete(&dct->drained); + xa_unlock_irqrestore(&dev->qp_table.dct_xa, flags); + return NOTIFY_OK; +} + static int rsc_event_notifier(struct notifier_block *nb, unsigned long type, void *data) { + struct mlx5_ib_dev *dev = + container_of(nb, struct mlx5_ib_dev, qp_table.nb); struct mlx5_core_rsc_common *common; - struct mlx5_qp_table *table; - struct mlx5_core_dct *dct; + struct mlx5_eqe *eqe = data; u8 event_type = (u8)type; struct mlx5_core_qp *qp; - struct mlx5_eqe *eqe; u32 rsn; switch (event_type) { case MLX5_EVENT_TYPE_DCT_DRAINED: - eqe = data; - rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; - rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); - break; + return dct_event_notifier(dev, eqe); case MLX5_EVENT_TYPE_PATH_MIG: case MLX5_EVENT_TYPE_COMM_EST: case MLX5_EVENT_TYPE_SQ_DRAINED: @@ -113,7 +125,6 @@ static int rsc_event_notifier(struct notifier_block *nb, case MLX5_EVENT_TYPE_PATH_MIG_FAILED: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - eqe = data; rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); break; @@ -121,8 +132,7 @@ static int rsc_event_notifier(struct notifier_block *nb, return NOTIFY_DONE; } - table = container_of(nb, struct mlx5_qp_table, nb); - common = mlx5_get_rsc(table, rsn); + common = mlx5_get_rsc(&dev->qp_table, rsn); if (!common) return NOTIFY_OK; @@ -137,11 +147,6 @@ static int rsc_event_notifier(struct notifier_block *nb, qp->event(qp, event_type); /* Need to put resource in event handler */ return NOTIFY_OK; - case MLX5_RES_DCT: - dct = (struct mlx5_core_dct *)common; - if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) - complete(&dct->drained); - break; default: break; } @@ -188,28 +193,15 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev, } static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, - struct mlx5_core_dct *dct, bool need_cleanup) + struct mlx5_core_dct *dct) { u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; struct mlx5_core_qp *qp = &dct->mqp; - int err; - err = mlx5_core_drain_dct(dev, dct); - if (err) { - if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) - goto destroy; - - return err; - } - wait_for_completion(&dct->drained); -destroy: - if (need_cleanup) - destroy_resource_common(dev, &dct->mqp); MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); MLX5_SET(destroy_dct_in, in, uid, qp->uid); - err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); - return err; + return mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); } int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, @@ -227,13 +219,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, qp->qpn = MLX5_GET(create_dct_out, out, dctn); qp->uid = MLX5_GET(create_dct_in, in, uid); - err = create_resource_common(dev, qp, MLX5_RES_DCT); + err = xa_err(xa_store_irq(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL)); if (err) goto err_cmd; return 0; err_cmd: - _mlx5_core_destroy_dct(dev, dct, false); + _mlx5_core_destroy_dct(dev, dct); return err; } @@ -284,7 +276,31 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct) { - return _mlx5_core_destroy_dct(dev, dct, true); + struct mlx5_qp_table *table = &dev->qp_table; + struct mlx5_core_dct *tmp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto destroy; + + return err; + } + wait_for_completion(&dct->drained); + +destroy: + tmp = xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, dct, XA_ZERO_ENTRY, GFP_KERNEL); + if (WARN_ON(tmp != dct)) + return xa_err(tmp) ?: -EINVAL; + + err = _mlx5_core_destroy_dct(dev, dct); + if (err) { + xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, dct, 0); + return err; + } + xa_erase_irq(&table->dct_xa, dct->mqp.qpn); + return 0; } int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) @@ -488,6 +504,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + xa_init(&table->dct_xa); mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e3b616388b18..e67c603d507b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -382,7 +382,6 @@ enum mlx5_res_type { MLX5_RES_SRQ = 3, MLX5_RES_XSRQ = 4, MLX5_RES_XRQ = 5, - MLX5_RES_DCT = MLX5_EVENT_QUEUE_TYPE_DCT, }; struct mlx5_core_rsc_common { From 22664c06e997087fe37f9ba208008c948571214a Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:14:07 +0300 Subject: [PATCH 34/71] RDMA/mlx5: Return the firmware result upon destroying QP/RQ Previously when destroying a QP/RQ, the result of the firmware destruction function was ignored and upper layers weren't informed about the failure. Which in turn could lead to various problems since when upper layer isn't aware of the failure it continues its operation thinking that the related QP/RQ was successfully destroyed while it actually wasn't, which could lead to the below kernel WARN. Currently, we return the correct firmware destruction status to upper layers which in case of the RQ would be mlx5_ib_destroy_wq() which was already capable of handling RQ destruction failure or in case of a QP to destroy_qp_common(), which now would actually warn upon qp destruction failure. WARNING: CPU: 3 PID: 995 at drivers/infiniband/core/rdma_core.c:940 uverbs_destroy_ufile_hw+0xcb/0xe0 [ib_uverbs] Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm ib_umad ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core overlay mlx5_core fuse CPU: 3 PID: 995 Comm: python3 Not tainted 5.16.0-rc5+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:uverbs_destroy_ufile_hw+0xcb/0xe0 [ib_uverbs] Code: 41 5c 41 5d 41 5e e9 44 34 f0 e0 48 89 df e8 4c 77 ff ff 49 8b 86 10 01 00 00 48 85 c0 74 a1 4c 89 e7 ff d0 eb 9a 0f 0b eb c1 <0f> 0b be 04 00 00 00 48 89 df e8 b6 f6 ff ff e9 75 ff ff ff 90 0f RSP: 0018:ffff8881533e3e78 EFLAGS: 00010287 RAX: ffff88811b2cf3e0 RBX: ffff888106209700 RCX: 0000000000000000 RDX: ffff888106209780 RSI: ffff8881533e3d30 RDI: ffff888109b101a0 RBP: 0000000000000001 R08: ffff888127cb381c R09: 0de9890000000009 R10: ffff888127cb3800 R11: 0000000000000000 R12: ffff888106209780 R13: ffff888106209750 R14: ffff888100f20660 R15: 0000000000000000 FS: 00007f8be353b740(0000) GS:ffff88852c980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8bd5b117c0 CR3: 000000012cd8a004 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ib_uverbs_close+0x1a/0x90 [ib_uverbs] __fput+0x82/0x230 task_work_run+0x59/0x90 exit_to_user_mode_prepare+0x138/0x140 syscall_exit_to_user_mode+0x1d/0x50 ? __x64_sys_close+0xe/0x40 do_syscall_64+0x4a/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f8be3ae0abb Code: 03 00 00 00 0f 05 48 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 83 43 f9 ff 8b 7c 24 0c 41 89 c0 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 89 44 24 0c e8 c1 43 f9 ff 8b 44 RSP: 002b:00007ffdb51909c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000557bb7f7c020 RCX: 00007f8be3ae0abb RDX: 0000557bb7c74010 RSI: 0000557bb7f14ca0 RDI: 0000000000000005 RBP: 0000557bb7fbd598 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 0000557bb7fbd5b8 R13: 0000557bb7fbd5a8 R14: 0000000000001000 R15: 0000557bb7f7c020 Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/c6df677f931d18090bafbe7f7dbb9524047b7d9b.1685953497.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qpc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 3a2f755d4985..d9cf6982d645 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -314,8 +314,7 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); MLX5_SET(destroy_qp_in, in, uid, qp->uid); - mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); - return 0; + return mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); } int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, @@ -568,14 +567,14 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn) return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in); } -static void destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) +static int destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) { u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); MLX5_SET(destroy_rq_in, in, rqn, rqn); MLX5_SET(destroy_rq_in, in, uid, uid); - mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); + return mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); } int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, @@ -606,8 +605,7 @@ int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { destroy_resource_common(dev, rq); - destroy_rq_tracked(dev, rq->qpn, rq->uid); - return 0; + return destroy_rq_tracked(dev, rq->qpn, rq->uid); } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) From 128f8404306d4299f4b962ba1161f14a794ddb13 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Tue, 6 Jun 2023 13:50:02 +0800 Subject: [PATCH 35/71] RDMA/erdma: Configure PAGE_SIZE to hardware Add a new CMDQ message to configure hardware. Initially the page size (in the format of shift) will be passed to hardware, so that hardware can organize the mmio space properly. It's called only if hardware supports it. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230606055005.80729-2-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 12 ++++++++++++ drivers/infiniband/hw/erdma/erdma_main.c | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 76ce2856be28..670796c22bcc 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -159,6 +159,7 @@ enum CMDQ_COMMON_OPCODE { CMDQ_OPCODE_DESTROY_EQ = 1, CMDQ_OPCODE_QUERY_FW_INFO = 2, CMDQ_OPCODE_CONF_MTU = 3, + CMDQ_OPCODE_CONF_DEVICE = 5, }; /* cmdq-SQE HDR */ @@ -196,6 +197,16 @@ struct erdma_cmdq_destroy_eq_req { u8 qtype; }; +/* config device cfg */ +#define ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK BIT(31) +#define ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK GENMASK(4, 0) + +struct erdma_cmdq_config_device_req { + u64 hdr; + u32 cfg; + u32 rsvd[5]; +}; + struct erdma_cmdq_config_mtu_req { u64 hdr; u32 mtu; @@ -329,6 +340,7 @@ struct erdma_cmdq_reflush_req { enum { ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7, + ERDMA_DEV_CAP_FLAGS_EXTEND_DB = 1 << 3, }; #define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 7c74abeee864..525edea987b2 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -426,6 +426,22 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) return err; } +static int erdma_device_config(struct erdma_dev *dev) +{ + struct erdma_cmdq_config_device_req req = {}; + + if (!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_EXTEND_DB)) + return 0; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_CONF_DEVICE); + + req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) | + FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1); + + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); +} + static int erdma_res_cb_init(struct erdma_dev *dev) { int i, j; @@ -512,6 +528,10 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; + ret = erdma_device_config(dev); + if (ret) + return ret; + ibdev->node_type = RDMA_NODE_RNIC; memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); From 7e9a1dada2266c1ef777eba123b5515859779eb9 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Tue, 6 Jun 2023 13:50:03 +0800 Subject: [PATCH 36/71] RDMA/erdma: Allocate doorbell resources from hardware Each ucontext will try to allocate doorbell resources in the extended bar space from hardware. For compatibility, we change nothing for the original bar space, and it will be used only for applications with CAP_SYS_RAWIO authority in the older HW/FW environments. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230606055005.80729-3-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 2 + drivers/infiniband/hw/erdma/erdma_hw.h | 22 ++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 119 ++++++++++++++++++---- drivers/infiniband/hw/erdma/erdma_verbs.h | 9 ++ 4 files changed, 132 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index e819e4032490..a361d4bcd714 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -268,6 +268,8 @@ static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, return FIELD_GET(filed_mask, val); } +#define ERDMA_GET(val, name) FIELD_GET(ERDMA_CMD_##name##_MASK, val) + int erdma_cmdq_init(struct erdma_dev *dev); void erdma_finish_cmdq_init(struct erdma_dev *dev); void erdma_cmdq_destroy(struct erdma_dev *dev); diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 670796c22bcc..812fc40de64b 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -160,6 +160,8 @@ enum CMDQ_COMMON_OPCODE { CMDQ_OPCODE_QUERY_FW_INFO = 2, CMDQ_OPCODE_CONF_MTU = 3, CMDQ_OPCODE_CONF_DEVICE = 5, + CMDQ_OPCODE_ALLOC_DB = 8, + CMDQ_OPCODE_FREE_DB = 9, }; /* cmdq-SQE HDR */ @@ -212,6 +214,26 @@ struct erdma_cmdq_config_mtu_req { u32 mtu; }; +/* ext db requests(alloc and free) cfg */ +#define ERDMA_CMD_EXT_DB_CQ_EN_MASK BIT(2) +#define ERDMA_CMD_EXT_DB_RQ_EN_MASK BIT(1) +#define ERDMA_CMD_EXT_DB_SQ_EN_MASK BIT(0) + +struct erdma_cmdq_ext_db_req { + u64 hdr; + u32 cfg; + u16 rdb_off; + u16 sdb_off; + u16 rsvd0; + u16 cdb_off; + u32 rsvd1[3]; +}; + +/* alloc db response qword 0 definition */ +#define ERDMA_CMD_ALLOC_DB_RESP_RDB_MASK GENMASK_ULL(63, 48) +#define ERDMA_CMD_ALLOC_DB_RESP_CDB_MASK GENMASK_ULL(47, 32) +#define ERDMA_CMD_ALLOC_DB_RESP_SDB_MASK GENMASK_ULL(15, 0) + /* create_cq cfg0 */ #define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) #define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 83e1b0d55977..376f70219ecd 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1188,6 +1188,60 @@ alloc_normal_db: ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); } +static int alloc_ext_db_resources(struct erdma_dev *dev, + struct erdma_ucontext *ctx) +{ + struct erdma_cmdq_ext_db_req req = {}; + u64 val0, val1; + int ret; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_ALLOC_DB); + + req.cfg = FIELD_PREP(ERDMA_CMD_EXT_DB_CQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1); + if (ret) + return ret; + + ctx->ext_db.enable = true; + ctx->ext_db.sdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_SDB); + ctx->ext_db.rdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_RDB); + ctx->ext_db.cdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_CDB); + + ctx->sdb_type = ERDMA_SDB_PAGE; + ctx->sdb = dev->func_bar_addr + (ctx->ext_db.sdb_off << PAGE_SHIFT); + ctx->cdb = dev->func_bar_addr + (ctx->ext_db.rdb_off << PAGE_SHIFT); + ctx->rdb = dev->func_bar_addr + (ctx->ext_db.cdb_off << PAGE_SHIFT); + + return 0; +} + +static void free_ext_db_resources(struct erdma_dev *dev, + struct erdma_ucontext *ctx) +{ + struct erdma_cmdq_ext_db_req req = {}; + int ret; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_FREE_DB); + + req.cfg = FIELD_PREP(ERDMA_CMD_EXT_DB_CQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); + + req.sdb_off = ctx->ext_db.sdb_off; + req.rdb_off = ctx->ext_db.rdb_off; + req.cdb_off = ctx->ext_db.cdb_off; + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) + ibdev_err_ratelimited(&dev->ibdev, + "free db resources failed %d", ret); +} + static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) { rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry); @@ -1201,44 +1255,60 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibctx->device); int ret; struct erdma_uresp_alloc_ctx uresp = {}; + bool ext_db_en; if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { ret = -ENOMEM; goto err_out; } - INIT_LIST_HEAD(&ctx->dbrecords_page_list); - mutex_init(&ctx->dbrecords_page_mutex); - - alloc_db_resources(dev, ctx); - - ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; - ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; - if (udata->outlen < sizeof(uresp)) { ret = -EINVAL; goto err_out; } + INIT_LIST_HEAD(&ctx->dbrecords_page_list); + mutex_init(&ctx->dbrecords_page_mutex); + + /* + * CAP_SYS_RAWIO is required if hardware does not support extend + * doorbell mechanism. + */ + ext_db_en = !!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_EXTEND_DB); + if (!ext_db_en && !capable(CAP_SYS_RAWIO)) { + ret = -EPERM; + goto err_out; + } + + if (ext_db_en) { + ret = alloc_ext_db_resources(dev, ctx); + if (ret) + goto err_out; + } else { + alloc_db_resources(dev, ctx); + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + } + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); if (!ctx->sq_db_mmap_entry) { ret = -ENOMEM; - goto err_out; + goto err_free_ext_db; } ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); if (!ctx->rq_db_mmap_entry) { ret = -EINVAL; - goto err_out; + goto err_put_mmap_entries; } ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); if (!ctx->cq_db_mmap_entry) { ret = -EINVAL; - goto err_out; + goto err_put_mmap_entries; } uresp.dev_id = dev->pdev->device; @@ -1247,12 +1317,18 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) - goto err_out; + goto err_put_mmap_entries; return 0; -err_out: +err_put_mmap_entries: erdma_uctx_user_mmap_entries_remove(ctx); + +err_free_ext_db: + if (ext_db_en) + free_ext_db_resources(dev, ctx); + +err_out: atomic_dec(&dev->num_ctx); return ret; } @@ -1262,15 +1338,18 @@ void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) struct erdma_ucontext *ctx = to_ectx(ibctx); struct erdma_dev *dev = to_edev(ibctx->device); - spin_lock(&dev->db_bitmap_lock); - if (ctx->sdb_type == ERDMA_SDB_PAGE) - clear_bit(ctx->sdb_idx, dev->sdb_page); - else if (ctx->sdb_type == ERDMA_SDB_ENTRY) - clear_bit(ctx->sdb_idx, dev->sdb_entry); - erdma_uctx_user_mmap_entries_remove(ctx); - spin_unlock(&dev->db_bitmap_lock); + if (ctx->ext_db.enable) { + free_ext_db_resources(dev, ctx); + } else { + spin_lock(&dev->db_bitmap_lock); + if (ctx->sdb_type == ERDMA_SDB_PAGE) + clear_bit(ctx->sdb_idx, dev->sdb_page); + else if (ctx->sdb_type == ERDMA_SDB_ENTRY) + clear_bit(ctx->sdb_idx, dev->sdb_entry); + spin_unlock(&dev->db_bitmap_lock); + } atomic_dec(&dev->num_ctx); } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 131cf5f40982..252106679d36 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -31,9 +31,18 @@ struct erdma_user_mmap_entry { u8 mmap_flag; }; +struct erdma_ext_db_info { + bool enable; + u16 sdb_off; + u16 rdb_off; + u16 cdb_off; +}; + struct erdma_ucontext { struct ib_ucontext ibucontext; + struct erdma_ext_db_info ext_db; + u32 sdb_type; u32 sdb_idx; u32 sdb_page_idx; From 6534de1fe385145d256cf5f37b01ccbd63e23405 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Tue, 6 Jun 2023 13:50:04 +0800 Subject: [PATCH 37/71] RDMA/erdma: Associate QPs/CQs with doorbells for authorization For the isolation requirement, each QP/CQ can only issue doorbells from the allocated mmio space. Configure the relationship between QPs/CQs and mmio doorbell spaces to hardware in create_qp/create_cq interfaces. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230606055005.80729-4-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 17 ++++++++++++- drivers/infiniband/hw/erdma/erdma_verbs.c | 31 ++++++++++++++++++----- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 812fc40de64b..cf7629bfe534 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -134,7 +134,7 @@ /* CMDQ related. */ #define ERDMA_CMDQ_MAX_OUTSTANDING 128 -#define ERDMA_CMDQ_SQE_SIZE 64 +#define ERDMA_CMDQ_SQE_SIZE 128 /* cmdq sub module definition. */ enum CMDQ_WQE_SUB_MOD { @@ -242,8 +242,12 @@ struct erdma_cmdq_ext_db_req { /* create_cq cfg1 */ #define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) #define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11) #define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) +/* create_cq cfg2 */ +#define ERDMA_CMD_CREATE_CQ_DB_CFG_MASK GENMASK(15, 0) + struct erdma_cmdq_create_cq_req { u64 hdr; u32 cfg0; @@ -252,6 +256,7 @@ struct erdma_cmdq_create_cq_req { u32 cfg1; u64 cq_db_info_addr; u32 first_page_offset; + u32 cfg2; }; /* regmr/deregmr cfg0 */ @@ -311,6 +316,7 @@ struct erdma_cmdq_modify_qp_req { /* create qp cqn_mtt_cfg */ #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) +#define ERDMA_CMD_CREATE_QP_DB_CFG_MASK BIT(25) #define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) /* create qp mtt_cfg */ @@ -318,6 +324,10 @@ struct erdma_cmdq_modify_qp_req { #define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) #define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) +/* create qp db cfg */ +#define ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK GENMASK(31, 16) +#define ERDMA_CMD_CREATE_QP_RQDB_CFG_MASK GENMASK(15, 0) + #define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) struct erdma_cmdq_create_qp_req { @@ -332,6 +342,11 @@ struct erdma_cmdq_create_qp_req { u32 rq_mtt_cfg; u64 sq_db_info_dma_addr; u64 rq_db_info_dma_addr; + + u64 sq_mtt_entry[3]; + u64 rq_mtt_entry[3]; + + u32 db_cfg; }; struct erdma_cmdq_destroy_qp_req { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 376f70219ecd..ffc05ddc98ae 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -19,10 +19,11 @@ #include "erdma_cm.h" #include "erdma_verbs.h" -static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) +static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) { - struct erdma_cmdq_create_qp_req req; + struct erdma_dev *dev = to_edev(qp->ibqp.device); struct erdma_pd *pd = to_epd(qp->ibqp.pd); + struct erdma_cmdq_create_qp_req req; struct erdma_uqp *user_qp; u64 resp0, resp1; int err; @@ -93,6 +94,16 @@ static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + + if (uctx->ext_db.enable) { + req.sq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_DB_CFG_MASK, 1); + req.db_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK, + uctx->ext_db.sdb_off) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_RQDB_CFG_MASK, + uctx->ext_db.rdb_off); + } } err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, @@ -146,11 +157,12 @@ post_cmd: return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); } -static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) +static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) { + struct erdma_dev *dev = to_edev(cq->ibcq.device); struct erdma_cmdq_create_cq_req req; - u32 page_size; struct erdma_mem *mtt; + u32 page_size; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_CREATE_CQ); @@ -192,6 +204,13 @@ static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) req.first_page_offset = mtt->page_offset; req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + + if (uctx->ext_db.enable) { + req.cfg1 |= FIELD_PREP( + ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK, 1); + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_DB_CFG_MASK, + uctx->ext_db.cdb_off); + } } return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); @@ -753,7 +772,7 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.state = ERDMA_QP_STATE_IDLE; INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); - ret = create_qp_cmd(dev, qp); + ret = create_qp_cmd(uctx, qp); if (ret) goto err_out_cmd; @@ -1517,7 +1536,7 @@ int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_out_xa; } - ret = create_cq_cmd(dev, cq); + ret = create_cq_cmd(ctx, cq); if (ret) goto err_free_res; From 3b3dfd58bace12e8348e5863e05867afd2ead28b Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Tue, 6 Jun 2023 13:50:05 +0800 Subject: [PATCH 38/71] RDMA/erdma: Refactor the original doorbell allocation mechanism The original doorbell allocation mechanism is complex and does not meet the isolation requirement. So we introduce a new doorbell mechanism and the original mechanism (only be used with CAP_SYS_RAWIO if hardware does not support the new mechanism) needs to be kept as simple as possible for compatibility. Signed-off-by: Cheng Xu Link: https://lore.kernel.org/r/20230606055005.80729-5-chengyou@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 14 --- drivers/infiniband/hw/erdma/erdma_hw.h | 13 --- drivers/infiniband/hw/erdma/erdma_main.c | 33 ------ drivers/infiniband/hw/erdma/erdma_verbs.c | 126 +++++----------------- drivers/infiniband/hw/erdma/erdma_verbs.h | 4 - 5 files changed, 27 insertions(+), 163 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index a361d4bcd714..f190111840e9 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -128,13 +128,8 @@ struct erdma_devattr { int numa_node; enum erdma_cc_alg cc; - u32 grp_num; u32 irq_num; - bool disable_dwqe; - u16 dwqe_pages; - u16 dwqe_entries; - u32 max_qp; u32 max_send_wr; u32 max_recv_wr; @@ -215,15 +210,6 @@ struct erdma_dev { u32 next_alloc_qpn; u32 next_alloc_cqn; - spinlock_t db_bitmap_lock; - /* We provide max 64 uContexts that each has one SQ doorbell Page. */ - DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); - /* - * We provide max 496 uContexts that each has one SQ normal Db, - * and one directWQE db. - */ - DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); - atomic_t num_ctx; struct list_head cep_list; }; diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index cf7629bfe534..a882b57aa118 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -82,19 +82,6 @@ #define ERDMA_BAR_CQDB_SPACE_OFFSET \ (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) -/* Doorbell page resources related. */ -/* - * Max # of parallelly issued directSQE is 3072 per device, - * hardware organizes this into 24 group, per group has 128 credits. - */ -#define ERDMA_DWQE_MAX_GRP_CNT 24 -#define ERDMA_DWQE_NUM_PER_GRP 128 - -#define ERDMA_DWQE_TYPE0_CNT 64 -#define ERDMA_DWQE_TYPE1_CNT 496 -/* type1 DB contains 2 DBs, takes 256Byte. */ -#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 - #define ERDMA_SDB_SHARED_PAGE_INDEX 95 /* Doorbell related. */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 525edea987b2..0880c79a978c 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -130,33 +130,6 @@ static irqreturn_t erdma_comm_irq_handler(int irq, void *data) return IRQ_HANDLED; } -static void erdma_dwqe_resource_init(struct erdma_dev *dev) -{ - int total_pages, type0, type1; - - dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); - - if (dev->attrs.grp_num < 4) - dev->attrs.disable_dwqe = true; - else - dev->attrs.disable_dwqe = false; - - /* One page contains 4 goups. */ - total_pages = dev->attrs.grp_num * 4; - - if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { - dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; - type0 = ERDMA_DWQE_TYPE0_CNT; - type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; - } else { - type1 = total_pages / 3; - type0 = total_pages - type1 - 1; - } - - dev->attrs.dwqe_pages = type0; - dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; -} - static int erdma_request_vectors(struct erdma_dev *dev) { int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); @@ -199,8 +172,6 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) { int ret; - erdma_dwqe_resource_init(dev); - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ERDMA_PCI_WIDTH)); if (ret) @@ -557,10 +528,6 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; - spin_lock_init(&dev->db_bitmap_lock); - bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); - bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); - atomic_set(&dev->num_ctx, 0); mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index ffc05ddc98ae..517676fbb8b1 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1149,71 +1149,27 @@ void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(entry); } -#define ERDMA_SDB_PAGE 0 -#define ERDMA_SDB_ENTRY 1 -#define ERDMA_SDB_SHARED 2 - -static void alloc_db_resources(struct erdma_dev *dev, - struct erdma_ucontext *ctx) -{ - u32 bitmap_idx; - struct erdma_devattr *attrs = &dev->attrs; - - if (attrs->disable_dwqe) - goto alloc_normal_db; - - /* Try to alloc independent SDB page. */ - spin_lock(&dev->db_bitmap_lock); - bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); - if (bitmap_idx != attrs->dwqe_pages) { - set_bit(bitmap_idx, dev->sdb_page); - spin_unlock(&dev->db_bitmap_lock); - - ctx->sdb_type = ERDMA_SDB_PAGE; - ctx->sdb_idx = bitmap_idx; - ctx->sdb_page_idx = bitmap_idx; - ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + - (bitmap_idx << PAGE_SHIFT); - ctx->sdb_page_off = 0; - - return; - } - - bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); - if (bitmap_idx != attrs->dwqe_entries) { - set_bit(bitmap_idx, dev->sdb_entry); - spin_unlock(&dev->db_bitmap_lock); - - ctx->sdb_type = ERDMA_SDB_ENTRY; - ctx->sdb_idx = bitmap_idx; - ctx->sdb_page_idx = attrs->dwqe_pages + - bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; - ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; - - ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + - (ctx->sdb_page_idx << PAGE_SHIFT); - - return; - } - - spin_unlock(&dev->db_bitmap_lock); - -alloc_normal_db: - ctx->sdb_type = ERDMA_SDB_SHARED; - ctx->sdb_idx = 0; - ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; - ctx->sdb_page_off = 0; - - ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); -} - -static int alloc_ext_db_resources(struct erdma_dev *dev, - struct erdma_ucontext *ctx) +static int alloc_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx, + bool ext_db_en) { struct erdma_cmdq_ext_db_req req = {}; u64 val0, val1; int ret; + /* + * CAP_SYS_RAWIO is required if hardware does not support extend + * doorbell mechanism. + */ + if (!ext_db_en && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!ext_db_en) { + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET; + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + return 0; + } + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_ALLOC_DB); @@ -1230,7 +1186,6 @@ static int alloc_ext_db_resources(struct erdma_dev *dev, ctx->ext_db.rdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_RDB); ctx->ext_db.cdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_CDB); - ctx->sdb_type = ERDMA_SDB_PAGE; ctx->sdb = dev->func_bar_addr + (ctx->ext_db.sdb_off << PAGE_SHIFT); ctx->cdb = dev->func_bar_addr + (ctx->ext_db.rdb_off << PAGE_SHIFT); ctx->rdb = dev->func_bar_addr + (ctx->ext_db.cdb_off << PAGE_SHIFT); @@ -1238,12 +1193,14 @@ static int alloc_ext_db_resources(struct erdma_dev *dev, return 0; } -static void free_ext_db_resources(struct erdma_dev *dev, - struct erdma_ucontext *ctx) +static void free_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx) { struct erdma_cmdq_ext_db_req req = {}; int ret; + if (!ctx->ext_db.enable) + return; + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_FREE_DB); @@ -1274,7 +1231,6 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibctx->device); int ret; struct erdma_uresp_alloc_ctx uresp = {}; - bool ext_db_en; if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { ret = -ENOMEM; @@ -1289,25 +1245,11 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) INIT_LIST_HEAD(&ctx->dbrecords_page_list); mutex_init(&ctx->dbrecords_page_mutex); - /* - * CAP_SYS_RAWIO is required if hardware does not support extend - * doorbell mechanism. - */ - ext_db_en = !!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_EXTEND_DB); - if (!ext_db_en && !capable(CAP_SYS_RAWIO)) { - ret = -EPERM; + ret = alloc_db_resources(dev, ctx, + !!(dev->attrs.cap_flags & + ERDMA_DEV_CAP_FLAGS_EXTEND_DB)); + if (ret) goto err_out; - } - - if (ext_db_en) { - ret = alloc_ext_db_resources(dev, ctx); - if (ret) - goto err_out; - } else { - alloc_db_resources(dev, ctx); - ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; - ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; - } ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); @@ -1331,8 +1273,6 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) } uresp.dev_id = dev->pdev->device; - uresp.sdb_type = ctx->sdb_type; - uresp.sdb_offset = ctx->sdb_page_off; ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) @@ -1344,8 +1284,7 @@ err_put_mmap_entries: erdma_uctx_user_mmap_entries_remove(ctx); err_free_ext_db: - if (ext_db_en) - free_ext_db_resources(dev, ctx); + free_db_resources(dev, ctx); err_out: atomic_dec(&dev->num_ctx); @@ -1354,22 +1293,11 @@ err_out: void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) { - struct erdma_ucontext *ctx = to_ectx(ibctx); struct erdma_dev *dev = to_edev(ibctx->device); + struct erdma_ucontext *ctx = to_ectx(ibctx); erdma_uctx_user_mmap_entries_remove(ctx); - - if (ctx->ext_db.enable) { - free_ext_db_resources(dev, ctx); - } else { - spin_lock(&dev->db_bitmap_lock); - if (ctx->sdb_type == ERDMA_SDB_PAGE) - clear_bit(ctx->sdb_idx, dev->sdb_page); - else if (ctx->sdb_type == ERDMA_SDB_ENTRY) - clear_bit(ctx->sdb_idx, dev->sdb_entry); - spin_unlock(&dev->db_bitmap_lock); - } - + free_db_resources(dev, ctx); atomic_dec(&dev->num_ctx); } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 252106679d36..429fc3063f98 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -43,10 +43,6 @@ struct erdma_ucontext { struct erdma_ext_db_info ext_db; - u32 sdb_type; - u32 sdb_idx; - u32 sdb_page_idx; - u32 sdb_page_off; u64 sdb; u64 rdb; u64 cdb; From 0af91306e17ef3d18e5f100aa58aa787869118af Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:38 -0700 Subject: [PATCH 39/71] RDMA/bnxt_re: wraparound mbox producer index Driver is not handling the wraparound of the mbox producer index correctly. Currently the wraparound happens once u32 max is reached. Bit 31 of the producer index register is special and should be set only once for the first command. Because the producer index overflow setting bit31 after a long time, FW goes to initialization sequence and this causes FW hang. Fix is to wraparound the mbox producer index once it reaches u16 max. Fixes: cee0c7bba486 ("RDMA/bnxt_re: Refactor command queue management code") Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 688eaa01db64..d4ce82bebb0a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -180,7 +180,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, } while (bsize > 0); cmdq->seq_num++; - cmdq_prod = hwq->prod; + cmdq_prod = hwq->prod & 0xFFFF; if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) { /* The very first doorbell write * is required to set this flag @@ -599,7 +599,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192; sginfo.pgsize = bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth); - hwq_attr.depth = rcfw->cmdq_depth; + hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; hwq_attr.stride = BNXT_QPLIB_CMDQE_UNITS; hwq_attr.type = HWQ_TYPE_CTX; if (bnxt_qplib_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) { From 3099bcdc19b701f732f638ee45679858c08559bb Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:39 -0700 Subject: [PATCH 40/71] RDMA/bnxt_re: Avoid calling wake_up threads from spin_lock context bnxt_qplib_service_creq can be called from interrupt or tasklet or process context. So the function take irq variant of spin_lock. But when wake_up is invoked with the lock held, it is putting the calling context to sleep. [exception RIP: __wake_up_common+190] RIP: ffffffffb7539d7e RSP: ffffa73300207ad8 RFLAGS: 00000083 RAX: 0000000000000001 RBX: ffff91fa295f69b8 RCX: dead000000000200 RDX: ffffa733344af940 RSI: ffffa73336527940 RDI: ffffa73336527940 RBP: 000000000000001c R8: 0000000000000002 R9: 00000000000299c0 R10: 0000017230de82c5 R11: 0000000000000002 R12: ffffa73300207b28 R13: 0000000000000000 R14: ffffa733341bf928 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 Call the wakeup after releasing the lock. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index d4ce82bebb0a..c11b8e708844 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -295,7 +295,8 @@ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw, } static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, - struct creq_qp_event *qp_event) + struct creq_qp_event *qp_event, + u32 *num_wait) { struct creq_qp_error_notification *err_event; struct bnxt_qplib_hwq *hwq = &rcfw->cmdq.hwq; @@ -304,6 +305,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, u16 cbit, blocked = 0; struct pci_dev *pdev; unsigned long flags; + u32 wait_cmds = 0; __le16 mcookie; u16 cookie; int rc = 0; @@ -363,9 +365,10 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, crsqe->req_size = 0; if (!blocked) - wake_up(&rcfw->cmdq.waitq); + wait_cmds++; spin_unlock_irqrestore(&hwq->lock, flags); } + *num_wait += wait_cmds; return rc; } @@ -379,6 +382,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) struct creq_base *creqe; u32 sw_cons, raw_cons; unsigned long flags; + u32 num_wakeup = 0; /* Service the CREQ until budget is over */ spin_lock_irqsave(&hwq->lock, flags); @@ -397,7 +401,8 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) switch (type) { case CREQ_BASE_TYPE_QP_EVENT: bnxt_qplib_process_qp_event - (rcfw, (struct creq_qp_event *)creqe); + (rcfw, (struct creq_qp_event *)creqe, + &num_wakeup); creq->stats.creq_qp_event_processed++; break; case CREQ_BASE_TYPE_FUNC_EVENT: @@ -425,6 +430,8 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) rcfw->res->cctx, true); } spin_unlock_irqrestore(&hwq->lock, flags); + if (num_wakeup) + wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); } static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance) From b021186bca9d6b3d6d16ea8b97c4a598f62de12c Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:40 -0700 Subject: [PATCH 41/71] RDMA/bnxt_re: remove virt_func check while creating RoCE FW channel There is a common FW communication offset for both PF and VF. Removed code around virt_fn check while creating FW channel. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 15 ++++++--------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 5 ++--- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 4718af6c4c47..51372de19e22 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1202,7 +1202,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX); vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, - vid, db_offt, rdev->is_virtfn, + vid, db_offt, &bnxt_re_aeq_handler); if (rc) { ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n", diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index c11b8e708844..67140eb02c4d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -724,13 +724,11 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, return 0; } -static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf) +static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw) { struct bnxt_qplib_cmdq_mbox *mbox; resource_size_t bar_reg; struct pci_dev *pdev; - u16 prod_offt; - int rc = 0; pdev = rcfw->pdev; mbox = &rcfw->cmdq.cmdq_mbox; @@ -755,11 +753,10 @@ static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf) return -ENOMEM; } - prod_offt = is_vf ? RCFW_VF_COMM_PROD_OFFSET : - RCFW_PF_COMM_PROD_OFFSET; - mbox->prod = (void __iomem *)(mbox->reg.bar_reg + prod_offt); + mbox->prod = (void __iomem *)(mbox->reg.bar_reg + + RCFW_PF_VF_COMM_PROD_OFFSET); mbox->db = (void __iomem *)(mbox->reg.bar_reg + RCFW_COMM_TRIG_OFFSET); - return rc; + return 0; } static int bnxt_qplib_map_creq_db(struct bnxt_qplib_rcfw *rcfw, u32 reg_offt) @@ -820,7 +817,7 @@ static void bnxt_qplib_start_rcfw(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, int msix_vector, - int cp_bar_reg_off, int virt_fn, + int cp_bar_reg_off, aeq_handler_t aeq_handler) { struct bnxt_qplib_cmdq_ctx *cmdq; @@ -840,7 +837,7 @@ int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, creq->stats.creq_func_event_processed = 0; creq->aeq_handler = aeq_handler; - rc = bnxt_qplib_map_cmdq_mbox(rcfw, virt_fn); + rc = bnxt_qplib_map_cmdq_mbox(rcfw); if (rc) return rc; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 92f7a25533d3..0dff56e3db88 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -45,8 +45,7 @@ #define RCFW_COMM_PCI_BAR_REGION 0 #define RCFW_COMM_CONS_PCI_BAR_REGION 2 #define RCFW_COMM_BASE_OFFSET 0x600 -#define RCFW_PF_COMM_PROD_OFFSET 0xc -#define RCFW_VF_COMM_PROD_OFFSET 0xc +#define RCFW_PF_VF_COMM_PROD_OFFSET 0xc #define RCFW_COMM_TRIG_OFFSET 0x100 #define RCFW_COMM_SIZE 0x104 @@ -235,7 +234,7 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, bool need_init); int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, int msix_vector, - int cp_bar_reg_off, int virt_fn, + int cp_bar_reg_off, aeq_handler_t aeq_handler); struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf( From 258ee04317dacf789da420e2791fde2eb0da6946 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:41 -0700 Subject: [PATCH 42/71] RDMA/bnxt_re: set fixed command queue depth There is no need of setting max command queue entries based on firmware version check. Removing deperecated code. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 6 ++---- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 7 ++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 67140eb02c4d..66121fb898b3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -600,10 +600,8 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, "HW channel CREQ allocation failed\n"); goto fail; } - if (ctx->hwrm_intf_ver < HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK) - rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_256; - else - rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192; + + rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT; sginfo.pgsize = bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth); hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 0dff56e3db88..32e5b6767cc8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -69,8 +69,7 @@ static inline void bnxt_qplib_rcfw_cmd_prep(struct cmdq_base *req, #define RCFW_CMD_WAIT_TIME_MS 20000 /* 20 Seconds timeout */ /* CMDQ elements */ -#define BNXT_QPLIB_CMDQE_MAX_CNT_256 256 -#define BNXT_QPLIB_CMDQE_MAX_CNT_8192 8192 +#define BNXT_QPLIB_CMDQE_MAX_CNT 8192 #define BNXT_QPLIB_CMDQE_BYTES(depth) ((depth) * BNXT_QPLIB_CMDQE_UNITS) static inline u32 bnxt_qplib_cmdqe_npages(u32 depth) @@ -105,12 +104,10 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) return cmd_byte; } -#define RCFW_MAX_COOKIE_VALUE 0x7FFF +#define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) #define RCFW_CMD_IS_BLOCKING 0x8000 #define RCFW_BLOCKED_CMD_WAIT_COUNT 20000000UL /* 20 sec */ -#define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL - /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { u8 data[1024]; From 8cf1d12ad56beb73d2439ccf334b7148e71de58e Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:42 -0700 Subject: [PATCH 43/71] RDMA/bnxt_re: Enhance the existing functions that wait for FW responses Use jiffies based timewait instead of counting iteration for commands that block for FW response. Also add a poll routine for control path commands. This is for polling completion if the waiting commands timeout. This avoids cases where the driver misses completion interrupts. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 65 +++++++++++++++++----- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 66121fb898b3..3b242cca940f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -53,37 +53,74 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); -/* Hardware communication channel */ +/** + * __wait_for_resp - Don't hold the cpu context and wait for response + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * Wait for command completion in sleepable context. + * + * Returns: + * 0 if command is completed by firmware. + * Non zero error code for rest of the case. + */ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq; u16 cbit; - int rc; + int ret; cmdq = &rcfw->cmdq; cbit = cookie % rcfw->cmdq_depth; - rc = wait_event_timeout(cmdq->waitq, - !test_bit(cbit, cmdq->cmdq_bitmap), - msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS)); - return rc ? 0 : -ETIMEDOUT; + + do { + /* Non zero means command completed */ + ret = wait_event_timeout(cmdq->waitq, + !test_bit(cbit, cmdq->cmdq_bitmap), + msecs_to_jiffies(10000)); + + if (!test_bit(cbit, cmdq->cmdq_bitmap)) + return 0; + + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); + + if (!test_bit(cbit, cmdq->cmdq_bitmap)) + return 0; + + } while (true); }; +/** + * __block_for_resp - hold the cpu context and wait for response + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * This function will hold the cpu (non-sleepable context) and + * wait for command completion. Maximum holding interval is 8 second. + * + * Returns: + * -ETIMEOUT if command is not completed in specific time interval. + * 0 if command is completed by firmware. + */ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { - u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT; - struct bnxt_qplib_cmdq_ctx *cmdq; + struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + unsigned long issue_time = 0; u16 cbit; - cmdq = &rcfw->cmdq; cbit = cookie % rcfw->cmdq_depth; - if (!test_bit(cbit, cmdq->cmdq_bitmap)) - goto done; + issue_time = jiffies; + do { udelay(1); + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); - } while (test_bit(cbit, cmdq->cmdq_bitmap) && --count); -done: - return count ? 0 : -ETIMEDOUT; + if (!test_bit(cbit, cmdq->cmdq_bitmap)) + return 0; + + } while (time_before(jiffies, issue_time + (8 * HZ))); + + return -ETIMEDOUT; }; static int __send_message(struct bnxt_qplib_rcfw *rcfw, From 3022cc15119733cebaef05feddb5d87b9e401c0e Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:43 -0700 Subject: [PATCH 44/71] RDMA/bnxt_re: Avoid the command wait if firmware is inactive Add a check to avoid waiting if driver already detects a FW timeout. Return success for resource destroy in case the device is detached. Add helper function to map timeout error code to success. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-7-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 52 ++++++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3b242cca940f..3c4f72aa5fee 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -53,10 +53,47 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); +/** + * bnxt_qplib_map_rc - map return type based on opcode + * @opcode - roce slow path opcode + * + * In some cases like firmware halt is detected, the driver is supposed to + * remap the error code of the timed out command. + * + * It is not safe to assume hardware is really inactive so certain opcodes + * like destroy qp etc are not safe to be returned success, but this function + * will be called when FW already reports a timeout. This would be possible + * only when FW crashes and resets. This will clear all the HW resources. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bnxt_qplib_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + /** * __wait_for_resp - Don't hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode * * Wait for command completion in sleepable context. * @@ -64,7 +101,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); * 0 if command is completed by firmware. * Non zero error code for rest of the case. */ -static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) +static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; u16 cbit; @@ -74,6 +111,9 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) cbit = cookie % rcfw->cmdq_depth; do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(opcode); + /* Non zero means command completed */ ret = wait_event_timeout(cmdq->waitq, !test_bit(cbit, cmdq->cmdq_bitmap), @@ -94,6 +134,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) * __block_for_resp - hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode * * This function will hold the cpu (non-sleepable context) and * wait for command completion. Maximum holding interval is 8 second. @@ -102,7 +143,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) * -ETIMEOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ -static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) +static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; unsigned long issue_time = 0; @@ -112,6 +153,9 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) issue_time = jiffies; do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(opcode); + udelay(1); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); @@ -267,9 +311,9 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, } while (retry_cnt--); if (msg->block) - rc = __block_for_resp(rcfw, cookie); + rc = __block_for_resp(rcfw, cookie, opcode); else - rc = __wait_for_resp(rcfw, cookie); + rc = __wait_for_resp(rcfw, cookie, opcode); if (rc) { /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", From 65288a22ddd81422a2a2a10c15df976a5332e41b Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:44 -0700 Subject: [PATCH 45/71] RDMA/bnxt_re: use shadow qd while posting non blocking rcfw command Whenever there is a fast path IO and create/destroy resources from the slow path is happening in parallel, we may notice high latency of slow path command completion. Introduces a shadow queue depth to prevent the outstanding requests to the FW. Driver will not allow more than #RCFW_CMD_NON_BLOCKING_SHADOW_QD non-blocking commands to the Firmware. Shadow queue depth is a soft limit only for non-blocking commands. Blocking commands will be posted to the firmware as long as there is a free slot. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-8-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 60 +++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 ++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3c4f72aa5fee..f7d12388ef31 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -281,8 +281,21 @@ done: return 0; } -int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_cmdqmsg *msg) +/** + * __bnxt_qplib_rcfw_send_message - qplib interface to send + * and complete rcfw command. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * This function does not account shadow queue depth. It will send + * all the command unconditionally as long as send queue is not full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; u16 cookie; @@ -331,6 +344,48 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, return rc; } + +/** + * bnxt_qplib_rcfw_send_message - qplib interface to send + * and complete rcfw command. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * Driver interact with Firmware through rcfw channel/slow path in two ways. + * a. Blocking rcfw command send. In this path, driver cannot hold + * the context for longer period since it is holding cpu until + * command is not completed. + * b. Non-blocking rcfw command send. In this path, driver can hold the + * context for longer period. There may be many pending command waiting + * for completion because of non-blocking nature. + * + * Driver will use shadow queue depth. Current queue depth of 8K + * (due to size of rcfw message there can be actual ~4K rcfw outstanding) + * is not optimal for rcfw command processing in firmware. + * + * Restrict at max #RCFW_CMD_NON_BLOCKING_SHADOW_QD Non-Blocking rcfw commands. + * Allow all blocking commands until there is no queue full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) +{ + int ret; + + if (!msg->block) { + down(&rcfw->rcfw_inflight); + ret = __bnxt_qplib_rcfw_send_message(rcfw, msg); + up(&rcfw->rcfw_inflight); + } else { + ret = __bnxt_qplib_rcfw_send_message(rcfw, msg); + } + + return ret; +} + /* Completions */ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw, struct creq_func_event *func_event) @@ -932,6 +987,7 @@ int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, return rc; } + sema_init(&rcfw->rcfw_inflight, RCFW_CMD_NON_BLOCKING_SHADOW_QD); bnxt_qplib_start_rcfw(rcfw); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 32e5b6767cc8..862bfbfa7884 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -66,6 +66,8 @@ static inline void bnxt_qplib_rcfw_cmd_prep(struct cmdq_base *req, req->cmd_size = cmd_size; } +/* Shadow queue depth for non blocking command */ +#define RCFW_CMD_NON_BLOCKING_SHADOW_QD 64 #define RCFW_CMD_WAIT_TIME_MS 20000 /* 20 Seconds timeout */ /* CMDQ elements */ @@ -197,6 +199,7 @@ struct bnxt_qplib_rcfw { u64 oos_prev; u32 init_oos_stats; u32 cmdq_depth; + struct semaphore rcfw_inflight; }; struct bnxt_qplib_cmdqmsg { From 159cf95e42a7ca7375646fab82c0056cbb71f9e9 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:45 -0700 Subject: [PATCH 46/71] RDMA/bnxt_re: Simplify the function that sends the FW commands - Use __send_message_basic_sanity helper function. - Do not retry posting same command if there is a queue full detection. - ENXIO is used to indicate controller recovery. - In the case of ERR_DEVICE_DETACHED state, the driver should not post commands to the firmware, but also return fabricated written code. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-9-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 125 +++++++++++---------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 22 ++++ 2 files changed, 86 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index f7d12388ef31..15f67939602d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -170,34 +170,22 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg *msg) { - struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; - struct bnxt_qplib_hwq *hwq = &cmdq->hwq; + u32 bsize, opcode, free_slots, required_slots; + struct bnxt_qplib_cmdq_ctx *cmdq; struct bnxt_qplib_crsqe *crsqe; struct bnxt_qplib_cmdqe *cmdqe; + struct bnxt_qplib_hwq *hwq; u32 sw_prod, cmdq_prod; struct pci_dev *pdev; unsigned long flags; - u32 bsize, opcode; u16 cookie, cbit; u8 *preq; + cmdq = &rcfw->cmdq; + hwq = &cmdq->hwq; pdev = rcfw->pdev; opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && - (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && - opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && - opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { - dev_err(&pdev->dev, - "RCFW not initialized, reject opcode 0x%x\n", opcode); - return -EINVAL; - } - - if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && - opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { - dev_err(&pdev->dev, "RCFW already initialized!\n"); - return -EINVAL; - } if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags)) return -ETIMEDOUT; @@ -206,40 +194,37 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, * cmdqe */ spin_lock_irqsave(&hwq->lock, flags); - if (msg->req->cmd_size >= HWQ_FREE_SLOTS(hwq)) { - dev_err(&pdev->dev, "RCFW: CMDQ is full!\n"); + required_slots = bnxt_qplib_get_cmd_slots(msg->req); + free_slots = HWQ_FREE_SLOTS(hwq); + cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; + cbit = cookie % rcfw->cmdq_depth; + + if (required_slots >= free_slots || + test_bit(cbit, cmdq->cmdq_bitmap)) { + dev_info_ratelimited(&pdev->dev, + "CMDQ is full req/free %d/%d!", + required_slots, free_slots); spin_unlock_irqrestore(&hwq->lock, flags); return -EAGAIN; } - - - cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; if (msg->block) cookie |= RCFW_CMD_IS_BLOCKING; - set_bit(cbit, cmdq->cmdq_bitmap); __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); crsqe = &rcfw->crsqe_tbl[cbit]; - if (crsqe->resp) { - spin_unlock_irqrestore(&hwq->lock, flags); - return -EBUSY; - } - - /* change the cmd_size to the number of 16byte cmdq unit. - * req->cmd_size is modified here - */ bsize = bnxt_qplib_set_cmd_slots(msg->req); - - memset(msg->resp, 0, sizeof(*msg->resp)); + crsqe->free_slots = free_slots; crsqe->resp = (struct creq_qp_event *)msg->resp; crsqe->resp->cookie = cpu_to_le16(cookie); crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { struct bnxt_qplib_rcfw_sbuf *sbuf = msg->sb; - __set_cmdq_base_resp_addr(msg->req, msg->req_sz, cpu_to_le64(sbuf->dma_addr)); + + __set_cmdq_base_resp_addr(msg->req, msg->req_sz, + cpu_to_le64(sbuf->dma_addr)); __set_cmdq_base_resp_size(msg->req, msg->req_sz, - ALIGN(sbuf->size, BNXT_QPLIB_CMDQE_UNITS)); + ALIGN(sbuf->size, + BNXT_QPLIB_CMDQE_UNITS)); } preq = (u8 *)msg->req; @@ -247,11 +232,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, /* Locate the next cmdq slot */ sw_prod = HWQ_CMP(hwq->prod, hwq); cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL); - if (!cmdqe) { - dev_err(&pdev->dev, - "RCFW request failed with no cmdqe!\n"); - goto done; - } /* Copy a segment of the req cmd to the cmdq */ memset(cmdqe, 0, sizeof(*cmdqe)); memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); @@ -275,12 +255,43 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, wmb(); writel(cmdq_prod, cmdq->cmdq_mbox.prod); writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); -done: spin_unlock_irqrestore(&hwq->lock, flags); /* Return the CREQ response pointer */ return 0; } +static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) +{ + struct bnxt_qplib_cmdq_ctx *cmdq; + u32 opcode; + + cmdq = &rcfw->cmdq; + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); + + /* Prevent posting if f/w is not in a state to process */ + if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) + return -ENXIO; + + if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { + dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!"); + return -EINVAL; + } + + if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { + dev_err(&rcfw->pdev->dev, + "QPLIB: RCFW not initialized, reject opcode 0x%x", + opcode); + return -EOPNOTSUPP; + } + + return 0; +} + /** * __bnxt_qplib_rcfw_send_message - qplib interface to send * and complete rcfw command. @@ -299,29 +310,21 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; u16 cookie; - u8 opcode, retry_cnt = 0xFF; int rc = 0; + u8 opcode; - /* Prevent posting if f/w is not in a state to process */ - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return 0; + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - do { - opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - rc = __send_message(rcfw, msg); - cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) & - RCFW_MAX_COOKIE_VALUE; - if (!rc) - break; - if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) { - /* send failed */ - dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n", - cookie, opcode); - return rc; - } - msg->block ? mdelay(1) : usleep_range(500, 1000); + rc = __send_message_basic_sanity(rcfw, msg); + if (rc) + return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; - } while (retry_cnt--); + rc = __send_message(rcfw, msg); + if (rc) + return rc; + + cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) + & RCFW_MAX_COOKIE_VALUE; if (msg->block) rc = __block_for_resp(rcfw, cookie, opcode); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 862bfbfa7884..b7bbbae897d1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -89,6 +89,26 @@ static inline u32 bnxt_qplib_cmdqe_page_size(u32 depth) return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE); } +/* Get the number of command units required for the req. The + * function returns correct value only if called before + * setting using bnxt_qplib_set_cmd_slots + */ +static inline u32 bnxt_qplib_get_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_units = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_units = tlv_req->total_size; + } else { + cmd_units = (req->cmd_size + BNXT_QPLIB_CMDQE_UNITS - 1) / + BNXT_QPLIB_CMDQE_UNITS; + } + + return cmd_units; +} + static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) { u32 cmd_byte = 0; @@ -130,6 +150,8 @@ typedef int (*aeq_handler_t)(struct bnxt_qplib_rcfw *, void *, void *); struct bnxt_qplib_crsqe { struct creq_qp_event *resp; u32 req_size; + /* Free slots at the time of submission */ + u32 free_slots; }; struct bnxt_qplib_rcfw_sbuf { From 354f5bd985af9515190828bc642ebdf59acea121 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:46 -0700 Subject: [PATCH 47/71] RDMA/bnxt_re: add helper function __poll_for_resp This interface will be used if the driver has not enabled interrupt and/or interrupt is disabled for a short period of time. Completion is not possible from interrupt so this interface does self-polling. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-10-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 44 +++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 1 + 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 15f67939602d..3215f8a99afb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -260,6 +260,44 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, return 0; } +/** + * __poll_for_resp - self poll completion for rcfw command + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode + * + * It works same as __wait_for_resp except this function will + * do self polling in sort interval since interrupt is disabled. + * This function can not be called from non-sleepable context. + * + * Returns: + * -ETIMEOUT if command is not completed in specific time interval. + * 0 if command is completed by firmware. + */ +static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, + u8 opcode) +{ + struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + unsigned long issue_time; + u16 cbit; + + cbit = cookie % rcfw->cmdq_depth; + issue_time = jiffies; + + do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(opcode); + + usleep_range(1000, 1001); + + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); + if (!test_bit(cbit, cmdq->cmdq_bitmap)) + return 0; + if (jiffies_to_msecs(jiffies - issue_time) > 10000) + return -ETIMEDOUT; + } while (true); +}; + static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg *msg) { @@ -328,8 +366,10 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (msg->block) rc = __block_for_resp(rcfw, cookie, opcode); - else + else if (atomic_read(&rcfw->rcfw_intr_enabled)) rc = __wait_for_resp(rcfw, cookie, opcode); + else + rc = __poll_for_resp(rcfw, cookie, opcode); if (rc) { /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", @@ -796,6 +836,7 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) kfree(creq->irq_name); creq->irq_name = NULL; creq->requested = false; + atomic_set(&rcfw->rcfw_intr_enabled, 0); } void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) @@ -857,6 +898,7 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, creq->requested = true; bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); + atomic_inc(&rcfw->rcfw_intr_enabled); return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index b7bbbae897d1..089e61677e90 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -221,6 +221,7 @@ struct bnxt_qplib_rcfw { u64 oos_prev; u32 init_oos_stats; u32 cmdq_depth; + atomic_t rcfw_intr_enabled; struct semaphore rcfw_inflight; }; From 691eb7c6110fe04e38bf0f60d713482508b3d199 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:47 -0700 Subject: [PATCH 48/71] RDMA/bnxt_re: handle command completions after driver detect a timedout If calling context detect command timeout, associated memory stored on stack will not be valid. If firmware complete the same command later, this causes incorrect memory access by driver. Added is_waiter_alive to handle delayed completion by firmware. is_waiter_alive is set and reset under command queue lock. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-11-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 61 ++++++++++++---------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 1 + 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3215f8a99afb..4e5f66e5c714 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -105,7 +105,6 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; u16 cbit; - int ret; cmdq = &rcfw->cmdq; cbit = cookie % rcfw->cmdq_depth; @@ -115,9 +114,9 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) return bnxt_qplib_map_rc(opcode); /* Non zero means command completed */ - ret = wait_event_timeout(cmdq->waitq, - !test_bit(cbit, cmdq->cmdq_bitmap), - msecs_to_jiffies(10000)); + wait_event_timeout(cmdq->waitq, + !test_bit(cbit, cmdq->cmdq_bitmap), + msecs_to_jiffies(10000)); if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; @@ -170,7 +169,7 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg *msg) { - u32 bsize, opcode, free_slots, required_slots; + u32 bsize, free_slots, required_slots; struct bnxt_qplib_cmdq_ctx *cmdq; struct bnxt_qplib_crsqe *crsqe; struct bnxt_qplib_cmdqe *cmdqe; @@ -185,8 +184,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, hwq = &cmdq->hwq; pdev = rcfw->pdev; - opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags)) return -ETIMEDOUT; @@ -216,6 +213,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, crsqe->free_slots = free_slots; crsqe->resp = (struct creq_qp_event *)msg->resp; crsqe->resp->cookie = cpu_to_le16(cookie); + crsqe->is_waiter_alive = true; crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { struct bnxt_qplib_rcfw_sbuf *sbuf = msg->sb; @@ -347,7 +345,9 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg *msg) { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; - u16 cookie; + struct bnxt_qplib_crsqe *crsqe; + unsigned long flags; + u16 cookie, cbit; int rc = 0; u8 opcode; @@ -363,6 +363,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) & RCFW_MAX_COOKIE_VALUE; + cbit = cookie % rcfw->cmdq_depth; if (msg->block) rc = __block_for_resp(rcfw, cookie, opcode); @@ -378,6 +379,14 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, return rc; } + if (rc) { + spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); + crsqe = &rcfw->crsqe_tbl[cbit]; + crsqe->is_waiter_alive = false; + spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); + return -ETIMEDOUT; + } + if (evnt->status) { /* failed with status */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", @@ -480,15 +489,15 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, struct creq_qp_error_notification *err_event; struct bnxt_qplib_hwq *hwq = &rcfw->cmdq.hwq; struct bnxt_qplib_crsqe *crsqe; + u32 qp_id, tbl_indx, req_size; struct bnxt_qplib_qp *qp; u16 cbit, blocked = 0; + bool is_waiter_alive; struct pci_dev *pdev; unsigned long flags; u32 wait_cmds = 0; - __le16 mcookie; u16 cookie; int rc = 0; - u32 qp_id, tbl_indx; pdev = rcfw->pdev; switch (qp_event->event) { @@ -520,31 +529,29 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, spin_lock_irqsave_nested(&hwq->lock, flags, SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); - mcookie = qp_event->cookie; blocked = cookie & RCFW_CMD_IS_BLOCKING; cookie &= RCFW_MAX_COOKIE_VALUE; cbit = cookie % rcfw->cmdq_depth; crsqe = &rcfw->crsqe_tbl[cbit]; - if (crsqe->resp && - crsqe->resp->cookie == mcookie) { - memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); - crsqe->resp = NULL; - } else { - if (crsqe->resp && crsqe->resp->cookie) - dev_err(&pdev->dev, - "CMD %s cookie sent=%#x, recd=%#x\n", - crsqe->resp ? "mismatch" : "collision", - crsqe->resp ? crsqe->resp->cookie : 0, - mcookie); - } if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) dev_warn(&pdev->dev, "CMD bit %d was not requested\n", cbit); - hwq->cons += crsqe->req_size; - crsqe->req_size = 0; - if (!blocked) - wait_cmds++; + if (crsqe->is_waiter_alive) { + if (crsqe->resp) + memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); + if (!blocked) + wait_cmds++; + } + + req_size = crsqe->req_size; + is_waiter_alive = crsqe->is_waiter_alive; + + crsqe->req_size = 0; + if (!is_waiter_alive) + crsqe->resp = NULL; + + hwq->cons += req_size; spin_unlock_irqrestore(&hwq->lock, flags); } *num_wait += wait_cmds; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 089e61677e90..6ed81c180821 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -152,6 +152,7 @@ struct bnxt_qplib_crsqe { u32 req_size; /* Free slots at the time of submission */ u32 free_slots; + bool is_waiter_alive; }; struct bnxt_qplib_rcfw_sbuf { From b6c7256688264f5096a2cfaaaa56d01ea686b367 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:48 -0700 Subject: [PATCH 49/71] RDMA/bnxt_re: Add firmware stall check detection Every completion will update last_seen value in the unit of jiffies. last_seen field will be used to know if firmware is alive and is useful to detect firmware stall. Non blocking interface __wait_for_resp will have logic to detect firmware stall. After every 10 second interval if __wait_for_resp has not received completion for a given command it will check for firmware stall condition. If current jiffies is greater than last_seen jiffies + RCFW_FW_STALL_TIMEOUT_SEC * HZ, it is a firmware stall. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-12-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 44 ++++++++++++++++++---- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 5 ++- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 4e5f66e5c714..349fbed003d3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -112,11 +112,13 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; - /* Non zero means command completed */ wait_event_timeout(cmdq->waitq, !test_bit(cbit, cmdq->cmdq_bitmap), - msecs_to_jiffies(10000)); + msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC + * 1000)); if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; @@ -126,6 +128,11 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; + /* Firmware stall is detected */ + if (time_after(jiffies, cmdq->last_seen + + (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) + return -ENODEV; + } while (true); }; @@ -154,6 +161,8 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; udelay(1); @@ -184,9 +193,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, hwq = &cmdq->hwq; pdev = rcfw->pdev; - if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags)) - return -ETIMEDOUT; - /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ @@ -285,14 +291,21 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; usleep_range(1000, 1001); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; - if (jiffies_to_msecs(jiffies - issue_time) > 10000) - return -ETIMEDOUT; + if (jiffies_to_msecs(jiffies - issue_time) > + (RCFW_FW_STALL_TIMEOUT_SEC * 1000)) { + /* Firmware stall is detected */ + if (time_after(jiffies, cmdq->last_seen + + (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) + return -ENODEV; + } } while (true); }; @@ -308,6 +321,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { @@ -375,7 +390,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); - set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags); return rc; } @@ -383,6 +397,8 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); crsqe = &rcfw->crsqe_tbl[cbit]; crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); return -ETIMEDOUT; } @@ -533,6 +549,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, cookie &= RCFW_MAX_COOKIE_VALUE; cbit = cookie % rcfw->cmdq_depth; crsqe = &rcfw->crsqe_tbl[cbit]; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "QPLIB: Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock_irqrestore(&hwq->lock, flags); + return rc; + } + if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) dev_warn(&pdev->dev, "CMD bit %d was not requested\n", cbit); @@ -582,6 +609,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) * reading any further. */ dma_rmb(); + rcfw->cmdq.last_seen = jiffies; type = creqe->type & CREQ_BASE_TYPE_MASK; switch (type) { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 6ed81c180821..54576f12c8da 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -51,6 +51,7 @@ #define RCFW_DBR_PCI_BAR_REGION 2 #define RCFW_DBR_BASE_PAGE_SHIFT 12 +#define RCFW_FW_STALL_TIMEOUT_SEC 40 /* Cmdq contains a fix number of a 16-Byte slots */ struct bnxt_qplib_cmdqe { @@ -128,7 +129,6 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) #define RCFW_CMD_IS_BLOCKING 0x8000 -#define RCFW_BLOCKED_CMD_WAIT_COUNT 20000000UL /* 20 sec */ /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { @@ -170,7 +170,7 @@ struct bnxt_qplib_qp_node { #define FIRMWARE_INITIALIZED_FLAG (0) #define FIRMWARE_FIRST_FLAG (31) -#define FIRMWARE_TIMED_OUT (3) +#define FIRMWARE_STALL_DETECTED (3) #define ERR_DEVICE_DETACHED (4) struct bnxt_qplib_cmdq_mbox { @@ -185,6 +185,7 @@ struct bnxt_qplib_cmdq_ctx { wait_queue_head_t waitq; unsigned long flags; unsigned long *cmdq_bitmap; + unsigned long last_seen; u32 seq_num; }; From 84911cf3b2aa8d0a1e563d346c852131582cb871 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:49 -0700 Subject: [PATCH 50/71] RDMA/bnxt_re: post destroy_ah for delayed completion of AH creation AH create may be called from interrpt context and driver has a special timeout (8 sec) for this command. This is to avoid soft lockups when the FW command takes more time. Driver returns -ETIMEOUT and fail create AH, without waiting for actual completion from firmware. When FW completion is received, use is_waiter_alive flag to avoid a regular completion path. If create_ah opcode is detected in completion path which does not have waiter alive, driver will fetch ah_id from successful firmware completion in the interrupt context and sends destroy_ah command for same ah_id. This special post is done in quick manner using helper function __send_message_no_waiter. timeout_send is only used for debugging purposes. If timeout_send value keeps incrementing, it indicates out of sync active ah counter between driver and firmware. This is a limitation but graceful handling is possible in future. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-13-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 108 +++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 2 + 2 files changed, 110 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 349fbed003d3..8dd82163ab7a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -175,6 +175,73 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) return -ETIMEDOUT; }; +/* __send_message_no_waiter - get cookie and post the message. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * This function will just post and don't bother about completion. + * Current design of this function is - + * user must hold the completion queue hwq->lock. + * user must have used existing completion and free the resources. + * this function will not check queue full condition. + * this function will explicitly set is_waiter_alive=false. + * current use case is - send destroy_ah if create_ah is return + * after waiter of create_ah is lost. It can be extended for other + * use case as well. + * + * Returns: Nothing + * + */ +static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) +{ + struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + struct bnxt_qplib_hwq *hwq = &cmdq->hwq; + struct bnxt_qplib_crsqe *crsqe; + struct bnxt_qplib_cmdqe *cmdqe; + u32 sw_prod, cmdq_prod; + u16 cookie, cbit; + u32 bsize; + u8 *preq; + + cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; + cbit = cookie % rcfw->cmdq_depth; + + set_bit(cbit, cmdq->cmdq_bitmap); + __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); + crsqe = &rcfw->crsqe_tbl[cbit]; + + /* Set cmd_size in terms of 16B slots in req. */ + bsize = bnxt_qplib_set_cmd_slots(msg->req); + /* GET_CMD_SIZE would return number of slots in either case of tlv + * and non-tlv commands after call to bnxt_qplib_set_cmd_slots() + */ + crsqe->is_internal_cmd = true; + crsqe->is_waiter_alive = false; + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); + + preq = (u8 *)msg->req; + do { + /* Locate the next cmdq slot */ + sw_prod = HWQ_CMP(hwq->prod, hwq); + cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL); + /* Copy a segment of the req cmd to the cmdq */ + memset(cmdqe, 0, sizeof(*cmdqe)); + memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); + preq += min_t(u32, bsize, sizeof(*cmdqe)); + bsize -= min_t(u32, bsize, sizeof(*cmdqe)); + hwq->prod++; + } while (bsize > 0); + cmdq->seq_num++; + + cmdq_prod = hwq->prod; + atomic_inc(&rcfw->timeout_send); + /* ring CMDQ DB */ + wmb(); + writel(cmdq_prod, cmdq->cmdq_mbox.prod); + writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); +} + static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg *msg) { @@ -219,6 +286,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, crsqe->free_slots = free_slots; crsqe->resp = (struct creq_qp_event *)msg->resp; crsqe->resp->cookie = cpu_to_le16(cookie); + crsqe->is_internal_cmd = false; crsqe->is_waiter_alive = true; crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { @@ -343,6 +411,26 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, return 0; } +/* This function will just post and do not bother about completion */ +static void __destroy_timedout_ah(struct bnxt_qplib_rcfw *rcfw, + struct creq_create_ah_resp *create_ah_resp) +{ + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_destroy_ah req = {}; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_DESTROY_AH, + sizeof(req)); + req.ah_cid = create_ah_resp->xid; + msg.req = (struct cmdq_base *)&req; + msg.req_sz = sizeof(req); + __send_message_no_waiter(rcfw, &msg); + dev_info_ratelimited(&rcfw->pdev->dev, + "From %s: ah_cid = %d timeout_send %d\n", + __func__, req.ah_cid, + atomic_read(&rcfw->timeout_send)); +} + /** * __bnxt_qplib_rcfw_send_message - qplib interface to send * and complete rcfw command. @@ -563,6 +651,8 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) dev_warn(&pdev->dev, "CMD bit %d was not requested\n", cbit); + if (crsqe->is_internal_cmd && !qp_event->status) + atomic_dec(&rcfw->timeout_send); if (crsqe->is_waiter_alive) { if (crsqe->resp) @@ -579,6 +669,24 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, crsqe->resp = NULL; hwq->cons += req_size; + + /* This is a case to handle below scenario - + * Create AH is completed successfully by firmware, + * but completion took more time and driver already lost + * the context of create_ah from caller. + * We have already return failure for create_ah verbs, + * so let's destroy the same address vector since it is + * no more used in stack. We don't care about completion + * in __send_message_no_waiter. + * If destroy_ah is failued by firmware, there will be AH + * resource leak and relatively not critical + unlikely + * scenario. Current design is not to handle such case. + */ + if (!is_waiter_alive && !qp_event->status && + qp_event->event == CREQ_QP_EVENT_EVENT_CREATE_AH) + __destroy_timedout_ah(rcfw, + (struct creq_create_ah_resp *) + qp_event); spin_unlock_irqrestore(&hwq->lock, flags); } *num_wait += wait_cmds; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 54576f12c8da..338bf6a9699f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -153,6 +153,7 @@ struct bnxt_qplib_crsqe { /* Free slots at the time of submission */ u32 free_slots; bool is_waiter_alive; + bool is_internal_cmd; }; struct bnxt_qplib_rcfw_sbuf { @@ -225,6 +226,7 @@ struct bnxt_qplib_rcfw { u32 cmdq_depth; atomic_t rcfw_intr_enabled; struct semaphore rcfw_inflight; + atomic_t timeout_send; }; struct bnxt_qplib_cmdqmsg { From bb8c93618fb0b8567d309f1aebc6df0cd31da1a2 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:50 -0700 Subject: [PATCH 51/71] RDMA/bnxt_re: consider timeout of destroy ah as success. If destroy_ah is timed out, it is likely to be destroyed by firmware but it is taking longer time due to temporary slowness in processing the rcfw command. In worst case, there might be AH resource leak in firmware. Sending timeout return value can dump warning message from ib_core which can be avoided if we map timeout of destroy_ah as success. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-14-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 16 ++++++++++++---- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 8 +++++--- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 4 ++-- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5a2baf49ecaa..f34fb87e3b77 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -181,6 +181,8 @@ struct bnxt_re_dev { #define BNXT_RE_ROCEV2_IPV4_PACKET 2 #define BNXT_RE_ROCEV2_IPV6_PACKET 3 +#define BNXT_RE_CHECK_RC(x) ((x) && ((x) != -ETIMEDOUT)) + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e86afecfbe46..053afc9f1256 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -613,12 +613,20 @@ int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags) { struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah); struct bnxt_re_dev *rdev = ah->rdev; + bool block = true; + int rc = 0; - bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah, - !(flags & RDMA_DESTROY_AH_SLEEPABLE)); + block = !(flags & RDMA_DESTROY_AH_SLEEPABLE); + rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah, block); + if (BNXT_RE_CHECK_RC(rc)) { + if (rc == -ETIMEDOUT) + rc = 0; + else + goto fail; + } atomic_dec(&rdev->ah_count); - - return 0; +fail: + return rc; } static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index dbb0e4e9254d..910d17ddcf13 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -460,13 +460,14 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, return 0; } -void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, - bool block) +int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, + bool block) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct creq_destroy_ah_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_destroy_ah req = {}; + int rc; /* Clean up the AH table in the device */ bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, @@ -477,7 +478,8 @@ void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), block); - bnxt_qplib_rcfw_send_message(rcfw, &msg); + rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); + return rc; } /* MRW */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 5de874659cdf..4061616048e8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -327,8 +327,8 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, bool block); -void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, - bool block); +int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, + bool block); int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw); int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, From a00278521c9107c1edec0088f512a85316795692 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:51 -0700 Subject: [PATCH 52/71] RDMA/bnxt_re: cancel all control path command waiters upon error When an error is detected in FW, wake up all the waiters as the all of them need to be completed with timeout. Add the device error state also as a wait condition. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-15-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 51372de19e22..824115419436 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1495,6 +1495,7 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) */ set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); mutex_unlock(&bnxt_re_mutex); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 8dd82163ab7a..8b1b4132afc0 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -116,10 +116,10 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) return -ETIMEDOUT; wait_event_timeout(cmdq->waitq, - !test_bit(cbit, cmdq->cmdq_bitmap), + !test_bit(cbit, cmdq->cmdq_bitmap) || + test_bit(ERR_DEVICE_DETACHED, &cmdq->flags), msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC * 1000)); - if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; From f0c875ff6293965a30aaeb02a0d19b293e11bc2b Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:52 -0700 Subject: [PATCH 53/71] RDMA/bnxt_re: use firmware provided max request timeout Firmware provides max request timeout value as part of hwrm_ver_get API. Driver gets the timeout from firmware and if that interface is not available then fall back to hardcoded timeout value. Also, Add a helper function to check the FW status. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-16-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 8 +++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 59 ++++++++++++++++++---- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 4 +- drivers/infiniband/hw/bnxt_re/qplib_res.h | 1 + 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 824115419436..a2c7d3f21279 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1041,6 +1041,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ver_get_output resp = {0}; struct hwrm_ver_get_input req = {0}; + struct bnxt_qplib_chip_ctx *cctx; struct bnxt_fw_msg fw_msg; int rc = 0; @@ -1058,11 +1059,18 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) rc); return; } + + cctx = rdev->chip_ctx; rdev->qplib_ctx.hwrm_intf_ver = (u64)le16_to_cpu(resp.hwrm_intf_major) << 48 | (u64)le16_to_cpu(resp.hwrm_intf_minor) << 32 | (u64)le16_to_cpu(resp.hwrm_intf_build) << 16 | le16_to_cpu(resp.hwrm_intf_patch); + + cctx->hwrm_cmd_max_timeout = le16_to_cpu(resp.max_req_timeout); + + if (!cctx->hwrm_cmd_max_timeout) + cctx->hwrm_cmd_max_timeout = RCFW_FW_STALL_MAX_TIMEOUT; } static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 8b1b4132afc0..99aa1ae558d3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -89,6 +89,41 @@ static int bnxt_qplib_map_rc(u8 opcode) } } +/** + * bnxt_re_is_fw_stalled - Check firmware health + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode + * @cbit - bitmap entry of cookie + * + * If firmware has not responded any rcfw command within + * rcfw->max_timeout, consider firmware as stalled. + * + * Returns: + * 0 if firmware is responding + * -ENODEV if firmware is not responding + */ +static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, + u16 cookie, u8 opcode, u16 cbit) +{ + struct bnxt_qplib_cmdq_ctx *cmdq; + + cmdq = &rcfw->cmdq; + + if (time_after(jiffies, cmdq->last_seen + + (rcfw->max_timeout * HZ))) { + dev_warn_ratelimited(&rcfw->pdev->dev, + "%s: FW STALL Detected. cmdq[%#x]=%#x waited (%d > %d) msec active %d ", + __func__, cookie, opcode, + jiffies_to_msecs(jiffies - cmdq->last_seen), + rcfw->max_timeout * 1000, + test_bit(cbit, cmdq->cmdq_bitmap)); + return -ENODEV; + } + + return 0; +} + /** * __wait_for_resp - Don't hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev @@ -105,6 +140,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; u16 cbit; + int ret; cmdq = &rcfw->cmdq; cbit = cookie % rcfw->cmdq_depth; @@ -118,8 +154,8 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) wait_event_timeout(cmdq->waitq, !test_bit(cbit, cmdq->cmdq_bitmap) || test_bit(ERR_DEVICE_DETACHED, &cmdq->flags), - msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC - * 1000)); + msecs_to_jiffies(rcfw->max_timeout * 1000)); + if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; @@ -128,10 +164,9 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; - /* Firmware stall is detected */ - if (time_after(jiffies, cmdq->last_seen + - (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) - return -ENODEV; + ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit); + if (ret) + return ret; } while (true); }; @@ -352,6 +387,7 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; unsigned long issue_time; u16 cbit; + int ret; cbit = cookie % rcfw->cmdq_depth; issue_time = jiffies; @@ -368,11 +404,10 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; if (jiffies_to_msecs(jiffies - issue_time) > - (RCFW_FW_STALL_TIMEOUT_SEC * 1000)) { - /* Firmware stall is detected */ - if (time_after(jiffies, cmdq->last_seen + - (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) - return -ENODEV; + (rcfw->max_timeout * 1000)) { + ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit); + if (ret) + return ret; } } while (true); }; @@ -951,6 +986,8 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->qp_tbl) goto fail; + rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; + return 0; fail: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 338bf6a9699f..b644dcc8fc22 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -51,7 +51,7 @@ #define RCFW_DBR_PCI_BAR_REGION 2 #define RCFW_DBR_BASE_PAGE_SHIFT 12 -#define RCFW_FW_STALL_TIMEOUT_SEC 40 +#define RCFW_FW_STALL_MAX_TIMEOUT 40 /* Cmdq contains a fix number of a 16-Byte slots */ struct bnxt_qplib_cmdqe { @@ -227,6 +227,8 @@ struct bnxt_qplib_rcfw { atomic_t rcfw_intr_enabled; struct semaphore rcfw_inflight; atomic_t timeout_send; + /* cached from chip cctx for quick reference in slow path */ + u16 max_timeout; }; struct bnxt_qplib_cmdqmsg { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 982e2c96dac2..77f0b84aa1b2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -55,6 +55,7 @@ struct bnxt_qplib_chip_ctx { u8 chip_rev; u8 chip_metal; u16 hw_stats_size; + u16 hwrm_cmd_max_timeout; struct bnxt_qplib_drv_modes modes; }; From bcfee4ce3e0139ffa9c564e4ed3682e8b87f0a1d Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:53 -0700 Subject: [PATCH 54/71] RDMA/bnxt_re: remove redundant cmdq_bitmap cmdq_bitmap is used to derive the next available index in the CMDQ. This is not required as the we can get the next index using the existing bnxt_qplib_crsqe array. Driver will use bnxt_qplib_crsqe array and flag is_in_used to derive valid entries. is_in_used is replacement of cmdq_bitmap. There is no change in the existing mechanism of the circular buffer used to get index. Added opcode field in bnxt_qplib_crsqe array so that it is easy to map opcode associated with pending rcfw command. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-17-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 79 +++++++++------------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 +- 2 files changed, 34 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 99aa1ae558d3..3ae2f82ff4b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -95,6 +95,7 @@ static int bnxt_qplib_map_rc(u8 opcode) * @cookie - cookie to track the command * @opcode - rcfw submitted for given opcode * @cbit - bitmap entry of cookie + * @in_used - command is in used or freed * * If firmware has not responded any rcfw command within * rcfw->max_timeout, consider firmware as stalled. @@ -104,7 +105,7 @@ static int bnxt_qplib_map_rc(u8 opcode) * -ENODEV if firmware is not responding */ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, - u16 cookie, u8 opcode, u16 cbit) + u16 cookie, u8 opcode, bool in_used) { struct bnxt_qplib_cmdq_ctx *cmdq; @@ -117,7 +118,7 @@ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, __func__, cookie, opcode, jiffies_to_msecs(jiffies - cmdq->last_seen), rcfw->max_timeout * 1000, - test_bit(cbit, cmdq->cmdq_bitmap)); + in_used); return -ENODEV; } @@ -139,11 +140,11 @@ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; - u16 cbit; + struct bnxt_qplib_crsqe *crsqe; int ret; cmdq = &rcfw->cmdq; - cbit = cookie % rcfw->cmdq_depth; + crsqe = &rcfw->crsqe_tbl[cookie]; do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) @@ -152,19 +153,19 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) return -ETIMEDOUT; wait_event_timeout(cmdq->waitq, - !test_bit(cbit, cmdq->cmdq_bitmap) || + !crsqe->is_in_used || test_bit(ERR_DEVICE_DETACHED, &cmdq->flags), msecs_to_jiffies(rcfw->max_timeout * 1000)); - if (!test_bit(cbit, cmdq->cmdq_bitmap)) + if (!crsqe->is_in_used) return 0; bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); - if (!test_bit(cbit, cmdq->cmdq_bitmap)) + if (!crsqe->is_in_used) return 0; - ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit); + ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, crsqe->is_in_used); if (ret) return ret; @@ -187,11 +188,11 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + struct bnxt_qplib_crsqe *crsqe; unsigned long issue_time = 0; - u16 cbit; - cbit = cookie % rcfw->cmdq_depth; issue_time = jiffies; + crsqe = &rcfw->crsqe_tbl[cookie]; do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) @@ -202,7 +203,7 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) udelay(1); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); - if (!test_bit(cbit, cmdq->cmdq_bitmap)) + if (!crsqe->is_in_used) return 0; } while (time_before(jiffies, issue_time + (8 * HZ))); @@ -235,16 +236,13 @@ static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_crsqe *crsqe; struct bnxt_qplib_cmdqe *cmdqe; u32 sw_prod, cmdq_prod; - u16 cookie, cbit; + u16 cookie; u32 bsize; u8 *preq; cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; - - set_bit(cbit, cmdq->cmdq_bitmap); __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); - crsqe = &rcfw->crsqe_tbl[cbit]; + crsqe = &rcfw->crsqe_tbl[cookie]; /* Set cmd_size in terms of 16B slots in req. */ bsize = bnxt_qplib_set_cmd_slots(msg->req); @@ -253,6 +251,7 @@ static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw, */ crsqe->is_internal_cmd = true; crsqe->is_waiter_alive = false; + crsqe->is_in_used = true; crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); preq = (u8 *)msg->req; @@ -288,7 +287,8 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, u32 sw_prod, cmdq_prod; struct pci_dev *pdev; unsigned long flags; - u16 cookie, cbit; + u16 cookie; + u8 opcode; u8 *preq; cmdq = &rcfw->cmdq; @@ -302,10 +302,9 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, required_slots = bnxt_qplib_get_cmd_slots(msg->req); free_slots = HWQ_FREE_SLOTS(hwq); cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; + crsqe = &rcfw->crsqe_tbl[cookie]; - if (required_slots >= free_slots || - test_bit(cbit, cmdq->cmdq_bitmap)) { + if (required_slots >= free_slots) { dev_info_ratelimited(&pdev->dev, "CMDQ is full req/free %d/%d!", required_slots, free_slots); @@ -314,15 +313,17 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, } if (msg->block) cookie |= RCFW_CMD_IS_BLOCKING; - set_bit(cbit, cmdq->cmdq_bitmap); __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); - crsqe = &rcfw->crsqe_tbl[cbit]; + bsize = bnxt_qplib_set_cmd_slots(msg->req); crsqe->free_slots = free_slots; crsqe->resp = (struct creq_qp_event *)msg->resp; crsqe->resp->cookie = cpu_to_le16(cookie); crsqe->is_internal_cmd = false; crsqe->is_waiter_alive = true; + crsqe->is_in_used = true; + crsqe->opcode = opcode; + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { struct bnxt_qplib_rcfw_sbuf *sbuf = msg->sb; @@ -385,12 +386,12 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + struct bnxt_qplib_crsqe *crsqe; unsigned long issue_time; - u16 cbit; int ret; - cbit = cookie % rcfw->cmdq_depth; issue_time = jiffies; + crsqe = &rcfw->crsqe_tbl[cookie]; do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) @@ -401,11 +402,11 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, usleep_range(1000, 1001); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); - if (!test_bit(cbit, cmdq->cmdq_bitmap)) + if (!crsqe->is_in_used) return 0; if (jiffies_to_msecs(jiffies - issue_time) > (rcfw->max_timeout * 1000)) { - ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, cbit); + ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, crsqe->is_in_used); if (ret) return ret; } @@ -485,7 +486,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; struct bnxt_qplib_crsqe *crsqe; unsigned long flags; - u16 cookie, cbit; + u16 cookie; int rc = 0; u8 opcode; @@ -501,7 +502,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) & RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; if (msg->block) rc = __block_for_resp(rcfw, cookie, opcode); @@ -518,7 +518,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (rc) { spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); - crsqe = &rcfw->crsqe_tbl[cbit]; + crsqe = &rcfw->crsqe_tbl[cookie]; crsqe->is_waiter_alive = false; if (rc == -ENODEV) set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); @@ -630,12 +630,11 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_crsqe *crsqe; u32 qp_id, tbl_indx, req_size; struct bnxt_qplib_qp *qp; - u16 cbit, blocked = 0; + u16 cookie, blocked = 0; bool is_waiter_alive; struct pci_dev *pdev; unsigned long flags; u32 wait_cmds = 0; - u16 cookie; int rc = 0; pdev = rcfw->pdev; @@ -670,8 +669,8 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, cookie = le16_to_cpu(qp_event->cookie); blocked = cookie & RCFW_CMD_IS_BLOCKING; cookie &= RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; - crsqe = &rcfw->crsqe_tbl[cbit]; + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_in_used = false; if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags), @@ -683,9 +682,6 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, return rc; } - if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) - dev_warn(&pdev->dev, - "CMD bit %d was not requested\n", cbit); if (crsqe->is_internal_cmd && !qp_event->status) atomic_dec(&rcfw->timeout_send); @@ -920,7 +916,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - bitmap_free(rcfw->cmdq.cmdq_bitmap); kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); @@ -975,10 +970,6 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - cmdq->cmdq_bitmap = bitmap_zalloc(rcfw->cmdq_depth, GFP_KERNEL); - if (!cmdq->cmdq_bitmap) - goto fail; - /* Allocate one extra to hold the QP1 entries */ rcfw->qp_tbl_size = qp_tbl_sz + 1; rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), @@ -1023,7 +1014,6 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { struct bnxt_qplib_creq_ctx *creq; struct bnxt_qplib_cmdq_ctx *cmdq; - unsigned long indx; creq = &rcfw->creq; cmdq = &rcfw->cmdq; @@ -1033,11 +1023,6 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) iounmap(cmdq->cmdq_mbox.reg.bar_reg); iounmap(creq->creq_db.reg.bar_reg); - indx = find_first_bit(cmdq->cmdq_bitmap, rcfw->cmdq_depth); - if (indx != rcfw->cmdq_depth) - dev_err(&rcfw->pdev->dev, - "disabling RCFW with pending cmd-bit %lx\n", indx); - cmdq->cmdq_mbox.reg.bar_reg = NULL; creq->creq_db.reg.bar_reg = NULL; creq->aeq_handler = NULL; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index b644dcc8fc22..f46de07c20bc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -152,8 +152,10 @@ struct bnxt_qplib_crsqe { u32 req_size; /* Free slots at the time of submission */ u32 free_slots; + u8 opcode; bool is_waiter_alive; bool is_internal_cmd; + bool is_in_used; }; struct bnxt_qplib_rcfw_sbuf { @@ -185,7 +187,6 @@ struct bnxt_qplib_cmdq_ctx { struct bnxt_qplib_cmdq_mbox cmdq_mbox; wait_queue_head_t waitq; unsigned long flags; - unsigned long *cmdq_bitmap; unsigned long last_seen; u32 seq_num; }; From 830f93f47068b1632cc127871fbf27e918efdf46 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:54 -0700 Subject: [PATCH 55/71] RDMA/bnxt_re: optimize the parameters passed to helper functions Avoid passing arguments like Opcode which can be retrieved from bnxt_qplib_crsqe structure. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-18-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 44 ++++++++++------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 3ae2f82ff4b4..bb5aebafe162 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -93,9 +93,6 @@ static int bnxt_qplib_map_rc(u8 opcode) * bnxt_re_is_fw_stalled - Check firmware health * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command - * @opcode - rcfw submitted for given opcode - * @cbit - bitmap entry of cookie - * @in_used - command is in used or freed * * If firmware has not responded any rcfw command within * rcfw->max_timeout, consider firmware as stalled. @@ -105,20 +102,22 @@ static int bnxt_qplib_map_rc(u8 opcode) * -ENODEV if firmware is not responding */ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, - u16 cookie, u8 opcode, bool in_used) + u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq; + struct bnxt_qplib_crsqe *crsqe; + crsqe = &rcfw->crsqe_tbl[cookie]; cmdq = &rcfw->cmdq; if (time_after(jiffies, cmdq->last_seen + (rcfw->max_timeout * HZ))) { dev_warn_ratelimited(&rcfw->pdev->dev, "%s: FW STALL Detected. cmdq[%#x]=%#x waited (%d > %d) msec active %d ", - __func__, cookie, opcode, + __func__, cookie, crsqe->opcode, jiffies_to_msecs(jiffies - cmdq->last_seen), rcfw->max_timeout * 1000, - in_used); + crsqe->is_in_used); return -ENODEV; } @@ -129,7 +128,6 @@ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, * __wait_for_resp - Don't hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command - * @opcode - rcfw submitted for given opcode * * Wait for command completion in sleepable context. * @@ -137,7 +135,7 @@ static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, * 0 if command is completed by firmware. * Non zero error code for rest of the case. */ -static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) +static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq; struct bnxt_qplib_crsqe *crsqe; @@ -148,7 +146,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) - return bnxt_qplib_map_rc(opcode); + return bnxt_qplib_map_rc(crsqe->opcode); if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -165,7 +163,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) if (!crsqe->is_in_used) return 0; - ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, crsqe->is_in_used); + ret = bnxt_re_is_fw_stalled(rcfw, cookie); if (ret) return ret; @@ -176,7 +174,6 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) * __block_for_resp - hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command - * @opcode - rcfw submitted for given opcode * * This function will hold the cpu (non-sleepable context) and * wait for command completion. Maximum holding interval is 8 second. @@ -185,7 +182,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) * -ETIMEOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ -static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) +static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; struct bnxt_qplib_crsqe *crsqe; @@ -196,7 +193,7 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) - return bnxt_qplib_map_rc(opcode); + return bnxt_qplib_map_rc(crsqe->opcode); if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -372,7 +369,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, * __poll_for_resp - self poll completion for rcfw command * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command - * @opcode - rcfw submitted for given opcode * * It works same as __wait_for_resp except this function will * do self polling in sort interval since interrupt is disabled. @@ -382,8 +378,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, * -ETIMEOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ -static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, - u8 opcode) +static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; struct bnxt_qplib_crsqe *crsqe; @@ -395,7 +390,7 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) - return bnxt_qplib_map_rc(opcode); + return bnxt_qplib_map_rc(crsqe->opcode); if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -406,7 +401,7 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, return 0; if (jiffies_to_msecs(jiffies - issue_time) > (rcfw->max_timeout * 1000)) { - ret = bnxt_re_is_fw_stalled(rcfw, cookie, opcode, crsqe->is_in_used); + ret = bnxt_re_is_fw_stalled(rcfw, cookie); if (ret) return ret; } @@ -414,13 +409,12 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, }; static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_cmdqmsg *msg) + struct bnxt_qplib_cmdqmsg *msg, + u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; - u32 opcode; cmdq = &rcfw->cmdq; - opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) @@ -492,7 +486,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - rc = __send_message_basic_sanity(rcfw, msg); + rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; @@ -504,11 +498,11 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, & RCFW_MAX_COOKIE_VALUE; if (msg->block) - rc = __block_for_resp(rcfw, cookie, opcode); + rc = __block_for_resp(rcfw, cookie); else if (atomic_read(&rcfw->rcfw_intr_enabled)) - rc = __wait_for_resp(rcfw, cookie, opcode); + rc = __wait_for_resp(rcfw, cookie); else - rc = __poll_for_resp(rcfw, cookie, opcode); + rc = __poll_for_resp(rcfw, cookie); if (rc) { /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", From ee678e5dffc0b0d63b4a2ec6129063339435bfc5 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 13 Jun 2023 12:16:55 -0500 Subject: [PATCH 56/71] RDMA/rxe: Fixes mr access supported list A recent patch incorrectly did not include IB_ACCESS_RELAXED_ORDERING in the list of supported access flags for the rxe driver. The driver actually does nothing related to relaxed ordering but it causes no problems to include it as supported but with no effect. This change caused ib_send_bw and friends to not run correctly. The correct approach is for the driver to allow any of the optional access flags and otherwise ignore them. This patch adds IB_ACCESS_OPTIONAL to the list of rxe supported flags. Fixes: 02ed253770fb ("RDMA/rxe: Introduce rxe access supported flags") Link: https://lore.kernel.org/r/20230613171654.19334-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index cb18b83b73c1..ccb9d19ffe8a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -262,7 +262,8 @@ enum { | IB_ACCESS_MW_BIND | IB_ACCESS_ON_DEMAND | IB_ACCESS_FLUSH_GLOBAL - | IB_ACCESS_FLUSH_PERSISTENT, + | IB_ACCESS_FLUSH_PERSISTENT + | IB_ACCESS_OPTIONAL, RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR | IB_ZERO_BASED, From 350b6dd4b2f876f1aa0d45a422b17b8377517762 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 12 Jun 2023 11:22:45 -0500 Subject: [PATCH 57/71] RDMA/rxe: Simplify cq->notify code The flags parameter to the request notify verb is a bitmask. But, rxe driver treats cq->notify as an int. If someone ever set both the IB_CQ_SOLICITED and the IB_CQ_NEXT_COMP bits rxe_cq_post could fail to generate a completion event. This patch treats the notify flags as a bit mask consistently and can handle the above case correctly. Link: https://lore.kernel.org/r/20230612162244.20038-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 5 ++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 20ff0c0c4605..31a25aaa44a0 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -115,10 +115,9 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) spin_unlock_irqrestore(&cq->cq_lock, flags); - if ((cq->notify == IB_CQ_NEXT_COMP) || - (cq->notify == IB_CQ_SOLICITED && solicited)) { + if ((cq->notify & IB_CQ_NEXT_COMP) || + (cq->notify & IB_CQ_SOLICITED && solicited)) { cq->notify = 0; - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f6396333bcef..515f9ff72d18 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1181,9 +1181,7 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); - if (cq->notify != IB_CQ_NEXT_COMP) - cq->notify = flags & IB_CQ_SOLICITED_MASK; - + cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) From 6735041fd8460a94ae367830ece8ef65f191227a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 14 Jun 2023 09:43:28 +0800 Subject: [PATCH 58/71] RDMA/cma: Remove NULL check before dev_{put, hold} The call netdev_{put, hold} of dev_{put, hold} will check NULL, so there is no need to check before using dev_{put, hold}, remove it to silence the warning: ./drivers/infiniband/core/cma.c:4812:2-9: WARNING: NULL check before dev_{put, hold} functions is not needed. Link: https://lore.kernel.org/r/20230614014328.14007-1-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5521 Signed-off-by: Yang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93a1c48d0c32..5146ef2dbfd9 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4805,8 +4805,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, event->param.ud.qkey = id_priv->qkey; out: - if (ndev) - dev_put(ndev); + dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) From 147394dbe1237ef1f2da528c238633215f7193e1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 13 Jun 2023 11:15:57 +0300 Subject: [PATCH 59/71] RDMA/bnxt_re: Initialize opcode while sending message Fix compilation warning: drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:325:18: error: variable 'opcode' is uninitialized when used here [-Werror,-Wuninitialized] crsqe->opcode = opcode; ^~~~~~ drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:291:11: note: initialize the variable 'opcode' to silence this warning u8 opcode; ^ = '\0' Fixes: bcfee4ce3e01 ("RDMA/bnxt_re: remove redundant cmdq_bitmap") Link: https://lore.kernel.org/r/6ad1e44be2b560986da6fdc6b68da606413e9026.1686644105.git.leonro@nvidia.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index bb5aebafe162..92b3a4fbd0b2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -274,7 +274,7 @@ static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw, } static int __send_message(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_cmdqmsg *msg) + struct bnxt_qplib_cmdqmsg *msg, u8 opcode) { u32 bsize, free_slots, required_slots; struct bnxt_qplib_cmdq_ctx *cmdq; @@ -285,7 +285,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct pci_dev *pdev; unsigned long flags; u16 cookie; - u8 opcode; u8 *preq; cmdq = &rcfw->cmdq; @@ -490,7 +489,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (rc) return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; - rc = __send_message(rcfw, msg); + rc = __send_message(rcfw, msg, opcode); if (rc) return rc; From 24ce94782c4878fe25ff15d8c08088a6ca5810e1 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:17 -0700 Subject: [PATCH 60/71] RDMA/bnxt_re: Use the common mmap helper functions Replace the mmap handling function with common code in IB core. Create rdma_user_mmap_entry for each mmap resource and add to the ib_core mmap list. Add mmap_free verb support. Also, use rdma_user_mmap_io while mapping Doorbell pages. Link: https://lore.kernel.org/r/1686679943-17117-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 125 ++++++++++++++++++---- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 15 +++ drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_res.c | 2 +- 4 files changed, 119 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 053afc9f1256..136133099a3f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -533,12 +533,55 @@ fail: return rc; } +static struct bnxt_re_user_mmap_entry* +bnxt_re_mmap_entry_insert(struct bnxt_re_ucontext *uctx, u64 mem_offset, + enum bnxt_re_mmap_flag mmap_flag, u64 *offset) +{ + struct bnxt_re_user_mmap_entry *entry; + int ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->mem_offset = mem_offset; + entry->mmap_flag = mmap_flag; + + switch (mmap_flag) { + case BNXT_RE_MMAP_SH_PAGE: + ret = rdma_user_mmap_entry_insert_exact(&uctx->ib_uctx, + &entry->rdma_entry, PAGE_SIZE, 0); + break; + case BNXT_RE_MMAP_UC_DB: + ret = rdma_user_mmap_entry_insert(&uctx->ib_uctx, + &entry->rdma_entry, PAGE_SIZE); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) { + kfree(entry); + return NULL; + } + if (offset) + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return entry; +} + /* Protection Domains */ int bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; + if (udata) { + rdma_user_mmap_entry_remove(pd->pd_db_mmap); + pd->pd_db_mmap = NULL; + } + bnxt_re_destroy_fence_mr(pd); if (pd->qplib_pd.id) { @@ -557,7 +600,8 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct bnxt_re_ucontext *ucntx = rdma_udata_to_drv_context( udata, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd); - int rc; + struct bnxt_re_user_mmap_entry *entry = NULL; + int rc = 0; pd->rdev = rdev; if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) { @@ -567,7 +611,7 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) } if (udata) { - struct bnxt_re_pd_resp resp; + struct bnxt_re_pd_resp resp = {}; if (!ucntx->dpi.dbr) { /* Allocate DPI in alloc_pd to avoid failing of @@ -584,12 +628,21 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) resp.pdid = pd->qplib_pd.id; /* Still allow mapping this DBR to the new user PD. */ resp.dpi = ucntx->dpi.dpi; - resp.dbr = (u64)ucntx->dpi.umdbr; - rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + entry = bnxt_re_mmap_entry_insert(ucntx, (u64)ucntx->dpi.umdbr, + BNXT_RE_MMAP_UC_DB, &resp.dbr); + + if (!entry) { + rc = -ENOMEM; + goto dbfail; + } + + pd->pd_db_mmap = &entry->rdma_entry; + + rc = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to copy user response\n"); + rdma_user_mmap_entry_remove(pd->pd_db_mmap); + rc = -EFAULT; goto dbfail; } } @@ -3964,6 +4017,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_re_user_mmap_entry *entry; struct bnxt_re_uctx_resp resp = {}; u32 chip_met_rev_num = 0; int rc; @@ -4002,6 +4056,13 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; resp.mode = rdev->chip_ctx->modes.wqe_mode; + entry = bnxt_re_mmap_entry_insert(uctx, 0, BNXT_RE_MMAP_SH_PAGE, NULL); + if (!entry) { + rc = -ENOMEM; + goto cfail; + } + uctx->shpage_mmap = &entry->rdma_entry; + rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (rc) { ibdev_err(ibdev, "Failed to copy user context"); @@ -4025,6 +4086,8 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) struct bnxt_re_dev *rdev = uctx->rdev; + rdma_user_mmap_entry_remove(uctx->shpage_mmap); + uctx->shpage_mmap = NULL; if (uctx->shpg) free_page((unsigned long)uctx->shpg); @@ -4044,27 +4107,43 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) struct bnxt_re_ucontext *uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_re_dev *rdev = uctx->rdev; + struct bnxt_re_user_mmap_entry *bnxt_entry; + struct rdma_user_mmap_entry *rdma_entry; + int ret = 0; u64 pfn; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + rdma_entry = rdma_user_mmap_entry_get(&uctx->ib_uctx, vma); + if (!rdma_entry) return -EINVAL; - if (vma->vm_pgoff) { - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - PAGE_SIZE, vma->vm_page_prot)) { - ibdev_err(&rdev->ibdev, "Failed to map DPI"); - return -EAGAIN; - } - } else { - pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT; - if (remap_pfn_range(vma, vma->vm_start, - pfn, PAGE_SIZE, vma->vm_page_prot)) { - ibdev_err(&rdev->ibdev, "Failed to map shared page"); - return -EAGAIN; - } + bnxt_entry = container_of(rdma_entry, struct bnxt_re_user_mmap_entry, + rdma_entry); + + switch (bnxt_entry->mmap_flag) { + case BNXT_RE_MMAP_UC_DB: + pfn = bnxt_entry->mem_offset >> PAGE_SHIFT; + ret = rdma_user_mmap_io(ib_uctx, vma, pfn, PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + rdma_entry); + break; + case BNXT_RE_MMAP_SH_PAGE: + ret = vm_insert_page(vma, vma->vm_start, virt_to_page(uctx->shpg)); + break; + default: + ret = -EINVAL; + break; } - return 0; + rdma_user_mmap_entry_put(rdma_entry); + return ret; +} + +void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct bnxt_re_user_mmap_entry *bnxt_entry; + + bnxt_entry = container_of(rdma_entry, struct bnxt_re_user_mmap_entry, + rdma_entry); + + kfree(bnxt_entry); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 31f7e34040f7..dcd31aefad4c 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -60,6 +60,7 @@ struct bnxt_re_pd { struct bnxt_re_dev *rdev; struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; + struct rdma_user_mmap_entry *pd_db_mmap; }; struct bnxt_re_ah { @@ -136,6 +137,18 @@ struct bnxt_re_ucontext { struct bnxt_qplib_dpi dpi; void *shpg; spinlock_t sh_lock; /* protect shpg */ + struct rdma_user_mmap_entry *shpage_mmap; +}; + +enum bnxt_re_mmap_flag { + BNXT_RE_MMAP_SH_PAGE, + BNXT_RE_MMAP_UC_DB, +}; + +struct bnxt_re_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u64 mem_offset; + u8 mmap_flag; }; static inline u16 bnxt_re_get_swqe_size(int nsge) @@ -213,6 +226,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); +void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); + unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a2c7d3f21279..acef429ef78a 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -545,6 +545,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .get_port_immutable = bnxt_re_get_port_immutable, .map_mr_sg = bnxt_re_map_mr_sg, .mmap = bnxt_re_mmap, + .mmap_free = bnxt_re_mmap_free, .modify_qp = bnxt_re_modify_qp, .modify_srq = bnxt_re_modify_srq, .poll_cq = bnxt_re_poll_cq, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 126d4f26f75a..920ab8704c8b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -813,7 +813,7 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, return 0; unmap_io: - pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem); + iounmap(dpit->dbr_bar_reg_iomem); dpit->dbr_bar_reg_iomem = NULL; return -ENOMEM; } From 390bf429cc6ce6844f834b1d9ddfbc1125aff1fc Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:18 -0700 Subject: [PATCH 61/71] RDMA/bnxt_re: Add disassociate ucontext support Add driver disassociation support. Driver uses the APIs rdma_user_mmap_io api while mapping the IO pages to user space. Add empty stub for disassociate ucontext. Link: https://lore.kernel.org/r/1686679943-17117-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index acef429ef78a..406b1001cfb3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -472,6 +472,10 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, return rc; } +static void bnxt_re_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + /* Device */ static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) @@ -538,6 +542,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .destroy_qp = bnxt_re_destroy_qp, .destroy_srq = bnxt_re_destroy_srq, .device_group = &bnxt_re_dev_attr_group, + .disassociate_ucontext = bnxt_re_disassociate_ucontext, .get_dev_fw_str = bnxt_re_query_fw_str, .get_dma_mr = bnxt_re_get_dma_mr, .get_hw_stats = bnxt_re_ib_get_hw_stats, From 7d3115eba3e3eef6e0cbe65f05285c5ab28360d7 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:19 -0700 Subject: [PATCH 62/71] RDMA/bnxt_re: Optimize the bnxt_re_init_hwrm_hdr usage As of now bnxt_re_init_hwrm_hdr is taking only the opcode from the caller. compl_ring and target_id field is always -1. These fields might be changed when newer features are added. For now, removing these parameters as they are hard coded. Also, remove the rdev field which is not used. Also, initialize the structure bnxt_fw_msg during declaration itself. Link: https://lore.kernel.org/r/1686679943-17117-4-git-send-email-selvin.xavier@broadcom.com Suggested-by: Leon Romanovsky Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 48 +++++++++++----------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 406b1001cfb3..1b16c4289147 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -321,12 +321,11 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_init_hwrm_hdr(struct bnxt_re_dev *rdev, struct input *hdr, - u16 opcd, u16 crid, u16 trid) +static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) { hdr->req_type = cpu_to_le16(opcd); - hdr->cmpl_ring = cpu_to_le16(crid); - hdr->target_id = cpu_to_le16(trid); + hdr->cmpl_ring = cpu_to_le16(-1); + hdr->target_id = cpu_to_le16(-1); } static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, @@ -344,9 +343,9 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, int type) { struct bnxt_en_dev *en_dev; - struct hwrm_ring_free_input req = {0}; + struct hwrm_ring_free_input req = {}; struct hwrm_ring_free_output resp; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!rdev) @@ -360,9 +359,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); req.ring_type = type; req.ring_id = cpu_to_le16(fw_ring_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, @@ -379,16 +376,15 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, u16 *fw_ring_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; - struct hwrm_ring_alloc_input req = {0}; + struct hwrm_ring_alloc_input req = {}; struct hwrm_ring_alloc_output resp; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!en_dev) return rc; - memset(&fw_msg, 0, sizeof(fw_msg)); - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); req.enables = 0; req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); if (ring_attr->pages > 1) { @@ -417,7 +413,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_stat_ctx_free_input req = {}; struct hwrm_stat_ctx_free_output resp = {}; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!en_dev) @@ -426,9 +422,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); @@ -445,10 +439,10 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, u32 *fw_stats_ctx_id) { struct bnxt_qplib_chip_ctx *chip_ctx = rdev->chip_ctx; - struct hwrm_stat_ctx_alloc_output resp = {0}; - struct hwrm_stat_ctx_alloc_input req = {0}; + struct hwrm_stat_ctx_alloc_output resp = {}; + struct hwrm_stat_ctx_alloc_input req = {}; struct bnxt_en_dev *en_dev = rdev->en_dev; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; *fw_stats_ctx_id = INVALID_STATS_CTX_ID; @@ -456,9 +450,7 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, if (!en_dev) return rc; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); req.update_period_ms = cpu_to_le32(1000); req.stats_dma_addr = cpu_to_le64(dma_map); req.stats_dma_length = cpu_to_le16(chip_ctx->hw_stats_size); @@ -1045,15 +1037,13 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; - struct hwrm_ver_get_output resp = {0}; - struct hwrm_ver_get_input req = {0}; + struct hwrm_ver_get_output resp = {}; + struct hwrm_ver_get_input req = {}; struct bnxt_qplib_chip_ctx *cctx; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, - HWRM_VER_GET, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VER_GET); req.hwrm_intf_maj = HWRM_VERSION_MAJOR; req.hwrm_intf_min = HWRM_VERSION_MINOR; req.hwrm_intf_upd = HWRM_VERSION_UPDATE; From ba75fe7b500e71aff8bc3b7096c4ce1dcc649eb3 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:20 -0700 Subject: [PATCH 63/71] RDMA/bnxt_re: Query function capabilities from firmware Query Function capabilities to enable advanced features. Link: https://lore.kernel.org/r/1686679943-17117-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 1b16c4289147..247f724cd347 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -83,6 +83,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr); static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); +static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) { @@ -91,6 +92,9 @@ static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) cctx = rdev->chip_ctx; cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ? mode : BNXT_QPLIB_WQE_MODE_STATIC; + if (bnxt_re_hwrm_qcaps(rdev)) + dev_err(rdev_to_dev(rdev), + "Failed to query hwrm qcaps\n"); } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -339,6 +343,23 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } +/* Query function capabilities using common hwrm */ +int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_func_qcaps_output resp = {}; + struct hwrm_func_qcaps_input req = {}; + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_fw_msg fw_msg = {}; + + cctx = rdev->chip_ctx; + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCAPS); + req.fid = cpu_to_le16(0xffff); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + return bnxt_send_msg(en_dev, &fw_msg); +} + static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, int type) { From 3fe9882fbb50eeb724504df5979e9140f8842f76 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:21 -0700 Subject: [PATCH 64/71] RDMA/bnxt_re: Move the interface version to chip context structure FW interface version check is required for multiple features. Moving the interface version to chip context structure. Link: https://lore.kernel.org/r/1686679943-17117-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/bnxt_re/qplib_res.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 247f724cd347..f8b4265c99ac 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1078,7 +1078,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) } cctx = rdev->chip_ctx; - rdev->qplib_ctx.hwrm_intf_ver = + cctx->hwrm_intf_ver = (u64)le16_to_cpu(resp.hwrm_intf_major) << 48 | (u64)le16_to_cpu(resp.hwrm_intf_minor) << 32 | (u64)le16_to_cpu(resp.hwrm_intf_build) << 16 | diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 77f0b84aa1b2..070451ac3dab 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -57,6 +57,7 @@ struct bnxt_qplib_chip_ctx { u16 hw_stats_size; u16 hwrm_cmd_max_timeout; struct bnxt_qplib_drv_modes modes; + u64 hwrm_intf_ver; }; #define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) @@ -242,7 +243,6 @@ struct bnxt_qplib_ctx { struct bnxt_qplib_tqm_ctx tqm_ctx; struct bnxt_qplib_stats stats; struct bnxt_qplib_vf_res vf_res; - u64 hwrm_intf_ver; }; struct bnxt_qplib_res { From 0ac20faf5d837b59fb4c041ea320932ed47fd67f Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:22 -0700 Subject: [PATCH 65/71] RDMA/bnxt_re: Reorg the bar mapping Reorganize the code for allocation and mapping of Doorbell pages. Implements new HW command to get the BAR length used by L2 driver. These changes are used by the future patch which maps the WC Doorbell pages. Also, introduced a new lock dpi_tbl_lock for synchronize the DB page allocation from users. Link: https://lore.kernel.org/r/1686679943-17117-7-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 7 +- drivers/infiniband/hw/bnxt_re/main.c | 71 ++++++++- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 4 +- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 2 + drivers/infiniband/hw/bnxt_re/qplib_res.c | 176 +++++++++++++-------- drivers/infiniband/hw/bnxt_re/qplib_res.h | 28 +++- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 3 + drivers/infiniband/hw/bnxt_re/qplib_sp.h | 1 + 8 files changed, 214 insertions(+), 78 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 136133099a3f..8e3b45d9e656 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -618,8 +618,8 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) * ibv_devinfo and family of application when DPIs * are depleted. */ - if (bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl, - &ucntx->dpi, ucntx)) { + if (bnxt_qplib_alloc_dpi(&rdev->qplib_res, + &ucntx->dpi, ucntx, BNXT_QPLIB_DPI_TYPE_UC)) { rc = -ENOMEM; goto dbfail; } @@ -4095,8 +4095,7 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) /* Free DPI only if this is the first PD allocated by the * application and mark the context dpi as NULL */ - bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &uctx->dpi); + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &uctx->dpi); uctx->dpi.dbr = NULL; } } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index f8b4265c99ac..0c681134dd57 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -85,6 +85,40 @@ static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); +static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, + u32 *offset); +static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_en_dev *en_dev; + struct bnxt_qplib_res *res; + u32 l2db_len = 0; + u32 offset = 0; + u32 barlen; + int rc; + + res = &rdev->qplib_res; + en_dev = rdev->en_dev; + cctx = rdev->chip_ctx; + + /* Issue qcfg */ + rc = bnxt_re_hwrm_qcfg(rdev, &l2db_len, &offset); + if (rc) + dev_info(rdev_to_dev(rdev), + "Couldn't get DB bar size, Low latency framework is disabled\n"); + /* set register offsets for both UC and WC */ + res->dpi_tbl.ucreg.offset = res->is_vf ? BNXT_QPLIB_DBR_VF_DB_OFFSET : + BNXT_QPLIB_DBR_PF_DB_OFFSET; + res->dpi_tbl.wcreg.offset = res->dpi_tbl.ucreg.offset; + + /* If WC mapping is disabled by L2 driver then en_dev->l2_db_size + * is equal to the DB-Bar actual size. This indicates that L2 + * is mapping entire bar as UC-. RoCE driver can't enable WC mapping + * in such cases and DB-push will be disabled. + */ + barlen = pci_resource_len(res->pdev, RCFW_DBR_PCI_BAR_REGION); +} + static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) { struct bnxt_qplib_chip_ctx *cctx; @@ -116,6 +150,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; + int rc; en_dev = rdev->en_dev; @@ -134,6 +169,12 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); bnxt_re_set_drv_mode(rdev, wqe_mode); + + bnxt_re_set_db_offset(rdev); + rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); + if (rc) + return rc; + if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, "platform doesn't support global atomics."); @@ -343,6 +384,30 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } +/* Query device config using common hwrm */ +static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, + u32 *offset) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_func_qcfg_output resp = {0}; + struct hwrm_func_qcfg_input req = {0}; + struct bnxt_fw_msg fw_msg; + int rc; + + memset(&fw_msg, 0, sizeof(fw_msg)); + bnxt_re_init_hwrm_hdr(rdev, (void *)&req, + HWRM_FUNC_QCFG, -1, -1); + req.fid = cpu_to_le16(0xffff); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + rc = bnxt_send_msg(en_dev, &fw_msg); + if (!rc) { + *db_len = PAGE_ALIGN(le16_to_cpu(resp.l2_doorbell_bar_size_kb) * 1024); + *offset = PAGE_ALIGN(le16_to_cpu(resp.legacy_l2_db_size_kb) * 1024); + } + return rc; +} + /* Query function capabilities using common hwrm */ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) { @@ -847,7 +912,6 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev) if (rdev->qplib_res.dpi_tbl.max) { bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); } if (rdev->qplib_res.rcfw) { @@ -875,9 +939,9 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) goto fail; - rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl, + rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res, &rdev->dpi_privileged, - rdev); + rdev, BNXT_QPLIB_DPI_TYPE_KERNEL); if (rc) goto dealloc_res; @@ -917,7 +981,6 @@ free_nq: bnxt_qplib_free_nq(&rdev->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); dealloc_res: bnxt_qplib_free_res(&rdev->qplib_res); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index d48a26e89b10..d5d418a8b003 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -668,7 +668,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, srq->dbinfo.xid = srq->id; srq->dbinfo.db = srq->dpi->dbr; srq->dbinfo.max_slot = 1; - srq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem; + srq->dbinfo.priv_db = res->dpi_tbl.priv_db; if (srq->threshold) bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); srq->arm_req = false; @@ -2104,7 +2104,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) cq->dbinfo.hwq = &cq->hwq; cq->dbinfo.xid = cq->id; cq->dbinfo.db = cq->dpi->dbr; - cq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem; + cq->dbinfo.priv_db = res->dpi_tbl.priv_db; bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index f46de07c20bc..7b31bee3e000 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -130,6 +130,8 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) #define RCFW_CMD_IS_BLOCKING 0x8000 +#define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL + /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { u8 data[1024]; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 920ab8704c8b..e1cbe594e56b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -704,44 +704,73 @@ static int bnxt_qplib_alloc_pd_tbl(struct bnxt_qplib_res *res, } /* DPIs */ -int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi, - void *app) +int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi, + void *app, u8 type) { + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + struct bnxt_qplib_reg_desc *reg; u32 bit_num; + u64 umaddr; + + reg = &dpit->wcreg; + mutex_lock(&res->dpi_tbl_lock); bit_num = find_first_bit(dpit->tbl, dpit->max); - if (bit_num == dpit->max) + if (bit_num == dpit->max) { + mutex_unlock(&res->dpi_tbl_lock); return -ENOMEM; + } /* Found unused DPI */ clear_bit(bit_num, dpit->tbl); dpit->app_tbl[bit_num] = app; - dpi->dpi = bit_num; - dpi->dbr = dpit->dbr_bar_reg_iomem + (bit_num * PAGE_SIZE); - dpi->umdbr = dpit->unmapped_dbr + (bit_num * PAGE_SIZE); + dpi->bit = bit_num; + dpi->dpi = bit_num + (reg->offset - dpit->ucreg.offset) / PAGE_SIZE; + umaddr = reg->bar_base + reg->offset + bit_num * PAGE_SIZE; + dpi->umdbr = umaddr; + + switch (type) { + case BNXT_QPLIB_DPI_TYPE_KERNEL: + /* priviledged dbr was already mapped just initialize it. */ + dpi->umdbr = dpit->ucreg.bar_base + + dpit->ucreg.offset + bit_num * PAGE_SIZE; + dpi->dbr = dpit->priv_db; + dpi->dpi = dpi->bit; + break; + default: + dpi->dbr = ioremap(umaddr, PAGE_SIZE); + break; + } + + dpi->type = type; + mutex_unlock(&res->dpi_tbl_lock); return 0; + } int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi) + struct bnxt_qplib_dpi *dpi) { - if (dpi->dpi >= dpit->max) { - dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi); - return -EINVAL; - } - if (test_and_set_bit(dpi->dpi, dpit->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n", - dpi->dpi); + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + + mutex_lock(&res->dpi_tbl_lock); + if (dpi->dpi && dpi->type != BNXT_QPLIB_DPI_TYPE_KERNEL) + pci_iounmap(res->pdev, dpi->dbr); + + if (test_and_set_bit(dpi->bit, dpit->tbl)) { + dev_warn(&res->pdev->dev, + "Freeing an unused DPI? dpi = %d, bit = %d\n", + dpi->dpi, dpi->bit); + mutex_unlock(&res->dpi_tbl_lock); return -EINVAL; } if (dpit->app_tbl) - dpit->app_tbl[dpi->dpi] = NULL; + dpit->app_tbl[dpi->bit] = NULL; memset(dpi, 0, sizeof(*dpi)); - + mutex_unlock(&res->dpi_tbl_lock); return 0; } @@ -750,52 +779,38 @@ static void bnxt_qplib_free_dpi_tbl(struct bnxt_qplib_res *res, { kfree(dpit->tbl); kfree(dpit->app_tbl); - if (dpit->dbr_bar_reg_iomem) - pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem); - memset(dpit, 0, sizeof(*dpit)); + dpit->tbl = NULL; + dpit->app_tbl = NULL; + dpit->max = 0; } -static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpit, - u32 dbr_offset) +static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, + struct bnxt_qplib_dev_attr *dev_attr) { - u32 dbr_bar_reg = RCFW_DBR_PCI_BAR_REGION; - resource_size_t bar_reg_base; - u32 dbr_len, bytes; + struct bnxt_qplib_dpi_tbl *dpit; + struct bnxt_qplib_reg_desc *reg; + unsigned long bar_len; + u32 dbr_offset; + u32 bytes; - if (dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n", - dbr_bar_reg); - return -EALREADY; + dpit = &res->dpi_tbl; + reg = &dpit->wcreg; + + if (!bnxt_qplib_is_chip_gen_p5(res->cctx)) { + /* Offest should come from L2 driver */ + dbr_offset = dev_attr->l2_db_size; + dpit->ucreg.offset = dbr_offset; + dpit->wcreg.offset = dbr_offset; } - bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg); - if (!bar_reg_base) { - dev_err(&res->pdev->dev, "BAR region %d resc start failed\n", - dbr_bar_reg); - return -ENOMEM; - } + bar_len = pci_resource_len(res->pdev, reg->bar_id); + dpit->max = (bar_len - reg->offset) / PAGE_SIZE; + if (dev_attr->max_dpi) + dpit->max = min_t(u32, dpit->max, dev_attr->max_dpi); - dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset; - if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) { - dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len); - return -ENOMEM; - } - - dpit->dbr_bar_reg_iomem = ioremap(bar_reg_base + dbr_offset, - dbr_len); - if (!dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, - "FP: DBR BAR region %d mapping failed\n", dbr_bar_reg); - return -ENOMEM; - } - - dpit->unmapped_dbr = bar_reg_base + dbr_offset; - dpit->max = dbr_len / PAGE_SIZE; - - dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL); + dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL); if (!dpit->app_tbl) - goto unmap_io; + return -ENOMEM; bytes = dpit->max >> 3; if (!bytes) @@ -805,17 +820,14 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, if (!dpit->tbl) { kfree(dpit->app_tbl); dpit->app_tbl = NULL; - goto unmap_io; + return -ENOMEM; } memset((u8 *)dpit->tbl, 0xFF, bytes); + dpit->priv_db = dpit->ucreg.bar_reg + dpit->ucreg.offset; return 0; -unmap_io: - iounmap(dpit->dbr_bar_reg_iomem); - dpit->dbr_bar_reg_iomem = NULL; - return -ENOMEM; } /* Stats */ @@ -882,7 +894,7 @@ int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, if (rc) goto fail; - rc = bnxt_qplib_alloc_dpi_tbl(res, &res->dpi_tbl, dev_attr->l2_db_size); + rc = bnxt_qplib_alloc_dpi_tbl(res, dev_attr); if (rc) goto fail; @@ -892,6 +904,46 @@ fail: return rc; } +void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res) +{ + struct bnxt_qplib_reg_desc *reg; + + reg = &res->dpi_tbl.ucreg; + if (reg->bar_reg) + pci_iounmap(res->pdev, reg->bar_reg); + reg->bar_reg = NULL; + reg->bar_base = 0; + reg->len = 0; + reg->bar_id = 0; +} + +int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res) +{ + struct bnxt_qplib_reg_desc *ucreg; + struct bnxt_qplib_reg_desc *wcreg; + + wcreg = &res->dpi_tbl.wcreg; + wcreg->bar_id = RCFW_DBR_PCI_BAR_REGION; + wcreg->bar_base = pci_resource_start(res->pdev, wcreg->bar_id); + + ucreg = &res->dpi_tbl.ucreg; + ucreg->bar_id = RCFW_DBR_PCI_BAR_REGION; + ucreg->bar_base = pci_resource_start(res->pdev, ucreg->bar_id); + ucreg->len = ucreg->offset + PAGE_SIZE; + if (!ucreg->len || ((ucreg->len & (PAGE_SIZE - 1)) != 0)) { + dev_err(&res->pdev->dev, "QPLIB: invalid dbr length %d", + (int)ucreg->len); + return -EINVAL; + } + ucreg->bar_reg = ioremap(ucreg->bar_base, ucreg->len); + if (!ucreg->bar_reg) { + dev_err(&res->pdev->dev, "priviledged dpi map failed!"); + return -ENOMEM; + } + + return 0; +} + int bnxt_qplib_determine_atomics(struct pci_dev *dev) { int comp; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 070451ac3dab..398a469d8377 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -60,6 +60,9 @@ struct bnxt_qplib_chip_ctx { u64 hwrm_intf_ver; }; +#define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 +#define BNXT_QPLIB_DBR_VF_DB_OFFSET 0x4000 + #define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) #define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1) #define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) @@ -111,6 +114,7 @@ enum bnxt_qplib_hwrm_pg_size { struct bnxt_qplib_reg_desc { u8 bar_id; resource_size_t bar_base; + unsigned long offset; void __iomem *bar_reg; size_t len; }; @@ -187,18 +191,26 @@ struct bnxt_qplib_sgid_tbl { u8 *vlan; }; +enum { + BNXT_QPLIB_DPI_TYPE_KERNEL = 0, + BNXT_QPLIB_DPI_TYPE_UC = 1, +}; + struct bnxt_qplib_dpi { u32 dpi; + u32 bit; void __iomem *dbr; u64 umdbr; + u8 type; }; struct bnxt_qplib_dpi_tbl { void **app_tbl; unsigned long *tbl; u16 max; - void __iomem *dbr_bar_reg_iomem; - u64 unmapped_dbr; + struct bnxt_qplib_reg_desc ucreg; /* Hold entire DB bar. */ + struct bnxt_qplib_reg_desc wcreg; + void __iomem *priv_db; }; struct bnxt_qplib_stats { @@ -254,6 +266,8 @@ struct bnxt_qplib_res { struct bnxt_qplib_pd_tbl pd_tbl; struct bnxt_qplib_sgid_tbl sgid_tbl; struct bnxt_qplib_dpi_tbl dpi_tbl; + /* To protect the dpi table bit map */ + struct mutex dpi_tbl_lock; bool prio; bool is_vf; }; @@ -345,11 +359,10 @@ int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl, int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res, struct bnxt_qplib_pd_tbl *pd_tbl, struct bnxt_qplib_pd *pd); -int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi, - void *app); +int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi, + void *app, u8 type); int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpi_tbl, struct bnxt_qplib_dpi *dpi); void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); @@ -362,6 +375,9 @@ void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx, bool virt_fn, bool is_p5); +int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res); +void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res); + int bnxt_qplib_determine_atomics(struct pci_dev *dev); static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_hwq *hwq, u32 cnt) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 910d17ddcf13..d5ad0861c537 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -170,6 +170,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); } + if (rcfw->res->cctx->hwrm_intf_ver >= HWRM_VERSION_DEV_ATTR_MAX_DPI) + attr->max_dpi = le32_to_cpu(sb->max_dpi); + attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw); bail: bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 4061616048e8..264ef3cedc45 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -72,6 +72,7 @@ struct bnxt_qplib_dev_attr { u8 tqm_alloc_reqs[MAX_TQM_ALLOC_REQ]; bool is_atomic; u16 dev_cap_flags; + u32 max_dpi; }; struct bnxt_qplib_pd { From 360da60d6c6edb9740de7a8e6d8969d62ceff956 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 13 Jun 2023 11:12:23 -0700 Subject: [PATCH 66/71] RDMA/bnxt_re: Enable low latency push Introduce driver specific uapi functionalites. Added a alloc_page functionality for user library to allocate specific pages. Currently added support for allocating write combine pages for push functinality. This interface shall be extended for other page allocations. Allocate a WC page using the uapi hook for enabling the low latency push in Gen P5 adapters for small packets. This is supported only for the user space QPs. Link: https://lore.kernel.org/r/1686679943-17117-8-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 3 + drivers/infiniband/hw/bnxt_re/ib_verbs.c | 148 ++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 4 + drivers/infiniband/hw/bnxt_re/main.c | 20 ++- drivers/infiniband/hw/bnxt_re/qplib_res.c | 3 + drivers/infiniband/hw/bnxt_re/qplib_res.h | 3 +- include/uapi/rdma/bnxt_re-abi.h | 27 ++++ 7 files changed, 204 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index f34fb87e3b77..9e278d23eb82 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -39,6 +39,7 @@ #ifndef __BNXT_RE_H__ #define __BNXT_RE_H__ +#include #include "hw_counters.h" #define ROCE_DRV_MODULE_NAME "bnxt_re" @@ -189,4 +190,6 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) return &rdev->ibdev.dev; return NULL; } + +extern const struct uapi_definition bnxt_re_uapi_defs[]; #endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 8e3b45d9e656..a936e0d474f9 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -61,6 +61,15 @@ #include "bnxt_re.h" #include "ib_verbs.h" + +#include +#include + +#include + +#define UVERBS_MODULE_NAME bnxt_re +#include + #include static int __from_ib_access_flags(int iflags) @@ -546,6 +555,7 @@ bnxt_re_mmap_entry_insert(struct bnxt_re_ucontext *uctx, u64 mem_offset, entry->mem_offset = mem_offset; entry->mmap_flag = mmap_flag; + entry->uctx = uctx; switch (mmap_flag) { case BNXT_RE_MMAP_SH_PAGE: @@ -553,6 +563,7 @@ bnxt_re_mmap_entry_insert(struct bnxt_re_ucontext *uctx, u64 mem_offset, &entry->rdma_entry, PAGE_SIZE, 0); break; case BNXT_RE_MMAP_UC_DB: + case BNXT_RE_MMAP_WC_DB: ret = rdma_user_mmap_entry_insert(&uctx->ib_uctx, &entry->rdma_entry, PAGE_SIZE); break; @@ -4056,6 +4067,9 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; resp.mode = rdev->chip_ctx->modes.wqe_mode; + if (rdev->chip_ctx->modes.db_push) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; + entry = bnxt_re_mmap_entry_insert(uctx, 0, BNXT_RE_MMAP_SH_PAGE, NULL); if (!entry) { rc = -ENOMEM; @@ -4119,6 +4133,12 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) rdma_entry); switch (bnxt_entry->mmap_flag) { + case BNXT_RE_MMAP_WC_DB: + pfn = bnxt_entry->mem_offset >> PAGE_SHIFT; + ret = rdma_user_mmap_io(ib_uctx, vma, pfn, PAGE_SIZE, + pgprot_writecombine(vma->vm_page_prot), + rdma_entry); + break; case BNXT_RE_MMAP_UC_DB: pfn = bnxt_entry->mem_offset >> PAGE_SHIFT; ret = rdma_user_mmap_io(ib_uctx, vma, pfn, PAGE_SIZE, @@ -4146,3 +4166,131 @@ void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(bnxt_entry); } + +static int UVERBS_HANDLER(BNXT_RE_METHOD_ALLOC_PAGE)(struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, BNXT_RE_ALLOC_PAGE_HANDLE); + enum bnxt_re_alloc_page_type alloc_type; + struct bnxt_re_user_mmap_entry *entry; + enum bnxt_re_mmap_flag mmap_flag; + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_dev *rdev; + u64 mmap_offset; + u32 length; + u32 dpi; + u64 dbr; + int err; + + uctx = container_of(ib_uverbs_get_ucontext(attrs), struct bnxt_re_ucontext, ib_uctx); + if (IS_ERR(uctx)) + return PTR_ERR(uctx); + + err = uverbs_get_const(&alloc_type, attrs, BNXT_RE_ALLOC_PAGE_TYPE); + if (err) + return err; + + rdev = uctx->rdev; + cctx = rdev->chip_ctx; + + switch (alloc_type) { + case BNXT_RE_ALLOC_WC_PAGE: + if (cctx->modes.db_push) { + if (bnxt_qplib_alloc_dpi(&rdev->qplib_res, &uctx->wcdpi, + uctx, BNXT_QPLIB_DPI_TYPE_WC)) + return -ENOMEM; + length = PAGE_SIZE; + dpi = uctx->wcdpi.dpi; + dbr = (u64)uctx->wcdpi.umdbr; + mmap_flag = BNXT_RE_MMAP_WC_DB; + } else { + return -EINVAL; + } + + break; + + default: + return -EOPNOTSUPP; + } + + entry = bnxt_re_mmap_entry_insert(uctx, dbr, mmap_flag, &mmap_offset); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, BNXT_RE_ALLOC_PAGE_HANDLE); + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_MMAP_LENGTH, + &length, sizeof(length)); + if (err) + return err; + + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_DPI, + &dpi, sizeof(length)); + if (err) + return err; + + return 0; +} + +static int alloc_page_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct bnxt_re_user_mmap_entry *entry = uobject->object; + struct bnxt_re_ucontext *uctx = entry->uctx; + + switch (entry->mmap_flag) { + case BNXT_RE_MMAP_WC_DB: + if (uctx && uctx->wcdpi.dbr) { + struct bnxt_re_dev *rdev = uctx->rdev; + + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &uctx->wcdpi); + uctx->wcdpi.dbr = NULL; + } + break; + default: + goto exit; + } + rdma_user_mmap_entry_remove(&entry->rdma_entry); +exit: + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD(BNXT_RE_METHOD_ALLOC_PAGE, + UVERBS_ATTR_IDR(BNXT_RE_ALLOC_PAGE_HANDLE, + BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(BNXT_RE_ALLOC_PAGE_TYPE, + enum bnxt_re_alloc_page_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_DPI, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY(BNXT_RE_METHOD_DESTROY_PAGE, + UVERBS_ATTR_IDR(BNXT_RE_DESTROY_PAGE_HANDLE, + BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_TYPE_ALLOC_IDR(alloc_page_obj_cleanup), + &UVERBS_METHOD(BNXT_RE_METHOD_ALLOC_PAGE), + &UVERBS_METHOD(BNXT_RE_METHOD_DESTROY_PAGE)); + +const struct uapi_definition bnxt_re_uapi_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_ALLOC_PAGE), + {} +}; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index dcd31aefad4c..32d9e9d09791 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -61,6 +61,7 @@ struct bnxt_re_pd { struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; struct rdma_user_mmap_entry *pd_db_mmap; + struct rdma_user_mmap_entry *pd_wcdb_mmap; }; struct bnxt_re_ah { @@ -135,6 +136,7 @@ struct bnxt_re_ucontext { struct ib_ucontext ib_uctx; struct bnxt_re_dev *rdev; struct bnxt_qplib_dpi dpi; + struct bnxt_qplib_dpi wcdpi; void *shpg; spinlock_t sh_lock; /* protect shpg */ struct rdma_user_mmap_entry *shpage_mmap; @@ -143,10 +145,12 @@ struct bnxt_re_ucontext { enum bnxt_re_mmap_flag { BNXT_RE_MMAP_SH_PAGE, BNXT_RE_MMAP_UC_DB, + BNXT_RE_MMAP_WC_DB, }; struct bnxt_re_user_mmap_entry { struct rdma_user_mmap_entry rdma_entry; + struct bnxt_re_ucontext *uctx; u64 mem_offset; u8 mmap_flag; }; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 0c681134dd57..0816cf2a0b38 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -66,6 +66,7 @@ #include #include "bnxt.h" #include "hw_counters.h" +#include "ib_verbs.h" static char version[] = BNXT_RE_DESC "\n"; @@ -117,6 +118,10 @@ static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) * in such cases and DB-push will be disabled. */ barlen = pci_resource_len(res->pdev, RCFW_DBR_PCI_BAR_REGION); + if (cctx->modes.db_push && l2db_len && en_dev->l2_db_size != barlen) { + res->dpi_tbl.wcreg.offset = en_dev->l2_db_size; + dev_info(rdev_to_dev(rdev), "Low latency framework is enabled\n"); + } } static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) @@ -395,8 +400,7 @@ static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, int rc; memset(&fw_msg, 0, sizeof(fw_msg)); - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, - HWRM_FUNC_QCFG, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCFG); req.fid = cpu_to_le16(0xffff); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); @@ -416,13 +420,20 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) struct hwrm_func_qcaps_input req = {}; struct bnxt_qplib_chip_ctx *cctx; struct bnxt_fw_msg fw_msg = {}; + int rc; cctx = rdev->chip_ctx; bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCAPS); req.fid = cpu_to_le16(0xffff); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - return bnxt_send_msg(en_dev, &fw_msg); + + rc = bnxt_send_msg(en_dev, &fw_msg); + if (rc) + return rc; + cctx->modes.db_push = le32_to_cpu(resp.flags) & FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE; + + return 0; } static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, @@ -669,6 +680,9 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->dev.parent = &rdev->en_dev->pdev->dev; ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + ibdev->driver_def = bnxt_re_uapi_defs; + ib_set_device_ops(ibdev, &bnxt_re_dev_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index e1cbe594e56b..174db831b888 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -740,6 +740,9 @@ int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, dpi->dbr = dpit->priv_db; dpi->dpi = dpi->bit; break; + case BNXT_QPLIB_DPI_TYPE_WC: + dpi->dbr = ioremap_wc(umaddr, PAGE_SIZE); + break; default: dpi->dbr = ioremap(umaddr, PAGE_SIZE); break; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 398a469d8377..d850a553821e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -47,7 +47,7 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; struct bnxt_qplib_drv_modes { u8 wqe_mode; - /* Other modes to follow here */ + bool db_push; }; struct bnxt_qplib_chip_ctx { @@ -194,6 +194,7 @@ struct bnxt_qplib_sgid_tbl { enum { BNXT_QPLIB_DPI_TYPE_KERNEL = 0, BNXT_QPLIB_DPI_TYPE_UC = 1, + BNXT_QPLIB_DPI_TYPE_WC = 2 }; struct bnxt_qplib_dpi { diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index c4e90775da0c..8a2a1d4f6b29 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -41,6 +41,7 @@ #define __BNXT_RE_UVERBS_ABI_H__ #include +#include #define BNXT_RE_ABI_VERSION 1 @@ -51,6 +52,7 @@ enum { BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL, BNXT_RE_UCNTX_CMASK_HAVE_MODE = 0x02ULL, + BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED = 0x04ULL, }; enum bnxt_re_wqe_mode { @@ -127,4 +129,29 @@ enum bnxt_re_shpg_offt { BNXT_RE_END_RESV_OFFT = 0xFF0 }; +enum bnxt_re_objects { + BNXT_RE_OBJECT_ALLOC_PAGE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_alloc_page_type { + BNXT_RE_ALLOC_WC_PAGE = 0, +}; + +enum bnxt_re_var_alloc_page_attrs { + BNXT_RE_ALLOC_PAGE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_ALLOC_PAGE_TYPE, + BNXT_RE_ALLOC_PAGE_DPI, + BNXT_RE_ALLOC_PAGE_MMAP_OFFSET, + BNXT_RE_ALLOC_PAGE_MMAP_LENGTH, +}; + +enum bnxt_re_alloc_page_attrs { + BNXT_RE_DESTROY_PAGE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum bnxt_re_alloc_page_methods { + BNXT_RE_METHOD_ALLOC_PAGE = (1U << UVERBS_ID_NS_SHIFT), + BNXT_RE_METHOD_DESTROY_PAGE, +}; + #endif /* __BNXT_RE_UVERBS_ABI_H__*/ From c8dce4e7438be24be7a5b8477555ba03c0fb16ae Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 16 Jun 2023 11:46:59 +0530 Subject: [PATCH 67/71] RDMA/bnxt_re: Remove incorrect return check from slow path The commit 691eb7c6110f ("RDMA/bnxt_re: handle command completions after driver detect a timedout") introduced code resulting in below warning issued by the smatch static checker. drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:513 __bnxt_qplib_rcfw_send_message() warn: duplicate check 'rc' (previous on line 506) Fix the warning by removing incorrect code block. Fixes: 691eb7c6110f ("RDMA/bnxt_re: handle command completions after driver detect a timedout") Link: https://lore.kernel.org/r/20230616061700.741769-1-kashyap.desai@broadcom.com Reported-by: Dan Carpenter Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 92b3a4fbd0b2..1aa7c7b9ddb1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -502,12 +502,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __wait_for_resp(rcfw, cookie); else rc = __poll_for_resp(rcfw, cookie); - if (rc) { - /* timed out */ - dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", - cookie, opcode, RCFW_CMD_WAIT_TIME_MS); - return rc; - } if (rc) { spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); From 25ed2d409f5ff73f1bde8e9b2863f686364cbc7f Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 16 Jun 2023 11:47:00 +0530 Subject: [PATCH 68/71] RDMA/bnxt_re: Refactor code around bnxt_qplib_map_rc() Update function comment of bnxt_qplib_map_rc() Remove intermediate return value ENXIO and directly called bnxt_qplib_map_rc() from __send_message_basic_sanity(). Link: https://lore.kernel.org/r/20230616061700.741769-2-kashyap.desai@broadcom.com Reported-by: Dan Carpenter Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 23 ++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 1aa7c7b9ddb1..b30e66b64827 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -57,13 +57,20 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); * bnxt_qplib_map_rc - map return type based on opcode * @opcode - roce slow path opcode * - * In some cases like firmware halt is detected, the driver is supposed to - * remap the error code of the timed out command. + * case #1 + * Firmware initiated error recovery is a safe state machine and + * driver can consider all the underlying rdma resources are free. + * In this state, it is safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). * - * It is not safe to assume hardware is really inactive so certain opcodes - * like destroy qp etc are not safe to be returned success, but this function - * will be called when FW already reports a timeout. This would be possible - * only when FW crashes and resets. This will clear all the HW resources. + * case #2 + * If driver detect potential firmware stall, it is not safe state machine + * and the driver can not consider all the underlying rdma resources are + * freed. + * In this state, it is not safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * Scope of this helper function is only for case #1. * * Returns: * 0 to communicate success to caller. @@ -417,7 +424,7 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return -ENXIO; + return bnxt_qplib_map_rc(opcode); if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) return -ETIMEDOUT; @@ -487,7 +494,7 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __send_message_basic_sanity(rcfw, msg, opcode); if (rc) - return rc == -ENXIO ? bnxt_qplib_map_rc(opcode) : rc; + return rc; rc = __send_message(rcfw, msg, opcode); if (rc) From 0ab83a6459604c566a745875c4df1aec8e8866c0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 26 Jun 2023 08:36:32 +0800 Subject: [PATCH 69/71] RDMA/bnxt_re: Remove duplicated include in bnxt_re/main.c ./drivers/infiniband/hw/bnxt_re/main.c: ib_verbs.h is included more than once. Link: https://lore.kernel.org/r/20230626003632.60435-1-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5588 Signed-off-by: Yang Li Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 0816cf2a0b38..729a2f5318cc 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -66,7 +66,6 @@ #include #include "bnxt.h" #include "hw_counters.h" -#include "ib_verbs.h" static char version[] = BNXT_RE_DESC "\n"; From d1d7fc3bf6d2e6fb5d81a24dbb4120779a447b33 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 26 Jun 2023 09:35:35 +0100 Subject: [PATCH 70/71] RDMA/bnxt_re: Fix spelling mistake "priviledged" -> "privileged" There is a spelling mistake in a comment and in a dev_err error message. Fix them. Link: https://lore.kernel.org/r/20230626083535.53303-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_res.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 174db831b888..7674136c08b2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -734,7 +734,7 @@ int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, switch (type) { case BNXT_QPLIB_DPI_TYPE_KERNEL: - /* priviledged dbr was already mapped just initialize it. */ + /* privileged dbr was already mapped just initialize it. */ dpi->umdbr = dpit->ucreg.bar_base + dpit->ucreg.offset + bit_num * PAGE_SIZE; dpi->dbr = dpit->priv_db; @@ -940,7 +940,7 @@ int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res) } ucreg->bar_reg = ioremap(ucreg->bar_base, ucreg->len); if (!ucreg->bar_reg) { - dev_err(&res->pdev->dev, "priviledged dpi map failed!"); + dev_err(&res->pdev->dev, "privileged dpi map failed!"); return -ENOMEM; } From 4251f631fdfba0b38e4634510c5950ee157cc069 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 27 Jun 2023 10:20:13 +0300 Subject: [PATCH 71/71] RDMA/bnxt_re: Fix an IS_ERR() vs NULL check The bnxt_re_mmap_entry_insert() function returns NULL, not error pointers. Update the check for errors accordingly. Fixes: 360da60d6c6e ("RDMA/bnxt_re: Enable low latency push") Link: https://lore.kernel.org/r/8d92e85f-626b-4eca-8501-ca7024cfc0ee@moroto.mountain Signed-off-by: Dan Carpenter Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a936e0d474f9..ef47c32a53cb 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4214,8 +4214,8 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_ALLOC_PAGE)(struct uverbs_attr_bundle * } entry = bnxt_re_mmap_entry_insert(uctx, dbr, mmap_flag, &mmap_offset); - if (IS_ERR(entry)) - return PTR_ERR(entry); + if (!entry) + return -ENOMEM; uobj->object = entry; uverbs_finalize_uobj_create(attrs, BNXT_RE_ALLOC_PAGE_HANDLE);