Linus Torvalds af3877265d v6.4 merge window RDMA pull request
Usual wide collection of unrelated items in drivers:
 
 - Driver bug fixes and treewide cleanups in hfi1, siw, qib, mlx5, rxe,
   usnic, usnic, bnxt_re, ocrdma, iser
    * Unnecessary NULL checks
    * kmap obsolescence
    * pci_enable_pcie_error_reporting() obsolescence
    * Unused variables and macros
    * trace event related warnings
    * casting warnings
 
 - Code cleanups for irdm and erdma
 
 - EFA reporting of 128 byte PCIe TLP support
 
 - mlx5 more agressively uses the out of order HW feature
 
 - Big rework of how state machines and tasks work in rxe
 
 - Fix a syzkaller found crash netdev refcount leak in siw
 
 - bnxt_re revises their HW description header
 
 - Congestion control for bnxt_re
 
 - Use mmu_notifiers more safely in hfi1
 
 - mlx5 gets better support for PCIe relaxed ordering inside VMs
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQRRRCHOFoQz/8F5bUaFwuHvBreFYQUCZEva5wAKCRCFwuHvBreF
 YZFmAQC9T3b/XQ3bRknYciuzbatC98o9xB0FTqmEFYGj+Y2lVAD9EEVe3HKfHfi3
 t/GxXYB5r22oxg5bgsblZfEdEdTVCg8=
 =akMm
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "Usual wide collection of unrelated items in drivers:

   - Driver bug fixes and treewide cleanups in hfi1, siw, qib, mlx5,
     rxe, usnic, usnic, bnxt_re, ocrdma, iser:
       - remove unnecessary NULL checks
       - kmap obsolescence
       - pci_enable_pcie_error_reporting() obsolescence
       - unused variables and macros
       - trace event related warnings
       - casting warnings

   - Code cleanups for irdm and erdma

   - EFA reporting of 128 byte PCIe TLP support

   - mlx5 more agressively uses the out of order HW feature

   - Big rework of how state machines and tasks work in rxe

   - Fix a syzkaller found crash netdev refcount leak in siw

   - bnxt_re revises their HW description header

   - Congestion control for bnxt_re

   - Use mmu_notifiers more safely in hfi1

   - mlx5 gets better support for PCIe relaxed ordering inside VMs"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (81 commits)
  RDMA/efa: Add rdma write capability to device caps
  RDMA/mlx5: Use correct device num_ports when modify DC
  RDMA/irdma: Drop spurious WQ_UNBOUND from alloc_ordered_workqueue() call
  RDMA/rxe: Fix spinlock recursion deadlock on requester
  RDMA/mlx5: Fix flow counter query via DEVX
  RDMA/rxe: Protect QP state with qp->state_lock
  RDMA/rxe: Move code to check if drained to subroutine
  RDMA/rxe: Remove qp->req.state
  RDMA/rxe: Remove qp->comp.state
  RDMA/rxe: Remove qp->resp.state
  RDMA/mlx5: Allow relaxed ordering read in VFs and VMs
  net/mlx5: Update relaxed ordering read HCA capabilities
  RDMA/mlx5: Check pcie_relaxed_ordering_enabled() in UMR
  RDMA/mlx5: Remove pcie_relaxed_ordering_enabled() check for RO write
  RDMA: Add ib_virt_dma_to_page()
  RDMA/rxe: Fix the error "trying to register non-static key in rxe_cleanup_task"
  RDMA/irdma: Slightly optimize irdma_form_ah_cm_frame()
  RDMA/rxe: Fix incorrect TASKLET_STATE_SCHED check in rxe_task.c
  IB/hfi1: Place struct mmu_rb_handler on cache line start
  IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests
  ...
2023-04-29 17:21:24 -07:00

204 lines
5.9 KiB
C

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
/* Kai Shen <kaishen@linux.alibaba.com> */
/* Copyright (c) 2020-2022, Alibaba Group. */
#include "erdma_verbs.h"
static void *get_next_valid_cqe(struct erdma_cq *cq)
{
__be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci,
cq->depth, CQE_SHIFT);
u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK,
be32_to_cpu(READ_ONCE(*cqe)));
return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL;
}
static void notify_cq(struct erdma_cq *cq, u8 solcitied)
{
u64 db_data =
FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) |
FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) |
FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) |
FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) |
FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci);
*cq->kern_cq.db_record = db_data;
writeq(db_data, cq->kern_cq.db);
}
int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
struct erdma_cq *cq = to_ecq(ibcq);
unsigned long irq_flags;
int ret = 0;
spin_lock_irqsave(&cq->kern_cq.lock, irq_flags);
notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq))
ret = 1;
cq->kern_cq.notify_cnt++;
spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags);
return ret;
}
static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = {
[ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE,
[ERDMA_OP_READ] = IB_WC_RDMA_READ,
[ERDMA_OP_SEND] = IB_WC_SEND,
[ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND,
[ERDMA_OP_RECEIVE] = IB_WC_RECV,
[ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM,
[ERDMA_OP_RECV_INV] = IB_WC_RECV,
[ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
[ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV,
[ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND,
[ERDMA_OP_REG_MR] = IB_WC_REG_MR,
[ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV,
[ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ,
[ERDMA_OP_ATOMIC_CAS] = IB_WC_COMP_SWAP,
[ERDMA_OP_ATOMIC_FAA] = IB_WC_FETCH_ADD,
};
static const struct {
enum erdma_wc_status erdma;
enum ib_wc_status base;
enum erdma_vendor_err vendor;
} map_cqe_status[ERDMA_NUM_WC_STATUS] = {
{ ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR },
{ ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR },
{ ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR,
ERDMA_WC_VENDOR_INVALID_RQE },
{ ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR,
ERDMA_WC_VENDOR_RQE_INVALID_STAG },
{ ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION },
{ ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR,
ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR },
{ ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR,
ERDMA_WC_VENDOR_RQE_INVALID_PD },
{ ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR,
ERDMA_WC_VENDOR_RQE_WRAP_ERR },
{ ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR,
ERDMA_WC_VENDOR_INVALID_SQE },
{ ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR,
ERDMA_WC_VENDOR_ZERO_ORD },
{ ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR,
ERDMA_WC_VENDOR_SQE_INVALID_STAG },
{ ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION },
{ ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR,
ERDMA_WC_VENDOR_SQE_ACCESS_ERR },
{ ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR,
ERDMA_WC_VENDOR_SQE_INVALID_PD },
{ ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR,
ERDMA_WC_VENDOR_SQE_WARP_ERR },
{ ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR },
{ ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR },
};
#define ERDMA_POLLCQ_NO_QP 1
static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc)
{
struct erdma_dev *dev = to_edev(cq->ibcq.device);
u8 opcode, syndrome, qtype;
struct erdma_kqp *kern_qp;
struct erdma_cqe *cqe;
struct erdma_qp *qp;
u16 wqe_idx, depth;
u32 qpn, cqe_hdr;
u64 *id_table;
u64 *wqe_hdr;
cqe = get_next_valid_cqe(cq);
if (!cqe)
return -EAGAIN;
cq->kern_cq.ci++;
/* cqbuf should be ready when we poll */
dma_rmb();
qpn = be32_to_cpu(cqe->qpn);
wqe_idx = be32_to_cpu(cqe->qe_idx);
cqe_hdr = be32_to_cpu(cqe->hdr);
qp = find_qp_by_qpn(dev, qpn);
if (!qp)
return ERDMA_POLLCQ_NO_QP;
kern_qp = &qp->kern_qp;
qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr);
syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr);
opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr);
if (qtype == ERDMA_CQE_QTYPE_SQ) {
id_table = kern_qp->swr_tbl;
depth = qp->attrs.sq_size;
wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx,
qp->attrs.sq_size, SQEBB_SHIFT);
kern_qp->sq_ci =
FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) +
wqe_idx + 1;
} else {
id_table = kern_qp->rwr_tbl;
depth = qp->attrs.rq_size;
}
wc->wr_id = id_table[wqe_idx & (depth - 1)];
wc->byte_len = be32_to_cpu(cqe->size);
wc->wc_flags = 0;
wc->opcode = wc_mapping_table[opcode];
if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) {
wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data));
wc->wc_flags |= IB_WC_WITH_IMM;
} else if (opcode == ERDMA_OP_RECV_INV) {
wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey);
wc->wc_flags |= IB_WC_WITH_INVALIDATE;
}
if (syndrome >= ERDMA_NUM_WC_STATUS)
syndrome = ERDMA_WC_GENERAL_ERR;
wc->status = map_cqe_status[syndrome].base;
wc->vendor_err = map_cqe_status[syndrome].vendor;
wc->qp = &qp->ibqp;
return 0;
}
int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
{
struct erdma_cq *cq = to_ecq(ibcq);
unsigned long flags;
int npolled, ret;
spin_lock_irqsave(&cq->kern_cq.lock, flags);
for (npolled = 0; npolled < num_entries;) {
ret = erdma_poll_one_cqe(cq, wc + npolled);
if (ret == -EAGAIN) /* no received new CQEs. */
break;
else if (ret) /* ignore invalid CQEs. */
continue;
npolled++;
}
spin_unlock_irqrestore(&cq->kern_cq.lock, flags);
return npolled;
}