RDMA/erdma: Refactor the storage structure of MTT entries

Currently our MTT only support inline mtt entries (0 level MTT) and
indirect MTT entries (1 level mtt), which will limit the maximum length
of MRs. In order to implement a multi-level MTT, we refactor the
structure of MTT first.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817102151.75964-3-chengyou@linux.alibaba.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Cheng Xu 2023-08-17 18:21:50 +08:00 committed by Leon Romanovsky
parent d7cfbba90b
commit 7244b4aa42
4 changed files with 152 additions and 94 deletions

View File

@ -228,7 +228,7 @@ struct erdma_cmdq_ext_db_req {
/* create_cq cfg1 */
#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16)
#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15)
#define ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK BIT(15)
#define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11)
#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0)
@ -258,7 +258,7 @@ struct erdma_cmdq_create_cq_req {
/* regmr cfg2 */
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20)
#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20)
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)
struct erdma_cmdq_reg_mr_req {

View File

@ -410,7 +410,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi,
/* Copy SGLs to SQE content to accelerate */
memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
qp->attrs.sq_size, SQEBB_SHIFT),
mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents));
mr->mem.mtt->buf, MTT_SIZE(mr->mem.mtt_nents));
wqe_size = sizeof(struct erdma_reg_mr_sqe) +
MTT_SIZE(mr->mem.mtt_nents);
} else {

View File

@ -19,6 +19,23 @@
#include "erdma_cm.h"
#include "erdma_verbs.h"
static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg,
u64 *addr0, u64 *addr1)
{
struct erdma_mtt *mtt = mem->mtt;
if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) {
*addr0 = mtt->buf_dma;
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INDIRECT_MTT);
} else {
*addr0 = mtt->buf[0];
memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1));
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
ERDMA_MR_INLINE_MTT);
}
}
static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
{
struct erdma_dev *dev = to_edev(qp->ibqp.device);
@ -79,18 +96,16 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
req.sq_mtt_cfg = user_qp->sq_mem.page_offset;
req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
user_qp->sq_mem.mtt_nents) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
user_qp->sq_mem.mtt_type);
user_qp->sq_mem.mtt_nents);
req.rq_mtt_cfg = user_qp->rq_mem.page_offset;
req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
user_qp->rq_mem.mtt_nents) |
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
user_qp->rq_mem.mtt_type);
user_qp->rq_mem.mtt_nents);
req.sq_buf_addr = user_qp->sq_mem.mtt_entry[0];
req.rq_buf_addr = user_qp->rq_mem.mtt_entry[0];
assemble_qbuf_mtt_for_cmd(&user_qp->sq_mem, &req.sq_mtt_cfg,
&req.sq_buf_addr, req.sq_mtt_entry);
assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg,
&req.rq_buf_addr, req.rq_mtt_entry);
req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr;
req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr;
@ -117,13 +132,22 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
{
struct erdma_cmdq_reg_mr_req req;
struct erdma_pd *pd = to_epd(mr->ibmr.pd);
u64 *phy_addr;
int i;
struct erdma_cmdq_reg_mr_req req;
u32 mtt_level;
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR);
if (mr->type == ERDMA_MR_TYPE_FRMR ||
mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) {
req.phy_addr[0] = mr->mem.mtt->buf_dma;
mtt_level = ERDMA_MR_INDIRECT_MTT;
} else {
memcpy(req.phy_addr, mr->mem.mtt->buf,
MTT_SIZE(mr->mem.page_cnt));
mtt_level = ERDMA_MR_INLINE_MTT;
}
req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) |
FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8);
@ -132,7 +156,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access);
req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK,
ilog2(mr->mem.page_size)) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_LEVEL_MASK, mtt_level) |
FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt);
if (mr->type == ERDMA_MR_TYPE_DMA)
@ -143,16 +167,6 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
req.size = mr->mem.len;
}
if (mr->type == ERDMA_MR_TYPE_FRMR ||
mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) {
phy_addr = req.phy_addr;
*phy_addr = mr->mem.mtt_entry[0];
} else {
phy_addr = req.phy_addr;
for (i = 0; i < mr->mem.mtt_nents; i++)
*phy_addr++ = mr->mem.mtt_entry[i];
}
post_cmd:
return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL);
}
@ -179,7 +193,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr);
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);
req.first_page_offset = 0;
@ -191,16 +205,20 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
ilog2(mem->page_size) - ERDMA_HW_PAGE_SHIFT);
if (mem->mtt_nents == 1) {
req.qbuf_addr_l = lower_32_bits(*(u64 *)mem->mtt_buf);
req.qbuf_addr_h = upper_32_bits(*(u64 *)mem->mtt_buf);
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf[0]);
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INLINE_MTT);
} else {
req.qbuf_addr_l = lower_32_bits(mem->mtt_entry[0]);
req.qbuf_addr_h = upper_32_bits(mem->mtt_entry[0]);
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma);
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma);
req.cfg1 |=
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
ERDMA_MR_INDIRECT_MTT);
}
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
mem->mtt_nents);
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
mem->mtt_type);
req.first_page_offset = mem->page_offset;
req.cq_db_info_addr = cq->user_cq.db_info_dma_addr;
@ -508,12 +526,77 @@ err_out:
return -ENOMEM;
}
static void erdma_fill_bottom_mtt(struct erdma_dev *dev, struct erdma_mem *mem)
{
struct erdma_mtt *mtt = mem->mtt;
struct ib_block_iter biter;
u32 idx = 0;
while (mtt->low_level)
mtt = mtt->low_level;
rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size)
mtt->buf[idx++] = rdma_block_iter_dma_address(&biter);
}
static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
size_t size)
{
struct erdma_mtt *mtt;
int ret = -ENOMEM;
mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
if (!mtt)
return ERR_PTR(-ENOMEM);
mtt->size = size;
mtt->buf = kzalloc(mtt->size, GFP_KERNEL);
if (!mtt->buf)
goto err_free_mtt;
mtt->continuous = true;
mtt->buf_dma = dma_map_single(&dev->pdev->dev, mtt->buf, mtt->size,
DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mtt->buf_dma))
goto err_free_mtt_buf;
return mtt;
err_free_mtt_buf:
kfree(mtt->buf);
err_free_mtt:
kfree(mtt);
return ERR_PTR(ret);
}
static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
bool force_continuous)
{
ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size,
force_continuous);
if (force_continuous)
return erdma_create_cont_mtt(dev, size);
return ERR_PTR(-EOPNOTSUPP);
}
static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt)
{
if (mtt->continuous) {
dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size,
DMA_TO_DEVICE);
kfree(mtt->buf);
kfree(mtt);
}
}
static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
u64 start, u64 len, int access, u64 virt,
unsigned long req_page_size, u8 force_indirect_mtt)
{
struct ib_block_iter biter;
uint64_t *phy_addr = NULL;
int ret = 0;
mem->umem = ib_umem_get(&dev->ibdev, start, len, access);
@ -529,38 +612,13 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
mem->page_offset = start & (mem->page_size - 1);
mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
mem->page_cnt = mem->mtt_nents;
if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES ||
force_indirect_mtt) {
mem->mtt_type = ERDMA_MR_INDIRECT_MTT;
mem->mtt_buf =
alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL);
if (!mem->mtt_buf) {
ret = -ENOMEM;
goto error_ret;
}
phy_addr = mem->mtt_buf;
} else {
mem->mtt_type = ERDMA_MR_INLINE_MTT;
phy_addr = mem->mtt_entry;
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true);
if (IS_ERR(mem->mtt)) {
ret = PTR_ERR(mem->mtt);
goto error_ret;
}
rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) {
*phy_addr = rdma_block_iter_dma_address(&biter);
phy_addr++;
}
if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) {
mem->mtt_entry[0] =
dma_map_single(&dev->pdev->dev, mem->mtt_buf,
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) {
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
mem->mtt_buf = NULL;
ret = -ENOMEM;
goto error_ret;
}
}
erdma_fill_bottom_mtt(dev, mem);
return 0;
@ -575,11 +633,8 @@ error_ret:
static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem)
{
if (mem->mtt_buf) {
dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0],
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
}
if (mem->mtt)
erdma_destroy_mtt(dev, mem->mtt);
if (mem->umem) {
ib_umem_release(mem->umem);
@ -875,33 +930,20 @@ struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
mr->mem.page_size = PAGE_SIZE; /* update it later. */
mr->mem.page_cnt = max_num_sg;
mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT;
mr->mem.mtt_buf =
alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL);
if (!mr->mem.mtt_buf) {
ret = -ENOMEM;
mr->mem.mtt = erdma_create_mtt(dev, MTT_SIZE(max_num_sg), true);
if (IS_ERR(mr->mem.mtt)) {
ret = PTR_ERR(mr->mem.mtt);
goto out_remove_stag;
}
mr->mem.mtt_entry[0] =
dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf,
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) {
ret = -ENOMEM;
goto out_free_mtt;
}
ret = regmr_cmd(dev, mr);
if (ret)
goto out_dma_unmap;
goto out_destroy_mtt;
return &mr->ibmr;
out_dma_unmap:
dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0],
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
out_free_mtt:
free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt));
out_destroy_mtt:
erdma_destroy_mtt(dev, mr->mem.mtt);
out_remove_stag:
erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
@ -920,7 +962,7 @@ static int erdma_set_page(struct ib_mr *ibmr, u64 addr)
if (mr->mem.mtt_nents >= mr->mem.page_cnt)
return -1;
*((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr;
mr->mem.mtt->buf[mr->mem.mtt_nents] = addr;
mr->mem.mtt_nents++;
return 0;

View File

@ -65,7 +65,7 @@ struct erdma_pd {
* MemoryRegion definition.
*/
#define ERDMA_MAX_INLINE_MTT_ENTRIES 4
#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt entry takes 8 Bytes. */
#define MTT_SIZE(mtt_cnt) ((mtt_cnt) << 3) /* per mtt entry takes 8 Bytes. */
#define ERDMA_MR_MAX_MTT_CNT 524288
#define ERDMA_MTT_ENTRY_SIZE 8
@ -90,10 +90,28 @@ static inline u8 to_erdma_access_flags(int access)
(access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0);
}
/* Hierarchical storage structure for MTT entries */
struct erdma_mtt {
u64 *buf;
size_t size;
bool continuous;
union {
dma_addr_t buf_dma;
struct {
struct scatterlist *sglist;
u32 nsg;
u32 level;
};
};
struct erdma_mtt *low_level;
};
struct erdma_mem {
struct ib_umem *umem;
void *mtt_buf;
u32 mtt_type;
struct erdma_mtt *mtt;
u32 page_size;
u32 page_offset;
u32 page_cnt;
@ -101,8 +119,6 @@ struct erdma_mem {
u64 va;
u64 len;
u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES];
};
struct erdma_mr {