RDMA/mlx5: Allow larger pages in DevX umem

The umem DMA list calculation was locked at 4k pages due to confusion
around how this API works and is used when larger pages are present.

The conclusion is:

 - umem's cannot extend past what is mapped into the process, so creating
   a lage page size and referring to a sub-range is not allowed

 - umem's must always have a page offset of zero, except for sub PAGE_SIZE
   umems

 - The feature of umem_offset to create multiple objects inside a umem
   is buggy and isn't used anyplace. Thus we can assume all users of the
   current API have umem_offset == 0 as well

Provide a new page size calculator that limits the DMA list to the VA
range and enforces umem_offset == 0.

Allow user space to specify the page sizes which it can accept, this
bitmap must be derived from the intended use of the umem, based on
per-usage HW limitations.

Link: https://lore.kernel.org/r/20210304130501.1102577-4-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
Jason Gunthorpe 2021-03-04 15:05:01 +02:00
parent 2904bb37b3
commit 7610ab57de
2 changed files with 55 additions and 10 deletions

View File

@ -2185,27 +2185,69 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
return 0; return 0;
} }
static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem,
unsigned long pgsz_bitmap)
{
unsigned long page_size;
/* Don't bother checking larger page sizes as offset must be zero and
* total DEVX umem length must be equal to total umem length.
*/
pgsz_bitmap &= GENMASK_ULL(max_t(u64, order_base_2(umem->length),
PAGE_SHIFT),
MLX5_ADAPTER_PAGE_SHIFT);
if (!pgsz_bitmap)
return 0;
page_size = ib_umem_find_best_pgoff(umem, pgsz_bitmap, U64_MAX);
if (!page_size)
return 0;
/* If the page_size is less than the CPU page size then we can use the
* offset and create a umem which is a subset of the page list.
* For larger page sizes we can't be sure the DMA list reflects the
* VA so we must ensure that the umem extent is exactly equal to the
* page list. Reduce the page size until one of these cases is true.
*/
while ((ib_umem_dma_offset(umem, page_size) != 0 ||
(umem->length % page_size) != 0) &&
page_size > PAGE_SIZE)
page_size /= 2;
return page_size;
}
static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev, static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
struct uverbs_attr_bundle *attrs, struct uverbs_attr_bundle *attrs,
struct devx_umem *obj, struct devx_umem *obj,
struct devx_umem_reg_cmd *cmd) struct devx_umem_reg_cmd *cmd)
{ {
unsigned long pgsz_bitmap;
unsigned int page_size; unsigned int page_size;
__be64 *mtt; __be64 *mtt;
void *umem; void *umem;
int ret;
/* /*
* We don't know what the user intends to use this umem for, but the HW * If the user does not pass in pgsz_bitmap then the user promises not
* restrictions must be met. MR, doorbell records, QP, WQ and CQ all * to use umem_offset!=0 in any commands that allocate on top of the
* have different requirements. Since we have no idea how to sort this * umem.
* out, only support PAGE_SIZE with the expectation that userspace will *
* provide the necessary alignments inside the known PAGE_SIZE and that * If the user wants to use a umem_offset then it must pass in
* FW will check everything. * pgsz_bitmap which guides the maximum page size and thus maximum
* object alignment inside the umem. See the PRM.
*
* Users are not allowed to use IOVA here, mkeys are not supported on
* umem.
*/ */
page_size = ib_umem_find_best_pgoff( ret = uverbs_get_const_default(&pgsz_bitmap, attrs,
obj->umem, PAGE_SIZE, MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
__mlx5_page_offset_to_bitmask(__mlx5_bit_sz(umem, page_offset), GENMASK_ULL(63,
0)); min(PAGE_SHIFT, MLX5_ADAPTER_PAGE_SHIFT)));
if (ret)
return ret;
page_size = devx_umem_find_best_pgsize(obj->umem, pgsz_bitmap);
if (!page_size) if (!page_size)
return -EINVAL; return -EINVAL;
@ -2791,6 +2833,8 @@ DECLARE_UVERBS_NAMED_METHOD(
UA_MANDATORY), UA_MANDATORY),
UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
enum ib_access_flags), enum ib_access_flags),
UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
u64),
UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID,
UVERBS_ATTR_TYPE(u32), UVERBS_ATTR_TYPE(u32),
UA_MANDATORY)); UA_MANDATORY));

View File

@ -154,6 +154,7 @@ enum mlx5_ib_devx_umem_reg_attrs {
MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN,
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID,
MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
}; };
enum mlx5_ib_devx_umem_dereg_attrs { enum mlx5_ib_devx_umem_dereg_attrs {