IB/mlx5: Implement on demand paging by adding support for MMU notifiers
* Implement the relevant invalidation functions (zap MTTs as needed) * Implement interlocking (and rollback in the page fault handlers) for cases of a racing notifier and fault. * With this patch we can now enable the capability bits for supporting RC send/receive/RDMA read/RDMA write, and UD send. Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Shachar Raindel <raindel@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
This commit is contained in:
parent
eab668a6d0
commit
b4cfe447d4
@ -574,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
|
|||||||
goto out_count;
|
goto out_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||||
|
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
|
||||||
|
#endif
|
||||||
|
|
||||||
INIT_LIST_HEAD(&context->db_page_list);
|
INIT_LIST_HEAD(&context->db_page_list);
|
||||||
mutex_init(&context->db_page_mutex);
|
mutex_init(&context->db_page_mutex);
|
||||||
|
|
||||||
|
@ -325,6 +325,7 @@ struct mlx5_ib_mr {
|
|||||||
struct mlx5_ib_dev *dev;
|
struct mlx5_ib_dev *dev;
|
||||||
struct mlx5_create_mkey_mbox_out out;
|
struct mlx5_create_mkey_mbox_out out;
|
||||||
struct mlx5_core_sig_ctx *sig;
|
struct mlx5_core_sig_ctx *sig;
|
||||||
|
int live;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mlx5_ib_fast_reg_page_list {
|
struct mlx5_ib_fast_reg_page_list {
|
||||||
@ -629,6 +630,8 @@ int __init mlx5_ib_odp_init(void);
|
|||||||
void mlx5_ib_odp_cleanup(void);
|
void mlx5_ib_odp_cleanup(void);
|
||||||
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
|
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
|
||||||
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
|
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
|
||||||
|
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
|
||||||
|
unsigned long end);
|
||||||
|
|
||||||
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
||||||
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
|
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
|
||||||
|
@ -37,6 +37,7 @@
|
|||||||
#include <linux/export.h>
|
#include <linux/export.h>
|
||||||
#include <linux/delay.h>
|
#include <linux/delay.h>
|
||||||
#include <rdma/ib_umem.h>
|
#include <rdma/ib_umem.h>
|
||||||
|
#include <rdma/ib_umem_odp.h>
|
||||||
#include <rdma/ib_verbs.h>
|
#include <rdma/ib_verbs.h>
|
||||||
#include "mlx5_ib.h"
|
#include "mlx5_ib.h"
|
||||||
|
|
||||||
@ -54,6 +55,18 @@ static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
|
|||||||
|
|
||||||
static int clean_mr(struct mlx5_ib_mr *mr);
|
static int clean_mr(struct mlx5_ib_mr *mr);
|
||||||
|
|
||||||
|
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
|
||||||
|
{
|
||||||
|
int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
|
||||||
|
|
||||||
|
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||||
|
/* Wait until all page fault handlers using the mr complete. */
|
||||||
|
synchronize_srcu(&dev->mr_srcu);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
static int order2idx(struct mlx5_ib_dev *dev, int order)
|
static int order2idx(struct mlx5_ib_dev *dev, int order)
|
||||||
{
|
{
|
||||||
struct mlx5_mr_cache *cache = &dev->cache;
|
struct mlx5_mr_cache *cache = &dev->cache;
|
||||||
@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
|
|||||||
ent->cur--;
|
ent->cur--;
|
||||||
ent->size--;
|
ent->size--;
|
||||||
spin_unlock_irq(&ent->lock);
|
spin_unlock_irq(&ent->lock);
|
||||||
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
|
err = destroy_mkey(dev, mr);
|
||||||
if (err)
|
if (err)
|
||||||
mlx5_ib_warn(dev, "failed destroy mkey\n");
|
mlx5_ib_warn(dev, "failed destroy mkey\n");
|
||||||
else
|
else
|
||||||
@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
|
|||||||
ent->cur--;
|
ent->cur--;
|
||||||
ent->size--;
|
ent->size--;
|
||||||
spin_unlock_irq(&ent->lock);
|
spin_unlock_irq(&ent->lock);
|
||||||
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
|
err = destroy_mkey(dev, mr);
|
||||||
if (err)
|
if (err)
|
||||||
mlx5_ib_warn(dev, "failed destroy mkey\n");
|
mlx5_ib_warn(dev, "failed destroy mkey\n");
|
||||||
else
|
else
|
||||||
@ -812,6 +825,8 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
|
|||||||
mr->mmr.size = len;
|
mr->mmr.size = len;
|
||||||
mr->mmr.pd = to_mpd(pd)->pdn;
|
mr->mmr.pd = to_mpd(pd)->pdn;
|
||||||
|
|
||||||
|
mr->live = 1;
|
||||||
|
|
||||||
unmap_dma:
|
unmap_dma:
|
||||||
up(&umrc->sem);
|
up(&umrc->sem);
|
||||||
dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
|
dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
|
||||||
@ -997,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
|
|||||||
goto err_2;
|
goto err_2;
|
||||||
}
|
}
|
||||||
mr->umem = umem;
|
mr->umem = umem;
|
||||||
|
mr->live = 1;
|
||||||
kvfree(in);
|
kvfree(in);
|
||||||
|
|
||||||
mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
|
mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
|
||||||
@ -1074,10 +1090,47 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
|
|||||||
mr->ibmr.lkey = mr->mmr.key;
|
mr->ibmr.lkey = mr->mmr.key;
|
||||||
mr->ibmr.rkey = mr->mmr.key;
|
mr->ibmr.rkey = mr->mmr.key;
|
||||||
|
|
||||||
|
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||||
|
if (umem->odp_data) {
|
||||||
|
/*
|
||||||
|
* This barrier prevents the compiler from moving the
|
||||||
|
* setting of umem->odp_data->private to point to our
|
||||||
|
* MR, before reg_umr finished, to ensure that the MR
|
||||||
|
* initialization have finished before starting to
|
||||||
|
* handle invalidations.
|
||||||
|
*/
|
||||||
|
smp_wmb();
|
||||||
|
mr->umem->odp_data->private = mr;
|
||||||
|
/*
|
||||||
|
* Make sure we will see the new
|
||||||
|
* umem->odp_data->private value in the invalidation
|
||||||
|
* routines, before we can get page faults on the
|
||||||
|
* MR. Page faults can happen once we put the MR in
|
||||||
|
* the tree, below this line. Without the barrier,
|
||||||
|
* there can be a fault handling and an invalidation
|
||||||
|
* before umem->odp_data->private == mr is visible to
|
||||||
|
* the invalidation handler.
|
||||||
|
*/
|
||||||
|
smp_wmb();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return &mr->ibmr;
|
return &mr->ibmr;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
/*
|
||||||
|
* Destroy the umem *before* destroying the MR, to ensure we
|
||||||
|
* will not have any in-flight notifiers when destroying the
|
||||||
|
* MR.
|
||||||
|
*
|
||||||
|
* As the MR is completely invalid to begin with, and this
|
||||||
|
* error path is only taken if we can't push the mr entry into
|
||||||
|
* the pagefault tree, this is safe.
|
||||||
|
*/
|
||||||
|
|
||||||
ib_umem_release(umem);
|
ib_umem_release(umem);
|
||||||
|
/* Kill the MR, and return an error code. */
|
||||||
|
clean_mr(mr);
|
||||||
return ERR_PTR(err);
|
return ERR_PTR(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1121,7 +1174,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
|
|||||||
int err;
|
int err;
|
||||||
|
|
||||||
if (!umred) {
|
if (!umred) {
|
||||||
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
|
err = destroy_mkey(dev, mr);
|
||||||
if (err) {
|
if (err) {
|
||||||
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
|
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
|
||||||
mr->mmr.key, err);
|
mr->mmr.key, err);
|
||||||
@ -1150,9 +1203,25 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
|
|||||||
struct ib_umem *umem = mr->umem;
|
struct ib_umem *umem = mr->umem;
|
||||||
|
|
||||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||||
if (umem)
|
if (umem && umem->odp_data) {
|
||||||
|
/* Prevent new page faults from succeeding */
|
||||||
|
mr->live = 0;
|
||||||
/* Wait for all running page-fault handlers to finish. */
|
/* Wait for all running page-fault handlers to finish. */
|
||||||
synchronize_srcu(&dev->mr_srcu);
|
synchronize_srcu(&dev->mr_srcu);
|
||||||
|
/* Destroy all page mappings */
|
||||||
|
mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
|
||||||
|
ib_umem_end(umem));
|
||||||
|
/*
|
||||||
|
* We kill the umem before the MR for ODP,
|
||||||
|
* so that there will not be any invalidations in
|
||||||
|
* flight, looking at the *mr struct.
|
||||||
|
*/
|
||||||
|
ib_umem_release(umem);
|
||||||
|
atomic_sub(npages, &dev->mdev->priv.reg_pages);
|
||||||
|
|
||||||
|
/* Avoid double-freeing the umem. */
|
||||||
|
umem = NULL;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
clean_mr(mr);
|
clean_mr(mr);
|
||||||
@ -1269,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
|
|||||||
kfree(mr->sig);
|
kfree(mr->sig);
|
||||||
}
|
}
|
||||||
|
|
||||||
err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
|
err = destroy_mkey(dev, mr);
|
||||||
if (err) {
|
if (err) {
|
||||||
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
|
mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
|
||||||
mr->mmr.key, err);
|
mr->mmr.key, err);
|
||||||
|
@ -37,8 +37,78 @@
|
|||||||
|
|
||||||
#define MAX_PREFETCH_LEN (4*1024*1024U)
|
#define MAX_PREFETCH_LEN (4*1024*1024U)
|
||||||
|
|
||||||
|
/* Timeout in ms to wait for an active mmu notifier to complete when handling
|
||||||
|
* a pagefault. */
|
||||||
|
#define MMU_NOTIFIER_TIMEOUT 1000
|
||||||
|
|
||||||
struct workqueue_struct *mlx5_ib_page_fault_wq;
|
struct workqueue_struct *mlx5_ib_page_fault_wq;
|
||||||
|
|
||||||
|
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
|
||||||
|
unsigned long end)
|
||||||
|
{
|
||||||
|
struct mlx5_ib_mr *mr;
|
||||||
|
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
|
||||||
|
u64 idx = 0, blk_start_idx = 0;
|
||||||
|
int in_block = 0;
|
||||||
|
u64 addr;
|
||||||
|
|
||||||
|
if (!umem || !umem->odp_data) {
|
||||||
|
pr_err("invalidation called on NULL umem or non-ODP umem\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
mr = umem->odp_data->private;
|
||||||
|
|
||||||
|
if (!mr || !mr->ibmr.pd)
|
||||||
|
return;
|
||||||
|
|
||||||
|
start = max_t(u64, ib_umem_start(umem), start);
|
||||||
|
end = min_t(u64, ib_umem_end(umem), end);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
|
||||||
|
* while we are doing the invalidation, no page fault will attempt to
|
||||||
|
* overwrite the same MTTs. Concurent invalidations might race us,
|
||||||
|
* but they will write 0s as well, so no difference in the end result.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (addr = start; addr < end; addr += (u64)umem->page_size) {
|
||||||
|
idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
|
||||||
|
/*
|
||||||
|
* Strive to write the MTTs in chunks, but avoid overwriting
|
||||||
|
* non-existing MTTs. The huristic here can be improved to
|
||||||
|
* estimate the cost of another UMR vs. the cost of bigger
|
||||||
|
* UMR.
|
||||||
|
*/
|
||||||
|
if (umem->odp_data->dma_list[idx] &
|
||||||
|
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
|
||||||
|
if (!in_block) {
|
||||||
|
blk_start_idx = idx;
|
||||||
|
in_block = 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
u64 umr_offset = idx & umr_block_mask;
|
||||||
|
|
||||||
|
if (in_block && umr_offset == 0) {
|
||||||
|
mlx5_ib_update_mtt(mr, blk_start_idx,
|
||||||
|
idx - blk_start_idx, 1);
|
||||||
|
in_block = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (in_block)
|
||||||
|
mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
|
||||||
|
1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We are now sure that the device will not access the
|
||||||
|
* memory. We can safely unmap it, and mark it as dirty if
|
||||||
|
* needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
ib_umem_odp_unmap_dma_pages(umem, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
|
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
|
||||||
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
|
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
|
||||||
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
|
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
|
||||||
@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
|
|||||||
if (err)
|
if (err)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/* At this point we would copy the capability bits that the driver
|
caps->general_caps = IB_ODP_SUPPORT;
|
||||||
* supports from the hw_caps struct to the caps struct. However, no
|
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
|
||||||
* such capabilities are supported so far. */
|
SEND);
|
||||||
|
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
||||||
|
SEND);
|
||||||
|
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
||||||
|
RECV);
|
||||||
|
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
||||||
|
WRITE);
|
||||||
|
COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
||||||
|
READ);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
|
|||||||
{
|
{
|
||||||
u32 base_key = mlx5_base_mkey(key);
|
u32 base_key = mlx5_base_mkey(key);
|
||||||
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
|
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
|
||||||
|
struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
|
||||||
|
|
||||||
if (!mmr || mmr->key != key)
|
if (!mmr || mmr->key != key || !mr->live)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
return container_of(mmr, struct mlx5_ib_mr, mmr);
|
return container_of(mmr, struct mlx5_ib_mr, mmr);
|
||||||
@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
|
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
|
||||||
|
/*
|
||||||
|
* Ensure the sequence number is valid for some time before we call
|
||||||
|
* gup.
|
||||||
|
*/
|
||||||
|
smp_rmb();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Avoid branches - this code will perform correctly
|
* Avoid branches - this code will perform correctly
|
||||||
@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|||||||
|
|
||||||
if (npages > 0) {
|
if (npages > 0) {
|
||||||
mutex_lock(&mr->umem->odp_data->umem_mutex);
|
mutex_lock(&mr->umem->odp_data->umem_mutex);
|
||||||
/*
|
if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
|
||||||
* No need to check whether the MTTs really belong to
|
/*
|
||||||
* this MR, since ib_umem_odp_map_dma_pages already
|
* No need to check whether the MTTs really belong to
|
||||||
* checks this.
|
* this MR, since ib_umem_odp_map_dma_pages already
|
||||||
*/
|
* checks this.
|
||||||
ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
|
*/
|
||||||
|
ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
|
||||||
|
} else {
|
||||||
|
ret = -EAGAIN;
|
||||||
|
}
|
||||||
mutex_unlock(&mr->umem->odp_data->umem_mutex);
|
mutex_unlock(&mr->umem->odp_data->umem_mutex);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
pr_err("Failed to update mkey page tables\n");
|
if (ret != -EAGAIN)
|
||||||
|
pr_err("Failed to update mkey page tables\n");
|
||||||
goto srcu_unlock;
|
goto srcu_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
srcu_unlock:
|
srcu_unlock:
|
||||||
|
if (ret == -EAGAIN) {
|
||||||
|
if (!mr->umem->odp_data->dying) {
|
||||||
|
struct ib_umem_odp *odp_data = mr->umem->odp_data;
|
||||||
|
unsigned long timeout =
|
||||||
|
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
|
||||||
|
|
||||||
|
if (!wait_for_completion_timeout(
|
||||||
|
&odp_data->notifier_completion,
|
||||||
|
timeout)) {
|
||||||
|
pr_warn("timeout waiting for mmu notifier completion\n");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* The MR is being killed, kill the QP as well. */
|
||||||
|
ret = -EFAULT;
|
||||||
|
}
|
||||||
|
}
|
||||||
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
|
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
|
||||||
pfault->mpfault.bytes_committed = 0;
|
pfault->mpfault.bytes_committed = 0;
|
||||||
return ret ? ret : npages;
|
return ret ? ret : npages;
|
||||||
|
Loading…
Reference in New Issue
Block a user