2014-12-11 18:04:20 +03:00
/*
2015-04-02 17:07:30 +03:00
* Copyright ( c ) 2013 - 2015 , Mellanox Technologies . All rights reserved .
2014-12-11 18:04:20 +03:00
*
* This software is available to you under a choice of one of two
* licenses . You may choose to be licensed under the terms of the GNU
* General Public License ( GPL ) Version 2 , available from the file
* COPYING in the main directory of this source tree , or the
* OpenIB . org BSD license below :
*
* Redistribution and use in source and binary forms , with or
* without modification , are permitted provided that the following
* conditions are met :
*
* - Redistributions of source code must retain the above
* copyright notice , this list of conditions and the following
* disclaimer .
*
* - Redistributions in binary form must reproduce the above
* copyright notice , this list of conditions and the following
* disclaimer in the documentation and / or other materials
* provided with the distribution .
*
* THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
* ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE .
*/
2014-12-11 18:04:24 +03:00
# include <rdma/ib_umem.h>
# include <rdma/ib_umem_odp.h>
2017-10-16 08:45:17 +03:00
# include <linux/kernel.h>
2014-12-11 18:04:24 +03:00
2014-12-11 18:04:20 +03:00
# include "mlx5_ib.h"
2017-01-18 17:58:11 +03:00
# include "cmd.h"
2014-12-11 18:04:20 +03:00
2018-11-19 21:52:41 +03:00
# include <linux/mlx5/eq.h>
/* Contains the details of a pagefault. */
struct mlx5_pagefault {
u32 bytes_committed ;
u32 token ;
u8 event_subtype ;
u8 type ;
union {
/* Initiator or send message responder pagefault details. */
struct {
/* Received packet size, only valid for responders. */
u32 packet_size ;
/*
* Number of resource holding WQE , depends on type .
*/
u32 wq_num ;
/*
* WQE index . Refers to either the send queue or
* receive queue , according to event_subtype .
*/
u16 wqe_index ;
} wqe ;
/* RDMA responder pagefault details */
struct {
u32 r_key ;
/*
* Received packet size , minimal size page fault
* resolution required for forward progress .
*/
u32 packet_size ;
u32 rdma_op_len ;
u64 rdma_va ;
} rdma ;
} ;
struct mlx5_ib_pf_eq * eq ;
struct work_struct work ;
} ;
2014-12-11 18:04:25 +03:00
# define MAX_PREFETCH_LEN (4*1024*1024U)
2014-12-11 18:04:26 +03:00
/* Timeout in ms to wait for an active mmu notifier to complete when handling
* a pagefault . */
# define MMU_NOTIFIER_TIMEOUT 1000
2017-01-18 17:58:11 +03:00
# define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
# define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
# define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
# define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
# define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
# define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
static u64 mlx5_imr_ksm_entries ;
2019-10-09 19:09:29 +03:00
void mlx5_odp_populate_klm ( struct mlx5_klm * pklm , size_t idx , size_t nentries ,
struct mlx5_ib_mr * imr , int flags )
2017-01-18 17:58:11 +03:00
{
2019-10-09 19:09:29 +03:00
struct mlx5_klm * end = pklm + nentries ;
2017-01-18 17:58:11 +03:00
if ( flags & MLX5_IB_UPD_XLT_ZAP ) {
2019-10-09 19:09:29 +03:00
for ( ; pklm ! = end ; pklm + + , idx + + ) {
2017-01-18 17:58:11 +03:00
pklm - > bcount = cpu_to_be32 ( MLX5_IMR_MTT_SIZE ) ;
2019-10-09 19:09:29 +03:00
pklm - > key = cpu_to_be32 ( imr - > dev - > null_mkey ) ;
2017-01-18 17:58:11 +03:00
pklm - > va = 0 ;
}
return ;
}
2019-10-01 18:38:17 +03:00
/*
2019-10-09 19:09:29 +03:00
* The locking here is pretty subtle . Ideally the implicit_children
* xarray would be protected by the umem_mutex , however that is not
2019-10-01 18:38:17 +03:00
* possible . Instead this uses a weaker update - then - lock pattern :
*
* srcu_read_lock ( )
2019-10-09 19:09:29 +03:00
* xa_store ( )
2019-10-01 18:38:17 +03:00
* mutex_lock ( umem_mutex )
* mlx5_ib_update_xlt ( )
* mutex_unlock ( umem_mutex )
* destroy lkey
*
2019-10-09 19:09:29 +03:00
* ie any change the xarray must be followed by the locked update_xlt
* before destroying .
2019-10-01 18:38:17 +03:00
*
* The umem_mutex provides the acquire / release semantic needed to make
2019-10-09 19:09:29 +03:00
* the xa_store ( ) visible to a racing thread . While SRCU is not
2019-10-01 18:38:17 +03:00
* technically required , using it gives consistent use of the SRCU
2019-10-09 19:09:29 +03:00
* locking around the xarray .
2019-10-01 18:38:17 +03:00
*/
2019-10-09 19:09:29 +03:00
lockdep_assert_held ( & to_ib_umem_odp ( imr - > umem ) - > umem_mutex ) ;
lockdep_assert_held ( & imr - > dev - > odp_srcu ) ;
2019-10-01 18:38:17 +03:00
2019-10-09 19:09:29 +03:00
for ( ; pklm ! = end ; pklm + + , idx + + ) {
struct mlx5_ib_mr * mtt = xa_load ( & imr - > implicit_children , idx ) ;
2017-01-18 17:58:11 +03:00
pklm - > bcount = cpu_to_be32 ( MLX5_IMR_MTT_SIZE ) ;
2019-10-09 19:09:29 +03:00
if ( mtt ) {
2017-01-18 17:58:11 +03:00
pklm - > key = cpu_to_be32 ( mtt - > ibmr . lkey ) ;
2019-10-09 19:09:29 +03:00
pklm - > va = cpu_to_be64 ( idx * MLX5_IMR_MTT_SIZE ) ;
2017-01-18 17:58:11 +03:00
} else {
2019-10-09 19:09:29 +03:00
pklm - > key = cpu_to_be32 ( imr - > dev - > null_mkey ) ;
2019-10-09 19:09:27 +03:00
pklm - > va = 0 ;
2017-01-18 17:58:11 +03:00
}
}
}
2019-10-09 19:09:34 +03:00
static void dma_fence_odp_mr ( struct mlx5_ib_mr * mr )
{
struct ib_umem_odp * odp = to_ib_umem_odp ( mr - > umem ) ;
/* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
mutex_lock ( & odp - > umem_mutex ) ;
if ( odp - > npages ) {
mlx5_mr_cache_invalidate ( mr ) ;
ib_umem_odp_unmap_dma_pages ( odp , ib_umem_start ( odp ) ,
ib_umem_end ( odp ) ) ;
WARN_ON ( odp - > npages ) ;
}
odp - > private = NULL ;
mutex_unlock ( & odp - > umem_mutex ) ;
if ( ! mr - > allocated_from_cache ) {
mlx5_core_destroy_mkey ( mr - > dev - > mdev , & mr - > mmkey ) ;
WARN_ON ( mr - > descs ) ;
}
}
2019-10-09 19:09:32 +03:00
/*
* This must be called after the mr has been removed from implicit_children
2019-10-09 19:09:33 +03:00
* and the SRCU synchronized . NOTE : The MR does not necessarily have to be
* empty here , parallel page faults could have raced with the free process and
* added pages to it .
2019-10-09 19:09:32 +03:00
*/
static void free_implicit_child_mr ( struct mlx5_ib_mr * mr , bool need_imr_xlt )
2017-01-18 17:58:11 +03:00
{
2019-10-09 19:09:32 +03:00
struct mlx5_ib_mr * imr = mr - > parent ;
2019-10-01 18:38:17 +03:00
struct ib_umem_odp * odp_imr = to_ib_umem_odp ( imr - > umem ) ;
2019-10-09 19:09:32 +03:00
struct ib_umem_odp * odp = to_ib_umem_odp ( mr - > umem ) ;
unsigned long idx = ib_umem_start ( odp ) > > MLX5_IMR_MTT_SHIFT ;
2019-10-01 18:38:17 +03:00
int srcu_key ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
/* implicit_child_mr's are not allowed to have deferred work */
WARN_ON ( atomic_read ( & mr - > num_deferred_work ) ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
if ( need_imr_xlt ) {
2019-10-09 19:09:23 +03:00
srcu_key = srcu_read_lock ( & mr - > dev - > odp_srcu ) ;
2019-10-01 18:38:17 +03:00
mutex_lock ( & odp_imr - > umem_mutex ) ;
2019-10-09 19:09:32 +03:00
mlx5_ib_update_xlt ( mr - > parent , idx , 1 , 0 ,
2017-01-18 17:58:11 +03:00
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC ) ;
2019-10-01 18:38:17 +03:00
mutex_unlock ( & odp_imr - > umem_mutex ) ;
2019-10-09 19:09:23 +03:00
srcu_read_unlock ( & mr - > dev - > odp_srcu , srcu_key ) ;
2019-10-01 18:38:17 +03:00
}
2019-10-09 19:09:32 +03:00
2019-10-09 19:09:34 +03:00
dma_fence_odp_mr ( mr ) ;
2019-10-09 19:09:32 +03:00
mr - > parent = NULL ;
2017-01-18 17:58:11 +03:00
mlx5_mr_cache_free ( mr - > dev , mr ) ;
2019-10-09 19:09:32 +03:00
ib_umem_odp_release ( odp ) ;
atomic_dec ( & imr - > num_deferred_work ) ;
}
static void free_implicit_child_mr_work ( struct work_struct * work )
{
struct mlx5_ib_mr * mr =
container_of ( work , struct mlx5_ib_mr , odp_destroy . work ) ;
free_implicit_child_mr ( mr , true ) ;
}
static void free_implicit_child_mr_rcu ( struct rcu_head * head )
{
struct mlx5_ib_mr * mr =
container_of ( head , struct mlx5_ib_mr , odp_destroy . rcu ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK ( & mr - > odp_destroy . work , free_implicit_child_mr_work ) ;
queue_work ( system_unbound_wq , & mr - > odp_destroy . work ) ;
}
static void destroy_unused_implicit_child_mr ( struct mlx5_ib_mr * mr )
{
struct ib_umem_odp * odp = to_ib_umem_odp ( mr - > umem ) ;
unsigned long idx = ib_umem_start ( odp ) > > MLX5_IMR_MTT_SHIFT ;
struct mlx5_ib_mr * imr = mr - > parent ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
xa_lock ( & imr - > implicit_children ) ;
/*
* This can race with mlx5_ib_free_implicit_mr ( ) , the first one to
* reach the xa lock wins the race and destroys the MR .
*/
if ( __xa_cmpxchg ( & imr - > implicit_children , idx , mr , NULL , GFP_ATOMIC ) ! =
mr )
goto out_unlock ;
atomic_inc ( & imr - > num_deferred_work ) ;
call_srcu ( & mr - > dev - > odp_srcu , & mr - > odp_destroy . rcu ,
free_implicit_child_mr_rcu ) ;
out_unlock :
xa_unlock ( & imr - > implicit_children ) ;
2017-01-18 17:58:11 +03:00
}
2019-11-12 23:22:22 +03:00
static bool mlx5_ib_invalidate_range ( struct mmu_interval_notifier * mni ,
const struct mmu_notifier_range * range ,
unsigned long cur_seq )
2014-12-11 18:04:26 +03:00
{
2019-11-12 23:22:22 +03:00
struct ib_umem_odp * umem_odp =
container_of ( mni , struct ib_umem_odp , notifier ) ;
2014-12-11 18:04:26 +03:00
struct mlx5_ib_mr * mr ;
2017-01-02 12:37:42 +03:00
const u64 umr_block_mask = ( MLX5_UMR_MTT_ALIGNMENT /
sizeof ( struct mlx5_mtt ) ) - 1 ;
2014-12-11 18:04:26 +03:00
u64 idx = 0 , blk_start_idx = 0 ;
2019-10-16 09:23:05 +03:00
u64 invalidations = 0 ;
2019-11-12 23:22:22 +03:00
unsigned long start ;
unsigned long end ;
2014-12-11 18:04:26 +03:00
int in_block = 0 ;
u64 addr ;
2019-11-12 23:22:22 +03:00
if ( ! mmu_notifier_range_blockable ( range ) )
return false ;
2019-10-09 19:09:34 +03:00
mutex_lock ( & umem_odp - > umem_mutex ) ;
2019-11-12 23:22:22 +03:00
mmu_interval_set_seq ( mni , cur_seq ) ;
2019-10-09 19:09:34 +03:00
/*
* If npages is zero then umem_odp - > private may not be setup yet . This
* does not complete until after the first page is mapped for DMA .
*/
if ( ! umem_odp - > npages )
goto out ;
2018-09-16 20:48:04 +03:00
mr = umem_odp - > private ;
2014-12-11 18:04:26 +03:00
2019-11-12 23:22:22 +03:00
start = max_t ( u64 , ib_umem_start ( umem_odp ) , range - > start ) ;
end = min_t ( u64 , ib_umem_end ( umem_odp ) , range - > end ) ;
2014-12-11 18:04:26 +03:00
/*
* Iteration one - zap the HW ' s MTTs . The notifiers_count ensures that
* while we are doing the invalidation , no page fault will attempt to
* overwrite the same MTTs . Concurent invalidations might race us ,
* but they will write 0 s as well , so no difference in the end result .
*/
2019-05-20 09:05:25 +03:00
for ( addr = start ; addr < end ; addr + = BIT ( umem_odp - > page_shift ) ) {
idx = ( addr - ib_umem_start ( umem_odp ) ) > > umem_odp - > page_shift ;
2014-12-11 18:04:26 +03:00
/*
* Strive to write the MTTs in chunks , but avoid overwriting
* non - existing MTTs . The huristic here can be improved to
* estimate the cost of another UMR vs . the cost of bigger
* UMR .
*/
2018-09-16 20:48:04 +03:00
if ( umem_odp - > dma_list [ idx ] &
2014-12-11 18:04:26 +03:00
( ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT ) ) {
if ( ! in_block ) {
blk_start_idx = idx ;
in_block = 1 ;
}
2019-10-16 09:23:05 +03:00
/* Count page invalidations */
invalidations + = idx - blk_start_idx + 1 ;
2014-12-11 18:04:26 +03:00
} else {
u64 umr_offset = idx & umr_block_mask ;
if ( in_block & & umr_offset = = 0 ) {
2017-01-02 12:37:44 +03:00
mlx5_ib_update_xlt ( mr , blk_start_idx ,
2017-04-05 09:23:56 +03:00
idx - blk_start_idx , 0 ,
2017-01-02 12:37:44 +03:00
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC ) ;
2014-12-11 18:04:26 +03:00
in_block = 0 ;
}
}
}
if ( in_block )
2017-01-02 12:37:44 +03:00
mlx5_ib_update_xlt ( mr , blk_start_idx ,
2017-04-05 09:23:56 +03:00
idx - blk_start_idx + 1 , 0 ,
2017-01-02 12:37:44 +03:00
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC ) ;
2019-10-16 09:23:05 +03:00
mlx5_update_odp_stats ( mr , invalidations , invalidations ) ;
2014-12-11 18:04:26 +03:00
/*
* We are now sure that the device will not access the
* memory . We can safely unmap it , and mark it as dirty if
* needed .
*/
2018-09-16 20:48:04 +03:00
ib_umem_odp_unmap_dma_pages ( umem_odp , start , end ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
if ( unlikely ( ! umem_odp - > npages & & mr - > parent ) )
destroy_unused_implicit_child_mr ( mr ) ;
2019-10-09 19:09:34 +03:00
out :
2019-10-01 18:38:18 +03:00
mutex_unlock ( & umem_odp - > umem_mutex ) ;
2019-11-12 23:22:22 +03:00
return true ;
2014-12-11 18:04:26 +03:00
}
2019-11-12 23:22:22 +03:00
const struct mmu_interval_notifier_ops mlx5_mn_ops = {
. invalidate = mlx5_ib_invalidate_range ,
} ;
2015-05-28 22:28:41 +03:00
void mlx5_ib_internal_fill_odp_caps ( struct mlx5_ib_dev * dev )
2014-12-11 18:04:20 +03:00
{
struct ib_odp_caps * caps = & dev - > odp_caps ;
memset ( caps , 0 , sizeof ( * caps ) ) ;
2019-08-15 11:38:32 +03:00
if ( ! MLX5_CAP_GEN ( dev - > mdev , pg ) | |
! mlx5_ib_can_use_umr ( dev , true ) )
2015-05-28 22:28:41 +03:00
return ;
2014-12-11 18:04:20 +03:00
2014-12-11 18:04:26 +03:00
caps - > general_caps = IB_ODP_SUPPORT ;
2015-05-28 22:28:41 +03:00
2017-01-02 12:37:43 +03:00
if ( MLX5_CAP_GEN ( dev - > mdev , umr_extended_translation_offset ) )
dev - > odp_max_size = U64_MAX ;
else
dev - > odp_max_size = BIT_ULL ( MLX5_MAX_UMR_SHIFT + PAGE_SHIFT ) ;
2015-05-28 22:28:41 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , ud_odp_caps . send ) )
caps - > per_transport_caps . ud_odp_caps | = IB_ODP_SUPPORT_SEND ;
2019-01-22 09:48:48 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , ud_odp_caps . srq_receive ) )
caps - > per_transport_caps . ud_odp_caps | = IB_ODP_SUPPORT_SRQ_RECV ;
2015-05-28 22:28:41 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . send ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_SEND ;
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . receive ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_RECV ;
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . write ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_WRITE ;
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . read ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_READ ;
2017-01-02 12:37:47 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . atomic ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_ATOMIC ;
2019-01-22 09:48:48 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , rc_odp_caps . srq_receive ) )
caps - > per_transport_caps . rc_odp_caps | = IB_ODP_SUPPORT_SRQ_RECV ;
2019-01-22 09:48:50 +03:00
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . send ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_SEND ;
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . receive ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_RECV ;
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . write ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_WRITE ;
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . read ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_READ ;
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . atomic ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_ATOMIC ;
if ( MLX5_CAP_ODP ( dev - > mdev , xrc_odp_caps . srq_receive ) )
caps - > per_transport_caps . xrc_odp_caps | = IB_ODP_SUPPORT_SRQ_RECV ;
2017-01-18 17:58:11 +03:00
if ( MLX5_CAP_GEN ( dev - > mdev , fixed_buffer_size ) & &
MLX5_CAP_GEN ( dev - > mdev , null_mkey ) & &
2019-08-15 11:38:32 +03:00
MLX5_CAP_GEN ( dev - > mdev , umr_extended_translation_offset ) & &
! MLX5_CAP_GEN ( dev - > mdev , umr_indirect_mkey_disabled ) )
2017-01-18 17:58:11 +03:00
caps - > general_caps | = IB_ODP_SUPPORT_IMPLICIT ;
2014-12-11 18:04:20 +03:00
}
2014-12-11 18:04:23 +03:00
2017-01-02 12:37:46 +03:00
static void mlx5_ib_page_fault_resume ( struct mlx5_ib_dev * dev ,
struct mlx5_pagefault * pfault ,
2016-01-14 20:13:03 +03:00
int error )
{
2017-01-02 12:37:46 +03:00
int wq_num = pfault - > event_subtype = = MLX5_PFAULT_SUBTYPE_WQE ?
pfault - > wqe . wq_num : pfault - > token ;
2018-11-19 21:52:41 +03:00
u32 out [ MLX5_ST_SZ_DW ( page_fault_resume_out ) ] = { } ;
u32 in [ MLX5_ST_SZ_DW ( page_fault_resume_in ) ] = { } ;
int err ;
MLX5_SET ( page_fault_resume_in , in , opcode , MLX5_CMD_OP_PAGE_FAULT_RESUME ) ;
MLX5_SET ( page_fault_resume_in , in , page_fault_type , pfault - > type ) ;
MLX5_SET ( page_fault_resume_in , in , token , pfault - > token ) ;
MLX5_SET ( page_fault_resume_in , in , wq_number , wq_num ) ;
MLX5_SET ( page_fault_resume_in , in , error , ! ! error ) ;
err = mlx5_cmd_exec ( dev - > mdev , in , sizeof ( in ) , out , sizeof ( out ) ) ;
if ( err )
mlx5_ib_err ( dev , " Failed to resolve the page fault on WQ 0x%x err %d \n " ,
wq_num , err ) ;
2014-12-11 18:04:23 +03:00
}
2019-10-09 19:09:26 +03:00
static struct mlx5_ib_mr * implicit_get_child_mr ( struct mlx5_ib_mr * imr ,
unsigned long idx )
2017-01-18 17:58:11 +03:00
{
2019-10-09 19:09:26 +03:00
struct ib_umem_odp * odp ;
2017-01-18 17:58:11 +03:00
struct mlx5_ib_mr * mr ;
2019-10-09 19:09:26 +03:00
struct mlx5_ib_mr * ret ;
2017-01-18 17:58:11 +03:00
int err ;
2019-10-09 19:09:26 +03:00
odp = ib_umem_odp_alloc_child ( to_ib_umem_odp ( imr - > umem ) ,
idx * MLX5_IMR_MTT_SIZE ,
2019-11-12 23:22:22 +03:00
MLX5_IMR_MTT_SIZE , & mlx5_mn_ops ) ;
2019-10-09 19:09:26 +03:00
if ( IS_ERR ( odp ) )
return ERR_CAST ( odp ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:26 +03:00
ret = mr = mlx5_mr_cache_alloc ( imr - > dev , MLX5_IMR_MTT_CACHE_ENTRY ) ;
2017-01-18 17:58:11 +03:00
if ( IS_ERR ( mr ) )
2019-10-09 19:09:26 +03:00
goto out_umem ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:26 +03:00
mr - > ibmr . pd = imr - > ibmr . pd ;
mr - > access_flags = imr - > access_flags ;
mr - > umem = & odp - > umem ;
2017-01-18 17:58:11 +03:00
mr - > ibmr . lkey = mr - > mmkey . key ;
mr - > ibmr . rkey = mr - > mmkey . key ;
2019-10-09 19:09:27 +03:00
mr - > mmkey . iova = idx * MLX5_IMR_MTT_SIZE ;
2019-10-09 19:09:26 +03:00
mr - > parent = imr ;
odp - > private = mr ;
err = mlx5_ib_update_xlt ( mr , 0 ,
MLX5_IMR_MTT_ENTRIES ,
PAGE_SHIFT ,
MLX5_IB_UPD_XLT_ZAP |
2019-10-09 19:09:30 +03:00
MLX5_IB_UPD_XLT_ENABLE ) ;
2019-10-09 19:09:26 +03:00
if ( err ) {
ret = ERR_PTR ( err ) ;
2019-10-09 19:09:33 +03:00
goto out_mr ;
2017-01-18 17:58:11 +03:00
}
2019-10-09 19:09:29 +03:00
/*
* Once the store to either xarray completes any error unwind has to
* use synchronize_srcu ( ) . Avoid this with xa_reserve ( )
*/
2019-10-09 19:09:32 +03:00
ret = xa_cmpxchg ( & imr - > implicit_children , idx , NULL , mr ,
GFP_KERNEL ) ;
2019-10-09 19:09:30 +03:00
if ( unlikely ( ret ) ) {
if ( xa_is_err ( ret ) ) {
ret = ERR_PTR ( xa_err ( ret ) ) ;
2019-10-09 19:09:33 +03:00
goto out_mr ;
2017-01-18 17:58:11 +03:00
}
2019-10-09 19:09:30 +03:00
/*
* Another thread beat us to creating the child mr , use
* theirs .
*/
2019-10-09 19:09:33 +03:00
goto out_mr ;
2017-01-18 17:58:11 +03:00
}
2019-10-09 19:09:26 +03:00
mlx5_ib_dbg ( imr - > dev , " key %x mr %p \n " , mr - > mmkey . key , mr ) ;
2017-01-18 17:58:11 +03:00
return mr ;
2019-10-09 19:09:23 +03:00
out_mr :
2019-10-09 19:09:26 +03:00
mlx5_mr_cache_free ( imr - > dev , mr ) ;
out_umem :
ib_umem_odp_release ( odp ) ;
return ret ;
2017-01-18 17:58:11 +03:00
}
struct mlx5_ib_mr * mlx5_ib_alloc_implicit_mr ( struct mlx5_ib_pd * pd ,
2019-01-09 12:15:16 +03:00
struct ib_udata * udata ,
2017-01-18 17:58:11 +03:00
int access_flags )
{
2019-10-09 19:09:26 +03:00
struct mlx5_ib_dev * dev = to_mdev ( pd - > ibpd . device ) ;
2019-08-19 14:17:03 +03:00
struct ib_umem_odp * umem_odp ;
2019-10-09 19:09:26 +03:00
struct mlx5_ib_mr * imr ;
int err ;
2017-01-18 17:58:11 +03:00
2019-08-19 14:17:03 +03:00
umem_odp = ib_umem_odp_alloc_implicit ( udata , access_flags ) ;
if ( IS_ERR ( umem_odp ) )
return ERR_CAST ( umem_odp ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:26 +03:00
imr = mlx5_mr_cache_alloc ( dev , MLX5_IMR_KSM_CACHE_ENTRY ) ;
2017-01-18 17:58:11 +03:00
if ( IS_ERR ( imr ) ) {
2019-10-09 19:09:26 +03:00
err = PTR_ERR ( imr ) ;
goto out_umem ;
2017-01-18 17:58:11 +03:00
}
2019-10-09 19:09:26 +03:00
imr - > ibmr . pd = & pd - > ibpd ;
imr - > access_flags = access_flags ;
imr - > mmkey . iova = 0 ;
imr - > umem = & umem_odp - > umem ;
imr - > ibmr . lkey = imr - > mmkey . key ;
imr - > ibmr . rkey = imr - > mmkey . key ;
2019-08-19 14:17:03 +03:00
imr - > umem = & umem_odp - > umem ;
2019-10-16 09:23:07 +03:00
imr - > is_odp_implicit = true ;
2019-10-09 19:09:32 +03:00
atomic_set ( & imr - > num_deferred_work , 0 ) ;
2019-10-09 19:09:29 +03:00
xa_init ( & imr - > implicit_children ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:26 +03:00
err = mlx5_ib_update_xlt ( imr , 0 ,
mlx5_imr_ksm_entries ,
MLX5_KSM_PAGE_SHIFT ,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ENABLE ) ;
if ( err )
goto out_mr ;
err = xa_err ( xa_store ( & dev - > odp_mkeys , mlx5_base_mkey ( imr - > mmkey . key ) ,
& imr - > mmkey , GFP_KERNEL ) ) ;
if ( err )
goto out_mr ;
2019-10-16 09:23:07 +03:00
2019-10-09 19:09:26 +03:00
mlx5_ib_dbg ( dev , " key %x mr %p \n " , imr - > mmkey . key , imr ) ;
2017-01-18 17:58:11 +03:00
return imr ;
2019-10-09 19:09:26 +03:00
out_mr :
mlx5_ib_err ( dev , " Failed to register MKEY %d \n " , err ) ;
mlx5_mr_cache_free ( dev , imr ) ;
out_umem :
ib_umem_odp_release ( umem_odp ) ;
return ERR_PTR ( err ) ;
2017-01-18 17:58:11 +03:00
}
2019-08-19 14:17:00 +03:00
void mlx5_ib_free_implicit_mr ( struct mlx5_ib_mr * imr )
2017-01-18 17:58:11 +03:00
{
2019-10-09 19:09:29 +03:00
struct ib_umem_odp * odp_imr = to_ib_umem_odp ( imr - > umem ) ;
2019-10-09 19:09:32 +03:00
struct mlx5_ib_dev * dev = imr - > dev ;
struct list_head destroy_list ;
2019-10-09 19:09:29 +03:00
struct mlx5_ib_mr * mtt ;
2019-10-09 19:09:32 +03:00
struct mlx5_ib_mr * tmp ;
2019-10-09 19:09:29 +03:00
unsigned long idx ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
INIT_LIST_HEAD ( & destroy_list ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
xa_erase ( & dev - > odp_mkeys , mlx5_base_mkey ( imr - > mmkey . key ) ) ;
/*
* This stops the SRCU protected page fault path from touching either
* the imr or any children . The page fault path can only reach the
* children xarray via the imr .
*/
synchronize_srcu ( & dev - > odp_srcu ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
xa_lock ( & imr - > implicit_children ) ;
xa_for_each ( & imr - > implicit_children , idx , mtt ) {
__xa_erase ( & imr - > implicit_children , idx ) ;
list_add ( & mtt - > odp_destroy . elm , & destroy_list ) ;
}
xa_unlock ( & imr - > implicit_children ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:32 +03:00
/*
* num_deferred_work can only be incremented inside the odp_srcu , or
* under xa_lock while the child is in the xarray . Thus at this point
* it is only decreasing , and all work holding it is now on the wq .
*/
if ( atomic_read ( & imr - > num_deferred_work ) ) {
flush_workqueue ( system_unbound_wq ) ;
WARN_ON ( atomic_read ( & imr - > num_deferred_work ) ) ;
2019-08-19 14:17:00 +03:00
}
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:34 +03:00
/*
* Fence the imr before we destroy the children . This allows us to
* skip updating the XLT of the imr during destroy of the child mkey
* the imr points to .
*/
mlx5_mr_cache_invalidate ( imr ) ;
2019-10-09 19:09:32 +03:00
list_for_each_entry_safe ( mtt , tmp , & destroy_list , odp_destroy . elm )
free_implicit_child_mr ( mtt , false ) ;
mlx5_mr_cache_free ( dev , imr ) ;
ib_umem_odp_release ( odp_imr ) ;
2017-01-18 17:58:11 +03:00
}
2019-10-09 19:09:34 +03:00
/**
* mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
* @ mr : to fence
*
* On return no parallel threads will be touching this MR and no DMA will be
* active .
*/
void mlx5_ib_fence_odp_mr ( struct mlx5_ib_mr * mr )
{
/* Prevent new page faults and prefetch requests from succeeding */
xa_erase ( & mr - > dev - > odp_mkeys , mlx5_base_mkey ( mr - > mmkey . key ) ) ;
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu ( & mr - > dev - > odp_srcu ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:34 +03:00
if ( atomic_read ( & mr - > num_deferred_work ) ) {
flush_workqueue ( system_unbound_wq ) ;
WARN_ON ( atomic_read ( & mr - > num_deferred_work ) ) ;
2019-08-19 14:17:00 +03:00
}
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:34 +03:00
dma_fence_odp_mr ( mr ) ;
2017-01-18 17:58:11 +03:00
}
2018-12-11 14:37:53 +03:00
# define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
2019-10-09 19:09:28 +03:00
static int pagefault_real_mr ( struct mlx5_ib_mr * mr , struct ib_umem_odp * odp ,
u64 user_va , size_t bcnt , u32 * bytes_mapped ,
u32 flags )
2014-12-11 18:04:24 +03:00
{
2019-11-12 23:22:22 +03:00
int page_shift , ret , np ;
2018-12-11 14:37:53 +03:00
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE ;
2019-11-12 23:22:22 +03:00
unsigned long current_seq ;
2019-03-19 12:24:36 +03:00
u64 access_mask ;
2017-04-05 09:23:58 +03:00
u64 start_idx , page_mask ;
2019-05-20 09:05:25 +03:00
page_shift = odp - > page_shift ;
2017-04-05 09:23:56 +03:00
page_mask = ~ ( BIT ( page_shift ) - 1 ) ;
2019-10-09 19:09:28 +03:00
start_idx = ( user_va - ( mr - > mmkey . iova & page_mask ) ) > > page_shift ;
2019-03-19 12:24:36 +03:00
access_mask = ODP_READ_ALLOWED_BIT ;
2017-04-05 09:23:58 +03:00
2019-08-19 14:17:10 +03:00
if ( odp - > umem . writable & & ! downgrade )
2017-04-05 09:23:58 +03:00
access_mask | = ODP_WRITE_ALLOWED_BIT ;
2017-04-05 09:23:56 +03:00
2019-11-12 23:22:22 +03:00
current_seq = mmu_interval_read_begin ( & odp - > notifier ) ;
2017-01-18 17:58:11 +03:00
2019-10-09 19:09:28 +03:00
np = ib_umem_odp_map_dma_pages ( odp , user_va , bcnt , access_mask ,
current_seq ) ;
if ( np < 0 )
return np ;
2017-04-05 09:23:58 +03:00
mutex_lock ( & odp - > umem_mutex ) ;
2019-11-12 23:22:22 +03:00
if ( ! mmu_interval_read_retry ( & odp - > notifier , current_seq ) ) {
2017-04-05 09:23:58 +03:00
/*
* No need to check whether the MTTs really belong to
* this MR , since ib_umem_odp_map_dma_pages already
* checks this .
*/
ret = mlx5_ib_update_xlt ( mr , start_idx , np ,
page_shift , MLX5_IB_UPD_XLT_ATOMIC ) ;
} else {
ret = - EAGAIN ;
}
mutex_unlock ( & odp - > umem_mutex ) ;
if ( ret < 0 ) {
if ( ret ! = - EAGAIN )
2019-10-09 19:09:21 +03:00
mlx5_ib_err ( mr - > dev ,
" Failed to update mkey page tables \n " ) ;
2017-04-05 09:23:58 +03:00
goto out ;
}
2017-01-18 17:58:11 +03:00
2017-04-05 09:23:58 +03:00
if ( bytes_mapped ) {
u32 new_mappings = ( np < < page_shift ) -
2019-10-09 19:09:28 +03:00
( user_va - round_down ( user_va , 1 < < page_shift ) ) ;
2017-04-05 09:23:58 +03:00
2019-10-09 19:09:28 +03:00
* bytes_mapped + = min_t ( u32 , new_mappings , bcnt ) ;
2014-12-11 18:04:24 +03:00
}
2019-10-09 19:09:28 +03:00
return np < < ( page_shift - PAGE_SHIFT ) ;
2017-04-05 09:23:58 +03:00
out :
return ret ;
}
2019-10-09 19:09:31 +03:00
static int pagefault_implicit_mr ( struct mlx5_ib_mr * imr ,
struct ib_umem_odp * odp_imr , u64 user_va ,
size_t bcnt , u32 * bytes_mapped , u32 flags )
{
unsigned long end_idx = ( user_va + bcnt - 1 ) > > MLX5_IMR_MTT_SHIFT ;
unsigned long upd_start_idx = end_idx + 1 ;
unsigned long upd_len = 0 ;
unsigned long npages = 0 ;
int err ;
int ret ;
if ( unlikely ( user_va > = mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE | |
mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt ) )
return - EFAULT ;
/* Fault each child mr that intersects with our interval. */
while ( bcnt ) {
unsigned long idx = user_va > > MLX5_IMR_MTT_SHIFT ;
struct ib_umem_odp * umem_odp ;
struct mlx5_ib_mr * mtt ;
u64 len ;
mtt = xa_load ( & imr - > implicit_children , idx ) ;
if ( unlikely ( ! mtt ) ) {
mtt = implicit_get_child_mr ( imr , idx ) ;
if ( IS_ERR ( mtt ) ) {
ret = PTR_ERR ( mtt ) ;
goto out ;
}
upd_start_idx = min ( upd_start_idx , idx ) ;
upd_len = idx - upd_start_idx + 1 ;
}
umem_odp = to_ib_umem_odp ( mtt - > umem ) ;
len = min_t ( u64 , user_va + bcnt , ib_umem_end ( umem_odp ) ) -
user_va ;
ret = pagefault_real_mr ( mtt , umem_odp , user_va , len ,
bytes_mapped , flags ) ;
if ( ret < 0 )
goto out ;
user_va + = len ;
bcnt - = len ;
npages + = ret ;
}
ret = npages ;
/*
* Any time the implicit_children are changed we must perform an
* update of the xlt before exiting to ensure the HW and the
* implicit_children remains synchronized .
*/
out :
if ( likely ( ! upd_len ) )
return ret ;
/*
* Notice this is not strictly ordered right , the KSM is updated after
* the implicit_children is updated , so a parallel page fault could
* see a MR that is not yet visible in the KSM . This is similar to a
* parallel page fault seeing a MR that is being concurrently removed
* from the KSM . Both of these improbable situations are resolved
* safely by resuming the HW and then taking another page fault . The
* next pagefault handler will see the new information .
*/
mutex_lock ( & odp_imr - > umem_mutex ) ;
err = mlx5_ib_update_xlt ( imr , upd_start_idx , upd_len , 0 ,
MLX5_IB_UPD_XLT_INDIRECT |
MLX5_IB_UPD_XLT_ATOMIC ) ;
mutex_unlock ( & odp_imr - > umem_mutex ) ;
if ( err ) {
mlx5_ib_err ( imr - > dev , " Failed to update PAS \n " ) ;
return err ;
}
return ret ;
}
2019-10-09 19:09:28 +03:00
/*
* Returns :
* - EFAULT : The io_virt - > bcnt is not within the MR , it covers pages that are
* not accessible , or the MR is no longer valid .
* - EAGAIN / - ENOMEM : The operation should be retried
*
* - EINVAL / others : General internal malfunction
* > 0 : Number of pages mapped
*/
static int pagefault_mr ( struct mlx5_ib_mr * mr , u64 io_virt , size_t bcnt ,
u32 * bytes_mapped , u32 flags )
{
struct ib_umem_odp * odp = to_ib_umem_odp ( mr - > umem ) ;
if ( ! odp - > is_implicit_odp ) {
if ( unlikely ( io_virt < ib_umem_start ( odp ) | |
ib_umem_end ( odp ) - io_virt < bcnt ) )
return - EFAULT ;
return pagefault_real_mr ( mr , odp , io_virt , bcnt , bytes_mapped ,
flags ) ;
}
2019-10-09 19:09:31 +03:00
return pagefault_implicit_mr ( mr , odp , io_virt , bcnt , bytes_mapped ,
flags ) ;
2019-10-09 19:09:28 +03:00
}
2017-04-05 09:23:59 +03:00
struct pf_frame {
struct pf_frame * next ;
u32 key ;
u64 io_virt ;
size_t bcnt ;
int depth ;
} ;
2019-03-19 12:24:39 +03:00
static bool mkey_is_eq ( struct mlx5_core_mkey * mmkey , u32 key )
{
if ( ! mmkey )
return false ;
if ( mmkey - > type = = MLX5_MKEY_MW )
return mlx5_base_mkey ( mmkey - > key ) = = mlx5_base_mkey ( key ) ;
return mmkey - > key = = key ;
}
2019-01-13 17:01:18 +03:00
static int get_indirect_num_descs ( struct mlx5_core_mkey * mmkey )
{
struct mlx5_ib_mw * mw ;
struct mlx5_ib_devx_mr * devx_mr ;
if ( mmkey - > type = = MLX5_MKEY_MW ) {
mw = container_of ( mmkey , struct mlx5_ib_mw , mmkey ) ;
return mw - > ndescs ;
}
devx_mr = container_of ( mmkey , struct mlx5_ib_devx_mr ,
mmkey ) ;
return devx_mr - > ndescs ;
}
2017-04-05 09:23:58 +03:00
/*
* Handle a single data segment in a page - fault WQE or RDMA region .
*
* Returns number of OS pages retrieved on success . The caller may continue to
* the next data segment .
* Can return the following error codes :
* - EAGAIN to designate a temporary error . The caller will abort handling the
* page fault and resolve it .
* - EFAULT when there ' s an error mapping the requested pages . The caller will
* abort the page fault handling .
*/
2019-02-17 17:08:23 +03:00
static int pagefault_single_data_segment ( struct mlx5_ib_dev * dev ,
struct ib_pd * pd , u32 key ,
2018-12-11 14:37:53 +03:00
u64 io_virt , size_t bcnt ,
2017-04-05 09:23:58 +03:00
u32 * bytes_committed ,
2019-10-09 19:09:21 +03:00
u32 * bytes_mapped )
2017-04-05 09:23:58 +03:00
{
2017-04-05 09:23:59 +03:00
int npages = 0 , srcu_key , ret , i , outlen , cur_outlen = 0 , depth = 0 ;
struct pf_frame * head = NULL , * frame ;
struct mlx5_core_mkey * mmkey ;
2017-04-05 09:23:58 +03:00
struct mlx5_ib_mr * mr ;
2017-04-05 09:23:59 +03:00
struct mlx5_klm * pklm ;
u32 * out = NULL ;
size_t offset ;
2019-01-13 17:01:18 +03:00
int ndescs ;
2017-04-05 09:23:58 +03:00
2019-10-09 19:09:23 +03:00
srcu_key = srcu_read_lock ( & dev - > odp_srcu ) ;
2017-04-05 09:23:59 +03:00
io_virt + = * bytes_committed ;
bcnt - = * bytes_committed ;
next_mr :
2019-10-09 19:09:23 +03:00
mmkey = xa_load ( & dev - > odp_mkeys , mlx5_base_mkey ( key ) ) ;
if ( ! mmkey ) {
mlx5_ib_dbg (
dev ,
" skipping non ODP MR (lkey=0x%06x) in page fault handler. \n " ,
key ) ;
if ( bytes_mapped )
* bytes_mapped + = bcnt ;
/*
* The user could specify a SGL with multiple lkeys and only
* some of them are ODP . Treat the non - ODP ones as fully
* faulted .
*/
ret = 0 ;
goto srcu_unlock ;
}
2019-03-19 12:24:39 +03:00
if ( ! mkey_is_eq ( mmkey , key ) ) {
2017-04-05 09:23:59 +03:00
mlx5_ib_dbg ( dev , " failed to find mkey %x \n " , key ) ;
2017-04-05 09:23:58 +03:00
ret = - EFAULT ;
goto srcu_unlock ;
}
2017-04-05 09:23:59 +03:00
switch ( mmkey - > type ) {
case MLX5_MKEY_MR :
mr = container_of ( mmkey , struct mlx5_ib_mr , mmkey ) ;
2018-12-11 14:37:53 +03:00
2019-10-09 19:09:21 +03:00
ret = pagefault_mr ( mr , io_virt , bcnt , bytes_mapped , 0 ) ;
2017-04-05 09:23:59 +03:00
if ( ret < 0 )
goto srcu_unlock ;
2019-10-16 09:23:05 +03:00
/*
* When prefetching a page , page fault is generated
* in order to bring the page to the main memory .
* In the current flow , page faults are being counted .
*/
mlx5_update_odp_stats ( mr , faults , ret ) ;
2017-04-05 09:23:59 +03:00
npages + = ret ;
ret = 0 ;
break ;
case MLX5_MKEY_MW :
2019-01-13 17:01:18 +03:00
case MLX5_MKEY_INDIRECT_DEVX :
ndescs = get_indirect_num_descs ( mmkey ) ;
2017-04-05 09:23:59 +03:00
if ( depth > = MLX5_CAP_GEN ( dev - > mdev , max_indirection ) ) {
mlx5_ib_dbg ( dev , " indirection level exceeded \n " ) ;
ret = - EFAULT ;
goto srcu_unlock ;
}
outlen = MLX5_ST_SZ_BYTES ( query_mkey_out ) +
2019-01-13 17:01:18 +03:00
sizeof ( * pklm ) * ( ndescs - 2 ) ;
2017-04-05 09:23:59 +03:00
if ( outlen > cur_outlen ) {
kfree ( out ) ;
out = kzalloc ( outlen , GFP_KERNEL ) ;
if ( ! out ) {
ret = - ENOMEM ;
goto srcu_unlock ;
}
cur_outlen = outlen ;
}
pklm = ( struct mlx5_klm * ) MLX5_ADDR_OF ( query_mkey_out , out ,
bsf0_klm0_pas_mtt0_1 ) ;
2019-01-13 17:01:18 +03:00
ret = mlx5_core_query_mkey ( dev - > mdev , mmkey , out , outlen ) ;
2017-04-05 09:23:59 +03:00
if ( ret )
goto srcu_unlock ;
offset = io_virt - MLX5_GET64 ( query_mkey_out , out ,
memory_key_mkey_entry . start_addr ) ;
2019-01-13 17:01:18 +03:00
for ( i = 0 ; bcnt & & i < ndescs ; i + + , pklm + + ) {
2017-04-05 09:23:59 +03:00
if ( offset > = be32_to_cpu ( pklm - > bcount ) ) {
offset - = be32_to_cpu ( pklm - > bcount ) ;
continue ;
}
frame = kzalloc ( sizeof ( * frame ) , GFP_KERNEL ) ;
if ( ! frame ) {
ret = - ENOMEM ;
goto srcu_unlock ;
}
frame - > key = be32_to_cpu ( pklm - > key ) ;
frame - > io_virt = be64_to_cpu ( pklm - > va ) + offset ;
frame - > bcnt = min_t ( size_t , bcnt ,
be32_to_cpu ( pklm - > bcount ) - offset ) ;
frame - > depth = depth + 1 ;
frame - > next = head ;
head = frame ;
bcnt - = frame - > bcnt ;
2018-11-25 21:34:26 +03:00
offset = 0 ;
2017-04-05 09:23:59 +03:00
}
break ;
default :
mlx5_ib_dbg ( dev , " wrong mkey type %d \n " , mmkey - > type ) ;
ret = - EFAULT ;
2017-04-05 09:23:58 +03:00
goto srcu_unlock ;
}
2017-04-05 09:23:59 +03:00
if ( head ) {
frame = head ;
head = frame - > next ;
key = frame - > key ;
io_virt = frame - > io_virt ;
bcnt = frame - > bcnt ;
depth = frame - > depth ;
kfree ( frame ) ;
2017-04-05 09:23:58 +03:00
2017-04-05 09:23:59 +03:00
goto next_mr ;
}
2017-04-05 09:23:58 +03:00
srcu_unlock :
2017-04-05 09:23:59 +03:00
while ( head ) {
frame = head ;
head = frame - > next ;
kfree ( frame ) ;
}
kfree ( out ) ;
2019-10-09 19:09:23 +03:00
srcu_read_unlock ( & dev - > odp_srcu , srcu_key ) ;
2017-01-02 12:37:46 +03:00
* bytes_committed = 0 ;
2014-12-11 18:04:24 +03:00
return ret ? ret : npages ;
}
/**
* Parse a series of data segments for page fault handling .
*
* @ pfault contains page fault information .
* @ wqe points at the first data segment in the WQE .
* @ wqe_end points after the end of the WQE .
* @ bytes_mapped receives the number of bytes that the function was able to
* map . This allows the caller to decide intelligently whether
* enough memory was mapped to resolve the page fault
* successfully ( e . g . enough for the next MTU , or the entire
* WQE ) .
* @ total_wqe_bytes receives the total data size of this WQE in bytes ( minus
* the committed bytes ) .
*
* Returns the number of pages loaded if positive , zero for an empty WQE , or a
* negative error code .
*/
2017-01-02 12:37:46 +03:00
static int pagefault_data_segments ( struct mlx5_ib_dev * dev ,
struct mlx5_pagefault * pfault ,
2019-01-22 09:48:43 +03:00
void * wqe ,
2014-12-11 18:04:24 +03:00
void * wqe_end , u32 * bytes_mapped ,
2019-02-25 09:56:14 +03:00
u32 * total_wqe_bytes , bool receive_queue )
2014-12-11 18:04:24 +03:00
{
int ret = 0 , npages = 0 ;
u64 io_virt ;
u32 key ;
u32 byte_count ;
size_t bcnt ;
int inline_segment ;
if ( bytes_mapped )
* bytes_mapped = 0 ;
if ( total_wqe_bytes )
* total_wqe_bytes = 0 ;
while ( wqe < wqe_end ) {
struct mlx5_wqe_data_seg * dseg = wqe ;
io_virt = be64_to_cpu ( dseg - > addr ) ;
key = be32_to_cpu ( dseg - > lkey ) ;
byte_count = be32_to_cpu ( dseg - > byte_count ) ;
inline_segment = ! ! ( byte_count & MLX5_INLINE_SEG ) ;
bcnt = byte_count & ~ MLX5_INLINE_SEG ;
if ( inline_segment ) {
bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK ;
wqe + = ALIGN ( sizeof ( struct mlx5_wqe_inline_seg ) + bcnt ,
16 ) ;
} else {
wqe + = sizeof ( * dseg ) ;
}
/* receive WQE end of sg list. */
if ( receive_queue & & bcnt = = 0 & & key = = MLX5_INVALID_LKEY & &
io_virt = = 0 )
break ;
if ( ! inline_segment & & total_wqe_bytes ) {
* total_wqe_bytes + = bcnt - min_t ( size_t , bcnt ,
2017-01-02 12:37:46 +03:00
pfault - > bytes_committed ) ;
2014-12-11 18:04:24 +03:00
}
/* A zero length data segment designates a length of 2GB. */
if ( bcnt = = 0 )
bcnt = 1U < < 31 ;
2017-01-02 12:37:46 +03:00
if ( inline_segment | | bcnt < = pfault - > bytes_committed ) {
pfault - > bytes_committed - =
2014-12-11 18:04:24 +03:00
min_t ( size_t , bcnt ,
2017-01-02 12:37:46 +03:00
pfault - > bytes_committed ) ;
2014-12-11 18:04:24 +03:00
continue ;
}
2019-02-17 17:08:23 +03:00
ret = pagefault_single_data_segment ( dev , NULL , key ,
io_virt , bcnt ,
2017-01-02 12:37:46 +03:00
& pfault - > bytes_committed ,
2019-10-09 19:09:21 +03:00
bytes_mapped ) ;
2014-12-11 18:04:24 +03:00
if ( ret < 0 )
break ;
npages + = ret ;
}
return ret < 0 ? ret : npages ;
}
/*
* Parse initiator WQE . Advances the wqe pointer to point at the
* scatter - gather list , and set wqe_end to the end of the WQE .
*/
static int mlx5_ib_mr_initiator_pfault_handler (
2017-01-02 12:37:46 +03:00
struct mlx5_ib_dev * dev , struct mlx5_pagefault * pfault ,
struct mlx5_ib_qp * qp , void * * wqe , void * * wqe_end , int wqe_length )
2014-12-11 18:04:24 +03:00
{
struct mlx5_wqe_ctrl_seg * ctrl = * wqe ;
2017-01-02 12:37:46 +03:00
u16 wqe_index = pfault - > wqe . wqe_index ;
2017-01-02 12:37:47 +03:00
struct mlx5_base_av * av ;
2014-12-11 18:04:24 +03:00
unsigned ds , opcode ;
2016-01-14 20:13:03 +03:00
u32 qpn = qp - > trans_qp . base . mqp . qpn ;
2014-12-11 18:04:24 +03:00
ds = be32_to_cpu ( ctrl - > qpn_ds ) & MLX5_WQE_CTRL_DS_MASK ;
if ( ds * MLX5_WQE_DS_UNITS > wqe_length ) {
mlx5_ib_err ( dev , " Unable to read the complete WQE. ds = 0x%x, ret = 0x%x \n " ,
ds , wqe_length ) ;
return - EFAULT ;
}
if ( ds = = 0 ) {
mlx5_ib_err ( dev , " Got WQE with zero DS. wqe_index=%x, qpn=%x \n " ,
2016-01-14 20:13:03 +03:00
wqe_index , qpn ) ;
2014-12-11 18:04:24 +03:00
return - EFAULT ;
}
* wqe_end = * wqe + ds * MLX5_WQE_DS_UNITS ;
* wqe + = sizeof ( * ctrl ) ;
opcode = be32_to_cpu ( ctrl - > opmod_idx_opcode ) &
MLX5_WQE_CTRL_OPCODE_MASK ;
2017-01-02 12:37:47 +03:00
2019-08-19 15:08:14 +03:00
if ( qp - > ibqp . qp_type = = IB_QPT_XRC_INI )
2019-01-22 09:48:45 +03:00
* wqe + = sizeof ( struct mlx5_wqe_xrc_seg ) ;
2014-12-11 18:04:24 +03:00
2019-08-19 15:08:15 +03:00
if ( qp - > ibqp . qp_type = = IB_QPT_UD | |
qp - > qp_sub_type = = MLX5_IB_QPT_DCI ) {
2017-01-02 12:37:47 +03:00
av = * wqe ;
2017-08-01 09:41:37 +03:00
if ( av - > dqp_dct & cpu_to_be32 ( MLX5_EXTENDED_UD_AV ) )
2017-01-02 12:37:47 +03:00
* wqe + = sizeof ( struct mlx5_av ) ;
else
* wqe + = sizeof ( struct mlx5_base_av ) ;
}
switch ( opcode ) {
case MLX5_OPCODE_RDMA_WRITE :
case MLX5_OPCODE_RDMA_WRITE_IMM :
case MLX5_OPCODE_RDMA_READ :
* wqe + = sizeof ( struct mlx5_wqe_raddr_seg ) ;
break ;
case MLX5_OPCODE_ATOMIC_CS :
case MLX5_OPCODE_ATOMIC_FA :
* wqe + = sizeof ( struct mlx5_wqe_raddr_seg ) ;
* wqe + = sizeof ( struct mlx5_wqe_atomic_seg ) ;
break ;
}
2014-12-11 18:04:24 +03:00
return 0 ;
}
/*
2019-01-22 09:48:44 +03:00
* Parse responder WQE and set wqe_end to the end of the WQE .
2014-12-11 18:04:24 +03:00
*/
2019-01-22 09:48:47 +03:00
static int mlx5_ib_mr_responder_pfault_handler_srq ( struct mlx5_ib_dev * dev ,
struct mlx5_ib_srq * srq ,
void * * wqe , void * * wqe_end ,
int wqe_length )
2014-12-11 18:04:24 +03:00
{
2019-01-22 09:48:47 +03:00
int wqe_size = 1 < < srq - > msrq . wqe_shift ;
2014-12-11 18:04:24 +03:00
2019-01-22 09:48:47 +03:00
if ( wqe_size > wqe_length ) {
mlx5_ib_err ( dev , " Couldn't read all of the receive WQE's content \n " ) ;
2014-12-11 18:04:24 +03:00
return - EFAULT ;
}
2019-01-22 09:48:47 +03:00
* wqe_end = * wqe + wqe_size ;
* wqe + = sizeof ( struct mlx5_wqe_srq_next_seg ) ;
return 0 ;
}
static int mlx5_ib_mr_responder_pfault_handler_rq ( struct mlx5_ib_dev * dev ,
struct mlx5_ib_qp * qp ,
void * wqe , void * * wqe_end ,
int wqe_length )
{
struct mlx5_ib_wq * wq = & qp - > rq ;
int wqe_size = 1 < < wq - > wqe_shift ;
2014-12-11 18:04:24 +03:00
if ( qp - > wq_sig ) {
mlx5_ib_err ( dev , " ODP fault with WQE signatures is not supported \n " ) ;
return - EFAULT ;
}
if ( wqe_size > wqe_length ) {
mlx5_ib_err ( dev , " Couldn't read all of the receive WQE's content \n " ) ;
return - EFAULT ;
}
2019-01-22 09:48:44 +03:00
* wqe_end = wqe + wqe_size ;
2014-12-11 18:04:24 +03:00
return 0 ;
}
2018-11-08 22:10:12 +03:00
static inline struct mlx5_core_rsc_common * odp_get_rsc ( struct mlx5_ib_dev * dev ,
u32 wq_num , int pf_type )
2017-01-02 12:37:46 +03:00
{
2019-01-22 09:48:40 +03:00
struct mlx5_core_rsc_common * common = NULL ;
struct mlx5_core_srq * srq ;
2017-01-02 12:37:46 +03:00
2018-11-08 22:10:12 +03:00
switch ( pf_type ) {
case MLX5_WQE_PF_TYPE_RMP :
2019-01-22 09:48:40 +03:00
srq = mlx5_cmd_get_srq ( dev , wq_num ) ;
if ( srq )
common = & srq - > common ;
2018-11-08 22:10:12 +03:00
break ;
case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE :
case MLX5_WQE_PF_TYPE_RESP :
case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC :
2019-01-22 09:48:40 +03:00
common = mlx5_core_res_hold ( dev - > mdev , wq_num , MLX5_RES_QP ) ;
2018-11-08 22:10:12 +03:00
break ;
default :
2019-01-22 09:48:40 +03:00
break ;
2017-01-02 12:37:46 +03:00
}
2019-01-22 09:48:40 +03:00
return common ;
2018-11-08 22:10:12 +03:00
}
static inline struct mlx5_ib_qp * res_to_qp ( struct mlx5_core_rsc_common * res )
{
struct mlx5_core_qp * mqp = ( struct mlx5_core_qp * ) res ;
2017-01-02 12:37:46 +03:00
return to_mibqp ( mqp ) ;
}
2019-01-22 09:48:47 +03:00
static inline struct mlx5_ib_srq * res_to_srq ( struct mlx5_core_rsc_common * res )
{
struct mlx5_core_srq * msrq =
container_of ( res , struct mlx5_core_srq , common ) ;
return to_mibsrq ( msrq ) ;
}
2017-01-02 12:37:46 +03:00
static void mlx5_ib_mr_wqe_pfault_handler ( struct mlx5_ib_dev * dev ,
struct mlx5_pagefault * pfault )
2014-12-11 18:04:24 +03:00
{
2019-02-25 09:56:14 +03:00
bool sq = pfault - > type & MLX5_PFAULT_REQUESTOR ;
u16 wqe_index = pfault - > wqe . wqe_index ;
2019-09-16 09:48:17 +03:00
void * wqe , * wqe_start = NULL , * wqe_end = NULL ;
2014-12-11 18:04:24 +03:00
u32 bytes_mapped , total_wqe_bytes ;
2019-02-25 09:56:14 +03:00
struct mlx5_core_rsc_common * res ;
2017-01-02 12:37:46 +03:00
int resume_with_error = 1 ;
2019-02-25 09:56:14 +03:00
struct mlx5_ib_qp * qp ;
2019-01-22 09:48:46 +03:00
size_t bytes_copied ;
2019-02-25 09:56:14 +03:00
int ret = 0 ;
2014-12-11 18:04:24 +03:00
2018-11-08 22:10:12 +03:00
res = odp_get_rsc ( dev , pfault - > wqe . wq_num , pfault - > type ) ;
if ( ! res ) {
mlx5_ib_dbg ( dev , " wqe page fault for missing resource %d \n " , pfault - > wqe . wq_num ) ;
return ;
}
2019-02-25 09:56:14 +03:00
if ( res - > res ! = MLX5_RES_QP & & res - > res ! = MLX5_RES_SRQ & &
res - > res ! = MLX5_RES_XSRQ ) {
mlx5_ib_err ( dev , " wqe page fault for unsupported type %d \n " ,
pfault - > type ) ;
2018-11-08 22:10:12 +03:00
goto resolve_page_fault ;
}
2019-09-16 09:48:17 +03:00
wqe_start = ( void * ) __get_free_page ( GFP_KERNEL ) ;
if ( ! wqe_start ) {
2014-12-11 18:04:24 +03:00
mlx5_ib_err ( dev , " Error allocating memory for IO page fault handling. \n " ) ;
goto resolve_page_fault ;
}
2019-09-16 09:48:17 +03:00
wqe = wqe_start ;
2019-02-25 09:56:14 +03:00
qp = ( res - > res = = MLX5_RES_QP ) ? res_to_qp ( res ) : NULL ;
if ( qp & & sq ) {
ret = mlx5_ib_read_user_wqe_sq ( qp , wqe_index , wqe , PAGE_SIZE ,
& bytes_copied ) ;
if ( ret )
goto read_user ;
ret = mlx5_ib_mr_initiator_pfault_handler (
dev , pfault , qp , & wqe , & wqe_end , bytes_copied ) ;
} else if ( qp & & ! sq ) {
ret = mlx5_ib_read_user_wqe_rq ( qp , wqe_index , wqe , PAGE_SIZE ,
& bytes_copied ) ;
if ( ret )
goto read_user ;
ret = mlx5_ib_mr_responder_pfault_handler_rq (
dev , qp , wqe , & wqe_end , bytes_copied ) ;
} else if ( ! qp ) {
struct mlx5_ib_srq * srq = res_to_srq ( res ) ;
ret = mlx5_ib_read_user_wqe_srq ( srq , wqe_index , wqe , PAGE_SIZE ,
2019-01-22 09:48:47 +03:00
& bytes_copied ) ;
2019-02-25 09:56:14 +03:00
if ( ret )
goto read_user ;
ret = mlx5_ib_mr_responder_pfault_handler_srq (
dev , srq , & wqe , & wqe_end , bytes_copied ) ;
2019-01-22 09:48:47 +03:00
}
2019-01-22 09:48:46 +03:00
2019-02-25 09:56:14 +03:00
if ( ret < 0 | | wqe > = wqe_end )
2014-12-11 18:04:24 +03:00
goto resolve_page_fault ;
2019-02-25 09:56:14 +03:00
ret = pagefault_data_segments ( dev , pfault , wqe , wqe_end , & bytes_mapped ,
& total_wqe_bytes , ! sq ) ;
if ( ret = = - EAGAIN )
goto out ;
2019-01-22 09:48:47 +03:00
2019-02-25 09:56:14 +03:00
if ( ret < 0 | | total_wqe_bytes > bytes_mapped )
2014-12-11 18:04:24 +03:00
goto resolve_page_fault ;
2019-02-25 09:56:14 +03:00
out :
ret = 0 ;
resume_with_error = 0 ;
2014-12-11 18:04:24 +03:00
2019-02-25 09:56:14 +03:00
read_user :
if ( ret )
mlx5_ib_err (
dev ,
" Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x \n " ,
ret , wqe_index , pfault - > token ) ;
2014-12-11 18:04:24 +03:00
resolve_page_fault :
2017-01-02 12:37:46 +03:00
mlx5_ib_page_fault_resume ( dev , pfault , resume_with_error ) ;
mlx5_ib_dbg ( dev , " PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x \n " ,
2017-01-18 17:58:11 +03:00
pfault - > wqe . wq_num , resume_with_error ,
2017-01-02 12:37:46 +03:00
pfault - > type ) ;
2018-11-08 22:10:12 +03:00
mlx5_core_res_put ( res ) ;
2019-09-16 09:48:17 +03:00
free_page ( ( unsigned long ) wqe_start ) ;
2014-12-11 18:04:24 +03:00
}
2014-12-11 18:04:25 +03:00
static int pages_in_range ( u64 address , u32 length )
{
return ( ALIGN ( address + length , PAGE_SIZE ) -
( address & PAGE_MASK ) ) > > PAGE_SHIFT ;
}
2017-01-02 12:37:46 +03:00
static void mlx5_ib_mr_rdma_pfault_handler ( struct mlx5_ib_dev * dev ,
struct mlx5_pagefault * pfault )
2014-12-11 18:04:25 +03:00
{
u64 address ;
u32 length ;
2017-01-02 12:37:46 +03:00
u32 prefetch_len = pfault - > bytes_committed ;
2014-12-11 18:04:25 +03:00
int prefetch_activated = 0 ;
2017-01-02 12:37:46 +03:00
u32 rkey = pfault - > rdma . r_key ;
2014-12-11 18:04:25 +03:00
int ret ;
/* The RDMA responder handler handles the page fault in two parts.
* First it brings the necessary pages for the current packet
* ( and uses the pfault context ) , and then ( after resuming the QP )
* prefetches more pages . The second operation cannot use the pfault
* context and therefore uses the dummy_pfault context allocated on
* the stack */
2017-01-02 12:37:46 +03:00
pfault - > rdma . rdma_va + = pfault - > bytes_committed ;
pfault - > rdma . rdma_op_len - = min ( pfault - > bytes_committed ,
pfault - > rdma . rdma_op_len ) ;
pfault - > bytes_committed = 0 ;
2014-12-11 18:04:25 +03:00
2017-01-02 12:37:46 +03:00
address = pfault - > rdma . rdma_va ;
length = pfault - > rdma . rdma_op_len ;
2014-12-11 18:04:25 +03:00
/* For some operations, the hardware cannot tell the exact message
* length , and in those cases it reports zero . Use prefetch
* logic . */
if ( length = = 0 ) {
prefetch_activated = 1 ;
2017-01-02 12:37:46 +03:00
length = pfault - > rdma . packet_size ;
2014-12-11 18:04:25 +03:00
prefetch_len = min ( MAX_PREFETCH_LEN , prefetch_len ) ;
}
2019-02-17 17:08:23 +03:00
ret = pagefault_single_data_segment ( dev , NULL , rkey , address , length ,
2019-10-09 19:09:21 +03:00
& pfault - > bytes_committed , NULL ) ;
2014-12-11 18:04:25 +03:00
if ( ret = = - EAGAIN ) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0 ;
} else if ( ret < 0 | | pages_in_range ( address , length ) > ret ) {
2017-01-02 12:37:46 +03:00
mlx5_ib_page_fault_resume ( dev , pfault , 1 ) ;
if ( ret ! = - ENOENT )
2017-04-05 09:23:54 +03:00
mlx5_ib_dbg ( dev , " PAGE FAULT error %d. QP 0x%x, type: 0x%x \n " ,
ret , pfault - > token , pfault - > type ) ;
2014-12-11 18:04:25 +03:00
return ;
}
2017-01-02 12:37:46 +03:00
mlx5_ib_page_fault_resume ( dev , pfault , 0 ) ;
mlx5_ib_dbg ( dev , " PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d \n " ,
pfault - > token , pfault - > type ,
prefetch_activated ) ;
2014-12-11 18:04:25 +03:00
/* At this point, there might be a new pagefault already arriving in
* the eq , switch to the dummy pagefault for the rest of the
* processing . We ' re still OK with the objects being alive as the
* work - queue is being fenced . */
if ( prefetch_activated ) {
2017-01-02 12:37:46 +03:00
u32 bytes_committed = 0 ;
2019-02-17 17:08:23 +03:00
ret = pagefault_single_data_segment ( dev , NULL , rkey , address ,
2014-12-11 18:04:25 +03:00
prefetch_len ,
2019-10-09 19:09:21 +03:00
& bytes_committed , NULL ) ;
2017-01-18 17:58:11 +03:00
if ( ret < 0 & & ret ! = - EAGAIN ) {
2017-04-05 09:23:54 +03:00
mlx5_ib_dbg ( dev , " Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x \n " ,
ret , pfault - > token , address , prefetch_len ) ;
2014-12-11 18:04:25 +03:00
}
}
}
2018-11-19 21:52:41 +03:00
static void mlx5_ib_pfault ( struct mlx5_ib_dev * dev , struct mlx5_pagefault * pfault )
2014-12-11 18:04:23 +03:00
{
2017-01-02 12:37:46 +03:00
u8 event_subtype = pfault - > event_subtype ;
2014-12-11 18:04:23 +03:00
switch ( event_subtype ) {
2014-12-11 18:04:24 +03:00
case MLX5_PFAULT_SUBTYPE_WQE :
2017-01-02 12:37:46 +03:00
mlx5_ib_mr_wqe_pfault_handler ( dev , pfault ) ;
2014-12-11 18:04:24 +03:00
break ;
2014-12-11 18:04:25 +03:00
case MLX5_PFAULT_SUBTYPE_RDMA :
2017-01-02 12:37:46 +03:00
mlx5_ib_mr_rdma_pfault_handler ( dev , pfault ) ;
2014-12-11 18:04:25 +03:00
break ;
2014-12-11 18:04:23 +03:00
default :
2017-01-02 12:37:46 +03:00
mlx5_ib_err ( dev , " Invalid page fault event subtype: 0x%x \n " ,
event_subtype ) ;
mlx5_ib_page_fault_resume ( dev , pfault , 1 ) ;
2014-12-11 18:04:23 +03:00
}
}
2018-11-19 21:52:41 +03:00
static void mlx5_ib_eqe_pf_action ( struct work_struct * work )
{
struct mlx5_pagefault * pfault = container_of ( work ,
struct mlx5_pagefault ,
work ) ;
struct mlx5_ib_pf_eq * eq = pfault - > eq ;
mlx5_ib_pfault ( eq - > dev , pfault ) ;
mempool_free ( pfault , eq - > pool ) ;
}
static void mlx5_ib_eq_pf_process ( struct mlx5_ib_pf_eq * eq )
{
struct mlx5_eqe_page_fault * pf_eqe ;
struct mlx5_pagefault * pfault ;
struct mlx5_eqe * eqe ;
int cc = 0 ;
while ( ( eqe = mlx5_eq_get_eqe ( eq - > core , cc ) ) ) {
pfault = mempool_alloc ( eq - > pool , GFP_ATOMIC ) ;
if ( ! pfault ) {
schedule_work ( & eq - > work ) ;
break ;
}
pf_eqe = & eqe - > data . page_fault ;
pfault - > event_subtype = eqe - > sub_type ;
pfault - > bytes_committed = be32_to_cpu ( pf_eqe - > bytes_committed ) ;
mlx5_ib_dbg ( eq - > dev ,
" PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x \n " ,
eqe - > sub_type , pfault - > bytes_committed ) ;
switch ( eqe - > sub_type ) {
case MLX5_PFAULT_SUBTYPE_RDMA :
/* RDMA based event */
pfault - > type =
be32_to_cpu ( pf_eqe - > rdma . pftype_token ) > > 24 ;
pfault - > token =
be32_to_cpu ( pf_eqe - > rdma . pftype_token ) &
MLX5_24BIT_MASK ;
pfault - > rdma . r_key =
be32_to_cpu ( pf_eqe - > rdma . r_key ) ;
pfault - > rdma . packet_size =
be16_to_cpu ( pf_eqe - > rdma . packet_length ) ;
pfault - > rdma . rdma_op_len =
be32_to_cpu ( pf_eqe - > rdma . rdma_op_len ) ;
pfault - > rdma . rdma_va =
be64_to_cpu ( pf_eqe - > rdma . rdma_va ) ;
mlx5_ib_dbg ( eq - > dev ,
" PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x \n " ,
pfault - > type , pfault - > token ,
pfault - > rdma . r_key ) ;
mlx5_ib_dbg ( eq - > dev ,
" PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx \n " ,
pfault - > rdma . rdma_op_len ,
pfault - > rdma . rdma_va ) ;
break ;
case MLX5_PFAULT_SUBTYPE_WQE :
/* WQE based event */
pfault - > type =
( be32_to_cpu ( pf_eqe - > wqe . pftype_wq ) > > 24 ) & 0x7 ;
pfault - > token =
be32_to_cpu ( pf_eqe - > wqe . token ) ;
pfault - > wqe . wq_num =
be32_to_cpu ( pf_eqe - > wqe . pftype_wq ) &
MLX5_24BIT_MASK ;
pfault - > wqe . wqe_index =
be16_to_cpu ( pf_eqe - > wqe . wqe_index ) ;
pfault - > wqe . packet_size =
be16_to_cpu ( pf_eqe - > wqe . packet_length ) ;
mlx5_ib_dbg ( eq - > dev ,
" PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x \n " ,
pfault - > type , pfault - > token ,
pfault - > wqe . wq_num ,
pfault - > wqe . wqe_index ) ;
break ;
default :
mlx5_ib_warn ( eq - > dev ,
" Unsupported page fault event sub-type: 0x%02hhx \n " ,
eqe - > sub_type ) ;
/* Unsupported page faults should still be
* resolved by the page fault handler
*/
}
pfault - > eq = eq ;
INIT_WORK ( & pfault - > work , mlx5_ib_eqe_pf_action ) ;
queue_work ( eq - > wq , & pfault - > work ) ;
cc = mlx5_eq_update_cc ( eq - > core , + + cc ) ;
}
mlx5_eq_update_ci ( eq - > core , cc , 1 ) ;
}
2019-06-11 02:38:23 +03:00
static int mlx5_ib_eq_pf_int ( struct notifier_block * nb , unsigned long type ,
void * data )
2018-11-19 21:52:41 +03:00
{
2019-06-11 02:38:23 +03:00
struct mlx5_ib_pf_eq * eq =
container_of ( nb , struct mlx5_ib_pf_eq , irq_nb ) ;
2018-11-19 21:52:41 +03:00
unsigned long flags ;
if ( spin_trylock_irqsave ( & eq - > lock , flags ) ) {
mlx5_ib_eq_pf_process ( eq ) ;
spin_unlock_irqrestore ( & eq - > lock , flags ) ;
} else {
schedule_work ( & eq - > work ) ;
}
return IRQ_HANDLED ;
}
/* mempool_refill() was proposed but unfortunately wasn't accepted
* http : //lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
* Cheap workaround .
*/
static void mempool_refill ( mempool_t * pool )
{
while ( pool - > curr_nr < pool - > min_nr )
mempool_free ( mempool_alloc ( pool , GFP_KERNEL ) , pool ) ;
}
static void mlx5_ib_eq_pf_action ( struct work_struct * work )
{
struct mlx5_ib_pf_eq * eq =
container_of ( work , struct mlx5_ib_pf_eq , work ) ;
mempool_refill ( eq - > pool ) ;
spin_lock_irq ( & eq - > lock ) ;
mlx5_ib_eq_pf_process ( eq ) ;
spin_unlock_irq ( & eq - > lock ) ;
}
enum {
MLX5_IB_NUM_PF_EQE = 0x1000 ,
MLX5_IB_NUM_PF_DRAIN = 64 ,
} ;
static int
mlx5_ib_create_pf_eq ( struct mlx5_ib_dev * dev , struct mlx5_ib_pf_eq * eq )
{
struct mlx5_eq_param param = { } ;
int err ;
INIT_WORK ( & eq - > work , mlx5_ib_eq_pf_action ) ;
spin_lock_init ( & eq - > lock ) ;
eq - > dev = dev ;
eq - > pool = mempool_create_kmalloc_pool ( MLX5_IB_NUM_PF_DRAIN ,
sizeof ( struct mlx5_pagefault ) ) ;
if ( ! eq - > pool )
return - ENOMEM ;
eq - > wq = alloc_workqueue ( " mlx5_ib_page_fault " ,
WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM ,
MLX5_NUM_CMD_EQE ) ;
if ( ! eq - > wq ) {
err = - ENOMEM ;
goto err_mempool ;
}
2019-06-11 02:38:23 +03:00
eq - > irq_nb . notifier_call = mlx5_ib_eq_pf_int ;
2018-11-19 21:52:41 +03:00
param = ( struct mlx5_eq_param ) {
2019-06-11 02:38:41 +03:00
. irq_index = 0 ,
2018-11-19 21:52:41 +03:00
. nent = MLX5_IB_NUM_PF_EQE ,
} ;
2019-06-30 19:23:23 +03:00
param . mask [ 0 ] = 1ull < < MLX5_EVENT_TYPE_PAGE_FAULT ;
2019-06-11 02:38:25 +03:00
eq - > core = mlx5_eq_create_generic ( dev - > mdev , & param ) ;
2018-11-19 21:52:41 +03:00
if ( IS_ERR ( eq - > core ) ) {
err = PTR_ERR ( eq - > core ) ;
goto err_wq ;
}
2019-06-11 02:38:42 +03:00
err = mlx5_eq_enable ( dev - > mdev , eq - > core , & eq - > irq_nb ) ;
if ( err ) {
mlx5_ib_err ( dev , " failed to enable odp EQ %d \n " , err ) ;
goto err_eq ;
}
2018-11-19 21:52:41 +03:00
return 0 ;
2019-06-11 02:38:42 +03:00
err_eq :
mlx5_eq_destroy_generic ( dev - > mdev , eq - > core ) ;
2018-11-19 21:52:41 +03:00
err_wq :
destroy_workqueue ( eq - > wq ) ;
err_mempool :
mempool_destroy ( eq - > pool ) ;
return err ;
}
static int
mlx5_ib_destroy_pf_eq ( struct mlx5_ib_dev * dev , struct mlx5_ib_pf_eq * eq )
{
int err ;
2019-06-11 02:38:42 +03:00
mlx5_eq_disable ( dev - > mdev , eq - > core , & eq - > irq_nb ) ;
2018-11-19 21:52:41 +03:00
err = mlx5_eq_destroy_generic ( dev - > mdev , eq - > core ) ;
cancel_work_sync ( & eq - > work ) ;
destroy_workqueue ( eq - > wq ) ;
mempool_destroy ( eq - > pool ) ;
return err ;
}
2017-01-18 17:58:11 +03:00
void mlx5_odp_init_mr_cache_entry ( struct mlx5_cache_ent * ent )
{
if ( ! ( ent - > dev - > odp_caps . general_caps & IB_ODP_SUPPORT_IMPLICIT ) )
return ;
switch ( ent - > order - 2 ) {
case MLX5_IMR_MTT_CACHE_ENTRY :
ent - > page = PAGE_SHIFT ;
ent - > xlt = MLX5_IMR_MTT_ENTRIES *
sizeof ( struct mlx5_mtt ) /
MLX5_IB_UMR_OCTOWORD ;
ent - > access_mode = MLX5_MKC_ACCESS_MODE_MTT ;
ent - > limit = 0 ;
break ;
case MLX5_IMR_KSM_CACHE_ENTRY :
ent - > page = MLX5_KSM_PAGE_SHIFT ;
ent - > xlt = mlx5_imr_ksm_entries *
sizeof ( struct mlx5_klm ) /
MLX5_IB_UMR_OCTOWORD ;
ent - > access_mode = MLX5_MKC_ACCESS_MODE_KSM ;
ent - > limit = 0 ;
break ;
}
}
2018-12-11 14:37:53 +03:00
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
. advise_mr = mlx5_ib_advise_mr ,
} ;
2017-01-18 17:58:11 +03:00
int mlx5_ib_odp_init_one ( struct mlx5_ib_dev * dev )
2014-12-11 18:04:23 +03:00
{
2018-11-19 21:52:41 +03:00
int ret = 0 ;
2014-12-11 18:04:23 +03:00
2019-08-15 11:38:32 +03:00
if ( ! ( dev - > odp_caps . general_caps & IB_ODP_SUPPORT ) )
return ret ;
ib_set_device_ops ( & dev - > ib_dev , & mlx5_ib_dev_odp_ops ) ;
2018-12-11 14:37:53 +03:00
2017-01-18 17:58:11 +03:00
if ( dev - > odp_caps . general_caps & IB_ODP_SUPPORT_IMPLICIT ) {
ret = mlx5_cmd_null_mkey ( dev - > mdev , & dev - > null_mkey ) ;
if ( ret ) {
mlx5_ib_err ( dev , " Error getting null_mkey %d \n " , ret ) ;
return ret ;
}
}
2018-11-19 21:52:41 +03:00
ret = mlx5_ib_create_pf_eq ( dev , & dev - > odp_pf_eq ) ;
return ret ;
}
void mlx5_ib_odp_cleanup_one ( struct mlx5_ib_dev * dev )
{
2019-08-15 11:38:32 +03:00
if ( ! ( dev - > odp_caps . general_caps & IB_ODP_SUPPORT ) )
2018-11-19 21:52:41 +03:00
return ;
mlx5_ib_destroy_pf_eq ( dev , & dev - > odp_pf_eq ) ;
2014-12-11 18:04:23 +03:00
}
2017-01-18 17:58:11 +03:00
int mlx5_ib_odp_init ( void )
2014-12-11 18:04:23 +03:00
{
2017-01-18 17:58:11 +03:00
mlx5_imr_ksm_entries = BIT_ULL ( get_order ( TASK_SIZE ) -
MLX5_IMR_MTT_BITS ) ;
return 0 ;
2014-12-11 18:04:23 +03:00
}
2018-12-11 14:37:53 +03:00
struct prefetch_mr_work {
struct work_struct work ;
u32 pf_flags ;
u32 num_sge ;
2019-10-09 19:09:21 +03:00
struct {
u64 io_virt ;
struct mlx5_ib_mr * mr ;
size_t length ;
} frags [ ] ;
2018-12-11 14:37:53 +03:00
} ;
2019-10-09 19:09:21 +03:00
static void destroy_prefetch_work ( struct prefetch_mr_work * work )
2019-02-17 17:08:22 +03:00
{
u32 i ;
2019-10-09 19:09:21 +03:00
for ( i = 0 ; i < work - > num_sge ; + + i )
2019-10-09 19:09:32 +03:00
atomic_dec ( & work - > frags [ i ] . mr - > num_deferred_work ) ;
2019-10-09 19:09:21 +03:00
kvfree ( work ) ;
2019-02-17 17:08:22 +03:00
}
2019-10-09 19:09:21 +03:00
static struct mlx5_ib_mr *
get_prefetchable_mr ( struct ib_pd * pd , enum ib_uverbs_advise_mr_advice advice ,
u32 lkey )
2019-02-17 17:08:22 +03:00
{
struct mlx5_ib_dev * dev = to_mdev ( pd - > device ) ;
2019-10-09 19:09:21 +03:00
struct mlx5_core_mkey * mmkey ;
struct ib_umem_odp * odp ;
struct mlx5_ib_mr * mr ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:23 +03:00
lockdep_assert_held ( & dev - > odp_srcu ) ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:23 +03:00
mmkey = xa_load ( & dev - > odp_mkeys , mlx5_base_mkey ( lkey ) ) ;
2019-10-09 19:09:21 +03:00
if ( ! mmkey | | mmkey - > key ! = lkey | | mmkey - > type ! = MLX5_MKEY_MR )
return NULL ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:21 +03:00
mr = container_of ( mmkey , struct mlx5_ib_mr , mmkey ) ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:23 +03:00
if ( mr - > ibmr . pd ! = pd )
2019-10-09 19:09:21 +03:00
return NULL ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:21 +03:00
odp = to_ib_umem_odp ( mr - > umem ) ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:21 +03:00
/* prefetch with write-access must be supported by the MR */
if ( advice = = IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE & &
! odp - > umem . writable )
return NULL ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:21 +03:00
return mr ;
2019-02-17 17:08:22 +03:00
}
2019-10-09 19:09:21 +03:00
static void mlx5_ib_prefetch_mr_work ( struct work_struct * w )
2018-12-11 14:37:53 +03:00
{
2019-10-09 19:09:21 +03:00
struct prefetch_mr_work * work =
container_of ( w , struct prefetch_mr_work , work ) ;
u32 bytes_mapped = 0 ;
2019-02-17 17:08:22 +03:00
u32 i ;
2019-10-09 19:09:21 +03:00
for ( i = 0 ; i < work - > num_sge ; + + i )
pagefault_mr ( work - > frags [ i ] . mr , work - > frags [ i ] . io_virt ,
work - > frags [ i ] . length , & bytes_mapped ,
work - > pf_flags ) ;
destroy_prefetch_work ( work ) ;
2019-02-17 17:08:22 +03:00
}
2019-10-09 19:09:21 +03:00
static bool init_prefetch_work ( struct ib_pd * pd ,
enum ib_uverbs_advise_mr_advice advice ,
u32 pf_flags , struct prefetch_mr_work * work ,
struct ib_sge * sg_list , u32 num_sge )
2018-12-11 14:37:53 +03:00
{
2019-02-17 17:08:22 +03:00
u32 i ;
2019-10-09 19:09:21 +03:00
INIT_WORK ( & work - > work , mlx5_ib_prefetch_mr_work ) ;
work - > pf_flags = pf_flags ;
2018-12-11 14:37:53 +03:00
for ( i = 0 ; i < num_sge ; + + i ) {
2019-10-09 19:09:21 +03:00
work - > frags [ i ] . io_virt = sg_list [ i ] . addr ;
work - > frags [ i ] . length = sg_list [ i ] . length ;
work - > frags [ i ] . mr =
get_prefetchable_mr ( pd , advice , sg_list [ i ] . lkey ) ;
if ( ! work - > frags [ i ] . mr ) {
work - > num_sge = i - 1 ;
if ( i )
destroy_prefetch_work ( work ) ;
return false ;
}
2018-12-11 14:37:53 +03:00
2019-10-09 19:09:21 +03:00
/* Keep the MR pointer will valid outside the SRCU */
2019-10-09 19:09:32 +03:00
atomic_inc ( & work - > frags [ i ] . mr - > num_deferred_work ) ;
2018-12-11 14:37:53 +03:00
}
2019-10-09 19:09:21 +03:00
work - > num_sge = num_sge ;
return true ;
2018-12-11 14:37:53 +03:00
}
2019-10-09 19:09:21 +03:00
static int mlx5_ib_prefetch_sg_list ( struct ib_pd * pd ,
enum ib_uverbs_advise_mr_advice advice ,
u32 pf_flags , struct ib_sge * sg_list ,
u32 num_sge )
2018-12-11 14:37:53 +03:00
{
2019-10-09 19:09:21 +03:00
struct mlx5_ib_dev * dev = to_mdev ( pd - > device ) ;
u32 bytes_mapped = 0 ;
int srcu_key ;
int ret = 0 ;
u32 i ;
2018-12-11 14:37:53 +03:00
2019-10-09 19:09:23 +03:00
srcu_key = srcu_read_lock ( & dev - > odp_srcu ) ;
2019-10-09 19:09:21 +03:00
for ( i = 0 ; i < num_sge ; + + i ) {
struct mlx5_ib_mr * mr ;
2018-12-11 14:37:53 +03:00
2019-10-09 19:09:21 +03:00
mr = get_prefetchable_mr ( pd , advice , sg_list [ i ] . lkey ) ;
if ( ! mr ) {
ret = - ENOENT ;
goto out ;
}
ret = pagefault_mr ( mr , sg_list [ i ] . addr , sg_list [ i ] . length ,
& bytes_mapped , pf_flags ) ;
if ( ret < 0 )
goto out ;
2019-01-12 05:31:24 +03:00
}
2019-10-09 19:09:21 +03:00
ret = 0 ;
2019-02-17 17:08:22 +03:00
2019-10-09 19:09:21 +03:00
out :
2019-10-09 19:09:23 +03:00
srcu_read_unlock ( & dev - > odp_srcu , srcu_key ) ;
2019-10-09 19:09:21 +03:00
return ret ;
2018-12-11 14:37:53 +03:00
}
int mlx5_ib_advise_mr_prefetch ( struct ib_pd * pd ,
enum ib_uverbs_advise_mr_advice advice ,
u32 flags , struct ib_sge * sg_list , u32 num_sge )
{
struct mlx5_ib_dev * dev = to_mdev ( pd - > device ) ;
2019-10-09 19:09:21 +03:00
u32 pf_flags = 0 ;
2018-12-11 14:37:53 +03:00
struct prefetch_mr_work * work ;
2019-02-17 17:08:22 +03:00
int srcu_key ;
2018-12-11 14:37:53 +03:00
if ( advice = = IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH )
pf_flags | = MLX5_PF_FLAGS_DOWNGRADE ;
if ( flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH )
2019-10-09 19:09:21 +03:00
return mlx5_ib_prefetch_sg_list ( pd , advice , pf_flags , sg_list ,
2018-12-11 14:37:53 +03:00
num_sge ) ;
2019-10-09 19:09:21 +03:00
work = kvzalloc ( struct_size ( work , frags , num_sge ) , GFP_KERNEL ) ;
2018-12-11 14:37:53 +03:00
if ( ! work )
return - ENOMEM ;
2019-10-09 19:09:23 +03:00
srcu_key = srcu_read_lock ( & dev - > odp_srcu ) ;
2019-10-09 19:09:21 +03:00
if ( ! init_prefetch_work ( pd , advice , pf_flags , work , sg_list , num_sge ) ) {
2019-10-09 19:09:23 +03:00
srcu_read_unlock ( & dev - > odp_srcu , srcu_key ) ;
2019-10-09 19:09:21 +03:00
return - EINVAL ;
}
queue_work ( system_unbound_wq , & work - > work ) ;
2019-10-09 19:09:23 +03:00
srcu_read_unlock ( & dev - > odp_srcu , srcu_key ) ;
2019-10-09 19:09:21 +03:00
return 0 ;
2018-12-11 14:37:53 +03:00
}