IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests

hfi1 user SDMA request processing has two bugs that can cause data
corruption for user SDMA requests that have multiple payload iovecs
where an iovec other than the tail iovec does not run up to the page
boundary for the buffer pointed to by that iovec.a

Here are the specific bugs:
1. user_sdma_txadd() does not use struct user_sdma_iovec->iov.iov_len.
   Rather, user_sdma_txadd() will add up to PAGE_SIZE bytes from iovec
   to the packet, even if some of those bytes are past
   iovec->iov.iov_len and are thus not intended to be in the packet.
2. user_sdma_txadd() and user_sdma_send_pkts() fail to advance to the
   next iovec in user_sdma_request->iovs when the current iovec
   is not PAGE_SIZE and does not contain enough data to complete the
   packet. The transmitted packet will contain the wrong data from the
   iovec pages.

This has not been an issue with SDMA packets from hfi1 Verbs or PSM2
because they only produce iovecs that end short of PAGE_SIZE as the tail
iovec of an SDMA request.

Fixing these bugs exposes other bugs with the SDMA pin cache
(struct mmu_rb_handler) that get in way of supporting user SDMA requests
with multiple payload iovecs whose buffers do not end at PAGE_SIZE. So
this commit fixes those issues as well.

Here are the mmu_rb_handler bugs that non-PAGE_SIZE-end multi-iovec
payload user SDMA requests can hit:
1. Overlapping memory ranges in mmu_rb_handler will result in duplicate
   pinnings.
2. When extending an existing mmu_rb_handler entry (struct mmu_rb_node),
   the mmu_rb code (1) removes the existing entry under a lock, (2)
   releases that lock, pins the new pages, (3) then reacquires the lock
   to insert the extended mmu_rb_node.

   If someone else comes in and inserts an overlapping entry between (2)
   and (3), insert in (3) will fail.

   The failure path code in this case unpins _all_ pages in either the
   original mmu_rb_node or the new mmu_rb_node that was inserted between
   (2) and (3).
3. In hfi1_mmu_rb_remove_unless_exact(), mmu_rb_node->refcount is
   incremented outside of mmu_rb_handler->lock. As a result, mmu_rb_node
   could be evicted by another thread that gets mmu_rb_handler->lock and
   checks mmu_rb_node->refcount before mmu_rb_node->refcount is
   incremented.
4. Related to #2 above, SDMA request submission failure path does not
   check mmu_rb_node->refcount before freeing mmu_rb_node object.

   If there are other SDMA requests in progress whose iovecs have
   pointers to the now-freed mmu_rb_node(s), those pointers to the
   now-freed mmu_rb nodes will be dereferenced when those SDMA requests
   complete.

Fixes: 7be85676f1 ("IB/hfi1: Don't remove RB entry when not needed.")
Fixes: 7724105686 ("IB/hfi1: add driver files")
Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com>
Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Link: https://lore.kernel.org/r/168088636445.3027109.10054635277810177889.stgit@252.162.96.66.static.eigbox.net
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Patrick Kelsey 2023-04-07 12:52:44 -04:00 committed by Leon Romanovsky
parent 9fe8fec5e4
commit 00cbce5cbf
11 changed files with 423 additions and 304 deletions

View File

@ -215,6 +215,7 @@ static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx,
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
ret = sdma_txadd_page(dd, ret = sdma_txadd_page(dd,
NULL,
txreq, txreq,
skb_frag_page(frag), skb_frag_page(frag),
frag->bv_offset, frag->bv_offset,

View File

@ -126,7 +126,7 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
spin_lock_irqsave(&handler->lock, flags); spin_lock_irqsave(&handler->lock, flags);
node = __mmu_rb_search(handler, mnode->addr, mnode->len); node = __mmu_rb_search(handler, mnode->addr, mnode->len);
if (node) { if (node) {
ret = -EINVAL; ret = -EEXIST;
goto unlock; goto unlock;
} }
__mmu_int_rb_insert(mnode, &handler->root); __mmu_int_rb_insert(mnode, &handler->root);
@ -143,6 +143,19 @@ unlock:
return ret; return ret;
} }
/* Caller must hold handler lock */
struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler,
unsigned long addr, unsigned long len)
{
struct mmu_rb_node *node;
trace_hfi1_mmu_rb_search(addr, len);
node = __mmu_int_rb_iter_first(&handler->root, addr, (addr + len) - 1);
if (node)
list_move_tail(&node->list, &handler->lru_list);
return node;
}
/* Caller must hold handler lock */ /* Caller must hold handler lock */
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
unsigned long addr, unsigned long addr,
@ -167,34 +180,6 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
return node; return node;
} }
bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
unsigned long addr, unsigned long len,
struct mmu_rb_node **rb_node)
{
struct mmu_rb_node *node;
unsigned long flags;
bool ret = false;
if (current->mm != handler->mn.mm)
return ret;
spin_lock_irqsave(&handler->lock, flags);
node = __mmu_rb_search(handler, addr, len);
if (node) {
if (node->addr == addr && node->len == len) {
list_move_tail(&node->list, &handler->lru_list);
goto unlock;
}
__mmu_int_rb_remove(node, &handler->root);
list_del(&node->list); /* remove from LRU list */
ret = true;
}
unlock:
spin_unlock_irqrestore(&handler->lock, flags);
*rb_node = node;
return ret;
}
void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
{ {
struct mmu_rb_node *rbnode, *ptr; struct mmu_rb_node *rbnode, *ptr;
@ -225,29 +210,6 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
} }
} }
/*
* It is up to the caller to ensure that this function does not race with the
* mmu invalidate notifier which may be calling the users remove callback on
* 'node'.
*/
void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
struct mmu_rb_node *node)
{
unsigned long flags;
if (current->mm != handler->mn.mm)
return;
/* Validity of handler and node pointers has been checked by caller. */
trace_hfi1_mmu_rb_remove(node->addr, node->len);
spin_lock_irqsave(&handler->lock, flags);
__mmu_int_rb_remove(node, &handler->root);
list_del(&node->list); /* remove from LRU list */
spin_unlock_irqrestore(&handler->lock, flags);
handler->ops->remove(handler->ops_arg, node);
}
static int mmu_notifier_range_start(struct mmu_notifier *mn, static int mmu_notifier_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *range) const struct mmu_notifier_range *range)
{ {

View File

@ -52,10 +52,8 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
struct mmu_rb_node *mnode); struct mmu_rb_node *mnode);
void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler,
struct mmu_rb_node *mnode); unsigned long addr,
bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler, unsigned long len);
unsigned long addr, unsigned long len,
struct mmu_rb_node **rb_node);
#endif /* _HFI1_MMU_RB_H */ #endif /* _HFI1_MMU_RB_H */

View File

@ -1593,22 +1593,7 @@ static inline void sdma_unmap_desc(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
struct sdma_desc *descp) struct sdma_desc *descp)
{ {
switch (sdma_mapping_type(descp)) { system_descriptor_complete(dd, descp);
case SDMA_MAP_SINGLE:
dma_unmap_single(
&dd->pcidev->dev,
sdma_mapping_addr(descp),
sdma_mapping_len(descp),
DMA_TO_DEVICE);
break;
case SDMA_MAP_PAGE:
dma_unmap_page(
&dd->pcidev->dev,
sdma_mapping_addr(descp),
sdma_mapping_len(descp),
DMA_TO_DEVICE);
break;
}
} }
/* /*
@ -3128,7 +3113,7 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
/* Add descriptor for coalesce buffer */ /* Add descriptor for coalesce buffer */
tx->desc_limit = MAX_DESC; tx->desc_limit = MAX_DESC;
return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx,
addr, tx->tlen); addr, tx->tlen);
} }
@ -3167,10 +3152,12 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
return rval; return rval;
} }
} }
/* finish the one just added */ /* finish the one just added */
make_tx_sdma_desc( make_tx_sdma_desc(
tx, tx,
SDMA_MAP_NONE, SDMA_MAP_NONE,
NULL,
dd->sdma_pad_phys, dd->sdma_pad_phys,
sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
tx->num_desc++; tx->num_desc++;

View File

@ -594,6 +594,7 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
static inline void make_tx_sdma_desc( static inline void make_tx_sdma_desc(
struct sdma_txreq *tx, struct sdma_txreq *tx,
int type, int type,
void *pinning_ctx,
dma_addr_t addr, dma_addr_t addr,
size_t len) size_t len)
{ {
@ -612,6 +613,7 @@ static inline void make_tx_sdma_desc(
<< SDMA_DESC0_PHY_ADDR_SHIFT) | << SDMA_DESC0_PHY_ADDR_SHIFT) |
(((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
<< SDMA_DESC0_BYTE_COUNT_SHIFT); << SDMA_DESC0_BYTE_COUNT_SHIFT);
desc->pinning_ctx = pinning_ctx;
} }
/* helper to extend txreq */ /* helper to extend txreq */
@ -643,6 +645,7 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd,
static inline int _sdma_txadd_daddr( static inline int _sdma_txadd_daddr(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
int type, int type,
void *pinning_ctx,
struct sdma_txreq *tx, struct sdma_txreq *tx,
dma_addr_t addr, dma_addr_t addr,
u16 len) u16 len)
@ -652,6 +655,7 @@ static inline int _sdma_txadd_daddr(
make_tx_sdma_desc( make_tx_sdma_desc(
tx, tx,
type, type,
pinning_ctx,
addr, len); addr, len);
WARN_ON(len > tx->tlen); WARN_ON(len > tx->tlen);
tx->num_desc++; tx->num_desc++;
@ -672,6 +676,7 @@ static inline int _sdma_txadd_daddr(
/** /**
* sdma_txadd_page() - add a page to the sdma_txreq * sdma_txadd_page() - add a page to the sdma_txreq
* @dd: the device to use for mapping * @dd: the device to use for mapping
* @pinning_ctx: context to be released at descriptor retirement
* @tx: tx request to which the page is added * @tx: tx request to which the page is added
* @page: page to map * @page: page to map
* @offset: offset within the page * @offset: offset within the page
@ -687,6 +692,7 @@ static inline int _sdma_txadd_daddr(
*/ */
static inline int sdma_txadd_page( static inline int sdma_txadd_page(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
void *pinning_ctx,
struct sdma_txreq *tx, struct sdma_txreq *tx,
struct page *page, struct page *page,
unsigned long offset, unsigned long offset,
@ -714,8 +720,7 @@ static inline int sdma_txadd_page(
return -ENOSPC; return -ENOSPC;
} }
return _sdma_txadd_daddr( return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, pinning_ctx, tx, addr, len);
dd, SDMA_MAP_PAGE, tx, addr, len);
} }
/** /**
@ -749,7 +754,8 @@ static inline int sdma_txadd_daddr(
return rval; return rval;
} }
return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len); return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, NULL, tx,
addr, len);
} }
/** /**
@ -795,8 +801,7 @@ static inline int sdma_txadd_kvaddr(
return -ENOSPC; return -ENOSPC;
} }
return _sdma_txadd_daddr( return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, addr, len);
dd, SDMA_MAP_SINGLE, tx, addr, len);
} }
struct iowait_work; struct iowait_work;
@ -1030,4 +1035,5 @@ extern uint mod_num_sdma;
void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
void system_descriptor_complete(struct hfi1_devdata *dd, struct sdma_desc *descp);
#endif #endif

View File

@ -19,6 +19,7 @@
struct sdma_desc { struct sdma_desc {
/* private: don't use directly */ /* private: don't use directly */
u64 qw[2]; u64 qw[2];
void *pinning_ctx;
}; };
/** /**

View File

@ -37,10 +37,6 @@ DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_search,
TP_PROTO(unsigned long addr, unsigned long len), TP_PROTO(unsigned long addr, unsigned long len),
TP_ARGS(addr, len)); TP_ARGS(addr, len));
DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_remove,
TP_PROTO(unsigned long addr, unsigned long len),
TP_ARGS(addr, len));
DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate, DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate,
TP_PROTO(unsigned long addr, unsigned long len), TP_PROTO(unsigned long addr, unsigned long len),
TP_ARGS(addr, len)); TP_ARGS(addr, len));

View File

@ -24,7 +24,6 @@
#include "hfi.h" #include "hfi.h"
#include "sdma.h" #include "sdma.h"
#include "mmu_rb.h"
#include "user_sdma.h" #include "user_sdma.h"
#include "verbs.h" /* for the headers */ #include "verbs.h" /* for the headers */
#include "common.h" /* for struct hfi1_tid_info */ #include "common.h" /* for struct hfi1_tid_info */
@ -39,11 +38,7 @@ static unsigned initial_pkt_count = 8;
static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); static void user_sdma_free_request(struct user_sdma_request *req);
static int pin_vector_pages(struct user_sdma_request *req,
struct user_sdma_iovec *iovec);
static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
unsigned start, unsigned npages);
static int check_header_template(struct user_sdma_request *req, static int check_header_template(struct user_sdma_request *req,
struct hfi1_pkt_header *hdr, u32 lrhlen, struct hfi1_pkt_header *hdr, u32 lrhlen,
u32 datalen); u32 datalen);
@ -81,6 +76,11 @@ static struct mmu_rb_ops sdma_rb_ops = {
.invalidate = sdma_rb_invalidate .invalidate = sdma_rb_invalidate
}; };
static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_txreq *tx,
struct user_sdma_iovec *iovec,
u32 *pkt_remaining);
static int defer_packet_queue( static int defer_packet_queue(
struct sdma_engine *sde, struct sdma_engine *sde,
struct iowait_work *wait, struct iowait_work *wait,
@ -410,6 +410,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
ret = -EINVAL; ret = -EINVAL;
goto free_req; goto free_req;
} }
/* Copy the header from the user buffer */ /* Copy the header from the user buffer */
ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
sizeof(req->hdr)); sizeof(req->hdr));
@ -484,9 +485,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
memcpy(&req->iovs[i].iov, memcpy(&req->iovs[i].iov,
iovec + idx++, iovec + idx++,
sizeof(req->iovs[i].iov)); sizeof(req->iovs[i].iov));
ret = pin_vector_pages(req, &req->iovs[i]); if (req->iovs[i].iov.iov_len == 0) {
if (ret) { ret = -EINVAL;
req->data_iovs = i;
goto free_req; goto free_req;
} }
req->data_len += req->iovs[i].iov.iov_len; req->data_len += req->iovs[i].iov.iov_len;
@ -584,7 +584,7 @@ free_req:
if (req->seqsubmitted) if (req->seqsubmitted)
wait_event(pq->busy.wait_dma, wait_event(pq->busy.wait_dma,
(req->seqcomp == req->seqsubmitted - 1)); (req->seqcomp == req->seqsubmitted - 1));
user_sdma_free_request(req, true); user_sdma_free_request(req);
pq_update(pq); pq_update(pq);
set_comp_state(pq, cq, info.comp_idx, ERROR, ret); set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
} }
@ -696,48 +696,6 @@ static int user_sdma_txadd_ahg(struct user_sdma_request *req,
return ret; return ret;
} }
static int user_sdma_txadd(struct user_sdma_request *req,
struct user_sdma_txreq *tx,
struct user_sdma_iovec *iovec, u32 datalen,
u32 *queued_ptr, u32 *data_sent_ptr,
u64 *iov_offset_ptr)
{
int ret;
unsigned int pageidx, len;
unsigned long base, offset;
u64 iov_offset = *iov_offset_ptr;
u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
struct hfi1_user_sdma_pkt_q *pq = req->pq;
base = (unsigned long)iovec->iov.iov_base;
offset = offset_in_page(base + iovec->offset + iov_offset);
pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
PAGE_SHIFT);
len = offset + req->info.fragsize > PAGE_SIZE ?
PAGE_SIZE - offset : req->info.fragsize;
len = min((datalen - queued), len);
ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
offset, len);
if (ret) {
SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
return ret;
}
iov_offset += len;
queued += len;
data_sent += len;
if (unlikely(queued < datalen && pageidx == iovec->npages &&
req->iov_idx < req->data_iovs - 1)) {
iovec->offset += iov_offset;
iovec = &req->iovs[++req->iov_idx];
iov_offset = 0;
}
*queued_ptr = queued;
*data_sent_ptr = data_sent;
*iov_offset_ptr = iov_offset;
return ret;
}
static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
{ {
int ret = 0; int ret = 0;
@ -769,8 +727,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
maxpkts = req->info.npkts - req->seqnum; maxpkts = req->info.npkts - req->seqnum;
while (npkts < maxpkts) { while (npkts < maxpkts) {
u32 datalen = 0, queued = 0, data_sent = 0; u32 datalen = 0;
u64 iov_offset = 0;
/* /*
* Check whether any of the completions have come back * Check whether any of the completions have come back
@ -863,27 +820,17 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
goto free_txreq; goto free_txreq;
} }
/*
* If the request contains any data vectors, add up to
* fragsize bytes to the descriptor.
*/
while (queued < datalen &&
(req->sent + data_sent) < req->data_len) {
ret = user_sdma_txadd(req, tx, iovec, datalen,
&queued, &data_sent, &iov_offset);
if (ret)
goto free_txreq;
}
/*
* The txreq was submitted successfully so we can update
* the counters.
*/
req->koffset += datalen; req->koffset += datalen;
if (req_opcode(req->info.ctrl) == EXPECTED) if (req_opcode(req->info.ctrl) == EXPECTED)
req->tidoffset += datalen; req->tidoffset += datalen;
req->sent += data_sent; req->sent += datalen;
if (req->data_len) while (datalen) {
iovec->offset += iov_offset; ret = add_system_pages_to_sdma_packet(req, tx, iovec,
&datalen);
if (ret)
goto free_txreq;
iovec = &req->iovs[req->iov_idx];
}
list_add_tail(&tx->txreq.list, &req->txps); list_add_tail(&tx->txreq.list, &req->txps);
/* /*
* It is important to increment this here as it is used to * It is important to increment this here as it is used to
@ -920,133 +867,14 @@ free_tx:
static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
{ {
struct evict_data evict_data; struct evict_data evict_data;
struct mmu_rb_handler *handler = pq->handler;
evict_data.cleared = 0; evict_data.cleared = 0;
evict_data.target = npages; evict_data.target = npages;
hfi1_mmu_rb_evict(pq->handler, &evict_data); hfi1_mmu_rb_evict(handler, &evict_data);
return evict_data.cleared; return evict_data.cleared;
} }
static int pin_sdma_pages(struct user_sdma_request *req,
struct user_sdma_iovec *iovec,
struct sdma_mmu_node *node,
int npages)
{
int pinned, cleared;
struct page **pages;
struct hfi1_user_sdma_pkt_q *pq = req->pq;
pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
memcpy(pages, node->pages, node->npages * sizeof(*pages));
npages -= node->npages;
retry:
if (!hfi1_can_pin_pages(pq->dd, current->mm,
atomic_read(&pq->n_locked), npages)) {
cleared = sdma_cache_evict(pq, npages);
if (cleared >= npages)
goto retry;
}
pinned = hfi1_acquire_user_pages(current->mm,
((unsigned long)iovec->iov.iov_base +
(node->npages * PAGE_SIZE)), npages, 0,
pages + node->npages);
if (pinned < 0) {
kfree(pages);
return pinned;
}
if (pinned != npages) {
unpin_vector_pages(current->mm, pages, node->npages, pinned);
return -EFAULT;
}
kfree(node->pages);
node->rb.len = iovec->iov.iov_len;
node->pages = pages;
atomic_add(pinned, &pq->n_locked);
return pinned;
}
static void unpin_sdma_pages(struct sdma_mmu_node *node)
{
if (node->npages) {
unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
node->npages);
atomic_sub(node->npages, &node->pq->n_locked);
}
}
static int pin_vector_pages(struct user_sdma_request *req,
struct user_sdma_iovec *iovec)
{
int ret = 0, pinned, npages;
struct hfi1_user_sdma_pkt_q *pq = req->pq;
struct sdma_mmu_node *node = NULL;
struct mmu_rb_node *rb_node;
struct iovec *iov;
bool extracted;
extracted =
hfi1_mmu_rb_remove_unless_exact(pq->handler,
(unsigned long)
iovec->iov.iov_base,
iovec->iov.iov_len, &rb_node);
if (rb_node) {
node = container_of(rb_node, struct sdma_mmu_node, rb);
if (!extracted) {
atomic_inc(&node->refcount);
iovec->pages = node->pages;
iovec->npages = node->npages;
iovec->node = node;
return 0;
}
}
if (!node) {
node = kzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
node->rb.addr = (unsigned long)iovec->iov.iov_base;
node->pq = pq;
atomic_set(&node->refcount, 0);
}
iov = &iovec->iov;
npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
if (node->npages < npages) {
pinned = pin_sdma_pages(req, iovec, node, npages);
if (pinned < 0) {
ret = pinned;
goto bail;
}
node->npages += pinned;
npages = node->npages;
}
iovec->pages = node->pages;
iovec->npages = npages;
iovec->node = node;
ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
if (ret) {
iovec->node = NULL;
goto bail;
}
return 0;
bail:
unpin_sdma_pages(node);
kfree(node);
return ret;
}
static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
unsigned start, unsigned npages)
{
hfi1_release_user_pages(mm, pages + start, npages, false);
kfree(pages);
}
static int check_header_template(struct user_sdma_request *req, static int check_header_template(struct user_sdma_request *req,
struct hfi1_pkt_header *hdr, u32 lrhlen, struct hfi1_pkt_header *hdr, u32 lrhlen,
u32 datalen) u32 datalen)
@ -1388,7 +1216,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
if (req->seqcomp != req->info.npkts - 1) if (req->seqcomp != req->info.npkts - 1)
return; return;
user_sdma_free_request(req, false); user_sdma_free_request(req);
set_comp_state(pq, cq, req->info.comp_idx, state, status); set_comp_state(pq, cq, req->info.comp_idx, state, status);
pq_update(pq); pq_update(pq);
} }
@ -1399,10 +1227,8 @@ static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
wake_up(&pq->wait); wake_up(&pq->wait);
} }
static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) static void user_sdma_free_request(struct user_sdma_request *req)
{ {
int i;
if (!list_empty(&req->txps)) { if (!list_empty(&req->txps)) {
struct sdma_txreq *t, *p; struct sdma_txreq *t, *p;
@ -1415,21 +1241,6 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
} }
} }
for (i = 0; i < req->data_iovs; i++) {
struct sdma_mmu_node *node = req->iovs[i].node;
if (!node)
continue;
req->iovs[i].node = NULL;
if (unpin)
hfi1_mmu_rb_remove(req->pq->handler,
&node->rb);
else
atomic_dec(&node->refcount);
}
kfree(req->tids); kfree(req->tids);
clear_bit(req->info.comp_idx, req->pq->req_in_use); clear_bit(req->info.comp_idx, req->pq->req_in_use);
} }
@ -1447,6 +1258,368 @@ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
idx, state, ret); idx, state, ret);
} }
static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
unsigned int start, unsigned int npages)
{
hfi1_release_user_pages(mm, pages + start, npages, false);
kfree(pages);
}
static void free_system_node(struct sdma_mmu_node *node)
{
if (node->npages) {
unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
node->npages);
atomic_sub(node->npages, &node->pq->n_locked);
}
kfree(node);
}
static inline void acquire_node(struct sdma_mmu_node *node)
{
atomic_inc(&node->refcount);
WARN_ON(atomic_read(&node->refcount) < 0);
}
static inline void release_node(struct mmu_rb_handler *handler,
struct sdma_mmu_node *node)
{
atomic_dec(&node->refcount);
WARN_ON(atomic_read(&node->refcount) < 0);
}
static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
unsigned long start,
unsigned long end)
{
struct mmu_rb_node *rb_node;
struct sdma_mmu_node *node;
unsigned long flags;
spin_lock_irqsave(&handler->lock, flags);
rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start));
if (!rb_node) {
spin_unlock_irqrestore(&handler->lock, flags);
return NULL;
}
node = container_of(rb_node, struct sdma_mmu_node, rb);
acquire_node(node);
spin_unlock_irqrestore(&handler->lock, flags);
return node;
}
static int pin_system_pages(struct user_sdma_request *req,
uintptr_t start_address, size_t length,
struct sdma_mmu_node *node, int npages)
{
struct hfi1_user_sdma_pkt_q *pq = req->pq;
int pinned, cleared;
struct page **pages;
pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
retry:
if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked),
npages)) {
SDMA_DBG(req, "Evicting: nlocked %u npages %u",
atomic_read(&pq->n_locked), npages);
cleared = sdma_cache_evict(pq, npages);
if (cleared >= npages)
goto retry;
}
SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u",
start_address, node->npages, npages);
pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0,
pages);
if (pinned < 0) {
kfree(pages);
SDMA_DBG(req, "pinned %d", pinned);
return pinned;
}
if (pinned != npages) {
unpin_vector_pages(current->mm, pages, node->npages, pinned);
SDMA_DBG(req, "npages %u pinned %d", npages, pinned);
return -EFAULT;
}
node->rb.addr = start_address;
node->rb.len = length;
node->pages = pages;
node->npages = npages;
atomic_add(pinned, &pq->n_locked);
SDMA_DBG(req, "done. pinned %d", pinned);
return 0;
}
static int add_system_pinning(struct user_sdma_request *req,
struct sdma_mmu_node **node_p,
unsigned long start, unsigned long len)
{
struct hfi1_user_sdma_pkt_q *pq = req->pq;
struct sdma_mmu_node *node;
int ret;
node = kzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
node->pq = pq;
ret = pin_system_pages(req, start, len, node, PFN_DOWN(len));
if (ret == 0) {
ret = hfi1_mmu_rb_insert(pq->handler, &node->rb);
if (ret)
free_system_node(node);
else
*node_p = node;
return ret;
}
kfree(node);
return ret;
}
static int get_system_cache_entry(struct user_sdma_request *req,
struct sdma_mmu_node **node_p,
size_t req_start, size_t req_len)
{
struct hfi1_user_sdma_pkt_q *pq = req->pq;
u64 start = ALIGN_DOWN(req_start, PAGE_SIZE);
u64 end = PFN_ALIGN(req_start + req_len);
struct mmu_rb_handler *handler = pq->handler;
int ret;
if ((end - start) == 0) {
SDMA_DBG(req,
"Request for empty cache entry req_start %lx req_len %lx start %llx end %llx",
req_start, req_len, start, end);
return -EINVAL;
}
SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len);
while (1) {
struct sdma_mmu_node *node =
find_system_node(handler, start, end);
u64 prepend_len = 0;
SDMA_DBG(req, "node %p start %llx end %llu", node, start, end);
if (!node) {
ret = add_system_pinning(req, node_p, start,
end - start);
if (ret == -EEXIST) {
/*
* Another execution context has inserted a
* conficting entry first.
*/
continue;
}
return ret;
}
if (node->rb.addr <= start) {
/*
* This entry covers at least part of the region. If it doesn't extend
* to the end, then this will be called again for the next segment.
*/
*node_p = node;
return 0;
}
SDMA_DBG(req, "prepend: node->rb.addr %lx, node->refcount %d",
node->rb.addr, atomic_read(&node->refcount));
prepend_len = node->rb.addr - start;
/*
* This node will not be returned, instead a new node
* will be. So release the reference.
*/
release_node(handler, node);
/* Prepend a node to cover the beginning of the allocation */
ret = add_system_pinning(req, node_p, start, prepend_len);
if (ret == -EEXIST) {
/* Another execution context has inserted a conficting entry first. */
continue;
}
return ret;
}
}
static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_txreq *tx,
struct sdma_mmu_node *cache_entry,
size_t start,
size_t from_this_cache_entry)
{
struct hfi1_user_sdma_pkt_q *pq = req->pq;
unsigned int page_offset;
unsigned int from_this_page;
size_t page_index;
void *ctx;
int ret;
/*
* Because the cache may be more fragmented than the memory that is being accessed,
* it's not strictly necessary to have a descriptor per cache entry.
*/
while (from_this_cache_entry) {
page_index = PFN_DOWN(start - cache_entry->rb.addr);
if (page_index >= cache_entry->npages) {
SDMA_DBG(req,
"Request for page_index %zu >= cache_entry->npages %u",
page_index, cache_entry->npages);
return -EINVAL;
}
page_offset = start - ALIGN_DOWN(start, PAGE_SIZE);
from_this_page = PAGE_SIZE - page_offset;
if (from_this_page < from_this_cache_entry) {
ctx = NULL;
} else {
/*
* In the case they are equal the next line has no practical effect,
* but it's better to do a register to register copy than a conditional
* branch.
*/
from_this_page = from_this_cache_entry;
ctx = cache_entry;
}
ret = sdma_txadd_page(pq->dd, ctx, &tx->txreq,
cache_entry->pages[page_index],
page_offset, from_this_page);
if (ret) {
/*
* When there's a failure, the entire request is freed by
* user_sdma_send_pkts().
*/
SDMA_DBG(req,
"sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u",
ret, page_index, page_offset, from_this_page);
return ret;
}
start += from_this_page;
from_this_cache_entry -= from_this_page;
}
return 0;
}
static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_txreq *tx,
struct user_sdma_iovec *iovec,
size_t from_this_iovec)
{
struct mmu_rb_handler *handler = req->pq->handler;
while (from_this_iovec > 0) {
struct sdma_mmu_node *cache_entry;
size_t from_this_cache_entry;
size_t start;
int ret;
start = (uintptr_t)iovec->iov.iov_base + iovec->offset;
ret = get_system_cache_entry(req, &cache_entry, start,
from_this_iovec);
if (ret) {
SDMA_DBG(req, "pin system segment failed %d", ret);
return ret;
}
from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr);
if (from_this_cache_entry > from_this_iovec)
from_this_cache_entry = from_this_iovec;
ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start,
from_this_cache_entry);
if (ret) {
/*
* We're guaranteed that there will be no descriptor
* completion callback that releases this node
* because only the last descriptor referencing it
* has a context attached, and a failure means the
* last descriptor was never added.
*/
release_node(handler, cache_entry);
SDMA_DBG(req, "add system segment failed %d", ret);
return ret;
}
iovec->offset += from_this_cache_entry;
from_this_iovec -= from_this_cache_entry;
}
return 0;
}
static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_txreq *tx,
struct user_sdma_iovec *iovec,
u32 *pkt_data_remaining)
{
size_t remaining_to_add = *pkt_data_remaining;
/*
* Walk through iovec entries, ensure the associated pages
* are pinned and mapped, add data to the packet until no more
* data remains to be added.
*/
while (remaining_to_add > 0) {
struct user_sdma_iovec *cur_iovec;
size_t from_this_iovec;
int ret;
cur_iovec = iovec;
from_this_iovec = iovec->iov.iov_len - iovec->offset;
if (from_this_iovec > remaining_to_add) {
from_this_iovec = remaining_to_add;
} else {
/* The current iovec entry will be consumed by this pass. */
req->iov_idx++;
iovec++;
}
ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec,
from_this_iovec);
if (ret)
return ret;
remaining_to_add -= from_this_iovec;
}
*pkt_data_remaining = remaining_to_add;
return 0;
}
void system_descriptor_complete(struct hfi1_devdata *dd,
struct sdma_desc *descp)
{
switch (sdma_mapping_type(descp)) {
case SDMA_MAP_SINGLE:
dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
case SDMA_MAP_PAGE:
dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
}
if (descp->pinning_ctx) {
struct sdma_mmu_node *node = descp->pinning_ctx;
release_node(node->rb.handler, node);
}
}
static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
unsigned long len) unsigned long len)
{ {
@ -1493,8 +1666,7 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
struct sdma_mmu_node *node = struct sdma_mmu_node *node =
container_of(mnode, struct sdma_mmu_node, rb); container_of(mnode, struct sdma_mmu_node, rb);
unpin_sdma_pages(node); free_system_node(node);
kfree(node);
} }
static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)

View File

@ -112,16 +112,11 @@ struct sdma_mmu_node {
struct user_sdma_iovec { struct user_sdma_iovec {
struct list_head list; struct list_head list;
struct iovec iov; struct iovec iov;
/* number of pages in this vector */
unsigned int npages;
/* array of pinned pages for this vector */
struct page **pages;
/* /*
* offset into the virtual address space of the vector at * offset into the virtual address space of the vector at
* which we last left off. * which we last left off.
*/ */
u64 offset; u64 offset;
struct sdma_mmu_node *node;
}; };
/* evict operation argument */ /* evict operation argument */

View File

@ -778,8 +778,8 @@ static int build_verbs_tx_desc(
/* add icrc, lt byte, and padding to flit */ /* add icrc, lt byte, and padding to flit */
if (extra_bytes) if (extra_bytes)
ret = sdma_txadd_daddr(sde->dd, &tx->txreq, ret = sdma_txadd_daddr(sde->dd, &tx->txreq, sde->dd->sdma_pad_phys,
sde->dd->sdma_pad_phys, extra_bytes); extra_bytes);
bail_txadd: bail_txadd:
return ret; return ret;

View File

@ -64,6 +64,7 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde,
/* combine physically continuous fragments later? */ /* combine physically continuous fragments later? */
ret = sdma_txadd_page(sde->dd, ret = sdma_txadd_page(sde->dd,
NULL,
&tx->txreq, &tx->txreq,
skb_frag_page(frag), skb_frag_page(frag),
skb_frag_off(frag), skb_frag_off(frag),