linux/drivers/infiniband/hw/hfi1/sdma_txreq.h

/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
 * Copyright(c) 2016 Intel Corporation.
 */

#ifndef HFI1_SDMA_TXREQ_H
#define HFI1_SDMA_TXREQ_H

/* increased for AHG */
#define NUM_DESC 6

/*
 * struct sdma_desc - canonical fragment descriptor
 *
 * This is the descriptor carried in the tx request
 * corresponding to each fragment.
 *
 */
struct sdma_desc {
	/* private:  don't use directly */
	u64 qw[2];
	void *pinning_ctx;
	/* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */
	void (*ctx_put)(void *ctx);
};

/**
 * struct sdma_txreq - the sdma_txreq structure (one per packet)
 * @list: for use by user and by queuing for wait
 *
 * This is the representation of a packet which consists of some
 * number of fragments.   Storage is provided to within the structure.
 * for all fragments.
 *
 * The storage for the descriptors are automatically extended as needed
 * when the currently allocation is exceeded.
 *
 * The user (Verbs or PSM) may overload this structure with fields
 * specific to their use by putting this struct first in their struct.
 * The method of allocation of the overloaded structure is user dependent
 *
 * The list is the only public field in the structure.
 *
 */

#define SDMA_TXREQ_S_OK        0
#define SDMA_TXREQ_S_SENDERROR 1
#define SDMA_TXREQ_S_ABORTED   2
#define SDMA_TXREQ_S_SHUTDOWN  3

/* flags bits */
#define SDMA_TXREQ_F_URGENT       0x0001
#define SDMA_TXREQ_F_AHG_COPY     0x0002
#define SDMA_TXREQ_F_USE_AHG      0x0004
#define SDMA_TXREQ_F_VIP          0x0010

struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);

struct iowait;
struct sdma_txreq {
	struct list_head list;
	/* private: */
	struct sdma_desc *descp;
	/* private: */
	void *coalesce_buf;
	/* private: */
	struct iowait *wait;
	/* private: */
	callback_t                  complete;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
	u64 sn;
#endif
	/* private: - used in coalesce/pad processing */
	u16                         packet_len;
	/* private: - down-counted to trigger last */
	u16                         tlen;
	/* private: */
	u16                         num_desc;
	/* private: */
	u16                         desc_limit;
	/* private: */
	u16                         next_descq_idx;
	/* private: */
	u16 coalesce_idx;
	/* private: flags */
	u16                         flags;
	/* private: */
	struct sdma_desc descs[NUM_DESC];
};

static inline int sdma_txreq_built(struct sdma_txreq *tx)
{
	return tx->num_desc;
}

#endif                          /* HFI1_SDMA_TXREQ_H */
IB: Use capital "OR" for multiple licenses in SPDX Documentation/process/license-rules.rst and checkpatch expect the SPDX identifier syntax for multiple licenses to use capital "OR". Correct it to keep consistent format and avoid copy-paste issues. Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> Link: https://lore.kernel.org/r/20230823092912.122674-1-krzysztof.kozlowski@linaro.org Signed-off-by: Leon Romanovsky <leon@kernel.org> 2023-08-23 12:29:12 +03:00			`/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */`
staging/rdma/hfi1: move txreq header code The patch separates the txreq defines into new files, one for verbs and one for sdma. The verbs_txreq implementation handles the setup and teardown of the txreq cache, so the register routine is changed to call the new init/exit routines. This patch allows for followup patches enhance the send engine. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:44:34 +03:00			`/*`
			`* Copyright(c) 2016 Intel Corporation.`
			`*/`

			`#ifndef HFI1_SDMA_TXREQ_H`
			`#define HFI1_SDMA_TXREQ_H`

			`/* increased for AHG */`
			`#define NUM_DESC 6`

			`/*`
			`* struct sdma_desc - canonical fragment descriptor`
			`*`
			`* This is the descriptor carried in the tx request`
			`* corresponding to each fragment.`
			`*`
			`*/`
			`struct sdma_desc {`
			`/* private: don't use directly */`
			`u64 qw[2];`
IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests hfi1 user SDMA request processing has two bugs that can cause data corruption for user SDMA requests that have multiple payload iovecs where an iovec other than the tail iovec does not run up to the page boundary for the buffer pointed to by that iovec.a Here are the specific bugs: 1. user_sdma_txadd() does not use struct user_sdma_iovec->iov.iov_len. Rather, user_sdma_txadd() will add up to PAGE_SIZE bytes from iovec to the packet, even if some of those bytes are past iovec->iov.iov_len and are thus not intended to be in the packet. 2. user_sdma_txadd() and user_sdma_send_pkts() fail to advance to the next iovec in user_sdma_request->iovs when the current iovec is not PAGE_SIZE and does not contain enough data to complete the packet. The transmitted packet will contain the wrong data from the iovec pages. This has not been an issue with SDMA packets from hfi1 Verbs or PSM2 because they only produce iovecs that end short of PAGE_SIZE as the tail iovec of an SDMA request. Fixing these bugs exposes other bugs with the SDMA pin cache (struct mmu_rb_handler) that get in way of supporting user SDMA requests with multiple payload iovecs whose buffers do not end at PAGE_SIZE. So this commit fixes those issues as well. Here are the mmu_rb_handler bugs that non-PAGE_SIZE-end multi-iovec payload user SDMA requests can hit: 1. Overlapping memory ranges in mmu_rb_handler will result in duplicate pinnings. 2. When extending an existing mmu_rb_handler entry (struct mmu_rb_node), the mmu_rb code (1) removes the existing entry under a lock, (2) releases that lock, pins the new pages, (3) then reacquires the lock to insert the extended mmu_rb_node. If someone else comes in and inserts an overlapping entry between (2) and (3), insert in (3) will fail. The failure path code in this case unpins _all_ pages in either the original mmu_rb_node or the new mmu_rb_node that was inserted between (2) and (3). 3. In hfi1_mmu_rb_remove_unless_exact(), mmu_rb_node->refcount is incremented outside of mmu_rb_handler->lock. As a result, mmu_rb_node could be evicted by another thread that gets mmu_rb_handler->lock and checks mmu_rb_node->refcount before mmu_rb_node->refcount is incremented. 4. Related to #2 above, SDMA request submission failure path does not check mmu_rb_node->refcount before freeing mmu_rb_node object. If there are other SDMA requests in progress whose iovecs have pointers to the now-freed mmu_rb_node(s), those pointers to the now-freed mmu_rb nodes will be dereferenced when those SDMA requests complete. Fixes: 7be85676f1d1 ("IB/hfi1: Don't remove RB entry when not needed.") Fixes: 7724105686e7 ("IB/hfi1: add driver files") Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Link: https://lore.kernel.org/r/168088636445.3027109.10054635277810177889.stgit@252.162.96.66.static.eigbox.net Signed-off-by: Leon Romanovsky <leon@kernel.org> 2023-04-07 19:52:44 +03:00			`void *pinning_ctx;`
IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate The hfi1 user SDMA pinned-page cache will leave a stale cache entry when the cache-entry's virtual address range is invalidated but that cache entry is in-use by an outstanding SDMA request. Subsequent user SDMA requests with buffers in or spanning the virtual address range of the stale cache entry will result in packets constructed from the wrong memory, the physical pages pointed to by the stale cache entry. To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler cache independent of the cache entry's refcount. Add 'struct kref refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with kref_get() and kref_put(). mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove 'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node code to use mmu_rb_node.refcount. Move the mmu_rb_handler destructor call after a wait-for-SDMA-request-completion call so mmu_rb_nodes that need mmu_rb_handler's workqueue to queue themselves up for destruction from an interrupt context may do so. Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging") Fixes: 00cbce5cbf88 ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests") Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick <dean.luick@cornelisnetworks.com> Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> 2023-05-19 19:32:16 +03:00			`/* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */`
			`void (ctx_put)(void ctx);`
staging/rdma/hfi1: move txreq header code The patch separates the txreq defines into new files, one for verbs and one for sdma. The verbs_txreq implementation handles the setup and teardown of the txreq cache, so the register routine is changed to call the new init/exit routines. This patch allows for followup patches enhance the send engine. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:44:34 +03:00			`};`

			`/**`
			`* struct sdma_txreq - the sdma_txreq structure (one per packet)`
			`* @list: for use by user and by queuing for wait`
			`*`
			`* This is the representation of a packet which consists of some`
			`* number of fragments. Storage is provided to within the structure.`
			`* for all fragments.`
			`*`
			`* The storage for the descriptors are automatically extended as needed`
			`* when the currently allocation is exceeded.`
			`*`
			`* The user (Verbs or PSM) may overload this structure with fields`
			`* specific to their use by putting this struct first in their struct.`
			`* The method of allocation of the overloaded structure is user dependent`
			`*`
			`* The list is the only public field in the structure.`
			`*`
			`*/`

			`#define SDMA_TXREQ_S_OK 0`
			`#define SDMA_TXREQ_S_SENDERROR 1`
			`#define SDMA_TXREQ_S_ABORTED 2`
			`#define SDMA_TXREQ_S_SHUTDOWN 3`

			`/* flags bits */`
			`#define SDMA_TXREQ_F_URGENT 0x0001`
			`#define SDMA_TXREQ_F_AHG_COPY 0x0002`
			`#define SDMA_TXREQ_F_USE_AHG 0x0004`
IB/hfi1: Prioritize the sending of ACK packets ACK packets are generally associated with request completion and resource release and therefore should be sent first. This patch optimizes the send engine by using the following policies: (1) QPs with RVT_S_ACK_PENDING bit set in qp->s_flags or qpriv->s_flags should have their priority incremented; (2) QPs with ACK or TID-ACK packet queued should have their priority incremented; (3) When a QP is queued to the wait list due to resource constraints, it will be queued to the head if it has ACK packet to send; (4) When selecting qps to run from the wait list, the one with the highest priority and starve_cnt will be selected; each priority will be equivalent to a fixed number of starve_cnt (16). Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Kaike Wan <kaike.wan@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2019-01-24 08:52:19 +03:00			`#define SDMA_TXREQ_F_VIP 0x0010`
staging/rdma/hfi1: move txreq header code The patch separates the txreq defines into new files, one for verbs and one for sdma. The verbs_txreq implementation handles the setup and teardown of the txreq cache, so the register routine is changed to call the new init/exit routines. This patch allows for followup patches enhance the send engine. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:44:34 +03:00
			`struct sdma_txreq;`
staging/rdma/hfi: fix CQ completion order issue The current implementation of the sdma_wait variable has a timing hole that can cause a completion Q entry to be returned from a pio send prior to an older sdma packets completion queue entry. The sdma_wait variable used to be decremented prior to calling the packet complete routine. The hole is between decrement and the verbs completion where send engine using pio could return a out of order completion in that window. This patch closes the hole by allowing an API option to specify an sdma_drained callback. The atomic dec is positioned after the complete callback to avoid the window as long as the pio path doesn't execute when there is a non-zero sdma count. Reviewed-by: Jubin John <jubin.john@intel.com> Signed-off-by: Dean Luick <dean.luick@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:45:53 +03:00			`typedef void (callback_t)(struct sdma_txreq , int);`
staging/rdma/hfi1: move txreq header code The patch separates the txreq defines into new files, one for verbs and one for sdma. The verbs_txreq implementation handles the setup and teardown of the txreq cache, so the register routine is changed to call the new init/exit routines. This patch allows for followup patches enhance the send engine. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:44:34 +03:00
			`struct iowait;`
			`struct sdma_txreq {`
			`struct list_head list;`
			`/* private: */`
			`struct sdma_desc *descp;`
			`/* private: */`
			`void *coalesce_buf;`
			`/* private: */`
			`struct iowait *wait;`
			`/* private: */`
			`callback_t complete;`
			`#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER`
			`u64 sn;`
			`#endif`
			`/* private: - used in coalesce/pad processing */`
			`u16 packet_len;`
			`/* private: - down-counted to trigger last */`
			`u16 tlen;`
			`/* private: */`
			`u16 num_desc;`
			`/* private: */`
			`u16 desc_limit;`
			`/* private: */`
			`u16 next_descq_idx;`
			`/* private: */`
			`u16 coalesce_idx;`
			`/* private: flags */`
			`u16 flags;`
			`/* private: */`
			`struct sdma_desc descs[NUM_DESC];`
			`};`

staging/rdma/hfi1: fix panic in send engine The send engine wasn't correctly handling pre-built packets, and worse, the pointer to a packet state's txreq wasn't initialized correctly. To fix: - all waiters need to save any prebuilt packets (smda waits already did) - the progress routine needs to handle a QPs prebuilt packet and initialize the txreq pointer properly To keep SDMA working, the dma send code needs to see if a packet has been built already. If not the code will build it. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:45:18 +03:00			`static inline int sdma_txreq_built(struct sdma_txreq *tx)`
			`{`
			`return tx->num_desc;`
			`}`

staging/rdma/hfi1: move txreq header code The patch separates the txreq defines into new files, one for verbs and one for sdma. The verbs_txreq implementation handles the setup and teardown of the txreq cache, so the register routine is changed to call the new init/exit routines. This patch allows for followup patches enhance the send engine. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com> 2016-02-14 23:44:34 +03:00			`#endif /* HFI1_SDMA_TXREQ_H */`