libeth: add Rx buffer management
Add a couple intuitive helpers to hide Rx buffer implementation details in the library and not multiplicate it between drivers. The settings are sorta optimized for 100G+ NICs, but nothing really HW-specific here. Use the new page_pool_dev_alloc() to dynamically switch between split-page and full-page modes depending on MTU, page size, required headroom etc. For example, on x86_64 with the default driver settings each page is shared between 2 buffers. Turning on XDP (not in this series) -> increasing headroom requirement pushes truesize out of 2048 boundary, leading to that each buffer starts getting a full page. The "ceiling" limit is %PAGE_SIZE, as only order-0 pages are used to avoid compound overhead. For the above architecture, this means maximum linear frame size of 3712 w/o XDP. Not that &libeth_buf_queue is not a complete queue/ring structure for now, rather a shim, but eventually the libeth-enabled drivers will move to it, with iavf being the first one. Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
This commit is contained in:
parent
ce230f4f89
commit
e6c91556b9
@ -3,6 +3,7 @@
|
||||
|
||||
config LIBETH
|
||||
tristate
|
||||
select PAGE_POOL
|
||||
help
|
||||
libeth is a common library containing routines shared between several
|
||||
drivers, but not yet promoted to the generic kernel API.
|
||||
|
@ -3,6 +3,104 @@
|
||||
|
||||
#include <net/libeth/rx.h>
|
||||
|
||||
/* Rx buffer management */
|
||||
|
||||
/**
|
||||
* libeth_rx_hw_len - get the actual buffer size to be passed to HW
|
||||
* @pp: &page_pool_params of the netdev to calculate the size for
|
||||
* @max_len: maximum buffer size for a single descriptor
|
||||
*
|
||||
* Return: HW-writeable length per one buffer to pass it to the HW accounting:
|
||||
* MTU the @dev has, HW required alignment, minimum and maximum allowed values,
|
||||
* and system's page size.
|
||||
*/
|
||||
static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len)
|
||||
{
|
||||
u32 len;
|
||||
|
||||
len = READ_ONCE(pp->netdev->mtu) + LIBETH_RX_LL_LEN;
|
||||
len = ALIGN(len, LIBETH_RX_BUF_STRIDE);
|
||||
len = min3(len, ALIGN_DOWN(max_len ? : U32_MAX, LIBETH_RX_BUF_STRIDE),
|
||||
pp->max_len);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* libeth_rx_fq_create - create a PP with the default libeth settings
|
||||
* @fq: buffer queue struct to fill
|
||||
* @napi: &napi_struct covering this PP (no usage outside its poll loops)
|
||||
*
|
||||
* Return: %0 on success, -%errno on failure.
|
||||
*/
|
||||
int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
|
||||
{
|
||||
struct page_pool_params pp = {
|
||||
.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
|
||||
.order = LIBETH_RX_PAGE_ORDER,
|
||||
.pool_size = fq->count,
|
||||
.nid = fq->nid,
|
||||
.dev = napi->dev->dev.parent,
|
||||
.netdev = napi->dev,
|
||||
.napi = napi,
|
||||
.dma_dir = DMA_FROM_DEVICE,
|
||||
.offset = LIBETH_SKB_HEADROOM,
|
||||
};
|
||||
struct libeth_fqe *fqes;
|
||||
struct page_pool *pool;
|
||||
|
||||
/* HW-writeable / syncable length per one page */
|
||||
pp.max_len = LIBETH_RX_PAGE_LEN(pp.offset);
|
||||
|
||||
/* HW-writeable length per buffer */
|
||||
fq->buf_len = libeth_rx_hw_len(&pp, fq->buf_len);
|
||||
/* Buffer size to allocate */
|
||||
fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp.offset +
|
||||
fq->buf_len));
|
||||
|
||||
pool = page_pool_create(&pp);
|
||||
if (IS_ERR(pool))
|
||||
return PTR_ERR(pool);
|
||||
|
||||
fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid);
|
||||
if (!fqes)
|
||||
goto err_buf;
|
||||
|
||||
fq->fqes = fqes;
|
||||
fq->pp = pool;
|
||||
|
||||
return 0;
|
||||
|
||||
err_buf:
|
||||
page_pool_destroy(pool);
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH);
|
||||
|
||||
/**
|
||||
* libeth_rx_fq_destroy - destroy a &page_pool created by libeth
|
||||
* @fq: buffer queue to process
|
||||
*/
|
||||
void libeth_rx_fq_destroy(struct libeth_fq *fq)
|
||||
{
|
||||
kvfree(fq->fqes);
|
||||
page_pool_destroy(fq->pp);
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH);
|
||||
|
||||
/**
|
||||
* libeth_rx_recycle_slow - recycle a libeth page from the NAPI context
|
||||
* @page: page to recycle
|
||||
*
|
||||
* To be used on exceptions or rare cases not requiring fast inline recycling.
|
||||
*/
|
||||
void libeth_rx_recycle_slow(struct page *page)
|
||||
{
|
||||
page_pool_recycle_direct(page->pp, page);
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(libeth_rx_recycle_slow, LIBETH);
|
||||
|
||||
/* Converting abstract packet type numbers into a software structure with
|
||||
* the packet parameters to do O(1) lookup on Rx.
|
||||
*/
|
||||
|
@ -4,8 +4,125 @@
|
||||
#ifndef __LIBETH_RX_H
|
||||
#define __LIBETH_RX_H
|
||||
|
||||
#include <linux/if_vlan.h>
|
||||
|
||||
#include <net/page_pool/helpers.h>
|
||||
#include <net/xdp.h>
|
||||
|
||||
/* Rx buffer management */
|
||||
|
||||
/* Space reserved in front of each frame */
|
||||
#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
|
||||
/* Maximum headroom for worst-case calculations */
|
||||
#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM
|
||||
/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
|
||||
#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
|
||||
|
||||
/* Always use order-0 pages */
|
||||
#define LIBETH_RX_PAGE_ORDER 0
|
||||
/* Pick a sane buffer stride and align to a cacheline boundary */
|
||||
#define LIBETH_RX_BUF_STRIDE SKB_DATA_ALIGN(128)
|
||||
/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */
|
||||
#define LIBETH_RX_PAGE_LEN(hr) \
|
||||
ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER), \
|
||||
LIBETH_RX_BUF_STRIDE)
|
||||
|
||||
/**
|
||||
* struct libeth_fqe - structure representing an Rx buffer (fill queue element)
|
||||
* @page: page holding the buffer
|
||||
* @offset: offset from the page start (to the headroom)
|
||||
* @truesize: total space occupied by the buffer (w/ headroom and tailroom)
|
||||
*
|
||||
* Depending on the MTU, API switches between one-page-per-frame and shared
|
||||
* page model (to conserve memory on bigger-page platforms). In case of the
|
||||
* former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
|
||||
*/
|
||||
struct libeth_fqe {
|
||||
struct page *page;
|
||||
u32 offset;
|
||||
u32 truesize;
|
||||
} __aligned_largest;
|
||||
|
||||
/**
|
||||
* struct libeth_fq - structure representing a buffer (fill) queue
|
||||
* @fp: hotpath part of the structure
|
||||
* @pp: &page_pool for buffer management
|
||||
* @fqes: array of Rx buffers
|
||||
* @truesize: size to allocate per buffer, w/overhead
|
||||
* @count: number of descriptors/buffers the queue has
|
||||
* @buf_len: HW-writeable length per each buffer
|
||||
* @nid: ID of the closest NUMA node with memory
|
||||
*/
|
||||
struct libeth_fq {
|
||||
struct_group_tagged(libeth_fq_fp, fp,
|
||||
struct page_pool *pp;
|
||||
struct libeth_fqe *fqes;
|
||||
|
||||
u32 truesize;
|
||||
u32 count;
|
||||
);
|
||||
|
||||
/* Cold fields */
|
||||
u32 buf_len;
|
||||
int nid;
|
||||
};
|
||||
|
||||
int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi);
|
||||
void libeth_rx_fq_destroy(struct libeth_fq *fq);
|
||||
|
||||
/**
|
||||
* libeth_rx_alloc - allocate a new Rx buffer
|
||||
* @fq: fill queue to allocate for
|
||||
* @i: index of the buffer within the queue
|
||||
*
|
||||
* Return: DMA address to be passed to HW for Rx on successful allocation,
|
||||
* ```DMA_MAPPING_ERROR``` otherwise.
|
||||
*/
|
||||
static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
|
||||
{
|
||||
struct libeth_fqe *buf = &fq->fqes[i];
|
||||
|
||||
buf->truesize = fq->truesize;
|
||||
buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
|
||||
if (unlikely(!buf->page))
|
||||
return DMA_MAPPING_ERROR;
|
||||
|
||||
return page_pool_get_dma_addr(buf->page) + buf->offset +
|
||||
fq->pp->p.offset;
|
||||
}
|
||||
|
||||
void libeth_rx_recycle_slow(struct page *page);
|
||||
|
||||
/**
|
||||
* libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
|
||||
* @fqe: buffer to process
|
||||
* @len: frame length from the descriptor
|
||||
*
|
||||
* Process the buffer after it's written by HW. The regular path is to
|
||||
* synchronize DMA for CPU, but in case of no data it will be immediately
|
||||
* recycled back to its PP.
|
||||
*
|
||||
* Return: true when there's data to process, false otherwise.
|
||||
*/
|
||||
static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
|
||||
u32 len)
|
||||
{
|
||||
struct page *page = fqe->page;
|
||||
|
||||
/* Very rare, but possible case. The most common reason:
|
||||
* the last fragment contained FCS only, which was then
|
||||
* stripped by the HW.
|
||||
*/
|
||||
if (unlikely(!len)) {
|
||||
libeth_rx_recycle_slow(page);
|
||||
return false;
|
||||
}
|
||||
|
||||
page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Converting abstract packet type numbers into a software structure with
|
||||
* the packet parameters to do O(1) lookup on Rx.
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user