Merge branch 'xdp-page_pool-fixes-and-in-flight-accounting'
Jesper Dangaard Brouer says: ==================== xdp: page_pool fixes and in-flight accounting This patchset fix page_pool API and users, such that drivers can use it for DMA-mapping. A number of places exist, where the DMA-mapping would not get released/unmapped, all these are fixed. This occurs e.g. when an xdp_frame gets converted to an SKB. As network stack doesn't have any callback for XDP memory models. The patchset also address a shutdown race-condition. Today removing a XDP memory model, based on page_pool, is only delayed one RCU grace period. This isn't enough as redirected xdp_frames can still be in-flight on different queues (remote driver TX, cpumap or veth). We stress that when drivers use page_pool for DMA-mapping, then they MUST use one packet per page. This might change in the future, but more work lies ahead, before we can lift this restriction. This patchset change the page_pool API to be more strict, as in-flight page accounting is added. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
2a54003e7a
@ -545,8 +545,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
|
||||
}
|
||||
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
|
||||
MEM_TYPE_PAGE_POOL, rq->page_pool);
|
||||
if (err)
|
||||
if (err) {
|
||||
page_pool_free(rq->page_pool);
|
||||
goto err_free;
|
||||
}
|
||||
|
||||
for (i = 0; i < wq_sz; i++) {
|
||||
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
|
||||
@ -611,8 +613,6 @@ err_rq_wq_destroy:
|
||||
if (rq->xdp_prog)
|
||||
bpf_prog_put(rq->xdp_prog);
|
||||
xdp_rxq_info_unreg(&rq->xdp_rxq);
|
||||
if (rq->page_pool)
|
||||
page_pool_destroy(rq->page_pool);
|
||||
mlx5_wq_destroy(&rq->wq_ctrl);
|
||||
|
||||
return err;
|
||||
@ -625,10 +625,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
|
||||
if (rq->xdp_prog)
|
||||
bpf_prog_put(rq->xdp_prog);
|
||||
|
||||
xdp_rxq_info_unreg(&rq->xdp_rxq);
|
||||
if (rq->page_pool)
|
||||
page_pool_destroy(rq->page_pool);
|
||||
|
||||
switch (rq->wq_type) {
|
||||
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
|
||||
kvfree(rq->mpwqe.info);
|
||||
@ -645,6 +641,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
|
||||
|
||||
mlx5e_page_release(rq, dma_info, false);
|
||||
}
|
||||
|
||||
xdp_rxq_info_unreg(&rq->xdp_rxq);
|
||||
mlx5_wq_destroy(&rq->wq_ctrl);
|
||||
}
|
||||
|
||||
|
@ -248,7 +248,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
|
||||
dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
|
||||
PAGE_SIZE, rq->buff.map_dir);
|
||||
if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
|
||||
put_page(dma_info->page);
|
||||
page_pool_recycle_direct(rq->page_pool, dma_info->page);
|
||||
dma_info->page = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -272,6 +272,7 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
|
||||
page_pool_recycle_direct(rq->page_pool, dma_info->page);
|
||||
} else {
|
||||
mlx5e_page_dma_unmap(rq, dma_info);
|
||||
page_pool_release_page(rq->page_pool, dma_info->page);
|
||||
put_page(dma_info->page);
|
||||
}
|
||||
}
|
||||
|
@ -547,6 +547,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
|
||||
goto err;
|
||||
}
|
||||
|
||||
xdp_release_frame(frame);
|
||||
xdp_scrub_frame(frame);
|
||||
skb->protocol = eth_type_trans(skb, rq->dev);
|
||||
err:
|
||||
|
@ -16,14 +16,16 @@
|
||||
* page_pool_alloc_pages() call. Drivers should likely use
|
||||
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
|
||||
*
|
||||
* If page_pool handles DMA mapping (use page->private), then API user
|
||||
* is responsible for invoking page_pool_put_page() once. In-case of
|
||||
* elevated refcnt, the DMA state is released, assuming other users of
|
||||
* the page will eventually call put_page().
|
||||
* API keeps track of in-flight pages, in-order to let API user know
|
||||
* when it is safe to dealloactor page_pool object. Thus, API users
|
||||
* must make sure to call page_pool_release_page() when a page is
|
||||
* "leaving" the page_pool. Or call page_pool_put_page() where
|
||||
* appropiate. For maintaining correct accounting.
|
||||
*
|
||||
* If no DMA mapping is done, then it can act as shim-layer that
|
||||
* fall-through to alloc_page. As no state is kept on the page, the
|
||||
* regular put_page() call is sufficient.
|
||||
* API user must only call page_pool_put_page() once on a page, as it
|
||||
* will either recycle the page, or in case of elevated refcnt, it
|
||||
* will release the DMA mapping and in-flight state accounting. We
|
||||
* hope to lift this requirement in the future.
|
||||
*/
|
||||
#ifndef _NET_PAGE_POOL_H
|
||||
#define _NET_PAGE_POOL_H
|
||||
@ -66,9 +68,10 @@ struct page_pool_params {
|
||||
};
|
||||
|
||||
struct page_pool {
|
||||
struct rcu_head rcu;
|
||||
struct page_pool_params p;
|
||||
|
||||
u32 pages_state_hold_cnt;
|
||||
|
||||
/*
|
||||
* Data structure for allocation side
|
||||
*
|
||||
@ -96,6 +99,8 @@ struct page_pool {
|
||||
* TODO: Implement bulk return pages into this structure.
|
||||
*/
|
||||
struct ptr_ring ring;
|
||||
|
||||
atomic_t pages_state_release_cnt;
|
||||
};
|
||||
|
||||
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
|
||||
@ -109,7 +114,16 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
|
||||
|
||||
struct page_pool *page_pool_create(const struct page_pool_params *params);
|
||||
|
||||
void page_pool_destroy(struct page_pool *pool);
|
||||
void __page_pool_free(struct page_pool *pool);
|
||||
static inline void page_pool_free(struct page_pool *pool)
|
||||
{
|
||||
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
|
||||
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
__page_pool_free(pool);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Never call this directly, use helpers below */
|
||||
void __page_pool_put_page(struct page_pool *pool,
|
||||
@ -132,6 +146,43 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
|
||||
__page_pool_put_page(pool, page, true);
|
||||
}
|
||||
|
||||
/* API user MUST have disconnected alloc-side (not allowed to call
|
||||
* page_pool_alloc_pages()) before calling this. The free-side can
|
||||
* still run concurrently, to handle in-flight packet-pages.
|
||||
*
|
||||
* A request to shutdown can fail (with false) if there are still
|
||||
* in-flight packet-pages.
|
||||
*/
|
||||
bool __page_pool_request_shutdown(struct page_pool *pool);
|
||||
static inline bool page_pool_request_shutdown(struct page_pool *pool)
|
||||
{
|
||||
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
|
||||
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
return __page_pool_request_shutdown(pool);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Disconnects a page (from a page_pool). API users can have a need
|
||||
* to disconnect a page (from a page_pool), to allow it to be used as
|
||||
* a regular page (that will eventually be returned to the normal
|
||||
* page-allocator via put_page).
|
||||
*/
|
||||
void page_pool_unmap_page(struct page_pool *pool, struct page *page);
|
||||
static inline void page_pool_release_page(struct page_pool *pool,
|
||||
struct page *page)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
page_pool_unmap_page(pool, page);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
|
||||
{
|
||||
return page->dma_addr;
|
||||
}
|
||||
|
||||
static inline bool is_page_pool_compiled_in(void)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
|
@ -129,6 +129,21 @@ void xdp_return_frame(struct xdp_frame *xdpf);
|
||||
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
|
||||
void xdp_return_buff(struct xdp_buff *xdp);
|
||||
|
||||
/* When sending xdp_frame into the network stack, then there is no
|
||||
* return point callback, which is needed to release e.g. DMA-mapping
|
||||
* resources with page_pool. Thus, have explicit function to release
|
||||
* frame resources.
|
||||
*/
|
||||
void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
|
||||
static inline void xdp_release_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
struct xdp_mem_info *mem = &xdpf->mem;
|
||||
|
||||
/* Curr only page_pool needs this */
|
||||
if (mem->type == MEM_TYPE_PAGE_POOL)
|
||||
__xdp_release_frame(xdpf->data, mem);
|
||||
}
|
||||
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
struct net_device *dev, u32 queue_index);
|
||||
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
|
||||
|
23
include/net/xdp_priv.h
Normal file
23
include/net/xdp_priv.h
Normal file
@ -0,0 +1,23 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __LINUX_NET_XDP_PRIV_H__
|
||||
#define __LINUX_NET_XDP_PRIV_H__
|
||||
|
||||
#include <linux/rhashtable.h>
|
||||
|
||||
/* Private to net/core/xdp.c, but used by trace/events/xdp.h */
|
||||
struct xdp_mem_allocator {
|
||||
struct xdp_mem_info mem;
|
||||
union {
|
||||
void *allocator;
|
||||
struct page_pool *page_pool;
|
||||
struct zero_copy_allocator *zc_alloc;
|
||||
};
|
||||
int disconnect_cnt;
|
||||
unsigned long defer_start;
|
||||
struct rhash_head node;
|
||||
struct rcu_head rcu;
|
||||
struct delayed_work defer_wq;
|
||||
unsigned long defer_warn;
|
||||
};
|
||||
|
||||
#endif /* __LINUX_NET_XDP_PRIV_H__ */
|
87
include/trace/events/page_pool.h
Normal file
87
include/trace/events/page_pool.h
Normal file
@ -0,0 +1,87 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM page_pool
|
||||
|
||||
#if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_PAGE_POOL_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
#include <net/page_pool.h>
|
||||
|
||||
TRACE_EVENT(page_pool_inflight,
|
||||
|
||||
TP_PROTO(const struct page_pool *pool,
|
||||
s32 inflight, u32 hold, u32 release),
|
||||
|
||||
TP_ARGS(pool, inflight, hold, release),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct page_pool *, pool)
|
||||
__field(s32, inflight)
|
||||
__field(u32, hold)
|
||||
__field(u32, release)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pool = pool;
|
||||
__entry->inflight = inflight;
|
||||
__entry->hold = hold;
|
||||
__entry->release = release;
|
||||
),
|
||||
|
||||
TP_printk("page_pool=%p inflight=%d hold=%u release=%u",
|
||||
__entry->pool, __entry->inflight, __entry->hold, __entry->release)
|
||||
);
|
||||
|
||||
TRACE_EVENT(page_pool_state_release,
|
||||
|
||||
TP_PROTO(const struct page_pool *pool,
|
||||
const struct page *page, u32 release),
|
||||
|
||||
TP_ARGS(pool, page, release),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct page_pool *, pool)
|
||||
__field(const struct page *, page)
|
||||
__field(u32, release)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pool = pool;
|
||||
__entry->page = page;
|
||||
__entry->release = release;
|
||||
),
|
||||
|
||||
TP_printk("page_pool=%p page=%p release=%u",
|
||||
__entry->pool, __entry->page, __entry->release)
|
||||
);
|
||||
|
||||
TRACE_EVENT(page_pool_state_hold,
|
||||
|
||||
TP_PROTO(const struct page_pool *pool,
|
||||
const struct page *page, u32 hold),
|
||||
|
||||
TP_ARGS(pool, page, hold),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct page_pool *, pool)
|
||||
__field(const struct page *, page)
|
||||
__field(u32, hold)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pool = pool;
|
||||
__entry->page = page;
|
||||
__entry->hold = hold;
|
||||
),
|
||||
|
||||
TP_printk("page_pool=%p page=%p hold=%u",
|
||||
__entry->pool, __entry->page, __entry->hold)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_PAGE_POOL_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
@ -269,6 +269,121 @@ TRACE_EVENT(xdp_devmap_xmit,
|
||||
__entry->from_ifindex, __entry->to_ifindex, __entry->err)
|
||||
);
|
||||
|
||||
/* Expect users already include <net/xdp.h>, but not xdp_priv.h */
|
||||
#include <net/xdp_priv.h>
|
||||
|
||||
#define __MEM_TYPE_MAP(FN) \
|
||||
FN(PAGE_SHARED) \
|
||||
FN(PAGE_ORDER0) \
|
||||
FN(PAGE_POOL) \
|
||||
FN(ZERO_COPY)
|
||||
|
||||
#define __MEM_TYPE_TP_FN(x) \
|
||||
TRACE_DEFINE_ENUM(MEM_TYPE_##x);
|
||||
#define __MEM_TYPE_SYM_FN(x) \
|
||||
{ MEM_TYPE_##x, #x },
|
||||
#define __MEM_TYPE_SYM_TAB \
|
||||
__MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 }
|
||||
__MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
|
||||
|
||||
TRACE_EVENT(mem_disconnect,
|
||||
|
||||
TP_PROTO(const struct xdp_mem_allocator *xa,
|
||||
bool safe_to_remove, bool force),
|
||||
|
||||
TP_ARGS(xa, safe_to_remove, force),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct xdp_mem_allocator *, xa)
|
||||
__field(u32, mem_id)
|
||||
__field(u32, mem_type)
|
||||
__field(const void *, allocator)
|
||||
__field(bool, safe_to_remove)
|
||||
__field(bool, force)
|
||||
__field(int, disconnect_cnt)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->xa = xa;
|
||||
__entry->mem_id = xa->mem.id;
|
||||
__entry->mem_type = xa->mem.type;
|
||||
__entry->allocator = xa->allocator;
|
||||
__entry->safe_to_remove = safe_to_remove;
|
||||
__entry->force = force;
|
||||
__entry->disconnect_cnt = xa->disconnect_cnt;
|
||||
),
|
||||
|
||||
TP_printk("mem_id=%d mem_type=%s allocator=%p"
|
||||
" safe_to_remove=%s force=%s disconnect_cnt=%d",
|
||||
__entry->mem_id,
|
||||
__print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
|
||||
__entry->allocator,
|
||||
__entry->safe_to_remove ? "true" : "false",
|
||||
__entry->force ? "true" : "false",
|
||||
__entry->disconnect_cnt
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(mem_connect,
|
||||
|
||||
TP_PROTO(const struct xdp_mem_allocator *xa,
|
||||
const struct xdp_rxq_info *rxq),
|
||||
|
||||
TP_ARGS(xa, rxq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct xdp_mem_allocator *, xa)
|
||||
__field(u32, mem_id)
|
||||
__field(u32, mem_type)
|
||||
__field(const void *, allocator)
|
||||
__field(const struct xdp_rxq_info *, rxq)
|
||||
__field(int, ifindex)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->xa = xa;
|
||||
__entry->mem_id = xa->mem.id;
|
||||
__entry->mem_type = xa->mem.type;
|
||||
__entry->allocator = xa->allocator;
|
||||
__entry->rxq = rxq;
|
||||
__entry->ifindex = rxq->dev->ifindex;
|
||||
),
|
||||
|
||||
TP_printk("mem_id=%d mem_type=%s allocator=%p"
|
||||
" ifindex=%d",
|
||||
__entry->mem_id,
|
||||
__print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
|
||||
__entry->allocator,
|
||||
__entry->ifindex
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(mem_return_failed,
|
||||
|
||||
TP_PROTO(const struct xdp_mem_info *mem,
|
||||
const struct page *page),
|
||||
|
||||
TP_ARGS(mem, page),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(const struct page *, page)
|
||||
__field(u32, mem_id)
|
||||
__field(u32, mem_type)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->page = page;
|
||||
__entry->mem_id = mem->id;
|
||||
__entry->mem_type = mem->type;
|
||||
),
|
||||
|
||||
TP_printk("mem_id=%d mem_type=%s page=%p",
|
||||
__entry->mem_id,
|
||||
__print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
|
||||
__entry->page
|
||||
)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_XDP_H */
|
||||
|
||||
#include <trace/define_trace.h>
|
||||
|
@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
|
||||
* - RX ring dev queue index (skb_record_rx_queue)
|
||||
*/
|
||||
|
||||
/* Until page_pool get SKB return path, release DMA here */
|
||||
xdp_release_frame(xdpf);
|
||||
|
||||
/* Allow SKB to reuse area used by xdp_frame */
|
||||
xdp_scrub_frame(xdpf);
|
||||
|
||||
|
@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_PAGE_POOL)
|
||||
#include <trace/events/page_pool.h>
|
||||
#endif
|
||||
|
||||
#include <trace/events/neigh.h>
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
|
||||
|
@ -4,9 +4,11 @@
|
||||
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
|
||||
* Copyright (C) 2016 Red Hat, Inc.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/device.h>
|
||||
|
||||
#include <net/page_pool.h>
|
||||
#include <linux/dma-direction.h>
|
||||
@ -14,6 +16,8 @@
|
||||
#include <linux/page-flags.h>
|
||||
#include <linux/mm.h> /* for __put_page() */
|
||||
|
||||
#include <trace/events/page_pool.h>
|
||||
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
const struct page_pool_params *params)
|
||||
{
|
||||
@ -43,6 +47,11 @@ static int page_pool_init(struct page_pool *pool,
|
||||
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic_set(&pool->pages_state_release_cnt, 0);
|
||||
|
||||
if (pool->p.flags & PP_FLAG_DMA_MAP)
|
||||
get_device(pool->p.dev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -151,6 +160,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
|
||||
page->dma_addr = dma;
|
||||
|
||||
skip_dma_map:
|
||||
/* Track how many pages are held 'in-flight' */
|
||||
pool->pages_state_hold_cnt++;
|
||||
|
||||
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
|
||||
|
||||
/* When page just alloc'ed is should/must have refcnt 1. */
|
||||
return page;
|
||||
}
|
||||
@ -173,6 +187,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_alloc_pages);
|
||||
|
||||
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
|
||||
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
|
||||
*/
|
||||
#define _distance(a, b) (s32)((a) - (b))
|
||||
|
||||
static s32 page_pool_inflight(struct page_pool *pool)
|
||||
{
|
||||
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
|
||||
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
|
||||
s32 distance;
|
||||
|
||||
distance = _distance(hold_cnt, release_cnt);
|
||||
|
||||
trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
|
||||
return distance;
|
||||
}
|
||||
|
||||
static bool __page_pool_safe_to_destroy(struct page_pool *pool)
|
||||
{
|
||||
s32 inflight = page_pool_inflight(pool);
|
||||
|
||||
/* The distance should not be able to become negative */
|
||||
WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
|
||||
|
||||
return (inflight == 0);
|
||||
}
|
||||
|
||||
/* Cleanup page_pool state from page */
|
||||
static void __page_pool_clean_page(struct page_pool *pool,
|
||||
struct page *page)
|
||||
@ -180,7 +221,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
|
||||
dma_addr_t dma;
|
||||
|
||||
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
|
||||
return;
|
||||
goto skip_dma_unmap;
|
||||
|
||||
dma = page->dma_addr;
|
||||
/* DMA unmap */
|
||||
@ -188,12 +229,27 @@ static void __page_pool_clean_page(struct page_pool *pool,
|
||||
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
|
||||
DMA_ATTR_SKIP_CPU_SYNC);
|
||||
page->dma_addr = 0;
|
||||
skip_dma_unmap:
|
||||
atomic_inc(&pool->pages_state_release_cnt);
|
||||
trace_page_pool_state_release(pool, page,
|
||||
atomic_read(&pool->pages_state_release_cnt));
|
||||
}
|
||||
|
||||
/* unmap the page and clean our state */
|
||||
void page_pool_unmap_page(struct page_pool *pool, struct page *page)
|
||||
{
|
||||
/* When page is unmapped, this implies page will not be
|
||||
* returned to page_pool.
|
||||
*/
|
||||
__page_pool_clean_page(pool, page);
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_unmap_page);
|
||||
|
||||
/* Return a page to the page allocator, cleaning up our state */
|
||||
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
|
||||
{
|
||||
__page_pool_clean_page(pool, page);
|
||||
|
||||
put_page(page);
|
||||
/* An optimization would be to call __free_pages(page, pool->p.order)
|
||||
* knowing page is not part of page-cache (thus avoiding a
|
||||
@ -285,21 +341,41 @@ static void __page_pool_empty_ring(struct page_pool *pool)
|
||||
}
|
||||
}
|
||||
|
||||
static void __page_pool_destroy_rcu(struct rcu_head *rcu)
|
||||
static void __warn_in_flight(struct page_pool *pool)
|
||||
{
|
||||
struct page_pool *pool;
|
||||
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
|
||||
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
|
||||
s32 distance;
|
||||
|
||||
pool = container_of(rcu, struct page_pool, rcu);
|
||||
distance = _distance(hold_cnt, release_cnt);
|
||||
|
||||
WARN(pool->alloc.count, "API usage violation");
|
||||
|
||||
__page_pool_empty_ring(pool);
|
||||
ptr_ring_cleanup(&pool->ring, NULL);
|
||||
kfree(pool);
|
||||
/* Drivers should fix this, but only problematic when DMA is used */
|
||||
WARN(1, "Still in-flight pages:%d hold:%u released:%u",
|
||||
distance, hold_cnt, release_cnt);
|
||||
}
|
||||
|
||||
/* Cleanup and release resources */
|
||||
void page_pool_destroy(struct page_pool *pool)
|
||||
void __page_pool_free(struct page_pool *pool)
|
||||
{
|
||||
WARN(pool->alloc.count, "API usage violation");
|
||||
WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
|
||||
|
||||
/* Can happen due to forced shutdown */
|
||||
if (!__page_pool_safe_to_destroy(pool))
|
||||
__warn_in_flight(pool);
|
||||
|
||||
ptr_ring_cleanup(&pool->ring, NULL);
|
||||
|
||||
if (pool->p.flags & PP_FLAG_DMA_MAP)
|
||||
put_device(pool->p.dev);
|
||||
|
||||
kfree(pool);
|
||||
}
|
||||
EXPORT_SYMBOL(__page_pool_free);
|
||||
|
||||
/* Request to shutdown: release pages cached by page_pool, and check
|
||||
* for in-flight pages
|
||||
*/
|
||||
bool __page_pool_request_shutdown(struct page_pool *pool)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
@ -317,7 +393,6 @@ void page_pool_destroy(struct page_pool *pool)
|
||||
*/
|
||||
__page_pool_empty_ring(pool);
|
||||
|
||||
/* An xdp_mem_allocator can still ref page_pool pointer */
|
||||
call_rcu(&pool->rcu, __page_pool_destroy_rcu);
|
||||
return __page_pool_safe_to_destroy(pool);
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_destroy);
|
||||
EXPORT_SYMBOL(__page_pool_request_shutdown);
|
||||
|
120
net/core/xdp.c
120
net/core/xdp.c
@ -14,6 +14,8 @@
|
||||
#include <net/page_pool.h>
|
||||
|
||||
#include <net/xdp.h>
|
||||
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
|
||||
#include <trace/events/xdp.h>
|
||||
|
||||
#define REG_STATE_NEW 0x0
|
||||
#define REG_STATE_REGISTERED 0x1
|
||||
@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
|
||||
static bool mem_id_init; /* false */
|
||||
static struct rhashtable *mem_id_ht;
|
||||
|
||||
struct xdp_mem_allocator {
|
||||
struct xdp_mem_info mem;
|
||||
union {
|
||||
void *allocator;
|
||||
struct page_pool *page_pool;
|
||||
struct zero_copy_allocator *zc_alloc;
|
||||
};
|
||||
struct rhash_head node;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
|
||||
{
|
||||
const u32 *k = data;
|
||||
@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
|
||||
|
||||
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
|
||||
|
||||
/* Allocator have indicated safe to remove before this is called */
|
||||
if (xa->mem.type == MEM_TYPE_PAGE_POOL)
|
||||
page_pool_free(xa->page_pool);
|
||||
|
||||
/* Allow this ID to be reused */
|
||||
ida_simple_remove(&mem_id_pool, xa->mem.id);
|
||||
|
||||
/* Notice, driver is expected to free the *allocator,
|
||||
* e.g. page_pool, and MUST also use RCU free.
|
||||
*/
|
||||
|
||||
/* Poison memory */
|
||||
xa->mem.id = 0xFFFF;
|
||||
xa->mem.type = 0xF0F0;
|
||||
@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
|
||||
kfree(xa);
|
||||
}
|
||||
|
||||
bool __mem_id_disconnect(int id, bool force)
|
||||
{
|
||||
struct xdp_mem_allocator *xa;
|
||||
bool safe_to_remove = true;
|
||||
|
||||
mutex_lock(&mem_id_lock);
|
||||
|
||||
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
|
||||
if (!xa) {
|
||||
mutex_unlock(&mem_id_lock);
|
||||
WARN(1, "Request remove non-existing id(%d), driver bug?", id);
|
||||
return true;
|
||||
}
|
||||
xa->disconnect_cnt++;
|
||||
|
||||
/* Detects in-flight packet-pages for page_pool */
|
||||
if (xa->mem.type == MEM_TYPE_PAGE_POOL)
|
||||
safe_to_remove = page_pool_request_shutdown(xa->page_pool);
|
||||
|
||||
trace_mem_disconnect(xa, safe_to_remove, force);
|
||||
|
||||
if ((safe_to_remove || force) &&
|
||||
!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
|
||||
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
|
||||
|
||||
mutex_unlock(&mem_id_lock);
|
||||
return (safe_to_remove|force);
|
||||
}
|
||||
|
||||
#define DEFER_TIME (msecs_to_jiffies(1000))
|
||||
#define DEFER_WARN_INTERVAL (30 * HZ)
|
||||
#define DEFER_MAX_RETRIES 120
|
||||
|
||||
static void mem_id_disconnect_defer_retry(struct work_struct *wq)
|
||||
{
|
||||
struct delayed_work *dwq = to_delayed_work(wq);
|
||||
struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
|
||||
bool force = false;
|
||||
|
||||
if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
|
||||
force = true;
|
||||
|
||||
if (__mem_id_disconnect(xa->mem.id, force))
|
||||
return;
|
||||
|
||||
/* Periodic warning */
|
||||
if (time_after_eq(jiffies, xa->defer_warn)) {
|
||||
int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
|
||||
|
||||
pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
|
||||
__func__, xa->mem.id, xa->disconnect_cnt, sec);
|
||||
xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
|
||||
}
|
||||
|
||||
/* Still not ready to be disconnected, retry later */
|
||||
schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
|
||||
}
|
||||
|
||||
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
|
||||
{
|
||||
struct xdp_mem_allocator *xa;
|
||||
@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
|
||||
if (id == 0)
|
||||
return;
|
||||
|
||||
if (__mem_id_disconnect(id, false))
|
||||
return;
|
||||
|
||||
/* Could not disconnect, defer new disconnect attempt to later */
|
||||
mutex_lock(&mem_id_lock);
|
||||
|
||||
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
|
||||
if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
|
||||
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
|
||||
if (!xa) {
|
||||
mutex_unlock(&mem_id_lock);
|
||||
return;
|
||||
}
|
||||
xa->defer_start = jiffies;
|
||||
xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
|
||||
|
||||
INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
|
||||
mutex_unlock(&mem_id_lock);
|
||||
schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
|
||||
|
||||
/* This unregister operation will also cleanup and destroy the
|
||||
* allocator. The page_pool_free() operation is first called when it's
|
||||
* safe to remove, possibly deferred to a workqueue.
|
||||
*/
|
||||
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
|
||||
{
|
||||
/* Simplify driver cleanup code paths, allow unreg "unused" */
|
||||
@ -301,12 +364,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
|
||||
/* Insert allocator into ID lookup table */
|
||||
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
|
||||
if (IS_ERR(ptr)) {
|
||||
ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
|
||||
xdp_rxq->mem.id = 0;
|
||||
errno = PTR_ERR(ptr);
|
||||
goto err;
|
||||
}
|
||||
|
||||
mutex_unlock(&mem_id_lock);
|
||||
|
||||
trace_mem_connect(xdp_alloc, xdp_rxq);
|
||||
return 0;
|
||||
err:
|
||||
mutex_unlock(&mem_id_lock);
|
||||
@ -333,10 +399,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
|
||||
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
|
||||
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
|
||||
page = virt_to_head_page(data);
|
||||
if (xa) {
|
||||
if (likely(xa)) {
|
||||
napi_direct &= !xdp_return_frame_no_direct();
|
||||
page_pool_put_page(xa->page_pool, page, napi_direct);
|
||||
} else {
|
||||
/* Hopefully stack show who to blame for late return */
|
||||
WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
|
||||
trace_mem_return_failed(mem, page);
|
||||
put_page(page);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -379,6 +448,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_buff);
|
||||
|
||||
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
|
||||
void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
|
||||
{
|
||||
struct xdp_mem_allocator *xa;
|
||||
struct page *page;
|
||||
|
||||
rcu_read_lock();
|
||||
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
|
||||
page = virt_to_head_page(data);
|
||||
if (xa)
|
||||
page_pool_release_page(xa->page_pool, page);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__xdp_release_frame);
|
||||
|
||||
int xdp_attachment_query(struct xdp_attachment_info *info,
|
||||
struct netdev_bpf *bpf)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user