Merge branch 'page_pool-allow-caching-from-safely-localized-napi'
Jakub Kicinski says: ==================== page_pool: allow caching from safely localized NAPI I went back to the explicit "are we in NAPI method", mostly because I don't like having both around :( (even tho I maintain that in_softirq() && !in_hardirq() is as safe, as softirqs do not nest). Still returning the skbs to a CPU, tho, not to the NAPI instance. I reckon we could create a small refcounted struct per NAPI instance which would allow sockets and other users so hold a persisent and safe reference. But that's a bigger change, and I get 90+% recycling thru the cache with just these patches (for RR and streaming tests with 100% CPU use it's almost 100%). Some numbers for streaming test with 100% CPU use (from previous version, but really they perform the same): HW-GRO page=page before after before after recycle: cached: 0 138669686 0 150197505 cache_full: 0 223391 0 74582 ring: 138551933 9997191 149299454 0 ring_full: 0 488 3154 127590 released_refcnt: 0 0 0 0 alloc: fast: 136491361 148615710 146969587 150322859 slow: 1772 1799 144 105 slow_high_order: 0 0 0 0 empty: 1772 1799 144 105 refill: 2165245 156302 2332880 2128 waive: 0 0 0 0 v1: https://lore.kernel.org/all/20230411201800.596103-1-kuba@kernel.org/ rfcv2: https://lore.kernel.org/all/20230405232100.103392-1-kuba@kernel.org/ ==================== Link: https://lore.kernel.org/r/20230413042605.895677-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
e61caf04b9
@ -165,6 +165,7 @@ Registration
|
||||
pp_params.pool_size = DESC_NUM;
|
||||
pp_params.nid = NUMA_NO_NODE;
|
||||
pp_params.dev = priv->dev;
|
||||
pp_params.napi = napi; /* only if locking is tied to NAPI */
|
||||
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
|
||||
page_pool = page_pool_create(&pp_params);
|
||||
|
||||
|
@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
|
||||
|
||||
pp.pool_size = bp->rx_ring_size;
|
||||
pp.nid = dev_to_node(&bp->pdev->dev);
|
||||
pp.napi = &rxr->bnapi->napi;
|
||||
pp.dev = &bp->pdev->dev;
|
||||
pp.dma_dir = DMA_BIDIRECTIONAL;
|
||||
|
||||
|
@ -360,8 +360,11 @@ struct napi_struct {
|
||||
unsigned long gro_bitmask;
|
||||
int (*poll)(struct napi_struct *, int);
|
||||
#ifdef CONFIG_NETPOLL
|
||||
/* CPU actively polling if netpoll is configured */
|
||||
int poll_owner;
|
||||
#endif
|
||||
/* CPU on which NAPI has been scheduled for processing */
|
||||
int list_owner;
|
||||
struct net_device *dev;
|
||||
struct gro_list gro_hash[GRO_HASH_BUCKETS];
|
||||
struct sk_buff *skb;
|
||||
|
@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
|
||||
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
|
||||
}
|
||||
|
||||
static inline void
|
||||
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
|
||||
{
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
if (recycle && page_pool_return_skb_page(page, napi_safe))
|
||||
return;
|
||||
#endif
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
/**
|
||||
* __skb_frag_unref - release a reference on a paged fragment.
|
||||
* @frag: the paged fragment
|
||||
@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
|
||||
*/
|
||||
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
|
||||
{
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
#ifdef CONFIG_PAGE_POOL
|
||||
if (recycle && page_pool_return_skb_page(page))
|
||||
return;
|
||||
#endif
|
||||
put_page(page);
|
||||
napi_frag_unref(frag, recycle, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -77,6 +77,7 @@ struct page_pool_params {
|
||||
unsigned int pool_size;
|
||||
int nid; /* Numa node id to allocate from pages from */
|
||||
struct device *dev; /* device, for DMA pre-mapping purposes */
|
||||
struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
|
||||
enum dma_data_direction dma_dir; /* DMA mapping direction */
|
||||
unsigned int max_len; /* max DMA sync memory size */
|
||||
unsigned int offset; /* DMA addr offset */
|
||||
@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
|
||||
return pool->p.dma_dir;
|
||||
}
|
||||
|
||||
bool page_pool_return_skb_page(struct page *page);
|
||||
bool page_pool_return_skb_page(struct page *page, bool napi_safe);
|
||||
|
||||
struct page_pool *page_pool_create(const struct page_pool_params *params);
|
||||
|
||||
|
@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
|
||||
}
|
||||
|
||||
list_add_tail(&napi->poll_list, &sd->poll_list);
|
||||
WRITE_ONCE(napi->list_owner, smp_processor_id());
|
||||
/* If not called from net_rx_action()
|
||||
* we have to raise NET_RX_SOFTIRQ.
|
||||
*/
|
||||
@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
|
||||
list_del_init(&n->poll_list);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
WRITE_ONCE(n->list_owner, -1);
|
||||
|
||||
val = READ_ONCE(n->state);
|
||||
do {
|
||||
@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
|
||||
#ifdef CONFIG_NETPOLL
|
||||
napi->poll_owner = -1;
|
||||
#endif
|
||||
napi->list_owner = -1;
|
||||
set_bit(NAPI_STATE_SCHED, &napi->state);
|
||||
set_bit(NAPI_STATE_NPSVC, &napi->state);
|
||||
list_add_rcu(&napi->dev_list, &dev->napi_list);
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <linux/mm.h> /* for put_page() */
|
||||
#include <linux/poison.h>
|
||||
#include <linux/ethtool.h>
|
||||
#include <linux/netdevice.h>
|
||||
|
||||
#include <trace/events/page_pool.h>
|
||||
|
||||
@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_update_nid);
|
||||
|
||||
bool page_pool_return_skb_page(struct page *page)
|
||||
bool page_pool_return_skb_page(struct page *page, bool napi_safe)
|
||||
{
|
||||
struct napi_struct *napi;
|
||||
struct page_pool *pp;
|
||||
bool allow_direct;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
|
||||
|
||||
pp = page->pp;
|
||||
|
||||
/* Allow direct recycle if we have reasons to believe that we are
|
||||
* in the same context as the consumer would run, so there's
|
||||
* no possible race.
|
||||
*/
|
||||
napi = pp->p.napi;
|
||||
allow_direct = napi_safe && napi &&
|
||||
READ_ONCE(napi->list_owner) == smp_processor_id();
|
||||
|
||||
/* Driver set this to memory recycling info. Reset it on recycle.
|
||||
* This will *not* work for NIC using a split-page memory model.
|
||||
* The page will be returned to the pool here regardless of the
|
||||
* 'flipped' fragment being in use or not.
|
||||
*/
|
||||
page_pool_put_full_page(pp, page, false);
|
||||
page_pool_put_full_page(pp, page, allow_direct);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
|
||||
skb_get(list);
|
||||
}
|
||||
|
||||
static bool skb_pp_recycle(struct sk_buff *skb, void *data)
|
||||
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
|
||||
return false;
|
||||
return page_pool_return_skb_page(virt_to_page(data));
|
||||
return page_pool_return_skb_page(virt_to_page(data), napi_safe);
|
||||
}
|
||||
|
||||
static void skb_kfree_head(void *head, unsigned int end_offset)
|
||||
@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
|
||||
kfree(head);
|
||||
}
|
||||
|
||||
static void skb_free_head(struct sk_buff *skb)
|
||||
static void skb_free_head(struct sk_buff *skb, bool napi_safe)
|
||||
{
|
||||
unsigned char *head = skb->head;
|
||||
|
||||
if (skb->head_frag) {
|
||||
if (skb_pp_recycle(skb, head))
|
||||
if (skb_pp_recycle(skb, head, napi_safe))
|
||||
return;
|
||||
skb_free_frag(head);
|
||||
} else {
|
||||
@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb)
|
||||
}
|
||||
}
|
||||
|
||||
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
|
||||
bool napi_safe)
|
||||
{
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
||||
int i;
|
||||
@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
}
|
||||
|
||||
for (i = 0; i < shinfo->nr_frags; i++)
|
||||
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
|
||||
napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
|
||||
|
||||
free_head:
|
||||
if (shinfo->frag_list)
|
||||
kfree_skb_list_reason(shinfo->frag_list, reason);
|
||||
|
||||
skb_free_head(skb);
|
||||
skb_free_head(skb, napi_safe);
|
||||
exit:
|
||||
/* When we clone an SKB we copy the reycling bit. The pp_recycle
|
||||
* bit is only set on the head though, so in order to avoid races
|
||||
@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
|
||||
}
|
||||
|
||||
/* Free everything but the sk_buff shell. */
|
||||
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
|
||||
bool napi_safe)
|
||||
{
|
||||
skb_release_head_state(skb);
|
||||
if (likely(skb->head))
|
||||
skb_release_data(skb, reason);
|
||||
skb_release_data(skb, reason, napi_safe);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
|
||||
void __kfree_skb(struct sk_buff *skb)
|
||||
{
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
|
||||
kfree_skbmem(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(__kfree_skb);
|
||||
@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
|
||||
return;
|
||||
}
|
||||
|
||||
skb_release_all(skb, reason);
|
||||
skb_release_all(skb, reason, false);
|
||||
sa->skb_array[sa->skb_count++] = skb;
|
||||
|
||||
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
|
||||
@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
|
||||
void __consume_stateless_skb(struct sk_buff *skb)
|
||||
{
|
||||
trace_consume_skb(skb, __builtin_return_address(0));
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
skb_release_data(skb, SKB_CONSUMED, false);
|
||||
kfree_skbmem(skb);
|
||||
}
|
||||
|
||||
@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
|
||||
|
||||
void __kfree_skb_defer(struct sk_buff *skb)
|
||||
{
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
|
||||
napi_skb_cache_put(skb);
|
||||
}
|
||||
|
||||
@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
|
||||
return;
|
||||
}
|
||||
|
||||
skb_release_all(skb, SKB_CONSUMED);
|
||||
skb_release_all(skb, SKB_CONSUMED, !!budget);
|
||||
napi_skb_cache_put(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(napi_consume_skb);
|
||||
@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
|
||||
*/
|
||||
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
|
||||
{
|
||||
skb_release_all(dst, SKB_CONSUMED);
|
||||
skb_release_all(dst, SKB_CONSUMED, false);
|
||||
return __skb_clone(dst, src);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skb_morph);
|
||||
@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
|
||||
if (skb_has_frag_list(skb))
|
||||
skb_clone_fraglist(skb);
|
||||
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
skb_release_data(skb, SKB_CONSUMED, false);
|
||||
} else {
|
||||
skb_free_head(skb);
|
||||
skb_free_head(skb, false);
|
||||
}
|
||||
off = (data + nhead) - skb->head;
|
||||
|
||||
@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
|
||||
skb_frag_ref(skb, i);
|
||||
if (skb_has_frag_list(skb))
|
||||
skb_clone_fraglist(skb);
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
skb_release_data(skb, SKB_CONSUMED, false);
|
||||
} else {
|
||||
/* we can reuse existing recount- all we did was
|
||||
* relocate values
|
||||
*/
|
||||
skb_free_head(skb);
|
||||
skb_free_head(skb, false);
|
||||
}
|
||||
|
||||
skb->head = data;
|
||||
@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
|
||||
skb_kfree_head(data, size);
|
||||
return -ENOMEM;
|
||||
}
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
skb_release_data(skb, SKB_CONSUMED, false);
|
||||
|
||||
skb->head = data;
|
||||
skb->head_frag = 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user