From b07a2d97ba5ef154fe736aa510e43a3299eee5f8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Apr 2023 21:26:03 -0700 Subject: [PATCH 1/3] net: skb: plumb napi state thru skb freeing paths We maintain a NAPI-local cache of skbs which is fed by napi_consume_skb(). Going forward we will also try to cache head and data pages. Plumb the "are we in a normal NAPI context" information thru deeper into the freeing path, up to skb_release_data() and skb_free_head()/skb_pp_recycle(). The "not normal NAPI context" comes from netpoll which passes budget of 0 to try to reap the Tx completions but not perform any Rx. Use "bool napi_safe" rather than bare "int budget", the further we get from NAPI the more confusing the budget argument may seem (particularly whether 0 or MAX is the correct value to pass in when not in NAPI). Reviewed-by: Tariq Toukan Tested-by: Dragos Tatulea Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 78238a13dbcf..2b5a98c5cb49 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -839,7 +839,7 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool skb_pp_recycle(struct sk_buff *skb, void *data) +static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; @@ -856,12 +856,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb) +static void skb_free_head(struct sk_buff *skb, bool napi_safe) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head)) + if (skb_pp_recycle(skb, head, napi_safe)) return; skb_free_frag(head); } else { @@ -869,7 +869,8 @@ static void skb_free_head(struct sk_buff *skb) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, + bool napi_safe) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -894,7 +895,7 @@ free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb); + skb_free_head(skb, napi_safe); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, + bool napi_safe) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason); + skb_release_data(skb, reason, napi_safe); } /** @@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason); + skb_release_all(skb, reason, false); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); kfree_skbmem(skb); } @@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __kfree_skb_defer(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true); napi_skb_cache_put(skb); } @@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED); + skb_release_all(skb, SKB_CONSUMED, !!budget); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED); + skb_release_all(dst, SKB_CONSUMED, false); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); } else { - skb_free_head(skb); + skb_free_head(skb, false); } off = (data + nhead) - skb->head; @@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb); + skb_free_head(skb, false); } skb->head = data; @@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED); + skb_release_data(skb, SKB_CONSUMED, false); skb->head = data; skb->head_frag = 0; From 8c48eea3adf3119e0a3fc57bd31f6966f26ee784 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Apr 2023 21:26:04 -0700 Subject: [PATCH 2/3] page_pool: allow caching from safely localized NAPI Recent patches to mlx5 mentioned a regression when moving from driver local page pool to only using the generic page pool code. Page pool has two recycling paths (1) direct one, which runs in safe NAPI context (basically consumer context, so producing can be lockless); and (2) via a ptr_ring, which takes a spin lock because the freeing can happen from any CPU; producer and consumer may run concurrently. Since the page pool code was added, Eric introduced a revised version of deferred skb freeing. TCP skbs are now usually returned to the CPU which allocated them, and freed in softirq context. This places the freeing (producing of pages back to the pool) enticingly close to the allocation (consumer). If we can prove that we're freeing in the same softirq context in which the consumer NAPI will run - lockless use of the cache is perfectly fine, no need for the lock. Let drivers link the page pool to a NAPI instance. If the NAPI instance is scheduled on the same CPU on which we're freeing - place the pages in the direct cache. With that and patched bnxt (XDP enabled to engage the page pool, sigh, bnxt really needs page pool work :() I see a 2.6% perf boost with a TCP stream test (app on a different physical core than softirq). The CPU use of relevant functions decreases as expected: page_pool_refill_alloc_cache 1.17% -> 0% _raw_spin_lock 2.41% -> 0.98% Only consider lockless path to be safe when NAPI is scheduled - in practice this should cover majority if not all of steady state workloads. It's usually the NAPI kicking in that causes the skb flush. The main case we'll miss out on is when application runs on the same CPU as NAPI. In that case we don't use the deferred skb free path. Reviewed-by: Tariq Toukan Acked-by: Jesper Dangaard Brouer Tested-by: Dragos Tatulea Signed-off-by: Jakub Kicinski --- Documentation/networking/page_pool.rst | 1 + include/linux/netdevice.h | 3 +++ include/linux/skbuff.h | 20 +++++++++++++------- include/net/page_pool.h | 3 ++- net/core/dev.c | 3 +++ net/core/page_pool.c | 15 +++++++++++++-- net/core/skbuff.c | 4 ++-- 7 files changed, 37 insertions(+), 12 deletions(-) diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst index 30f1344e7cca..873efd97f822 100644 --- a/Documentation/networking/page_pool.rst +++ b/Documentation/networking/page_pool.rst @@ -165,6 +165,7 @@ Registration pp_params.pool_size = DESC_NUM; pp_params.nid = NUMA_NO_NODE; pp_params.dev = priv->dev; + pp_params.napi = napi; /* only if locking is tied to NAPI */ pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; page_pool = page_pool_create(&pp_params); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 96d27d558b0c..203c0df2046c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -360,8 +360,11 @@ struct napi_struct { unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL + /* CPU actively polling if netpoll is configured */ int poll_owner; #endif + /* CPU on which NAPI has been scheduled for processing */ + int list_owner; struct net_device *dev; struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 494a23a976b0..a823ec3aa326 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +static inline void +napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) +{ + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page, napi_safe)) + return; +#endif + put_page(page); +} + /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment @@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - struct page *page = skb_frag_page(frag); - -#ifdef CONFIG_PAGE_POOL - if (recycle && page_pool_return_skb_page(page)) - return; -#endif - put_page(page); + napi_frag_unref(frag, recycle, false); } /** diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ddfa0b328677..91b808dade82 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -77,6 +77,7 @@ struct page_pool_params { unsigned int pool_size; int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ + struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */ enum dma_data_direction dma_dir; /* DMA mapping direction */ unsigned int max_len; /* max DMA sync memory size */ unsigned int offset; /* DMA addr offset */ @@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } -bool page_pool_return_skb_page(struct page *page); +bool page_pool_return_skb_page(struct page *page, bool napi_safe); struct page_pool *page_pool_create(const struct page_pool_params *params); diff --git a/net/core/dev.c b/net/core/dev.c index c7f13742b56c..8aea68275172 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, } list_add_tail(&napi->poll_list, &sd->poll_list); + WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ @@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) list_del_init(&n->poll_list); local_irq_restore(flags); } + WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { @@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif + napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 193c18799865..2f6bf422ed30 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -19,6 +19,7 @@ #include /* for put_page() */ #include #include +#include #include @@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } EXPORT_SYMBOL(page_pool_update_nid); -bool page_pool_return_skb_page(struct page *page) +bool page_pool_return_skb_page(struct page *page, bool napi_safe) { + struct napi_struct *napi; struct page_pool *pp; + bool allow_direct; page = compound_head(page); @@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page) pp = page->pp; + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + */ + napi = pp->p.napi; + allow_direct = napi_safe && napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + /* Driver set this to memory recycling info. Reset it on recycle. * This will *not* work for NIC using a split-page memory model. * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page_pool_put_full_page(pp, page, false); + page_pool_put_full_page(pp, page, allow_direct); return true; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2b5a98c5cb49..ef81452759be 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -843,7 +843,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data)); + return page_pool_return_skb_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -889,7 +889,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); free_head: if (shinfo->frag_list) From 294e39e0d03449f32b0723d3fd99ab5c23881c05 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Apr 2023 21:26:05 -0700 Subject: [PATCH 3/3] bnxt: hook NAPIs to page pools bnxt has 1:1 mapping of page pools and NAPIs, so it's safe to hoook them up together. Reviewed-by: Tariq Toukan Tested-by: Dragos Tatulea Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 92289ab2f34a..8fb9d1bbe56f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.pool_size = bp->rx_ring_size; pp.nid = dev_to_node(&bp->pdev->dev); + pp.napi = &rxr->bnapi->napi; pp.dev = &bp->pdev->dev; pp.dma_dir = DMA_BIDIRECTIONAL;