From 2896624be30b049601ec3ef9b08df184d0c70495 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 28 Jun 2024 12:18:54 +0200 Subject: [PATCH 1/3] net: Remove task_struct::bpf_net_context init on fork. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no clone() invocation within a bpf_net_ctx_…() block. Therefore the task_struct::bpf_net_context has always to be NULL and an explicit initialisation is not required. Remove the NULL assignment in the clone() path. Suggested-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index f314bdd7e610..99076dbe27d8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2355,7 +2355,6 @@ __latent_entropy struct task_struct *copy_process( RCU_INIT_POINTER(p->bpf_storage, NULL); p->bpf_ctx = NULL; #endif - p->bpf_net_context = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); From d839a73179ae91c07f5f2f97ccb9c69b2b7c3306 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 28 Jun 2024 12:18:55 +0200 Subject: [PATCH 2/3] net: Optimize xdp_do_flush() with bpf_net_context infos. Every NIC driver utilizing XDP should invoke xdp_do_flush() after processing all packages. With the introduction of the bpf_net_context logic the flush lists (for dev, CPU-map and xsk) are lazy initialized only if used. However xdp_do_flush() tries to flush all three of them so all three lists are always initialized and the likely empty lists are "iterated". Without the usage of XDP but with CONFIG_DEBUG_NET the lists are also initialized due to xdp_do_check_flushed(). Jakub suggest to utilize the hints in bpf_net_context and avoid invoking the flush function. This will also avoiding initializing the lists which are otherwise unused. Introduce bpf_net_ctx_get_all_used_flush_lists() to return the individual list if not-empty. Use the logic in xdp_do_flush() and xdp_do_check_flushed(). Remove the not needed .*_check_flush(). Suggested-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/bpf.h | 10 ++++------ include/linux/filter.h | 27 +++++++++++++++++++++++++++ include/net/xdp_sock.h | 14 ++------------ kernel/bpf/cpumap.c | 13 +------------ kernel/bpf/devmap.c | 13 +------------ net/core/filter.c | 33 +++++++++++++++++++++++++-------- net/xdp/xsk.c | 13 +------------ 7 files changed, 61 insertions(+), 62 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a834f4b761bc..f5c6bc9093a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; -void __dev_flush(void); +void __dev_flush(struct list_head *flush_list); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, @@ -2507,7 +2507,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); -void __cpu_map_flush(void); +void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, @@ -2644,8 +2644,6 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); -bool dev_check_flush(void); -bool cpu_map_check_flush(void); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2738,7 +2736,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline void __dev_flush(void) +static inline void __dev_flush(struct list_head *flush_list) { } @@ -2784,7 +2782,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, return 0; } -static inline void __cpu_map_flush(void) +static inline void __cpu_map_flush(struct list_head *flush_list) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index c0349522de8f..02ddcfdf94c4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -829,6 +829,33 @@ static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) return &bpf_net_ctx->xskmap_map_flush_list; } +static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map, + struct list_head **lh_dev, + struct list_head **lh_xsk) +{ + struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + u32 kern_flags = bpf_net_ctx->ri.kern_flags; + struct list_head *lh; + + *lh_map = *lh_dev = *lh_xsk = NULL; + + if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) + return; + + lh = &bpf_net_ctx->dev_map_flush_list; + if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh)) + *lh_dev = lh; + + lh = &bpf_net_ctx->cpu_map_flush_list; + if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh)) + *lh_map = lh; + + lh = &bpf_net_ctx->xskmap_map_flush_list; + if (IS_ENABLED(CONFIG_XDP_SOCKETS) && + kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh)) + *lh_xsk = lh; +} + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3d54de168a6d..bfe625b55d55 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -121,7 +121,7 @@ struct xsk_tx_metadata_ops { int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); -void __xsk_map_flush(void); +void __xsk_map_flush(struct list_head *flush_list); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information @@ -206,7 +206,7 @@ static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) return -EOPNOTSUPP; } -static inline void __xsk_map_flush(void) +static inline void __xsk_map_flush(struct list_head *flush_list) { } @@ -228,14 +228,4 @@ static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, } #endif /* CONFIG_XDP_SOCKETS */ - -#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) -bool xsk_map_check_flush(void); -#else -static inline bool xsk_map_check_flush(void) -{ - return false; -} -#endif - #endif /* _LINUX_XDP_SOCK_H */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 068e994ed781..4acf90cd79eb 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -757,9 +757,8 @@ trace: return ret; } -void __cpu_map_flush(void) +void __cpu_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -769,13 +768,3 @@ void __cpu_map_flush(void) wake_up_process(bq->obj->kthread); } } - -#ifdef CONFIG_DEBUG_NET -bool cpu_map_check_flush(void) -{ - if (list_empty(bpf_net_ctx_get_cpu_map_flush_list())) - return false; - __cpu_map_flush(); - return true; -} -#endif diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 317ac2d66ebd..9ca47eaacdd5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -412,9 +412,8 @@ out: * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ -void __dev_flush(void) +void __dev_flush(struct list_head *flush_list) { - struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { @@ -425,16 +424,6 @@ void __dev_flush(void) } } -#ifdef CONFIG_DEBUG_NET -bool dev_check_flush(void) -{ - if (list_empty(bpf_net_ctx_get_dev_flush_list())) - return false; - __dev_flush(); - return true; -} -#endif - /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. diff --git a/net/core/filter.c b/net/core/filter.c index eb1c4425c06f..403d23faf22e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4277,22 +4277,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { */ void xdp_do_flush(void) { - __dev_flush(); - __cpu_map_flush(); - __xsk_map_flush(); + struct list_head *lh_map, *lh_dev, *lh_xsk; + + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) + __dev_flush(lh_dev); + if (lh_map) + __cpu_map_flush(lh_map); + if (lh_xsk) + __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { - bool ret; + struct list_head *lh_map, *lh_dev, *lh_xsk; + bool missed = false; - ret = dev_check_flush(); - ret |= cpu_map_check_flush(); - ret |= xsk_map_check_flush(); + bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); + if (lh_dev) { + __dev_flush(lh_dev); + missed = true; + } + if (lh_map) { + __cpu_map_flush(lh_map); + missed = true; + } + if (lh_xsk) { + __xsk_map_flush(lh_xsk); + missed = true; + } - WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n", + WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ed062e038389..de9c0322bc29 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -383,9 +383,8 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) return 0; } -void __xsk_map_flush(void) +void __xsk_map_flush(struct list_head *flush_list) { - struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { @@ -394,16 +393,6 @@ void __xsk_map_flush(void) } } -#ifdef CONFIG_DEBUG_NET -bool xsk_map_check_flush(void) -{ - if (list_empty(bpf_net_ctx_get_xskmap_flush_list())) - return false; - __xsk_map_flush(); - return true; -} -#endif - void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); From e3d69f585d651aba877e18866de7e8cfa2476caa Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 28 Jun 2024 12:18:56 +0200 Subject: [PATCH 3/3] net: Move flush list retrieval to where it is used. The bpf_net_ctx_get_.*_flush_list() are used at the top of the function. This means the variable is always assigned even if unused. By moving the function to where it is used, it is possible to delay the initialisation until it is unavoidable. Not sure how much this gains in reality but by looking at bq_enqueue() (in devmap.c) gcc pushes one register less to the stack. \o/. Move flush list retrieval to where it is used. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Jesper Dangaard Brouer Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- kernel/bpf/cpumap.c | 6 ++++-- kernel/bpf/devmap.c | 3 ++- net/xdp/xsk.c | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 4acf90cd79eb..fbdf5a1aabfe 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -707,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) @@ -724,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) */ bq->q[bq->count++] = xdpf; - if (!bq->flush_node.prev) + if (!bq->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list(); + list_add(&bq->flush_node, flush_list); + } } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9ca47eaacdd5..b18d4a14a0a7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -448,7 +448,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) @@ -462,6 +461,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, * are only ever modified together. */ if (!bq->dev_rx) { + struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); + bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index de9c0322bc29..7e16336044b2 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -370,15 +370,17 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { - struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); int err; err = xsk_rcv(xs, xdp); if (err) return err; - if (!xs->flush_node.prev) + if (!xs->flush_node.prev) { + struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); + list_add(&xs->flush_node, flush_list); + } return 0; }