From 024cd2cbd1ca2d29e6df538855d52c4e5990cab7 Mon Sep 17 00:00:00 2001 From: Santucci Pierpaolo Date: Mon, 16 Nov 2020 11:30:37 +0100 Subject: [PATCH 001/118] selftest/bpf: Fix IPV6FR handling in flow dissector From second fragment on, IPV6FR program must stop the dissection of IPV6 fragmented packet. This is the same approach used for IPV4 fragmentation. This fixes the flow keys calculation for the upper-layer protocols. Note that according to RFC8200, the first fragment packet must include the upper-layer header. Signed-off-by: Santucci Pierpaolo Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/X7JUzUj34ceE2wBm@santucci.pierpaolo --- tools/testing/selftests/bpf/progs/bpf_flow.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 5a65f6b51377..95a5a0778ed7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -368,6 +368,8 @@ PROG(IPV6FR)(struct __sk_buff *skb) */ if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) return export_flow_keys(keys, BPF_OK); + } else { + return export_flow_keys(keys, BPF_OK); } return parse_ipv6_proto(skb, fragh->nexthdr); From b93ef089d35c3386dd197e85afb6399bbd54cfb3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 16 Nov 2020 12:01:13 -0800 Subject: [PATCH 002/118] bpf: Fix the irq and nmi check in bpf_sk_storage for tracing usage The intention of the current check is to avoid using bpf_sk_storage in irq and nmi. Jakub pointed out that the current check cannot do that. For example, in_serving_softirq() returns true if the softirq handling is interrupted by hard irq. Fixes: 8e4597c627fb ("bpf: Allow using bpf_sk_storage in FENTRY/FEXIT/RAW_TP") Suggested-by: Jakub Kicinski Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201116200113.2868539-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 359908a7d3c1..a32037daa933 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -415,7 +415,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - if (!in_serving_softirq() && !in_task()) + if (in_irq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); @@ -424,7 +424,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { - if (!in_serving_softirq() && !in_task()) + if (in_irq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); From de91e631bdc7e6411989e1a9ab65501a31527e0b Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Sun, 15 Nov 2020 10:46:35 +0000 Subject: [PATCH 003/118] libbpf: bpf__find_by_name[_kind] should use btf__get_nr_types() When operating on split BTF, btf__find_by_name[_kind] will not iterate over all types since they use btf->nr_types to show the number of types to iterate over. For split BTF this is the number of types _on top of base BTF_, so it will underestimate the number of types to iterate over, especially for vmlinux + module BTF, where the latter is much smaller. Use btf__get_nr_types() instead. Fixes: ba451366bf44 ("libbpf: Implement basic split BTF support") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1605437195-2175-1-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/btf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 2d0d064c6d31..8ff46cd30ca1 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -674,12 +674,12 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) __s32 btf__find_by_name(const struct btf *btf, const char *type_name) { - __u32 i; + __u32 i, nr_types = btf__get_nr_types(btf); if (!strcmp(type_name, "void")) return 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i <= nr_types; i++) { const struct btf_type *t = btf__type_by_id(btf, i); const char *name = btf__name_by_offset(btf, t->name_off); @@ -693,12 +693,12 @@ __s32 btf__find_by_name(const struct btf *btf, const char *type_name) __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind) { - __u32 i; + __u32 i, nr_types = btf__get_nr_types(btf); if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) return 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i <= nr_types; i++) { const struct btf_type *t = btf__type_by_id(btf, i); const char *name; From 90da4b3208d32bdb5489ca08b91af16ed4a68d00 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 16 Nov 2020 12:12:43 +0100 Subject: [PATCH 004/118] samples/bpf: Increment Tx stats at sending Increment the statistics over how many Tx packets have been sent at the time of sending instead of at the time of completion. This as a completion event means that the buffer has been sent AND returned to user space. The packet always gets sent shortly after sendto() is called. The kernel might, for performance reasons, decide to not return every single buffer to user space immediately after sending, for example, only after a batch of packets have been transmitted. Incrementing the number of packets sent at completion, will in that case be confusing as if you send a single packet, the counter might show zero for a while even though the packet has been transmitted. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1605525167-14450-2-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 1149e94ca32f..2567f0db5aca 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1146,7 +1146,6 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->ring_stats.tx_npkts += rcvd; } } @@ -1168,7 +1167,6 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->ring_stats.tx_npkts += rcvd; } } @@ -1260,6 +1258,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) } xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->ring_stats.tx_npkts += batch_size; xsk->outstanding_tx += batch_size; *frame_nb += batch_size; *frame_nb %= NUM_FRAMES; @@ -1348,6 +1347,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) } return; } + xsk->ring_stats.rx_npkts += rcvd; ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); while (ret != rcvd) { @@ -1379,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->tx, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->ring_stats.rx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; xsk->outstanding_tx += rcvd; } From f320460b9489d80355821829069fdefa6c698cb4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 16 Nov 2020 12:12:44 +0100 Subject: [PATCH 005/118] i40e: Remove unnecessary sw_ring access from xsk Tx Remove the unnecessary access to the software ring for the AF_XDP zero-copy driver. This was used to record the length of the packet so that the driver Tx completion code could sum this up to produce the total bytes sent. This is now performed during the transmission of the packet, so no need to record this in the software ring. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1605525167-14450-3-git-send-email-magnus.karlsson@gmail.com --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 567fd67e900e..20d26321b447 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -392,7 +392,6 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) { unsigned int sent_frames = 0, total_bytes = 0; struct i40e_tx_desc *tx_desc = NULL; - struct i40e_tx_buffer *tx_bi; struct xdp_desc desc; dma_addr_t dma; @@ -404,9 +403,6 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc.len); - tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; - tx_bi->bytecount = desc.len; - tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); tx_desc->buffer_addr = cpu_to_le64(dma); tx_desc->cmd_type_offset_bsz = @@ -415,7 +411,7 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) 0, desc.len, 0); sent_frames++; - total_bytes += tx_bi->bytecount; + total_bytes += desc.len; xdp_ring->next_to_use++; if (xdp_ring->next_to_use == xdp_ring->count) From b8c7aece29bc06e1e63efeefb9e31ab259e84ea2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 16 Nov 2020 12:12:45 +0100 Subject: [PATCH 006/118] xsk: Introduce padding between more ring pointers Introduce one cache line worth of padding between the consumer pointer and the flags field as well as between the flags field and the start of the descriptors in all the lockless rings. This so that the x86 HW adjacency prefetcher will not prefetch the adjacent pointer/field when only one pointer/field is going to be used. This improves throughput performance for the l2fwd sample app with 1% on my machine with HW prefetching turned on in the BIOS. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1605525167-14450-4-git-send-email-magnus.karlsson@gmail.com --- net/xdp/xsk_queue.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..74fac802cce1 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -18,9 +18,11 @@ struct xdp_ring { /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ - u32 pad ____cacheline_aligned_in_smp; + u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ From 9349eb3a9d2ae0151510dd98b6640dfaeebee9cc Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 16 Nov 2020 12:12:46 +0100 Subject: [PATCH 007/118] xsk: Introduce batched Tx descriptor interfaces Introduce batched descriptor interfaces in the xsk core code for the Tx path to be used in the driver to write a code path with higher performance. This interface will be used by the i40e driver in the next patch. Though other drivers would likely benefit from this new interface too. Note that batching is only implemented for the common case when there is only one socket bound to the same device and queue id. When this is not the case, we fall back to the old non-batched version of the function. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1605525167-14450-5-git-send-email-magnus.karlsson@gmail.com --- include/net/xdp_sock_drv.h | 7 +++ net/xdp/xsk.c | 57 ++++++++++++++++++++++++ net/xdp/xsk_queue.h | 89 ++++++++++++++++++++++++++++++++------ 3 files changed, 140 insertions(+), 13 deletions(-) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..4e295541e396 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -13,6 +13,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, + u32 max) +{ + return 0; +} + static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cfbec3989a76..b0141973f23e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -332,6 +332,63 @@ out: } EXPORT_SYMBOL(xsk_tx_peek_desc); +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + struct xdp_sock *xs; + u32 nb_pkts; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, descs, max_entries); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); + if (!nb_pkts) + goto out; + + xskq_cons_release_n(xs->tx, nb_pkts); + __xskq_cons_release(xs->tx); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 74fac802cce1..b936c46b1e16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -199,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, + struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + descs[nb_entries] = ring->desc[idx]; + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { + /* Skip the entry */ + cached_cons++; + continue; + } + + nb_entries++; + cached_cons++; + } + + return nb_entries; +} + /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) @@ -220,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q) __xskq_cons_peek(q); } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; - if (entries >= cnt) - return true; + if (entries >= max) + return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; - return entries >= cnt; + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) @@ -249,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } +static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 entries = xskq_cons_nb_entries(q, max); + + return xskq_cons_read_desc_batch(q, descs, pool, entries); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ static inline void xskq_cons_release(struct xsk_queue *q) { - /* To improve performance, only update local state here. - * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries() and whenever - * Rx or Tx processing are completed in the NAPI loop. - */ q->cached_cons++; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -268,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) /* Functions for producers */ -static inline bool xskq_prod_is_full(struct xsk_queue *q) +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (free_entries) - return false; + if (free_entries >= max) + return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return !free_entries; + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -304,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 max) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 nb_entries, i, cached_prod; + + nb_entries = xskq_prod_nb_free(q, max); + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; + + return nb_entries; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { From 3106c580fb7cf26691c1ce3aba2223f3ae56d846 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 16 Nov 2020 12:12:47 +0100 Subject: [PATCH 008/118] i40e: Use batched xsk Tx interfaces to increase performance Use the new batched xsk interfaces for the Tx path in the i40e driver to improve performance. On my machine, this yields a throughput increase of 4% for the l2fwd sample app in xdpsock. If we instead just look at the Tx part, this patch set increases throughput with above 20% for Tx. Note that I had to explicitly loop unroll the inner loop to get to this performance level, by using a pragma. It is honored by both clang and gcc and should be ignored by versions that do not support it. Using the -funroll-loops compiler command line switch on the source file resulted in a loop unrolling on a higher level that lead to a performance decrease instead of an increase. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1605525167-14450-6-git-send-email-magnus.karlsson@gmail.com --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 11 ++ drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + drivers/net/ethernet/intel/i40e/i40e_xsk.c | 115 ++++++++++++++------ drivers/net/ethernet/intel/i40e/i40e_xsk.h | 16 +++ 4 files changed, 110 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index d43ce13a93c9..c21548c71bb1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring) i40e_clean_tx_ring(tx_ring); kfree(tx_ring->tx_bi); tx_ring->tx_bi = NULL; + kfree(tx_ring->xsk_descs); + tx_ring->xsk_descs = NULL; if (tx_ring->desc) { dma_free_coherent(tx_ring->dev, tx_ring->size, @@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) if (!tx_ring->tx_bi) goto err; + if (ring_is_xdp(tx_ring)) { + tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs), + GFP_KERNEL); + if (!tx_ring->xsk_descs) + goto err; + } + u64_stats_init(&tx_ring->syncp); /* round up to nearest 4K */ @@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) return 0; err: + kfree(tx_ring->xsk_descs); + tx_ring->xsk_descs = NULL; kfree(tx_ring->tx_bi); tx_ring->tx_bi = NULL; return -ENOMEM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 2feed920ef8a..5f531b195959 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -389,6 +389,7 @@ struct i40e_ring { struct i40e_channel *ch; struct xdp_rxq_info xdp_rxq; struct xsk_buff_pool *xsk_pool; + struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */ } ____cacheline_internodealigned_in_smp; static inline bool ring_uses_build_skb(struct i40e_ring *ring) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 20d26321b447..4c44f499fd49 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,6 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include +#include #include #include @@ -381,6 +382,69 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) return failure ? budget : (int)total_rx_packets; } +static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc, + unsigned int *total_bytes) +{ + struct i40e_tx_desc *tx_desc; + dma_addr_t dma; + + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len); + + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++); + tx_desc->buffer_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP, + 0, desc->len, 0); + + *total_bytes += desc->len; +} + +static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc, + unsigned int *total_bytes) +{ + u16 ntu = xdp_ring->next_to_use; + struct i40e_tx_desc *tx_desc; + dma_addr_t dma; + u32 i; + + loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len); + + tx_desc = I40E_TX_DESC(xdp_ring, ntu++); + tx_desc->buffer_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | + I40E_TX_DESC_CMD_EOP, + 0, desc[i].len, 0); + + *total_bytes += desc[i].len; + } + + xdp_ring->next_to_use = ntu; +} + +static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts, + unsigned int *total_bytes) +{ + u32 batched, leftover, i; + + batched = nb_pkts & ~(PKTS_PER_BATCH - 1); + leftover = nb_pkts & (PKTS_PER_BATCH - 1); + for (i = 0; i < batched; i += PKTS_PER_BATCH) + i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); + for (i = batched; i < batched + leftover; i++) + i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes); +} + +static void i40e_set_rs_bit(struct i40e_ring *xdp_ring) +{ + u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1; + struct i40e_tx_desc *tx_desc; + + tx_desc = I40E_TX_DESC(xdp_ring, ntu); + tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); +} + /** * i40e_xmit_zc - Performs zero-copy Tx AF_XDP * @xdp_ring: XDP Tx ring @@ -390,45 +454,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) **/ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) { - unsigned int sent_frames = 0, total_bytes = 0; - struct i40e_tx_desc *tx_desc = NULL; - struct xdp_desc desc; - dma_addr_t dma; + struct xdp_desc *descs = xdp_ring->xsk_descs; + u32 nb_pkts, nb_processed = 0; + unsigned int total_bytes = 0; - while (budget-- > 0) { - if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc)) - break; + nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget); + if (!nb_pkts) + return false; - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, - desc.len); - - tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); - tx_desc->buffer_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = - build_ctob(I40E_TX_DESC_CMD_ICRC - | I40E_TX_DESC_CMD_EOP, - 0, desc.len, 0); - - sent_frames++; - total_bytes += desc.len; - - xdp_ring->next_to_use++; - if (xdp_ring->next_to_use == xdp_ring->count) - xdp_ring->next_to_use = 0; + if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) { + nb_processed = xdp_ring->count - xdp_ring->next_to_use; + i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes); + xdp_ring->next_to_use = 0; } - if (tx_desc) { - /* Request an interrupt for the last frame and bump tail ptr. */ - tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << - I40E_TXD_QW1_CMD_SHIFT); - i40e_xdp_ring_update_tail(xdp_ring); + i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed, + &total_bytes); - xsk_tx_release(xdp_ring->xsk_pool); - i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes); - } + /* Request an interrupt for the last frame and bump tail ptr. */ + i40e_set_rs_bit(xdp_ring); + i40e_xdp_ring_update_tail(xdp_ring); - return !!budget; + i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes); + + return true; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 7adfd8539247..ea88f4597a07 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -4,6 +4,22 @@ #ifndef _I40E_XSK_H_ #define _I40E_XSK_H_ +/* This value should match the pragma in the loop_unrolled_for + * macro. Why 4? It is strictly empirical. It seems to be a good + * compromise between the advantage of having simultaneous outstanding + * reads to the DMA array that can hide each others latency and the + * disadvantage of having a larger code path. + */ +#define PKTS_PER_BATCH 4 + +#ifdef __clang__ +#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for +#elif __GNUC__ >= 8 +#define loop_unrolled_for _Pragma("GCC unroll 4") for +#else +#define loop_unrolled_for for +#endif + struct i40e_vsi; struct xsk_buff_pool; struct zero_copy_allocator; From 3f6719c7b62f0327c9091e26d0da10e65668229e Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 17 Nov 2020 23:29:28 +0000 Subject: [PATCH 009/118] bpf: Add bpf_bprm_opts_set helper The helper allows modification of certain bits on the linux_binprm struct starting with the secureexec bit which can be updated using the BPF_F_BPRM_SECUREEXEC flag. secureexec can be set by the LSM for privilege gaining executions to set the AT_SECURE auxv for glibc. When set, the dynamic linker disables the use of certain environment variables (like LD_PRELOAD). Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201117232929.2156341-1-kpsingh@chromium.org --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ kernel/bpf/bpf_lsm.c | 26 ++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 4 files changed, 60 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 162999b12790..a52299b80b9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3787,6 +3787,16 @@ union bpf_attr { * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3958,7 @@ union bpf_attr { FN(task_storage_get), \ FN(task_storage_delete), \ FN(get_current_task_btf), \ + FN(bprm_opts_set), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4119,6 +4130,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 553107f4706a..b4f27a874092 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,29 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +/* Mask for all the currently supported BPRM option flags */ +#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC + +BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) +{ + if (flags & ~BPF_F_BRPM_OPTS_MASK) + return -EINVAL; + + bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC); + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) + +const static struct bpf_func_proto bpf_bprm_opts_set_proto = { + .func = bpf_bprm_opts_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -71,6 +95,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_bprm_opts_set: + return &bpf_bprm_opts_set_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 31484377b8b1..c5bc947a70ad 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -418,6 +418,7 @@ class PrinterHelpers(Printer): 'struct bpf_tcp_sock', 'struct bpf_tunnel_key', 'struct bpf_xfrm_state', + 'struct linux_binprm', 'struct pt_regs', 'struct sk_reuseport_md', 'struct sockaddr', @@ -465,6 +466,7 @@ class PrinterHelpers(Printer): 'struct bpf_tcp_sock', 'struct bpf_tunnel_key', 'struct bpf_xfrm_state', + 'struct linux_binprm', 'struct pt_regs', 'struct sk_reuseport_md', 'struct sockaddr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 162999b12790..a52299b80b9d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3787,6 +3787,16 @@ union bpf_attr { * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3958,7 @@ union bpf_attr { FN(task_storage_get), \ FN(task_storage_delete), \ FN(get_current_task_btf), \ + FN(bprm_opts_set), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4119,6 +4130,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ From ea87ae85c9b31303a2e9d4c769d9f3ee8a3a60d1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 17 Nov 2020 23:29:29 +0000 Subject: [PATCH 010/118] bpf: Add tests for bpf_bprm_opts_set helper The test forks a child process, updates the local storage to set/unset the securexec bit. The BPF program in the test attaches to bprm_creds_for_exec which checks the local storage of the current task to set the secureexec bit on the binary parameters (bprm). The child then execs a bash command with the environment variable TMPDIR set in the envp. The bash command returns a different exit code based on its observed value of the TMPDIR variable. Since TMPDIR is one of the variables that is ignored by the dynamic loader when the secureexec bit is set, one should expect the child execution to not see this value when the secureexec bit is set. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201117232929.2156341-2-kpsingh@chromium.org --- .../selftests/bpf/prog_tests/test_bprm_opts.c | 116 ++++++++++++++++++ tools/testing/selftests/bpf/progs/bprm_opts.c | 34 +++++ 2 files changed, 150 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c create mode 100644 tools/testing/selftests/bpf/progs/bprm_opts.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c new file mode 100644 index 000000000000..2559bb775762 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_bprm_opts.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include +#include + +#include "bprm_opts.skel.h" +#include "network_helpers.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif + +static const char * const bash_envp[] = { "TMPDIR=shouldnotbeset", NULL }; + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int update_storage(int map_fd, int secureexec) +{ + int task_fd, ret = 0; + + task_fd = sys_pidfd_open(getpid(), 0); + if (task_fd < 0) + return errno; + + ret = bpf_map_update_elem(map_fd, &task_fd, &secureexec, BPF_NOEXIST); + if (ret) + ret = errno; + + close(task_fd); + return ret; +} + +static int run_set_secureexec(int map_fd, int secureexec) +{ + int child_pid, child_status, ret, null_fd; + + child_pid = fork(); + if (child_pid == 0) { + null_fd = open("/dev/null", O_WRONLY); + if (null_fd == -1) + exit(errno); + dup2(null_fd, STDOUT_FILENO); + dup2(null_fd, STDERR_FILENO); + close(null_fd); + + /* Ensure that all executions from hereon are + * secure by setting a local storage which is read by + * the bprm_creds_for_exec hook and sets bprm->secureexec. + */ + ret = update_storage(map_fd, secureexec); + if (ret) + exit(ret); + + /* If the binary is executed with securexec=1, the dynamic + * loader ingores and unsets certain variables like LD_PRELOAD, + * TMPDIR etc. TMPDIR is used here to simplify the example, as + * LD_PRELOAD requires a real .so file. + * + * If the value of TMPDIR is set, the bash command returns 10 + * and if the value is unset, it returns 20. + */ + execle("/bin/bash", "bash", "-c", + "[[ -z \"${TMPDIR}\" ]] || exit 10 && exit 20", NULL, + bash_envp); + exit(errno); + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + ret = WEXITSTATUS(child_status); + + /* If a secureexec occurred, the exit status should be 20 */ + if (secureexec && ret == 20) + return 0; + + /* If normal execution happened, the exit code should be 10 */ + if (!secureexec && ret == 10) + return 0; + } + + return -EINVAL; +} + +void test_test_bprm_opts(void) +{ + int err, duration = 0; + struct bprm_opts *skel = NULL; + + skel = bprm_opts__open_and_load(); + if (CHECK(!skel, "skel_load", "skeleton failed\n")) + goto close_prog; + + err = bprm_opts__attach(skel); + if (CHECK(err, "attach", "attach failed: %d\n", err)) + goto close_prog; + + /* Run the test with the secureexec bit unset */ + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), + 0 /* secureexec */); + if (CHECK(err, "run_set_secureexec:0", "err = %d\n", err)) + goto close_prog; + + /* Run the test with the secureexec bit set */ + err = run_set_secureexec(bpf_map__fd(skel->maps.secure_exec_task_map), + 1 /* secureexec */); + if (CHECK(err, "run_set_secureexec:1", "err = %d\n", err)) + goto close_prog; + +close_prog: + bprm_opts__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/bprm_opts.c b/tools/testing/selftests/bpf/progs/bprm_opts.c new file mode 100644 index 000000000000..5bfef2887e70 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bprm_opts.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} secure_exec_task_map SEC(".maps"); + +SEC("lsm/bprm_creds_for_exec") +int BPF_PROG(secure_exec, struct linux_binprm *bprm) +{ + int *secureexec; + + secureexec = bpf_task_storage_get(&secure_exec_task_map, + bpf_get_current_task_btf(), 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + + if (secureexec && *secureexec) + bpf_bprm_opts_set(bprm, BPF_F_BPRM_SECUREEXEC); + + return 0; +} From d055126180564a57fe533728a4e93d0cb53d49b3 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Tue, 17 Nov 2020 18:45:49 +0000 Subject: [PATCH 011/118] bpf: Add bpf_ktime_get_coarse_ns helper The helper uses CLOCK_MONOTONIC_COARSE source of time that is less accurate but more performant. We have a BPF CGROUP_SKB firewall that supports event logging through bpf_perf_event_output(). Each event has a timestamp and currently we use bpf_ktime_get_ns() for it. Use of bpf_ktime_get_coarse_ns() saves ~15-20 ns in time required for event logging. bpf_ktime_get_ns(): EgressLogByRemoteEndpoint 113.82ns 8.79M bpf_ktime_get_coarse_ns(): EgressLogByRemoteEndpoint 95.40ns 10.48M Signed-off-by: Dmitrii Banshchikov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201117184549.257280-1-me@ubique.spb.ru --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 6 files changed, 39 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 581b2a2e78eb..e1bcb6d7345c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1842,6 +1842,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a52299b80b9d..3ca6146f001a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3797,6 +3797,16 @@ union bpf_attr { * is cleared if the flag is not specified. * Return * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3959,6 +3969,7 @@ union bpf_attr { FN(task_storage_delete), \ FN(get_current_task_btf), \ FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 55454d2278b1..ff55cbcfbab4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2211,6 +2211,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..2c395deae279 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_coarse_ns) +{ + return ktime_get_coarse_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { + .func = bpf_ktime_get_coarse_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 02986c7b90eb..d255bc9b2bfa 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1280,6 +1280,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a52299b80b9d..3ca6146f001a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3797,6 +3797,16 @@ union bpf_attr { * is cleared if the flag is not specified. * Return * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3959,6 +3969,7 @@ union bpf_attr { FN(task_storage_delete), \ FN(get_current_task_btf), \ FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 6016df8fe874e1cf36f6357d71438b384198ce06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 18 Nov 2020 08:16:38 +0100 Subject: [PATCH 012/118] selftests/bpf: Fix broken riscv build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The selftests/bpf Makefile includes system include directories from the host, when building BPF programs. On RISC-V glibc requires that __riscv_xlen is defined. This is not the case for "clang -target bpf", which messes up __WORDSIZE (errno.h -> ... -> wordsize.h) and breaks the build. By explicitly defining __risc_xlen correctly for riscv, we can workaround this. Fixes: 167381f3eac0 ("selftests/bpf: Makefile fix "missing" headers on build with -idirafter") Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Luke Nelson Link: https://lore.kernel.org/bpf/20201118071640.83773-2-bjorn.topel@gmail.com --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c1708ffa6b1c..3d5940cd110d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -219,7 +219,8 @@ $(RESOLVE_BTFIDS): $(BPFOBJ) | $(BUILD_DIR)/resolve_btfids \ # build would have failed anyways. define get_sys_includes $(shell $(1) -v -E - &1 \ - | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - Date: Wed, 18 Nov 2020 08:16:39 +0100 Subject: [PATCH 013/118] selftests/bpf: Avoid running unprivileged tests with alignment requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures have strict alignment requirements. In that case, the BPF verifier detects if a program has unaligned accesses and rejects them. A user can pass BPF_F_ANY_ALIGNMENT to a program to override this check. That, however, will only work when a privileged user loads a program. An unprivileged user loading a program with this flag will be rejected prior entering the verifier. Hence, it does not make sense to load unprivileged programs without strict alignment when testing the verifier. This patch avoids exactly that. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Luke Nelson Link: https://lore.kernel.org/bpf/20201118071640.83773-3-bjorn.topel@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 9be395d9dc64..4bfe3aa2cfc4 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1152,6 +1152,19 @@ static void get_unpriv_disabled() static bool test_as_unpriv(struct bpf_test *test) { +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + /* Some architectures have strict alignment requirements. In + * that case, the BPF verifier detects if a program has + * unaligned accesses and rejects them. A user can pass + * BPF_F_ANY_ALIGNMENT to a program to override this + * check. That, however, will only work when a privileged user + * loads a program. An unprivileged user loading a program + * with this flag will be rejected prior entering the + * verifier. + */ + if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) + return false; +#endif return !test->prog_type || test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER || test->prog_type == BPF_PROG_TYPE_CGROUP_SKB; From 6007b23cc7555df882be870433dc589841d4eb06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 18 Nov 2020 08:16:40 +0100 Subject: [PATCH 014/118] selftests/bpf: Mark tests that require unaligned memory access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A lot of tests require unaligned memory access to work. Mark the tests as such, so that they can be avoided on unsupported architectures such as RISC-V. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Luke Nelson Link: https://lore.kernel.org/bpf/20201118071640.83773-4-bjorn.topel@gmail.com --- .../selftests/bpf/verifier/ctx_sk_lookup.c | 7 +++ .../bpf/verifier/direct_value_access.c | 3 ++ .../testing/selftests/bpf/verifier/map_ptr.c | 1 + .../selftests/bpf/verifier/raw_tp_writable.c | 1 + .../selftests/bpf/verifier/ref_tracking.c | 4 ++ .../testing/selftests/bpf/verifier/regalloc.c | 8 ++++ .../selftests/bpf/verifier/wide_access.c | 46 +++++++++++-------- 7 files changed, 52 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index 2ad5f974451c..fb13ca2d5606 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -266,6 +266,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup remote_ip4 field", @@ -292,6 +293,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup remote_port field", @@ -305,6 +307,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup local_ip4 field", @@ -331,6 +334,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 8-byte read from bpf_sk_lookup local_port field", @@ -344,6 +348,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, /* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */ { @@ -410,6 +415,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "invalid 4-byte unaligned read from bpf_sk_lookup at even offset", @@ -422,6 +428,7 @@ .result = REJECT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, /* in-bound and out-of-bound writes to bpf_sk_lookup */ { diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index 988f46a1a4c7..c0648dc009b5 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -69,6 +69,7 @@ .fixup_map_array_48b = { 1 }, .result = REJECT, .errstr = "R1 min value is outside of the allowed memory range", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 7", @@ -195,6 +196,7 @@ .fixup_map_array_48b = { 1, 3 }, .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=47 size=2", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 17", @@ -209,6 +211,7 @@ .fixup_map_array_48b = { 1, 3 }, .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=47 size=2", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "direct map access, write test 18", diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index 637f9293bda8..b117bdd3806d 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -44,6 +44,7 @@ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN", .result = REJECT, .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "bpf_map_ptr: read ops field accepted", diff --git a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c index 95b5d70a1dc1..2978fb5a769d 100644 --- a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c +++ b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c @@ -31,4 +31,5 @@ .fixup_map_hash_8b = { 1, }, .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 006b5bd99c08..3b6ee009c00b 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -675,6 +675,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_sk_fullsock() after release", @@ -698,6 +699,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_sk_fullsock(tp) after release", @@ -725,6 +727,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use sk after bpf_sk_release(tp)", @@ -747,6 +750,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .errstr = "invalid mem access", + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)", diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c index 4ad7e05de706..bb0dd89dd212 100644 --- a/tools/testing/selftests/bpf/verifier/regalloc.c +++ b/tools/testing/selftests/bpf/verifier/regalloc.c @@ -21,6 +21,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc negative", @@ -71,6 +72,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc src_reg negative", @@ -97,6 +99,7 @@ .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=44 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc and spill", @@ -126,6 +129,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc and spill negative", @@ -156,6 +160,7 @@ .result = REJECT, .errstr = "invalid access to map value, value_size=48 off=48 size=8", .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc three regs", @@ -182,6 +187,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc after call", @@ -210,6 +216,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc in callee", @@ -240,6 +247,7 @@ .fixup_map_hash_48b = { 4 }, .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { "regalloc, spill, JEQ", diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c index ccade9312d21..55af248efa93 100644 --- a/tools/testing/selftests/bpf/verifier/wide_access.c +++ b/tools/testing/selftests/bpf/verifier/wide_access.c @@ -1,4 +1,4 @@ -#define BPF_SOCK_ADDR_STORE(field, off, res, err) \ +#define BPF_SOCK_ADDR_STORE(field, off, res, err, flgs) \ { \ "wide store to bpf_sock_addr." #field "[" #off "]", \ .insns = { \ @@ -11,31 +11,36 @@ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ .errstr = err, \ + .flags = flgs, \ } /* user_ip6[0] is u64 aligned */ BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), + "invalid bpf_context access off=12 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), + "invalid bpf_context access off=20 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), /* msg_src_ip6[0] is _not_ u64 aligned */ BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), + "invalid bpf_context access off=44 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), + "invalid bpf_context access off=52 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), + "invalid bpf_context access off=56 size=8", 0), #undef BPF_SOCK_ADDR_STORE -#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \ +#define BPF_SOCK_ADDR_LOAD(field, off, res, err, flgs) \ { \ "wide load from bpf_sock_addr." #field "[" #off "]", \ .insns = { \ @@ -48,26 +53,31 @@ BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT, .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \ .errstr = err, \ + .flags = flgs, \ } /* user_ip6[0] is u64 aligned */ BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT, - "invalid bpf_context access off=12 size=8"), + "invalid bpf_context access off=12 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT, - "invalid bpf_context access off=20 size=8"), + "invalid bpf_context access off=20 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), /* msg_src_ip6[0] is _not_ u64 aligned */ BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT, - "invalid bpf_context access off=44 size=8"), + "invalid bpf_context access off=44 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT, - NULL), + NULL, 0), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT, - "invalid bpf_context access off=52 size=8"), + "invalid bpf_context access off=52 size=8", + F_NEEDS_EFFICIENT_UNALIGNED_ACCESS), BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT, - "invalid bpf_context access off=56 size=8"), + "invalid bpf_context access off=56 size=8", 0), #undef BPF_SOCK_ADDR_LOAD From 450d060e8f752a6ce052a2bffd3f01633472e330 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 18 Nov 2020 23:30:39 -0800 Subject: [PATCH 015/118] bpftool: Add {i,d}tlb_misses support for bpftool profile Commit 47c09d6a9f67("bpftool: Introduce "prog profile" command") introduced "bpftool prog profile" command which can be used to profile bpf program with metrics like # of instructions, This patch added support for itlb_misses and dtlb_misses. During an internal bpf program performance evaluation, I found these two metrics are also very useful. The following is an example output: $ bpftool prog profile id 324 duration 3 cycles itlb_misses 1885029 run_cnt 5134686073 cycles 306893 itlb_misses $ bpftool prog profile id 324 duration 3 cycles dtlb_misses 1827382 run_cnt 4943593648 cycles 5975636 dtlb_misses $ bpftool prog profile id 324 duration 3 cycles llc_misses 1836527 run_cnt 5019612972 cycles 4161041 llc_misses From the above, we can see quite some dtlb misses, 3 dtlb misses perf prog run. This might be something worth further investigation. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201119073039.4060095-1-yhs@fb.com --- tools/bpf/bpftool/prog.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index acdb2c245f0a..1fe3ba255bad 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1717,6 +1717,34 @@ struct profile_metric { .ratio_desc = "LLC misses per million insns", .ratio_mul = 1e6, }, + { + .name = "itlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "itlb misses per million insns", + .ratio_mul = 1e6, + }, + { + .name = "dtlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "dtlb misses per million insns", + .ratio_mul = 1e6, + }, }; static __u64 profile_total_count; @@ -2109,7 +2137,7 @@ static int do_help(int argc, char **argv) " struct_ops | fentry | fexit | freplace | sk_lookup }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" - " METRIC := { cycles | instructions | l1d_loads | llc_misses }\n" + " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); From 91b2db27d3ff9ad29e8b3108dfbf1e2f49fe9bd3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 19 Nov 2020 16:28:33 -0800 Subject: [PATCH 016/118] bpf: Simplify task_file_seq_get_next() Simplify task_file_seq_get_next() by removing two in/out arguments: task and fstruct. Use info->task and info->files instead. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201120002833.2481110-1-songliubraving@fb.com --- kernel/bpf/task_iter.c | 54 +++++++++++++----------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 1fdb2fc196cd..0458a40edf10 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -136,8 +136,7 @@ struct bpf_iter_seq_task_file_info { }; static struct file * -task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, - struct task_struct **task, struct files_struct **fstruct) +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { struct pid_namespace *ns = info->common.ns; u32 curr_tid = info->tid, max_fds; @@ -150,14 +149,17 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, * Otherwise, it does not hold any reference. */ again: - if (*task) { - curr_task = *task; - curr_files = *fstruct; + if (info->task) { + curr_task = info->task; + curr_files = info->files; curr_fd = info->fd; } else { curr_task = task_seq_get_next(ns, &curr_tid, true); - if (!curr_task) + if (!curr_task) { + info->task = NULL; + info->files = NULL; return NULL; + } curr_files = get_files_struct(curr_task); if (!curr_files) { @@ -167,9 +169,8 @@ again: goto again; } - /* set *fstruct, *task and info->tid */ - *fstruct = curr_files; - *task = curr_task; + info->files = curr_files; + info->task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; } else { @@ -199,8 +200,8 @@ again: rcu_read_unlock(); put_files_struct(curr_files); put_task_struct(curr_task); - *task = NULL; - *fstruct = NULL; + info->task = NULL; + info->files = NULL; info->fd = 0; curr_tid = ++(info->tid); goto again; @@ -209,21 +210,13 @@ again: static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = NULL; - struct task_struct *task = NULL; struct file *file; - file = task_file_seq_get_next(info, &task, &files); - if (!file) { - info->files = NULL; - info->task = NULL; - return NULL; - } - - if (*pos == 0) + info->task = NULL; + info->files = NULL; + file = task_file_seq_get_next(info); + if (file && *pos == 0) ++*pos; - info->task = task; - info->files = files; return file; } @@ -231,24 +224,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = info->files; - struct task_struct *task = info->task; - struct file *file; ++*pos; ++info->fd; fput((struct file *)v); - file = task_file_seq_get_next(info, &task, &files); - if (!file) { - info->files = NULL; - info->task = NULL; - return NULL; - } - - info->task = task; - info->files = files; - - return file; + return task_file_seq_get_next(info); } struct bpf_iter__task_file { From 05a98d7672731aeb5f9837b35cc7fe70444e70bd Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sat, 21 Nov 2020 21:22:04 -0500 Subject: [PATCH 017/118] selftest/bpf: Fix link in readme The link was bad because of invalid rst; it was pointing to itself and was rendering badly. Signed-off-by: Andrei Matei Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201122022205.57229-1-andreimatei1@gmail.com --- tools/testing/selftests/bpf/README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index ac9eda830187..3b8d8885892d 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -2,7 +2,10 @@ BPF Selftest Notes ================== General instructions on running selftests can be found in -`Documentation/bpf/bpf_devel_QA.rst`_. +`Documentation/bpf/bpf_devel_QA.rst`__. + +__ /Documentation/bpf/bpf_devel_QA.rst#q-how-to-run-bpf-selftests + Additional information about selftest failures are documented here. From 1c26ac6ab3ce47ee2e6342373681dedbb97e21a3 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sat, 21 Nov 2020 21:22:05 -0500 Subject: [PATCH 018/118] selftest/bpf: Fix rst formatting in readme A couple of places in the readme had invalid rst formatting causing the rendering to be off. This patch fixes them with minimal edits. Signed-off-by: Andrei Matei Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201122022205.57229-2-andreimatei1@gmail.com --- tools/testing/selftests/bpf/README.rst | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 3b8d8885892d..ca064180d4d0 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -33,11 +33,12 @@ The verifier will reject such code with above error. At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and the insn 20 undoes map_value addition. It is currently impossible for the verifier to understand such speculative pointer arithmetic. -Hence - https://reviews.llvm.org/D85570 -addresses it on the compiler side. It was committed on llvm 12. +Hence `this patch`__ addresses it on the compiler side. It was committed on llvm 12. + +__ https://reviews.llvm.org/D85570 The corresponding C code + .. code-block:: c for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) { @@ -80,10 +81,11 @@ The symptom for ``bpf_iter/netlink`` looks like 17: (7b) *(u64 *)(r7 +0) = r2 only read is supported -This is due to a llvm BPF backend bug. The fix - https://reviews.llvm.org/D78466 +This is due to a llvm BPF backend bug. `The fix`__ has been pushed to llvm 10.x release branch and will be -available in 10.0.1. The fix is available in llvm 11.0.0 trunk. +available in 10.0.1. The patch is available in llvm 11.0.0 trunk. + +__ https://reviews.llvm.org/D78466 BPF CO-RE-based tests and Clang version ======================================= @@ -97,11 +99,11 @@ them to Clang/LLVM. These sub-tests are going to be skipped if Clang is too old to support them, they shouldn't cause build failures or runtime test failures: - - __builtin_btf_type_id() ([0], [1], [2]); - - __builtin_preserve_type_info(), __builtin_preserve_enum_value() ([3], [4]). +- __builtin_btf_type_id() [0_, 1_, 2_]; +- __builtin_preserve_type_info(), __builtin_preserve_enum_value() [3_, 4_]. - [0] https://reviews.llvm.org/D74572 - [1] https://reviews.llvm.org/D74668 - [2] https://reviews.llvm.org/D85174 - [3] https://reviews.llvm.org/D83878 - [4] https://reviews.llvm.org/D83242 +.. _0: https://reviews.llvm.org/D74572 +.. _1: https://reviews.llvm.org/D74668 +.. _2: https://reviews.llvm.org/D85174 +.. _3: https://reviews.llvm.org/D83878 +.. _4: https://reviews.llvm.org/D83242 From e732b538f4557cd0a856bbce3cde55d2dfef3b03 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 20 Nov 2020 23:08:28 -0800 Subject: [PATCH 019/118] kbuild: Skip module BTF generation for out-of-tree external modules In some modes of operation, Kbuild allows to build modules without having vmlinux image around. In such case, generation of module BTF is impossible. This patch changes the behavior to emit a warning about impossibility of generating kernel module BTF, instead of breaking the build. This is especially important for out-of-tree external module builds. In vmlinux-less mode: $ make clean $ make modules_prepare $ touch drivers/acpi/button.c $ make M=drivers/acpi ... CC [M] drivers/acpi/button.o MODPOST drivers/acpi/Module.symvers LD [M] drivers/acpi/button.ko BTF [M] drivers/acpi/button.ko Skipping BTF generation for drivers/acpi/button.ko due to unavailability of vmlinux ... $ readelf -S ~/linux-build/default/drivers/acpi/button.ko | grep BTF -A1 ... empty ... Now with normal build: $ make all ... LD [M] drivers/acpi/button.ko BTF [M] drivers/acpi/button.ko ... $ readelf -S ~/linux-build/default/drivers/acpi/button.ko | grep BTF -A1 [60] .BTF PROGBITS 0000000000000000 00029310 000000000000ab3f 0000000000000000 0 0 1 Fixes: 5f9ae91f7c0d ("kbuild: Build kernel module BTFs if BTF is enabled and pahole supports it") Reported-by: Bruce Allan Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Cc: Jessica Yu Cc: Greg Kroah-Hartman Cc: Masahiro Yamada Link: https://lore.kernel.org/bpf/20201121070829.2612884-1-andrii@kernel.org --- scripts/Makefile.modfinal | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 02b892421f7a..d49ec001825d 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -38,7 +38,12 @@ quiet_cmd_ld_ko_o = LD [M] $@ $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) quiet_cmd_btf_ko = BTF [M] $@ - cmd_btf_ko = LLVM_OBJCOPY=$(OBJCOPY) $(PAHOLE) -J --btf_base vmlinux $@ + cmd_btf_ko = \ + if [ -f vmlinux ]; then \ + LLVM_OBJCOPY=$(OBJCOPY) $(PAHOLE) -J --btf_base vmlinux $@; \ + else \ + printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ + fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies newer_prereqs_except = $(filter-out $(PHONY) $(1),$?) @@ -49,7 +54,7 @@ if_changed_except = $(if $(call newer_prereqs_except,$(2))$(cmd-check), \ printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:) # Re-generate module BTFs if either module's .ko or vmlinux changed -$(modules): %.ko: %.o %.mod.o scripts/module.lds vmlinux FORCE +$(modules): %.ko: %.o %.mod.o scripts/module.lds $(if $(KBUILD_BUILTIN),vmlinux) FORCE +$(call if_changed_except,ld_ko_o,vmlinux) ifdef CONFIG_DEBUG_INFO_BTF_MODULES +$(if $(newer-prereqs),$(call cmd,btf_ko)) From 607c543f939d8ca6fed7afe90b3a8d6f6684dd17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 20 Nov 2020 23:08:29 -0800 Subject: [PATCH 020/118] bpf: Sanitize BTF data pointer after module is loaded Given .BTF section is not allocatable, it will get trimmed after module is loaded. BPF system handles that properly by creating an independent copy of data. But prevent any accidental misused by resetting the pointer to BTF data. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Suggested-by: Jessica Yu Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jessica Yu Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201121070829.2612884-2-andrii@kernel.org --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/module.c b/kernel/module.c index f2996b02ab2e..18f259d61d14 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3709,6 +3709,11 @@ static noinline int do_init_module(struct module *mod) mod->init_layout.ro_size = 0; mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ + mod->btf_data = NULL; + mod->btf_data_size = 0; +#endif /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we From 59e2e27d227a0a4e7ec0e22c63ca36a5ad1ab438 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Sat, 21 Nov 2020 01:55:09 +0000 Subject: [PATCH 021/118] bpf: Refactor check_cfg to use a structured loop. The current implementation uses a number of gotos to implement a loop and different paths within the loop, which makes the code less readable than it would be with an explicit while-loop. This patch also replaces a chain of if/if-elses keyed on the same expression with a switch statement. No change in behaviour is intended. Signed-off-by: Wedson Almeida Filho Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201121015509.3594191-1-wedsonaf@google.com --- kernel/bpf/verifier.c | 177 ++++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 83 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb2943ea715d..e333ce43f281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8047,6 +8047,11 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].prune_point = true; } +enum { + DONE_EXPLORING = 0, + KEEP_EXPLORING = 1, +}; + /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -8059,10 +8064,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, int *insn_state = env->cfg.insn_state; if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) - return 0; + return DONE_EXPLORING; if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) - return 0; + return DONE_EXPLORING; if (w < 0 || w >= env->prog->len) { verbose_linfo(env, t, "%d: ", t); @@ -8081,10 +8086,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; insn_stack[env->cfg.cur_stack++] = w; - return 1; + return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { if (loop_ok && env->bpf_capable) - return 0; + return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -8096,7 +8101,74 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, verbose(env, "insn state internal bug\n"); return -EFAULT; } - return 0; + return DONE_EXPLORING; +} + +/* Visits the instruction at index t and returns one of the following: + * < 0 - an error occurred + * DONE_EXPLORING - the instruction was fully explored + * KEEP_EXPLORING - there is still work to be done before it is fully explored + */ +static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int ret; + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insns[t].code) != BPF_JMP && + BPF_CLASS(insns[t].code) != BPF_JMP32) + return push_insn(t, t + 1, FALLTHROUGH, env, false); + + switch (BPF_OP(insns[t].code)) { + case BPF_EXIT: + return DONE_EXPLORING; + + case BPF_CALL: + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; + + case BPF_JA: + if (BPF_SRC(insns[t].code) != BPF_K) + return -EINVAL; + + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + true); + if (ret) + return ret; + + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); + /* tell verifier to check for equivalent states + * after every call and jump + */ + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + + return ret; + + default: + /* conditional jump with two edges */ + init_explored_state(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + if (ret) + return ret; + + return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + } } /* non-recursive depth-first-search to detect loops in BPF program @@ -8104,11 +8176,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, */ static int check_cfg(struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; int *insn_stack, *insn_state; int ret = 0; - int i, t; + int i; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -8124,92 +8195,32 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; -peek_stack: - if (env->cfg.cur_stack == 0) - goto check_state; - t = insn_stack[env->cfg.cur_stack - 1]; + while (env->cfg.cur_stack > 0) { + int t = insn_stack[env->cfg.cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP || - BPF_CLASS(insns[t].code) == BPF_JMP32) { - u8 opcode = BPF_OP(insns[t].code); - - if (opcode == BPF_EXIT) { - goto mark_explored; - } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; + ret = visit_insn(t, insn_cnt, env); + switch (ret) { + case DONE_EXPLORING: + insn_state[t] = EXPLORED; + env->cfg.cur_stack--; + break; + case KEEP_EXPLORING: + break; + default: + if (ret > 0) { + verbose(env, "visit_insn internal bug\n"); + ret = -EFAULT; } - } else if (opcode == BPF_JA) { - if (BPF_SRC(insns[t].code) != BPF_K) { - ret = -EINVAL; - goto err_free; - } - /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ - init_explored_state(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - } else { - /* conditional jump with two edges */ - init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } - } else { - /* all other non-branch instructions with single - * fall-through edge - */ - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) goto err_free; + } } -mark_explored: - insn_state[t] = EXPLORED; - if (env->cfg.cur_stack-- <= 0) { + if (env->cfg.cur_stack < 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } - goto peek_stack; -check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); From db13db9f67fe5049159a05e870daedcee5879f8d Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 24 Nov 2020 15:21:14 +0800 Subject: [PATCH 022/118] libbpf: Add support for canceling cached_cons advance Add a new function for returning descriptors the user received after an xsk_ring_cons__peek call. After the application has gotten a number of descriptors from a ring, it might not be able to or want to process them all for various reasons. Therefore, it would be useful to have an interface for returning or cancelling a number of them so that they are returned to the ring. This patch adds a new function called xsk_ring_cons__cancel that performs this operation on nb descriptors counted from the end of the batch of descriptors that was received through the peek call. Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann [ Magnus Karlsson: rewrote changelog ] Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/1606202474-8119-1-git-send-email-lirongqing@baidu.com --- tools/lib/bpf/xsk.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index 1069c46364ff..1719a327e5f9 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -153,6 +153,12 @@ static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, return entries; } +static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, + size_t nb) +{ + cons->cached_cons -= nb; +} + static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) { /* Make sure data has been read before indicating we are done From 403319be5de51167cd70ddf594b76c95e6d26844 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 24 Nov 2020 15:12:08 +0000 Subject: [PATCH 023/118] ima: Implement ima_inode_hash This is in preparation to add a helper for BPF LSM programs to use IMA hashes when attached to LSM hooks. There are LSM hooks like inode_unlink which do not have a struct file * argument and cannot use the existing ima_file_hash API. An inode based API is, therefore, useful in LSM based detections like an executable trying to delete itself which rely on the inode_unlink LSM hook. Moreover, the ima_file_hash function does nothing with the struct file pointer apart from calling file_inode on it and converting it to an inode. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Mimi Zohar Link: https://lore.kernel.org/bpf/20201124151210.1081188-2-kpsingh@chromium.org --- include/linux/ima.h | 6 +++ security/integrity/ima/ima_main.c | 78 +++++++++++++++++++++---------- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/include/linux/ima.h b/include/linux/ima.h index 8fa7bcfb2da2..7233a2751754 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -29,6 +29,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); +extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); #ifdef CONFIG_IMA_KEXEC @@ -115,6 +116,11 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size) return -EOPNOTSUPP; } +static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) +{ + return -EOPNOTSUPP; +} + static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {} #endif /* CONFIG_IMA */ diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 2d1af8899cab..cb2deaa188e7 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -501,37 +501,14 @@ int ima_file_check(struct file *file, int mask) } EXPORT_SYMBOL_GPL(ima_file_check); -/** - * ima_file_hash - return the stored measurement if a file has been hashed and - * is in the iint cache. - * @file: pointer to the file - * @buf: buffer in which to store the hash - * @buf_size: length of the buffer - * - * On success, return the hash algorithm (as defined in the enum hash_algo). - * If buf is not NULL, this function also outputs the hash into buf. - * If the hash is larger than buf_size, then only buf_size bytes will be copied. - * It generally just makes sense to pass a buffer capable of holding the largest - * possible hash: IMA_MAX_DIGEST_SIZE. - * The file hash returned is based on the entire file, including the appended - * signature. - * - * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. - * If the parameters are incorrect, return -EINVAL. - */ -int ima_file_hash(struct file *file, char *buf, size_t buf_size) +static int __ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) { - struct inode *inode; struct integrity_iint_cache *iint; int hash_algo; - if (!file) - return -EINVAL; - if (!ima_policy_flag) return -EOPNOTSUPP; - inode = file_inode(file); iint = integrity_iint_find(inode); if (!iint) return -EOPNOTSUPP; @@ -558,8 +535,61 @@ int ima_file_hash(struct file *file, char *buf, size_t buf_size) return hash_algo; } + +/** + * ima_file_hash - return the stored measurement if a file has been hashed and + * is in the iint cache. + * @file: pointer to the file + * @buf: buffer in which to store the hash + * @buf_size: length of the buffer + * + * On success, return the hash algorithm (as defined in the enum hash_algo). + * If buf is not NULL, this function also outputs the hash into buf. + * If the hash is larger than buf_size, then only buf_size bytes will be copied. + * It generally just makes sense to pass a buffer capable of holding the largest + * possible hash: IMA_MAX_DIGEST_SIZE. + * The file hash returned is based on the entire file, including the appended + * signature. + * + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. + * If the parameters are incorrect, return -EINVAL. + */ +int ima_file_hash(struct file *file, char *buf, size_t buf_size) +{ + if (!file) + return -EINVAL; + + return __ima_inode_hash(file_inode(file), buf, buf_size); +} EXPORT_SYMBOL_GPL(ima_file_hash); +/** + * ima_inode_hash - return the stored measurement if the inode has been hashed + * and is in the iint cache. + * @inode: pointer to the inode + * @buf: buffer in which to store the hash + * @buf_size: length of the buffer + * + * On success, return the hash algorithm (as defined in the enum hash_algo). + * If buf is not NULL, this function also outputs the hash into buf. + * If the hash is larger than buf_size, then only buf_size bytes will be copied. + * It generally just makes sense to pass a buffer capable of holding the largest + * possible hash: IMA_MAX_DIGEST_SIZE. + * The hash returned is based on the entire contents, including the appended + * signature. + * + * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP. + * If the parameters are incorrect, return -EINVAL. + */ +int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size) +{ + if (!inode) + return -EINVAL; + + return __ima_inode_hash(inode, buf, buf_size); +} +EXPORT_SYMBOL_GPL(ima_inode_hash); + /** * ima_post_create_tmpfile - mark newly created tmpfile as new * @file : newly created tmpfile From 27672f0d280a3f286a410a8db2004f46ace72a17 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 24 Nov 2020 15:12:09 +0000 Subject: [PATCH 024/118] bpf: Add a BPF helper for getting the IMA hash of an inode Provide a wrapper function to get the IMA hash of an inode. This helper is useful in fingerprinting files (e.g executables on execution) and using these fingerprints in detections like an executable unlinking itself. Since the ima_inode_hash can sleep, it's only allowed for sleepable LSM hooks. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201124151210.1081188-3-kpsingh@chromium.org --- include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/bpf_lsm.c | 26 ++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 4 files changed, 50 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ca6146f001a..c3458ec1f30a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3807,6 +3807,16 @@ union bpf_attr { * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) * Return * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3970,6 +3980,7 @@ union bpf_attr { FN(get_current_task_btf), \ FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index b4f27a874092..70e5e0b6d69d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -15,6 +15,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -75,6 +76,29 @@ const static struct bpf_func_proto bpf_bprm_opts_set_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size) +{ + return ima_inode_hash(inode, dst, size); +} + +static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) +{ + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) + +const static struct bpf_func_proto bpf_ima_inode_hash_proto = { + .func = bpf_ima_inode_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -97,6 +121,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; + case BPF_FUNC_ima_inode_hash: + return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index c5bc947a70ad..8b829748d488 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,7 @@ class PrinterHelpers(Printer): 'struct xdp_md', 'struct path', 'struct btf_ptr', + 'struct inode', ] known_types = { '...', @@ -480,6 +481,7 @@ class PrinterHelpers(Printer): 'struct task_struct', 'struct path', 'struct btf_ptr', + 'struct inode', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca6146f001a..c3458ec1f30a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3807,6 +3807,16 @@ union bpf_attr { * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) * Return * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3970,6 +3980,7 @@ union bpf_attr { FN(get_current_task_btf), \ FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 34b82d3ac1058653b3de7be4697b55f67533b1f1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 24 Nov 2020 15:12:10 +0000 Subject: [PATCH 025/118] bpf: Add a selftest for bpf_ima_inode_hash The test does the following: - Mounts a loopback filesystem and appends the IMA policy to measure executions only on this file-system. Restricting the IMA policy to a particular filesystem prevents a system-wide IMA policy change. - Executes an executable copied to this loopback filesystem. - Calls the bpf_ima_inode_hash in the bprm_committed_creds hook and checks if the call succeeded and checks if a hash was calculated. The test shells out to the added ima_setup.sh script as the setup is better handled in a shell script and is more complicated to do in the test program or even shelling out individual commands from C. The list of required configs (i.e. IMA, SECURITYFS, IMA_{WRITE,READ}_POLICY) for running this test are also updated. Suggested-by: Mimi Zohar (limit policy rule to loopback mount) Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201124151210.1081188-4-kpsingh@chromium.org --- tools/testing/selftests/bpf/config | 4 + tools/testing/selftests/bpf/ima_setup.sh | 80 +++++++++++++++++++ .../selftests/bpf/prog_tests/test_ima.c | 74 +++++++++++++++++ tools/testing/selftests/bpf/progs/ima.c | 28 +++++++ 4 files changed, 186 insertions(+) create mode 100755 tools/testing/selftests/bpf/ima_setup.sh create mode 100644 tools/testing/selftests/bpf/prog_tests/test_ima.c create mode 100644 tools/testing/selftests/bpf/progs/ima.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 2118e23ac07a..365bf9771b07 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -39,3 +39,7 @@ CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y CONFIG_LIRC=y +CONFIG_IMA=y +CONFIG_SECURITYFS=y +CONFIG_IMA_WRITE_POLICY=y +CONFIG_IMA_READ_POLICY=y diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh new file mode 100755 index 000000000000..15490ccc5e55 --- /dev/null +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u + +IMA_POLICY_FILE="/sys/kernel/security/ima/policy" +TEST_BINARY="/bin/true" + +usage() +{ + echo "Usage: $0 " + exit 1 +} + +setup() +{ + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + mkdir -p ${mount_dir} + + dd if=/dev/zero of="${mount_img}" bs=1M count=10 + + local loop_device="$(losetup --find --show ${mount_img})" + + mkfs.ext4 "${loop_device}" + mount "${loop_device}" "${mount_dir}" + + cp "${TEST_BINARY}" "${mount_dir}" + local mount_uuid="$(blkid -s UUID -o value ${loop_device})" + echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} +} + +cleanup() { + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" + + local loop_devices=$(losetup -j ${mount_img} -O NAME --noheadings) + for loop_dev in "${loop_devices}"; do + losetup -d $loop_dev + done + + umount ${mount_dir} + rm -rf ${tmp_dir} +} + +run() +{ + local tmp_dir="$1" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + + exec "${copied_bin_path}" +} + +main() +{ + [[ $# -ne 2 ]] && usage + + local action="$1" + local tmp_dir="$2" + + [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 + + if [[ "${action}" == "setup" ]]; then + setup "${tmp_dir}" + elif [[ "${action}" == "cleanup" ]]; then + cleanup "${tmp_dir}" + elif [[ "${action}" == "run" ]]; then + run "${tmp_dir}" + else + echo "Unknown action: ${action}" + exit 1 + fi +} + +main "$@" diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c new file mode 100644 index 000000000000..61fca681d524 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include +#include +#include +#include +#include + +#include "ima.skel.h" + +static int run_measured_process(const char *measured_dir, u32 *monitored_pid) +{ + int child_pid, child_status; + + child_pid = fork(); + if (child_pid == 0) { + *monitored_pid = getpid(); + execlp("./ima_setup.sh", "./ima_setup.sh", "run", measured_dir, + NULL); + exit(errno); + + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return WEXITSTATUS(child_status); + } + + return -EINVAL; +} + +void test_test_ima(void) +{ + char measured_dir_template[] = "/tmp/ima_measuredXXXXXX"; + const char *measured_dir; + char cmd[256]; + + int err, duration = 0; + struct ima *skel = NULL; + + skel = ima__open_and_load(); + if (CHECK(!skel, "skel_load", "skeleton failed\n")) + goto close_prog; + + err = ima__attach(skel); + if (CHECK(err, "attach", "attach failed: %d\n", err)) + goto close_prog; + + measured_dir = mkdtemp(measured_dir_template); + if (CHECK(measured_dir == NULL, "mkdtemp", "err %d\n", errno)) + goto close_prog; + + snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); + if (CHECK_FAIL(system(cmd))) + goto close_clean; + + err = run_measured_process(measured_dir, &skel->bss->monitored_pid); + if (CHECK(err, "run_measured_process", "err = %d\n", err)) + goto close_clean; + + CHECK(skel->data->ima_hash_ret < 0, "ima_hash_ret", + "ima_hash_ret = %ld\n", skel->data->ima_hash_ret); + + CHECK(skel->bss->ima_hash == 0, "ima_hash", + "ima_hash = %lu\n", skel->bss->ima_hash); + +close_clean: + snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); + CHECK_FAIL(system(cmd)); +close_prog: + ima__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/ima.c b/tools/testing/selftests/bpf/progs/ima.c new file mode 100644 index 000000000000..86b21aff4bc5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ima.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include +#include +#include + +long ima_hash_ret = -1; +u64 ima_hash = 0; +u32 monitored_pid = 0; + +char _license[] SEC("license") = "GPL"; + +SEC("lsm.s/bprm_committed_creds") +int BPF_PROG(ima, struct linux_binprm *bprm) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + + if (pid == monitored_pid) + ima_hash_ret = bpf_ima_inode_hash(bprm->file->f_inode, + &ima_hash, sizeof(ima_hash)); + + return 0; +} From fb3558127cb62ba2dea9e3d0efa1bb1d7e5eee2a Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 24 Nov 2020 22:52:55 -0500 Subject: [PATCH 026/118] bpf: Fix selftest compilation on clang 11 Before this patch, profiler.inc.h wouldn't compile with clang-11 (before the __builtin_preserve_enum_value LLVM builtin was introduced in https://reviews.llvm.org/D83242). Another test that uses this builtin (test_core_enumval) is conditionally skipped if the compiler is too old. In that spirit, this patch inhibits part of populate_cgroup_info(), which needs this CO-RE builtin. The selftests build again on clang-11. The affected test (the profiler test) doesn't pass on clang-11 because it's missing https://reviews.llvm.org/D85570, but at least the test suite as a whole compiles. The test's expected failure is already called out in the README. Signed-off-by: Andrei Matei Signed-off-by: Daniel Borkmann Tested-by: Florian Lehner Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201125035255.17970-1-andreimatei1@gmail.com --- tools/testing/selftests/bpf/progs/profiler.inc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 30982a7e4d0f..4896fdf816f7 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -256,6 +256,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); +#if __has_builtin(__builtin_preserve_enum_value) if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) { int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, pids_cgrp_id___local); @@ -275,6 +276,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, } } } +#endif cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs); cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs); From c5815ac7e2aaff4f00b2b9e21d84b9f2fddddb48 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:04 +0000 Subject: [PATCH 027/118] samples: bpf: Refactor hbm program with libbpf This commit refactors the existing cgroup programs with libbpf bpf loader. Since bpf_program__attach doesn't support cgroup program attachment, this explicitly attaches cgroup bpf program with bpf_program__attach_cgroup(bpf_prog, cg1). Also, to change attach_type of bpf program, this uses libbpf's bpf_program__set_expected_attach_type helper to switch EGRESS to INGRESS. To keep bpf program attached to the cgroup hierarchy even after the exit, this commit uses the BPF_LINK_PINNING to pin the link attachment even after it is closed. Besides, this program was broken due to the typo of BPF MAP definition. But this commit solves the problem by fixing this from 'queue_stats' map struct hvm_queue_stats -> hbm_queue_stats. Fixes: 36b5d471135c ("selftests/bpf: samples/bpf: Split off legacy stuff from bpf_helpers.h") Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-2-danieltimlee@gmail.com --- samples/bpf/.gitignore | 3 + samples/bpf/Makefile | 2 +- samples/bpf/do_hbm_test.sh | 32 +++++------ samples/bpf/hbm.c | 113 ++++++++++++++++++++----------------- samples/bpf/hbm_kern.h | 2 +- 5 files changed, 79 insertions(+), 73 deletions(-) diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index b2f29bc8dc43..0b9548ea8477 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -52,3 +52,6 @@ xdp_tx_iptunnel xdpsock xsk_fwd testfile.img +hbm_out.log +iperf.* +*.out diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index aeebf5d12f32..7c61118525f7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -110,7 +110,7 @@ xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) -hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) +hbm-objs := hbm.o $(CGROUP_HELPERS) # Tell kbuild to always build the programs always-y := $(tprogs-y) diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index ffe4c0607341..21790ea5c460 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -91,6 +91,16 @@ qdisc="" flags="" do_stats=0 +BPFFS=/sys/fs/bpf +function config_bpffs () { + if mount | grep $BPFFS > /dev/null; then + echo "bpffs already mounted" + else + echo "bpffs not mounted. Mounting..." + mount -t bpf none $BPFFS + fi +} + function start_hbm () { rm -f hbm.out echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out @@ -192,6 +202,7 @@ processArgs () { } processArgs +config_bpffs if [ $debug_flag -eq 1 ] ; then rm -f hbm_out.log @@ -201,7 +212,7 @@ hbm_pid=$(start_hbm) usleep 100000 host=`hostname` -cg_base_dir=/sys/fs/cgroup +cg_base_dir=/sys/fs/cgroup/unified cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" echo $$ >> $cg_dir/cgroup.procs @@ -411,23 +422,8 @@ fi sleep 1 -# Detach any BPF programs that may have lingered -ttx=`bpftool cgroup tree | grep hbm` -v=2 -for x in $ttx ; do - if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then - cg=$x ; v=0 - else - if [ $v -eq 0 ] ; then - id=$x ; v=1 - else - if [ $v -eq 1 ] ; then - type=$x ; bpftool cgroup detach $cg $type id $id - v=0 - fi - fi - fi -done +# Detach any pinned BPF programs that may have lingered +rm -rf $BPFFS/hbm* if [ $use_netperf -ne 0 ] ; then if [ "$server" == "" ] ; then diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index 400e741a56eb..b0c18efe7928 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -46,7 +46,6 @@ #include #include -#include "bpf_load.h" #include "bpf_rlimit.h" #include "cgroup_helpers.h" #include "hbm.h" @@ -70,9 +69,9 @@ static void do_error(char *msg, bool errno_flag); #define DEBUGFS "/sys/kernel/debug/tracing/" -struct bpf_object *obj; -int bpfprog_fd; -int cgroup_storage_fd; +static struct bpf_program *bpf_prog; +static struct bpf_object *obj; +static int queue_stats_fd; static void read_trace_pipe2(void) { @@ -121,56 +120,50 @@ static void do_error(char *msg, bool errno_flag) static int prog_load(char *prog) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_CGROUP_SKB, - .file = prog, - .expected_attach_type = BPF_CGROUP_INET_EGRESS, - }; - int map_fd; - struct bpf_map *map; - - int ret = 0; - - if (access(prog, O_RDONLY) < 0) { - printf("Error accessing file %s: %s\n", prog, strerror(errno)); + obj = bpf_object__open_file(prog, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); return 1; } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) - ret = 1; - if (!ret) { - map = bpf_object__find_map_by_name(obj, "queue_stats"); - map_fd = bpf_map__fd(map); - if (map_fd < 0) { - printf("Map not found: %s\n", strerror(map_fd)); - ret = 1; - } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto err; } - if (ret) { - printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); - printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); - ret = -1; - } else { - ret = map_fd; + bpf_prog = bpf_object__find_program_by_title(obj, "cgroup_skb/egress"); + if (!bpf_prog) { + printf("ERROR: finding a prog in obj file failed\n"); + goto err; } - return ret; + queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); + if (queue_stats_fd < 0) { + printf("ERROR: finding a map in obj file failed\n"); + goto err; + } + + return 0; + +err: + bpf_object__close(obj); + return 1; } static int run_bpf_prog(char *prog, int cg_id) { - int map_fd; - int rc = 0; + struct hbm_queue_stats qstats = {0}; + char cg_dir[100], cg_pin_path[100]; + struct bpf_link *link = NULL; int key = 0; int cg1 = 0; - int type = BPF_CGROUP_INET_EGRESS; - char cg_dir[100]; - struct hbm_queue_stats qstats = {0}; + int rc = 0; sprintf(cg_dir, "/hbm%d", cg_id); - map_fd = prog_load(prog); - if (map_fd == -1) - return 1; + rc = prog_load(prog); + if (rc != 0) + return rc; if (setup_cgroup_environment()) { printf("ERROR: setting cgroup environment\n"); @@ -190,16 +183,24 @@ static int run_bpf_prog(char *prog, int cg_id) qstats.stats = stats_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0; qstats.no_cn = no_cn_flag ? 1 : 0; - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { printf("ERROR: Could not update map element\n"); goto err; } if (!outFlag) - type = BPF_CGROUP_INET_INGRESS; - if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { - printf("ERROR: bpf_prog_attach fails!\n"); - log_err("Attaching prog"); + bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); + + link = bpf_program__attach_cgroup(bpf_prog, cg1); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); + goto err; + } + + sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); + rc = bpf_link__pin(link, cg_pin_path); + if (rc < 0) { + printf("ERROR: bpf_link__pin failed: %d\n", rc); goto err; } @@ -213,7 +214,7 @@ static int run_bpf_prog(char *prog, int cg_id) #define DELTA_RATE_CHECK 10000 /* in us */ #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ - bpf_map_lookup_elem(map_fd, &key, &qstats); + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); if (gettimeofday(&t0, NULL) < 0) do_error("gettimeofday failed", true); t_last = t0; @@ -242,7 +243,7 @@ static int run_bpf_prog(char *prog, int cg_id) fclose(fin); printf(" new_eth_tx_bytes:%llu\n", new_eth_tx_bytes); - bpf_map_lookup_elem(map_fd, &key, &qstats); + bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); new_cg_tx_bytes = qstats.bytes_total; delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; last_eth_tx_bytes = new_eth_tx_bytes; @@ -289,14 +290,14 @@ static int run_bpf_prog(char *prog, int cg_id) rate = minRate; qstats.rate = rate; } - if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) + if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) do_error("update map element fails", false); } } else { sleep(dur); } // Get stats! - if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { + if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { char fname[100]; FILE *fout; @@ -394,14 +395,20 @@ static int run_bpf_prog(char *prog, int cg_id) if (debugFlag) read_trace_pipe2(); - return rc; + goto cleanup; + err: rc = 1; - if (cg1) - close(cg1); - cleanup_cgroup_environment(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + if (cg1 != -1) + close(cg1); + + if (rc != 0) + cleanup_cgroup_environment(); return rc; } diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index e00f26f6afba..722b3fadb467 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -69,7 +69,7 @@ struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); __type(key, u32); - __type(value, struct hvm_queue_stats); + __type(value, struct hbm_queue_stats); } queue_stats SEC(".maps"); struct hbm_pkt_info { From d89af13c92056c46dfc4bcb3d90efe88937c3381 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:05 +0000 Subject: [PATCH 028/118] samples: bpf: Refactor test_cgrp2_sock2 program with libbpf This commit refactors the existing cgroup program with libbpf bpf loader. The original test_cgrp2_sock2 has keeped the bpf program attached to the cgroup hierarchy even after the exit of user program. To implement the same functionality with libbpf, this commit uses the BPF_LINK_PINNING to pin the link attachment even after it is closed. Since this uses LINK instead of ATTACH, detach of bpf program from cgroup with 'test_cgrp2_sock' is not used anymore. The code to mount the bpf was added to the .sh file in case the bpff was not mounted on /sys/fs/bpf. Additionally, to fix the problem that shell script cannot find the binary object from the current path, relative path './' has been added in front of binary. Fixes: 554ae6e792ef3 ("samples/bpf: add userspace example for prohibiting sockets") Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-3-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/test_cgrp2_sock2.c | 69 +++++++++++++++++++++++---------- samples/bpf/test_cgrp2_sock2.sh | 21 ++++++++-- 3 files changed, 66 insertions(+), 26 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 7c61118525f7..d31e082c369e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -82,7 +82,7 @@ test_overhead-objs := bpf_load.o test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o test_cgrp2_sock-objs := test_cgrp2_sock.o -test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o +test_cgrp2_sock2-objs := test_cgrp2_sock2.o xdp1-objs := xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := xdp1_user.o diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c index a9277b118c33..e7060aaa2f5a 100644 --- a/samples/bpf/test_cgrp2_sock2.c +++ b/samples/bpf/test_cgrp2_sock2.c @@ -20,9 +20,9 @@ #include #include #include +#include #include "bpf_insn.h" -#include "bpf_load.h" static int usage(const char *argv0) { @@ -32,37 +32,64 @@ static int usage(const char *argv0) int main(int argc, char **argv) { - int cg_fd, ret, filter_id = 0; + int cg_fd, err, ret = EXIT_FAILURE, filter_id = 0, prog_cnt = 0; + const char *link_pin_path = "/sys/fs/bpf/test_cgrp2_sock2"; + struct bpf_link *link = NULL; + struct bpf_program *progs[2]; + struct bpf_program *prog; + struct bpf_object *obj; if (argc < 3) return usage(argv[0]); - cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); - if (cg_fd < 0) { - printf("Failed to open cgroup path: '%s'\n", strerror(errno)); - return EXIT_FAILURE; - } - - if (load_bpf_file(argv[2])) - return EXIT_FAILURE; - - printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); - if (argc > 3) filter_id = atoi(argv[3]); + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return ret; + } + + obj = bpf_object__open_file(argv[2], NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + return ret; + } + + bpf_object__for_each_program(prog, obj) { + progs[prog_cnt] = prog; + prog_cnt++; + } + if (filter_id >= prog_cnt) { printf("Invalid program id; program not found in file\n"); - return EXIT_FAILURE; + goto cleanup; } - ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, - BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto cleanup; } - return EXIT_SUCCESS; + link = bpf_program__attach_cgroup(progs[filter_id], cg_fd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + err = bpf_link__pin(link, link_pin_path); + if (err < 0) { + printf("ERROR: bpf_link__pin failed: %d\n", err); + goto cleanup; + } + + ret = EXIT_SUCCESS; + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); + return ret; } diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh index 0f396a86e0cb..6a3dbe642b2b 100755 --- a/samples/bpf/test_cgrp2_sock2.sh +++ b/samples/bpf/test_cgrp2_sock2.sh @@ -1,6 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +BPFFS=/sys/fs/bpf +LINK_PIN=$BPFFS/test_cgrp2_sock2 + function config_device { ip netns add at_ns0 ip link add veth0 type veth peer name veth0b @@ -21,16 +24,22 @@ function config_cgroup { echo $$ >> /tmp/cgroupv2/foo/cgroup.procs } +function config_bpffs { + if mount | grep $BPFFS > /dev/null; then + echo "bpffs already mounted" + else + echo "bpffs not mounted. Mounting..." + mount -t bpf none $BPFFS + fi +} function attach_bpf { - test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 + ./test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 [ $? -ne 0 ] && exit 1 } function cleanup { - if [ -d /tmp/cgroupv2/foo ]; then - test_cgrp2_sock -d /tmp/cgroupv2/foo - fi + rm -rf $LINK_PIN ip link del veth0b ip netns delete at_ns0 umount /tmp/cgroupv2 @@ -42,6 +51,7 @@ cleanup 2>/dev/null set -e config_device config_cgroup +config_bpffs set +e # @@ -62,6 +72,9 @@ if [ $? -eq 0 ]; then exit 1 fi +rm -rf $LINK_PIN +sleep 1 # Wait for link detach + # # Test 2 - fail ping # From 4fe6641526dbee115d9d037e94a344f4f448aaa4 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:06 +0000 Subject: [PATCH 029/118] samples: bpf: Refactor task_fd_query program with libbpf This commit refactors the existing kprobe program with libbpf bpf loader. To attach bpf program, this uses generic bpf_program__attach() approach rather than using bpf_load's load_bpf_file(). To attach bpf to perf_event, instead of using previous ioctl method, this commit uses bpf_program__attach_perf_event since it manages the enable of perf_event and attach of BPF programs to it, which is much more intuitive way to achieve. Also, explicit close(fd) has been removed since event will be closed inside bpf_link__destroy() automatically. Furthermore, to prevent conflict of same named uprobe events, O_TRUNC flag has been used to clear 'uprobe_events' interface. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-4-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/task_fd_query_user.c | 101 ++++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 28 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index d31e082c369e..3bffd42e1482 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -107,7 +107,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := xdpsock_user.o xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o -task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) +task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := hbm.o $(CGROUP_HELPERS) diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index b68bd2f8fdc9..f6b772faa348 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -15,12 +15,15 @@ #include #include +#include #include -#include "bpf_load.h" #include "bpf_util.h" #include "perf-sys.h" #include "trace_helpers.h" +static struct bpf_program *progs[2]; +static struct bpf_link *links[2]; + #define CHECK_PERROR_RET(condition) ({ \ int __ret = !!(condition); \ if (__ret) { \ @@ -86,21 +89,22 @@ static int bpf_get_retprobe_bit(const char *event_type) return ret; } -static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, +static int test_debug_fs_kprobe(int link_idx, const char *fn_name, __u32 expected_fd_type) { __u64 probe_offset, probe_addr; __u32 len, prog_id, fd_type; + int err, event_fd; char buf[256]; - int err; len = sizeof(buf); - err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, + event_fd = bpf_link__fd(links[link_idx]); + err = bpf_task_fd_query(getpid(), event_fd, 0, buf, &len, &prog_id, &fd_type, &probe_offset, &probe_addr); if (err < 0) { printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", - __func__, prog_fd_idx, fn_name); + __func__, link_idx, fn_name); perror(" :"); return -1; } @@ -108,7 +112,7 @@ static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, fd_type != expected_fd_type || probe_offset != 0x0 || probe_addr != 0x0) { printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", - prog_fd_idx); + link_idx); printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," " probe_addr: 0x%llx\n", buf, fd_type, probe_offset, probe_addr); @@ -125,12 +129,13 @@ static int test_nondebug_fs_kuprobe_common(const char *event_type, int is_return_bit = bpf_get_retprobe_bit(event_type); int type = bpf_find_probe_type(event_type); struct perf_event_attr attr = {}; - int fd; + struct bpf_link *link; + int fd, err = -1; if (type < 0 || is_return_bit < 0) { printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", __func__, type, is_return_bit); - return -1; + return err; } attr.sample_period = 1; @@ -149,14 +154,21 @@ static int test_nondebug_fs_kuprobe_common(const char *event_type, attr.type = type; fd = sys_perf_event_open(&attr, -1, 0, -1, 0); - CHECK_PERROR_RET(fd < 0); + link = bpf_program__attach_perf_event(progs[0], fd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(fd); + goto cleanup; + } - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); - CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, prog_id, fd_type, probe_offset, probe_addr) < 0); + err = 0; - return 0; +cleanup: + bpf_link__destroy(link); + return err; } static int test_nondebug_fs_probe(const char *event_type, const char *name, @@ -215,17 +227,18 @@ static int test_nondebug_fs_probe(const char *event_type, const char *name, static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) { + char buf[256], event_alias[sizeof("test_1234567890")]; const char *event_type = "uprobe"; struct perf_event_attr attr = {}; - char buf[256], event_alias[sizeof("test_1234567890")]; __u64 probe_offset, probe_addr; __u32 len, prog_id, fd_type; - int err, res, kfd, efd; + int err = -1, res, kfd, efd; + struct bpf_link *link; ssize_t bytes; snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); - kfd = open(buf, O_WRONLY | O_APPEND, 0); + kfd = open(buf, O_WRONLY | O_TRUNC, 0); CHECK_PERROR_RET(kfd < 0); res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); @@ -254,10 +267,15 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) attr.type = PERF_TYPE_TRACEPOINT; attr.sample_period = 1; attr.wakeup_events = 1; + kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); - CHECK_PERROR_RET(kfd < 0); - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); - CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); + link = bpf_program__attach_perf_event(progs[0], kfd); + if (libbpf_get_error(link)) { + printf("ERROR: bpf_program__attach_perf_event failed\n"); + link = NULL; + close(kfd); + goto cleanup; + } len = sizeof(buf); err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, @@ -283,9 +301,11 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) probe_offset); return -1; } + err = 0; - close(kfd); - return 0; +cleanup: + bpf_link__destroy(link); + return err; } int main(int argc, char **argv) @@ -294,21 +314,42 @@ int main(int argc, char **argv) extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; + struct bpf_program *prog; + struct bpf_object *obj; + int i = 0, err = -1; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; + return err; } if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); - return 1; + return err; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + progs[i] = prog; + links[i] = bpf_program__attach(progs[i]); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } /* test two functions in the corresponding *_kern.c file */ @@ -378,6 +419,12 @@ int main(int argc, char **argv) false)); CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, true)); + err = 0; - return 0; +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); + return err; } From 763af200d6160b2f79f45cbf9a85b8dc6e20f2c7 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:07 +0000 Subject: [PATCH 030/118] samples: bpf: Refactor ibumad program with libbpf This commit refactors the existing ibumad program with libbpf bpf loader. Attach/detach of Tracepoint bpf programs has been managed with the generic bpf_program__attach() and bpf_link__destroy() from the libbpf. Also, instead of using the previous BPF MAP definition, this commit refactors ibumad MAP definition with the new BTF-defined MAP format. To verify that this bpf program works without an infiniband device, try loading ib_umad kernel module and test the program as follows: # modprobe ib_umad # ./ibumad Moreover, TRACE_HELPERS has been removed from the Makefile since it is not used on this program. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-5-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/ibumad_kern.c | 24 ++++++------- samples/bpf/ibumad_user.c | 71 +++++++++++++++++++++++++++++---------- 3 files changed, 67 insertions(+), 30 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3bffd42e1482..09a249477554 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -109,7 +109,7 @@ xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) -ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) +ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) # Tell kbuild to always build the programs diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c index 3a91b4c1989a..26dcd4dde946 100644 --- a/samples/bpf/ibumad_kern.c +++ b/samples/bpf/ibumad_kern.c @@ -16,19 +16,19 @@ #include -struct bpf_map_def SEC("maps") read_count = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), /* class; u32 required */ - .value_size = sizeof(u64), /* count of mads read */ - .max_entries = 256, /* Room for all Classes */ -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads read */ + __uint(max_entries, 256); /* Room for all Classes */ +} read_count SEC(".maps"); -struct bpf_map_def SEC("maps") write_count = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), /* class; u32 required */ - .value_size = sizeof(u64), /* count of mads written */ - .max_entries = 256, /* Room for all Classes */ -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); /* class; u32 required */ + __type(value, u64); /* count of mads written */ + __uint(max_entries, 256); /* Room for all Classes */ +} write_count SEC(".maps"); #undef DEBUG #ifndef DEBUG diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c index fa06eef31a84..d83d8102f489 100644 --- a/samples/bpf/ibumad_user.c +++ b/samples/bpf/ibumad_user.c @@ -23,10 +23,15 @@ #include #include -#include "bpf_load.h" +#include #include "bpf_util.h" #include +static struct bpf_link *tp_links[3]; +static struct bpf_object *obj; +static int map_fd[2]; +static int tp_cnt; + static void dump_counts(int fd) { __u32 key; @@ -53,6 +58,11 @@ static void dump_all_counts(void) static void dump_exit(int sig) { dump_all_counts(); + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); exit(0); } @@ -73,19 +83,11 @@ static void usage(char *cmd) int main(int argc, char **argv) { + struct bpf_program *prog; unsigned long delay = 5; + char filename[256]; int longindex = 0; - int opt; - char bpf_file[256]; - - /* Create the eBPF kernel code path name. - * This follows the pattern of all of the other bpf samples - */ - snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]); - - /* Do one final dump when exiting */ - signal(SIGINT, dump_exit); - signal(SIGTERM, dump_exit); + int opt, err = -1; while ((opt = getopt_long(argc, argv, "hd:rSw", long_options, &longindex)) != -1) { @@ -107,16 +109,51 @@ int main(int argc, char **argv) } } - if (load_bpf_file(bpf_file)) { - fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n", - bpf_file); - return 1; + /* Do one final dump when exiting */ + signal(SIGINT, dump_exit); + signal(SIGTERM, dump_exit); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "read_count"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "write_count"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; } while (1) { sleep(delay); dump_all_counts(); } + err = 0; - return 0; +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + return err; } From c6497df0ddc3363c2c940d92c3af0b2620477003 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:08 +0000 Subject: [PATCH 031/118] samples: bpf: Refactor test_overhead program with libbpf This commit refactors the existing program with libbpf bpf loader. Since the kprobe, tracepoint and raw_tracepoint bpf program can be attached with single bpf_program__attach() interface, so the corresponding function of libbpf is used here. Rather than specifying the number of cpus inside the code, this commit uses the number of available cpus with _SC_NPROCESSORS_ONLN. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-6-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/test_overhead_user.c | 82 +++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 09a249477554..25380e04897e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -78,7 +78,7 @@ lathist-objs := lathist_user.o offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) spintest-objs := spintest_user.o $(TRACE_HELPERS) map_perf_test-objs := map_perf_test_user.o -test_overhead-objs := bpf_load.o test_overhead_user.o +test_overhead-objs := test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o test_cgrp2_sock-objs := test_cgrp2_sock.o diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c index 94f74112a20e..819a6fe86f89 100644 --- a/samples/bpf/test_overhead_user.c +++ b/samples/bpf/test_overhead_user.c @@ -18,10 +18,14 @@ #include #include #include -#include "bpf_load.h" +#include #define MAX_CNT 1000000 +static struct bpf_link *links[2]; +static struct bpf_object *obj; +static int cnt; + static __u64 time_get_ns(void) { struct timespec ts; @@ -115,20 +119,54 @@ static void run_perf_test(int tasks, int flags) } } +static int load_progs(char *filename) +{ + struct bpf_program *prog; + int err = 0; + + obj = bpf_object__open_file(filename, NULL); + err = libbpf_get_error(obj); + if (err < 0) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return err; + } + + /* load BPF program */ + err = bpf_object__load(obj); + if (err < 0) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + return err; + } + + bpf_object__for_each_program(prog, obj) { + links[cnt] = bpf_program__attach(prog); + err = libbpf_get_error(links[cnt]); + if (err < 0) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[cnt] = NULL; + return err; + } + cnt++; + } + + return err; +} + static void unload_progs(void) { - close(prog_fd[0]); - close(prog_fd[1]); - close(event_fd[0]); - close(event_fd[1]); + while (cnt) + bpf_link__destroy(links[--cnt]); + + bpf_object__close(obj); } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int num_cpu = 8; + int num_cpu = sysconf(_SC_NPROCESSORS_ONLN); int test_flags = ~0; + char filename[256]; + int err = 0; setrlimit(RLIMIT_MEMLOCK, &r); @@ -145,38 +183,36 @@ int main(int argc, char **argv) if (test_flags & 0xC) { snprintf(filename, sizeof(filename), "%s_kprobe_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + printf("w/KPROBE\n"); - run_perf_test(num_cpu, test_flags >> 2); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 2); + unload_progs(); } if (test_flags & 0x30) { snprintf(filename, sizeof(filename), "%s_tp_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } printf("w/TRACEPOINT\n"); - run_perf_test(num_cpu, test_flags >> 4); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 4); + unload_progs(); } if (test_flags & 0xC0) { snprintf(filename, sizeof(filename), "%s_raw_tp_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } printf("w/RAW_TRACEPOINT\n"); - run_perf_test(num_cpu, test_flags >> 6); + err = load_progs(filename); + if (!err) + run_perf_test(num_cpu, test_flags >> 6); + unload_progs(); } - return 0; + return err; } From 0afe0a998c40085a6342e1aeb4c510cccba46caf Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:09 +0000 Subject: [PATCH 032/118] samples: bpf: Fix lwt_len_hist reusing previous BPF map Currently, lwt_len_hist's map lwt_len_hist_map is uses pinning, and the map isn't cleared on test end. This leds to reuse of that map for each test, which prevents the results of the test from being accurate. This commit fixes the problem by removing of pinned map from bpffs. Also, this commit add the executable permission to shell script files. Fixes: f74599f7c5309 ("bpf: Add tests and samples for LWT-BPF") Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-7-danieltimlee@gmail.com --- samples/bpf/lwt_len_hist.sh | 2 ++ samples/bpf/test_lwt_bpf.sh | 0 2 files changed, 2 insertions(+) mode change 100644 => 100755 samples/bpf/lwt_len_hist.sh mode change 100644 => 100755 samples/bpf/test_lwt_bpf.sh diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh old mode 100644 new mode 100755 index 090b96eaf7f7..0eda9754f50b --- a/samples/bpf/lwt_len_hist.sh +++ b/samples/bpf/lwt_len_hist.sh @@ -8,6 +8,8 @@ VETH1=tst_lwt1b TRACE_ROOT=/sys/kernel/debug/tracing function cleanup { + # To reset saved histogram, remove pinned map + rm /sys/fs/bpf/tc/globals/lwt_len_hist_map ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null ip link del $VETH0 2> /dev/null ip link del $VETH1 2> /dev/null diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh old mode 100644 new mode 100755 From ceb5dea5654354fb4e6e393c99f1d0bf4debab0e Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 24 Nov 2020 09:03:10 +0000 Subject: [PATCH 033/118] samples: bpf: Remove bpf_load loader completely Numerous refactoring that rewrites BPF programs written with bpf_load to use the libbpf loader was finally completed, resulting in BPF programs using bpf_load within the kernel being completely no longer present. This commit removes bpf_load, an outdated bpf loader that is difficult to keep up with the latest kernel BPF and causes confusion. Also, this commit removes the unused trace_helper and bpf_load from samples/bpf target objects from Makefile. Signed-off-by: Daniel T. Lee Signed-off-by: Andrii Nakryiko Acked-by: Jesper Dangaard Brouer Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201124090310.24374-8-danieltimlee@gmail.com --- samples/bpf/Makefile | 10 +- samples/bpf/bpf_load.c | 667 -------------------------------- samples/bpf/bpf_load.h | 57 --- samples/bpf/xdp2skb_meta_kern.c | 2 +- 4 files changed, 5 insertions(+), 731 deletions(-) delete mode 100644 samples/bpf/bpf_load.c delete mode 100644 samples/bpf/bpf_load.h diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 25380e04897e..05db041f8b18 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -73,7 +73,7 @@ tracex5-objs := tracex5_user.o $(TRACE_HELPERS) tracex6-objs := tracex6_user.o tracex7-objs := tracex7_user.o test_probe_write_user-objs := test_probe_write_user_user.o -trace_output-objs := trace_output_user.o $(TRACE_HELPERS) +trace_output-objs := trace_output_user.o lathist-objs := lathist_user.o offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) spintest-objs := spintest_user.o $(TRACE_HELPERS) @@ -91,8 +91,8 @@ test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := trace_event_user.o $(TRACE_HELPERS) sampleip-objs := sampleip_user.o $(TRACE_HELPERS) -tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o -lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o +tc_l2_redirect-objs := tc_l2_redirect_user.o +lwt_len_hist-objs := lwt_len_hist_user.o xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o @@ -108,7 +108,7 @@ xdpsock-objs := xdpsock_user.o xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) -xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) +xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) @@ -197,8 +197,6 @@ TPROGS_CFLAGS += --sysroot=$(SYSROOT) TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib endif -TPROGCFLAGS_bpf_load.o += -Wno-unused-variable - TPROGS_LDLIBS += $(LIBBPF) -lelf -lz TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c deleted file mode 100644 index c5ad528f046e..000000000000 --- a/samples/bpf/bpf_load.c +++ /dev/null @@ -1,667 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "bpf_load.h" -#include "perf-sys.h" - -#define DEBUGFS "/sys/kernel/debug/tracing/" - -static char license[128]; -static int kern_version; -static bool processed_sec[128]; -char bpf_log_buf[BPF_LOG_BUF_SIZE]; -int map_fd[MAX_MAPS]; -int prog_fd[MAX_PROGS]; -int event_fd[MAX_PROGS]; -int prog_cnt; -int prog_array_fd = -1; - -struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count; - -static int populate_prog_array(const char *event, int prog_fd) -{ - int ind = atoi(event), err; - - err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); - if (err < 0) { - printf("failed to store prog_fd in prog_array\n"); - return -1; - } - return 0; -} - -static int write_kprobe_events(const char *val) -{ - int fd, ret, flags; - - if (val == NULL) - return -1; - else if (val[0] == '\0') - flags = O_WRONLY | O_TRUNC; - else - flags = O_WRONLY | O_APPEND; - - fd = open(DEBUGFS "kprobe_events", flags); - - ret = write(fd, val, strlen(val)); - close(fd); - - return ret; -} - -static int load_and_attach(const char *event, struct bpf_insn *prog, int size) -{ - bool is_socket = strncmp(event, "socket", 6) == 0; - bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; - bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; - bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; - bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; - bool is_xdp = strncmp(event, "xdp", 3) == 0; - bool is_perf_event = strncmp(event, "perf_event", 10) == 0; - bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; - bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; - bool is_sockops = strncmp(event, "sockops", 7) == 0; - bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; - bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; - size_t insns_cnt = size / sizeof(struct bpf_insn); - enum bpf_prog_type prog_type; - char buf[256]; - int fd, efd, err, id; - struct perf_event_attr attr = {}; - - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - if (is_socket) { - prog_type = BPF_PROG_TYPE_SOCKET_FILTER; - } else if (is_kprobe || is_kretprobe) { - prog_type = BPF_PROG_TYPE_KPROBE; - } else if (is_tracepoint) { - prog_type = BPF_PROG_TYPE_TRACEPOINT; - } else if (is_raw_tracepoint) { - prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; - } else if (is_xdp) { - prog_type = BPF_PROG_TYPE_XDP; - } else if (is_perf_event) { - prog_type = BPF_PROG_TYPE_PERF_EVENT; - } else if (is_cgroup_skb) { - prog_type = BPF_PROG_TYPE_CGROUP_SKB; - } else if (is_cgroup_sk) { - prog_type = BPF_PROG_TYPE_CGROUP_SOCK; - } else if (is_sockops) { - prog_type = BPF_PROG_TYPE_SOCK_OPS; - } else if (is_sk_skb) { - prog_type = BPF_PROG_TYPE_SK_SKB; - } else if (is_sk_msg) { - prog_type = BPF_PROG_TYPE_SK_MSG; - } else { - printf("Unknown event '%s'\n", event); - return -1; - } - - if (prog_cnt == MAX_PROGS) - return -1; - - fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, - bpf_log_buf, BPF_LOG_BUF_SIZE); - if (fd < 0) { - printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); - return -1; - } - - prog_fd[prog_cnt++] = fd; - - if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) - return 0; - - if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { - if (is_socket) - event += 6; - else - event += 7; - if (*event != '/') - return 0; - event++; - if (!isdigit(*event)) { - printf("invalid prog number\n"); - return -1; - } - return populate_prog_array(event, fd); - } - - if (is_raw_tracepoint) { - efd = bpf_raw_tracepoint_open(event + 15, fd); - if (efd < 0) { - printf("tracepoint %s %s\n", event + 15, strerror(errno)); - return -1; - } - event_fd[prog_cnt - 1] = efd; - return 0; - } - - if (is_kprobe || is_kretprobe) { - bool need_normal_check = true; - const char *event_prefix = ""; - - if (is_kprobe) - event += 7; - else - event += 10; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - - if (isdigit(*event)) - return populate_prog_array(event, fd); - -#ifdef __x86_64__ - if (strncmp(event, "sys_", 4) == 0) { - snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", - is_kprobe ? 'p' : 'r', event, event); - err = write_kprobe_events(buf); - if (err >= 0) { - need_normal_check = false; - event_prefix = "__x64_"; - } - } -#endif - if (need_normal_check) { - snprintf(buf, sizeof(buf), "%c:%s %s", - is_kprobe ? 'p' : 'r', event, event); - err = write_kprobe_events(buf); - if (err < 0) { - printf("failed to create kprobe '%s' error '%s'\n", - event, strerror(errno)); - return -1; - } - } - - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event_prefix); - strcat(buf, event); - strcat(buf, "/id"); - } else if (is_tracepoint) { - event += 11; - - if (*event == 0) { - printf("event name cannot be empty\n"); - return -1; - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/"); - strcat(buf, event); - strcat(buf, "/id"); - } - - efd = open(buf, O_RDONLY, 0); - if (efd < 0) { - printf("failed to open event %s\n", event); - return -1; - } - - err = read(efd, buf, sizeof(buf)); - if (err < 0 || err >= sizeof(buf)) { - printf("read from '%s' failed '%s'\n", event, strerror(errno)); - return -1; - } - - close(efd); - - buf[err] = 0; - id = atoi(buf); - attr.config = id; - - efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); - if (efd < 0) { - printf("event %d fd %d err %s\n", id, efd, strerror(errno)); - return -1; - } - event_fd[prog_cnt - 1] = efd; - err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); - if (err < 0) { - printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", - strerror(errno)); - return -1; - } - err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); - if (err < 0) { - printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", - strerror(errno)); - return -1; - } - - return 0; -} - -static int load_maps(struct bpf_map_data *maps, int nr_maps, - fixup_map_cb fixup_map) -{ - int i, numa_node; - - for (i = 0; i < nr_maps; i++) { - if (fixup_map) { - fixup_map(&maps[i], i); - /* Allow userspace to assign map FD prior to creation */ - if (maps[i].fd != -1) { - map_fd[i] = maps[i].fd; - continue; - } - } - - numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? - maps[i].def.numa_node : -1; - - if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - - map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, - maps[i].name, - maps[i].def.key_size, - inner_map_fd, - maps[i].def.max_entries, - maps[i].def.map_flags, - numa_node); - } else { - map_fd[i] = bpf_create_map_node(maps[i].def.type, - maps[i].name, - maps[i].def.key_size, - maps[i].def.value_size, - maps[i].def.max_entries, - maps[i].def.map_flags, - numa_node); - } - if (map_fd[i] < 0) { - printf("failed to create map %d (%s): %d %s\n", - i, maps[i].name, errno, strerror(errno)); - return 1; - } - maps[i].fd = map_fd[i]; - - if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) - prog_array_fd = map_fd[i]; - } - return 0; -} - -static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, - GElf_Shdr *shdr, Elf_Data **data) -{ - Elf_Scn *scn; - - scn = elf_getscn(elf, i); - if (!scn) - return 1; - - if (gelf_getshdr(scn, shdr) != shdr) - return 2; - - *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); - if (!*shname || !shdr->sh_size) - return 3; - - *data = elf_getdata(scn, 0); - if (!*data || elf_getdata(scn, *data) != NULL) - return 4; - - return 0; -} - -static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn, - struct bpf_map_data *maps, int nr_maps) -{ - int i, nrels; - - nrels = shdr->sh_size / shdr->sh_entsize; - - for (i = 0; i < nrels; i++) { - GElf_Sym sym; - GElf_Rel rel; - unsigned int insn_idx; - bool match = false; - int j, map_idx; - - gelf_getrel(data, i, &rel); - - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - - gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); - - if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - printf("invalid relo for insn[%d].code 0x%x\n", - insn_idx, insn[insn_idx].code); - return 1; - } - insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - - /* Match FD relocation against recorded map_data[] offset */ - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].elf_offset == sym.st_value) { - match = true; - break; - } - } - if (match) { - insn[insn_idx].imm = maps[map_idx].fd; - } else { - printf("invalid relo for insn[%d] no map_data match\n", - insn_idx); - return 1; - } - } - - return 0; -} - -static int cmp_symbols(const void *l, const void *r) -{ - const GElf_Sym *lsym = (const GElf_Sym *)l; - const GElf_Sym *rsym = (const GElf_Sym *)r; - - if (lsym->st_value < rsym->st_value) - return -1; - else if (lsym->st_value > rsym->st_value) - return 1; - else - return 0; -} - -static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, - Elf *elf, Elf_Data *symbols, int strtabidx) -{ - int map_sz_elf, map_sz_copy; - bool validate_zero = false; - Elf_Data *data_maps; - int i, nr_maps; - GElf_Sym *sym; - Elf_Scn *scn; - int copy_sz; - - if (maps_shndx < 0) - return -EINVAL; - if (!symbols) - return -EINVAL; - - /* Get data for maps section via elf index */ - scn = elf_getscn(elf, maps_shndx); - if (scn) - data_maps = elf_getdata(scn, NULL); - if (!scn || !data_maps) { - printf("Failed to get Elf_Data from maps section %d\n", - maps_shndx); - return -EINVAL; - } - - /* For each map get corrosponding symbol table entry */ - sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); - for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - assert(nr_maps < MAX_MAPS+1); - if (!gelf_getsym(symbols, i, &sym[nr_maps])) - continue; - if (sym[nr_maps].st_shndx != maps_shndx) - continue; - /* Only increment iif maps section */ - nr_maps++; - } - - /* Align to map_fd[] order, via sort on offset in sym.st_value */ - qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); - - /* Keeping compatible with ELF maps section changes - * ------------------------------------------------ - * The program size of struct bpf_load_map_def is known by loader - * code, but struct stored in ELF file can be different. - * - * Unfortunately sym[i].st_size is zero. To calculate the - * struct size stored in the ELF file, assume all struct have - * the same size, and simply divide with number of map - * symbols. - */ - map_sz_elf = data_maps->d_size / nr_maps; - map_sz_copy = sizeof(struct bpf_load_map_def); - if (map_sz_elf < map_sz_copy) { - /* - * Backward compat, loading older ELF file with - * smaller struct, keeping remaining bytes zero. - */ - map_sz_copy = map_sz_elf; - } else if (map_sz_elf > map_sz_copy) { - /* - * Forward compat, loading newer ELF file with larger - * struct with unknown features. Assume zero means - * feature not used. Thus, validate rest of struct - * data is zero. - */ - validate_zero = true; - } - - /* Memcpy relevant part of ELF maps data to loader maps */ - for (i = 0; i < nr_maps; i++) { - struct bpf_load_map_def *def; - unsigned char *addr, *end; - const char *map_name; - size_t offset; - - map_name = elf_strptr(elf, strtabidx, sym[i].st_name); - maps[i].name = strdup(map_name); - if (!maps[i].name) { - printf("strdup(%s): %s(%d)\n", map_name, - strerror(errno), errno); - free(sym); - return -errno; - } - - /* Symbol value is offset into ELF maps section data area */ - offset = sym[i].st_value; - def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); - maps[i].elf_offset = offset; - memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); - memcpy(&maps[i].def, def, map_sz_copy); - - /* Verify no newer features were requested */ - if (validate_zero) { - addr = (unsigned char *) def + map_sz_copy; - end = (unsigned char *) def + map_sz_elf; - for (; addr < end; addr++) { - if (*addr != 0) { - free(sym); - return -EFBIG; - } - } - } - } - - free(sym); - return nr_maps; -} - -static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) -{ - int fd, i, ret, maps_shndx = -1, strtabidx = -1; - Elf *elf; - GElf_Ehdr ehdr; - GElf_Shdr shdr, shdr_prog; - Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; - char *shname, *shname_prog; - int nr_maps = 0; - - /* reset global variables */ - kern_version = 0; - memset(license, 0, sizeof(license)); - memset(processed_sec, 0, sizeof(processed_sec)); - - if (elf_version(EV_CURRENT) == EV_NONE) - return 1; - - fd = open(path, O_RDONLY, 0); - if (fd < 0) - return 1; - - elf = elf_begin(fd, ELF_C_READ, NULL); - - if (!elf) - return 1; - - if (gelf_getehdr(elf, &ehdr) != &ehdr) - return 1; - - /* clear all kprobes */ - i = write_kprobe_events(""); - - /* scan over all elf sections to get license and map info */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (0) /* helpful for llvm debugging */ - printf("section %d:%s data %p size %zd link %d flags %d\n", - i, shname, data->d_buf, data->d_size, - shdr.sh_link, (int) shdr.sh_flags); - - if (strcmp(shname, "license") == 0) { - processed_sec[i] = true; - memcpy(license, data->d_buf, data->d_size); - } else if (strcmp(shname, "version") == 0) { - processed_sec[i] = true; - if (data->d_size != sizeof(int)) { - printf("invalid size of version section %zd\n", - data->d_size); - return 1; - } - memcpy(&kern_version, data->d_buf, sizeof(int)); - } else if (strcmp(shname, "maps") == 0) { - int j; - - maps_shndx = i; - data_maps = data; - for (j = 0; j < MAX_MAPS; j++) - map_data[j].fd = -1; - } else if (shdr.sh_type == SHT_SYMTAB) { - strtabidx = shdr.sh_link; - symbols = data; - } - } - - ret = 1; - - if (!symbols) { - printf("missing SHT_SYMTAB section\n"); - goto done; - } - - if (data_maps) { - nr_maps = load_elf_maps_section(map_data, maps_shndx, - elf, symbols, strtabidx); - if (nr_maps < 0) { - printf("Error: Failed loading ELF maps (errno:%d):%s\n", - nr_maps, strerror(-nr_maps)); - goto done; - } - if (load_maps(map_data, nr_maps, fixup_map)) - goto done; - map_data_count = nr_maps; - - processed_sec[maps_shndx] = true; - } - - /* process all relo sections, and rewrite bpf insns for maps */ - for (i = 1; i < ehdr.e_shnum; i++) { - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (shdr.sh_type == SHT_REL) { - struct bpf_insn *insns; - - /* locate prog sec that need map fixup (relocations) */ - if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, - &shdr_prog, &data_prog)) - continue; - - if (shdr_prog.sh_type != SHT_PROGBITS || - !(shdr_prog.sh_flags & SHF_EXECINSTR)) - continue; - - insns = (struct bpf_insn *) data_prog->d_buf; - processed_sec[i] = true; /* relo section */ - - if (parse_relo_and_apply(data, symbols, &shdr, insns, - map_data, nr_maps)) - continue; - } - } - - /* load programs */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (memcmp(shname, "kprobe/", 7) == 0 || - memcmp(shname, "kretprobe/", 10) == 0 || - memcmp(shname, "tracepoint/", 11) == 0 || - memcmp(shname, "raw_tracepoint/", 15) == 0 || - memcmp(shname, "xdp", 3) == 0 || - memcmp(shname, "perf_event", 10) == 0 || - memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0 || - memcmp(shname, "sockops", 7) == 0 || - memcmp(shname, "sk_skb", 6) == 0 || - memcmp(shname, "sk_msg", 6) == 0) { - ret = load_and_attach(shname, data->d_buf, - data->d_size); - if (ret != 0) - goto done; - } - } - -done: - close(fd); - return ret; -} - -int load_bpf_file(char *path) -{ - return do_load_bpf_file(path, NULL); -} - -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) -{ - return do_load_bpf_file(path, fixup_map); -} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h deleted file mode 100644 index 4fcd258c616f..000000000000 --- a/samples/bpf/bpf_load.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_LOAD_H -#define __BPF_LOAD_H - -#include - -#define MAX_MAPS 32 -#define MAX_PROGS 32 - -struct bpf_load_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; - unsigned int numa_node; -}; - -struct bpf_map_data { - int fd; - char *name; - size_t elf_offset; - struct bpf_load_map_def def; -}; - -typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); - -extern int prog_fd[MAX_PROGS]; -extern int event_fd[MAX_PROGS]; -extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; -extern int prog_cnt; - -/* There is a one-to-one mapping between map_fd[] and map_data[]. - * The map_data[] just contains more rich info on the given map. - */ -extern int map_fd[MAX_MAPS]; -extern struct bpf_map_data map_data[MAX_MAPS]; -extern int map_data_count; - -/* parses elf file compiled by llvm .c->.o - * . parses 'maps' section and creates maps via BPF syscall - * . parses 'license' section and passes it to syscall - * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by - * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD - * . loads eBPF programs via BPF syscall - * - * One ELF file can contain multiple BPF programs which will be loaded - * and their FDs stored stored in prog_fd array - * - * returns zero on success - */ -int load_bpf_file(char *path); -int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); - -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); -#endif diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c index 9b783316e860..d5631014a176 100644 --- a/samples/bpf/xdp2skb_meta_kern.c +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -6,7 +6,7 @@ * This uses the XDP data_meta infrastructure, and is a cooperation * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. * - * Notice: This example does not use the BPF C-loader (bpf_load.c), + * Notice: This example does not use the BPF C-loader, * but instead rely on the iproute2 TC tool for loading BPF-objects. */ #include From 105c4e75feb411a60f5089f7a1e68b8523f986cc Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 26 Nov 2020 10:37:35 +0100 Subject: [PATCH 034/118] libbpf: Replace size_t with __u32 in xsk interfaces Replace size_t with __u32 in the xsk interfaces that contain this. There is no reason to have size_t since the internal variable that is manipulated is a __u32. The following APIs are affected: __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) The "nb" variable and the return values have been changed from size_t to __u32. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1606383455-8243-1-git-send-email-magnus.karlsson@gmail.com --- tools/lib/bpf/xsk.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index 1719a327e5f9..5865e082ba0b 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -113,8 +113,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) return (entries > nb) ? nb : entries; } -static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, - size_t nb, __u32 *idx) +static inline __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx) { if (xsk_prod_nb_free(prod, nb) < nb) return 0; @@ -125,7 +124,7 @@ static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, return nb; } -static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) +static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) { /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. @@ -135,10 +134,9 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) *prod->producer += nb; } -static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, - size_t nb, __u32 *idx) +static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) { - size_t entries = xsk_cons_nb_avail(cons, nb); + __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { /* Make sure we do not speculatively read the data before @@ -153,13 +151,12 @@ static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, return entries; } -static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, - size_t nb) +static inline void xsk_ring_cons__cancel(struct xsk_ring_cons *cons, __u32 nb) { cons->cached_cons -= nb; } -static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) +static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) { /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. From bb1b25cab04324d0749f7ae22653aff58157bf83 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 26 Nov 2020 23:03:18 +0800 Subject: [PATCH 035/118] xdp: Remove the functions xsk_map_inc and xsk_map_put The functions xsk_map_put() and xsk_map_inc() are simple wrappers and as such, replace these functions with the functions bpf_map_inc() and bpf_map_put() and remove some error testing code. Signed-off-by: Zhu Yanjun Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/1606402998-12562-1-git-send-email-yanjunz@nvidia.com --- net/xdp/xsk.c | 4 ++-- net/xdp/xsk.h | 2 -- net/xdp/xskmap.c | 20 ++------------------ 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b0141973f23e..3e33cb6e68e6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -605,7 +605,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { - WARN_ON(xsk_map_inc(node->map)); + bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } @@ -635,7 +635,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); - xsk_map_put(map); + bpf_map_put(&map->map); } } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index b9e896cee5bb..edcf249ad1f1 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 49da2b8ace8b..66231ba6c348 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -11,32 +11,16 @@ #include "xsk.h" -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock **map_entry) { struct xsk_map_node *node; - int err; node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } + bpf_map_inc(&map->map); node->map = map; node->map_entry = map_entry; @@ -45,7 +29,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, static void xsk_map_node_free(struct xsk_map_node *node) { - xsk_map_put(node->map); + bpf_map_put(&node->map->map); kfree(node); } From 854055c0cf30d732b3514ce7956976f60496b1a1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 26 Nov 2020 18:49:46 +0000 Subject: [PATCH 036/118] selftests/bpf: Fix flavored variants of test_ima Flavored variants of test_progs (e.g. test_progs-no_alu32) change their working directory to the corresponding subdirectory (e.g. no_alu32). Since the setup script required by test_ima (ima_setup.sh) is not mentioned in the dependencies, it does not get copied to these subdirectories and causes flavored variants of test_ima to fail. Adding the script to TRUNNER_EXTRA_FILES ensures that the file is also copied to the subdirectories for the flavored variants of test_progs. Fixes: 34b82d3ac105 ("bpf: Add a selftest for bpf_ima_inode_hash") Reported-by: Yonghong Song Suggested-by: Yonghong Song Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201126184946.1708213-1-kpsingh@chromium.org --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3d5940cd110d..894192c319fb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -389,6 +389,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ btf_helpers.c flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ + ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) From 7fd3253a7de6a317a0683f83739479fb880bffc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:51:56 +0100 Subject: [PATCH 037/118] net: Introduce preferred busy-polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing busy-polling mode, enabled by the SO_BUSY_POLL socket option or system-wide using the /proc/sys/net/core/busy_read knob, is an opportunistic. That means that if the NAPI context is not scheduled, it will poll it. If, after busy-polling, the budget is exceeded the busy-polling logic will schedule the NAPI onto the regular softirq handling. One implication of the behavior above is that a busy/heavy loaded NAPI context will never enter/allow for busy-polling. Some applications prefer that most NAPI processing would be done by busy-polling. This series adds a new socket option, SO_PREFER_BUSY_POLL, that works in concert with the napi_defer_hard_irqs and gro_flush_timeout knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature"), and allows for a user to defer interrupts to be enabled and instead schedule the NAPI context from a watchdog timer. When a user enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled, and the NAPI context is being processed by a softirq, the softirq NAPI processing will exit early to allow the busy-polling to be performed. If the application stops performing busy-polling via a system call, the watchdog timer defined by gro_flush_timeout will timeout, and regular softirq handling will resume. In summary; Heavy traffic applications that prefer busy-polling over softirq processing should use this option. Example usage: $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout Note that the timeout should be larger than the userspace processing window, otherwise the watchdog will timeout and fall back to regular softirq processing. Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + fs/eventpoll.c | 2 +- include/linux/netdevice.h | 35 +++++++----- include/net/busy_poll.h | 5 +- include/net/sock.h | 4 ++ include/uapi/asm-generic/socket.h | 2 + net/core/dev.c | 78 +++++++++++++++++++++------ net/core/sock.c | 9 ++++ 11 files changed, 111 insertions(+), 32 deletions(-) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df61082..538359642554 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6..e406e73b5e6e 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e..1bc46200889d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 +#define SO_PREFER_BUSY_POLL 0x4043 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7c..99688cf673a4 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 +#define SO_PREFER_BUSY_POLL 0x0048 + #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4df61129566d..e11fab3a0b9e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ce648a564f7..52d1cc2bd8a7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -350,23 +350,25 @@ struct napi_struct { }; enum { - NAPI_STATE_SCHED, /* Poll is scheduled */ - NAPI_STATE_MISSED, /* reschedule a napi */ - NAPI_STATE_DISABLE, /* Disable pending */ - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_LISTED, /* NAPI added to system lists */ - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ + NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ }; enum { - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), }; enum gro_result { @@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } +static inline bool napi_prefer_busy_poll(struct napi_struct *n) +{ + return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); +} + bool napi_schedule_prep(struct napi_struct *n); /** diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..0292b8353d7e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, bool prefer_busy_poll); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + READ_ONCE(sk->sk_prefer_busy_poll)); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..d49b89b071b6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -301,6 +301,7 @@ struct bpf_local_storage; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -479,6 +480,9 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; +#ifdef CONFIG_NET_RX_BUSY_POLL + u8 sk_prefer_busy_poll; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..7dd02408b7ce 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/dev.c b/net/core/dev.c index 60d325bda0d7..6f8d2cffb7c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #define BUSY_POLL_BUDGET 8 -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +{ + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ @@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) */ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == BUSY_POLL_BUDGET) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6565,12 +6591,18 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } @@ -6588,7 +6620,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6599,7 +6631,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); out: rcu_read_unlock(); @@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + goto out_unlock; + } + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..e05f2e52b5a8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,12 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: From 7c951cafc0cb2e575f1d58677b95ac387ac0a5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:51:57 +0100 Subject: [PATCH 038/118] net: Add SO_BUSY_POLL_BUDGET socket option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This option lets a user set a per socket NAPI budget for busy-polling. If the options is not set, it will use the default of 8. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-3-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 1 + arch/mips/include/uapi/asm/socket.h | 1 + arch/parisc/include/uapi/asm/socket.h | 1 + arch/sparc/include/uapi/asm/socket.h | 1 + fs/eventpoll.c | 3 ++- include/net/busy_poll.h | 7 +++++-- include/net/sock.h | 2 ++ include/uapi/asm-generic/socket.h | 1 + net/core/dev.c | 21 ++++++++++----------- net/core/sock.c | 10 ++++++++++ 10 files changed, 34 insertions(+), 14 deletions(-) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 538359642554..57420356ce4c 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -125,6 +125,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index e406e73b5e6e..2d949969313b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -136,6 +136,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1bc46200889d..f60904329bbc 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -117,6 +117,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 #define SO_PREFER_BUSY_POLL 0x4043 +#define SO_BUSY_POLL_BUDGET 0x4044 #if !defined(__KERNEL__) diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 99688cf673a4..848a22fbac20 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -118,6 +118,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 #define SO_PREFER_BUSY_POLL 0x0048 +#define SO_BUSY_POLL_BUDGET 0x0049 #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e11fab3a0b9e..73c346e503d7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, + BUSY_POLL_BUDGET); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 0292b8353d7e..2f8f51807b83 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -23,6 +23,8 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#define BUSY_POLL_BUDGET 8 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll); + void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -106,7 +108,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, - READ_ONCE(sk->sk_prefer_busy_poll)); + READ_ONCE(sk->sk_prefer_busy_poll), + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index d49b89b071b6..77ba2c2737db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -302,6 +302,7 @@ struct bpf_local_storage; * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing + * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -482,6 +483,7 @@ struct sock { kuid_t sk_uid; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; + u16 sk_busy_poll_budget; #endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 7dd02408b7ce..4dcd13d097a9 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -120,6 +120,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/net/core/dev.c b/net/core/dev.c index 6f8d2cffb7c5..7a1e5936c67f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6496,8 +6496,6 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 - static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { @@ -6517,7 +6515,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6549,21 +6548,21 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == budget) __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6606,8 +6605,8 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6620,7 +6619,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6631,7 +6630,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index e05f2e52b5a8..d422a6808405 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1165,6 +1165,16 @@ set_sndbuf: else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: From 45a86681844e375bef6f6add272ccc309bb6a08d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:51:58 +0100 Subject: [PATCH 039/118] xsk: Add support for recvmsg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for non-blocking recvmsg() to XDP sockets. Previously, only sendmsg() was supported by XDP socket. Now, for symmetry and the upcoming busy-polling support, recvmsg() is added. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-4-bjorn.topel@gmail.com --- net/xdp/xsk.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3e33cb6e68e6..1d1e0cd51a95 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -531,6 +531,26 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return __xsk_sendmsg(sk); } +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->rx)) + return -ENOBUFS; + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; +} + static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -1191,7 +1211,7 @@ static const struct proto_ops xsk_proto_ops = { .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, + .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; From e392081837283fbe5df1837fd85012ae5bfae098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:51:59 +0100 Subject: [PATCH 040/118] xsk: Check need wakeup flag in sendmsg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a check for need wake up in sendmsg(), so that if a user calls sendmsg() when no wakeup is needed, do not trigger a wakeup. To simplify the need wakeup check in the syscall, unconditionally enable the need wakeup flag for Tx. This has a side-effect for poll(); If poll() is called for a socket without enabled need wakeup, a Tx wakeup is unconditionally performed. The wakeup matrix for AF_XDP now looks like: need wakeup | poll() | sendmsg() | recvmsg() ------------+--------------+-------------+------------ disabled | wake Tx | wake Tx | nop enabled | check flag; | check flag; | check flag; | wake Tx/Rx | wake Tx | wake Rx Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-5-bjorn.topel@gmail.com --- net/xdp/xsk.c | 6 +++++- net/xdp/xsk_buff_pool.c | 13 ++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1d1e0cd51a95..c72b4dba2878 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -522,13 +522,17 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(need_wait)) return -EOPNOTSUPP; - return __xsk_sendmsg(sk); + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) + return __xsk_sendmsg(sk); + return 0; } static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 8a3bf4e1318e..96bb607853ad 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -144,14 +144,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_NEED_WAKEUP) { + if (flags & XDP_USE_NEED_WAKEUP) pool->uses_need_wakeup = true; - /* Tx needs to be explicitly woken up the first time. - * Also for supporting drivers that do not implement this - * feature. They will always have to call sendto(). - */ - pool->cached_need_wakeup = XDP_WAKEUP_TX; - } + /* Tx needs to be explicitly woken up the first time. Also + * for supporting drivers that do not implement this + * feature. They will always have to call sendto() or poll(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; dev_hold(netdev); From a0731952d9cddc9c11a8352922f449e6ab2f7537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:00 +0100 Subject: [PATCH 041/118] xsk: Add busy-poll support for {recv,send}msg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire-up XDP socket busy-poll support for recvmsg() and sendmsg(). If the XDP socket prefers busy-polling, make sure that no wakeup/IPI is performed. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-6-bjorn.topel@gmail.com --- net/xdp/xsk.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c72b4dba2878..a8501a5477cf 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "xsk_queue.h" @@ -517,6 +518,17 @@ static int __xsk_sendmsg(struct sock *sk) return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); } +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; +#else + return false; +#endif +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); @@ -529,6 +541,12 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (unlikely(need_wait)) return -EOPNOTSUPP; + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + pool = xs->pool; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return __xsk_sendmsg(sk); @@ -550,6 +568,12 @@ static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int fl if (unlikely(need_wait)) return -EOPNOTSUPP; + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_RX); return 0; From b02e5a0ebb172c8276cea3151942aac681f7a4a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:01 +0100 Subject: [PATCH 042/118] xsk: Propagate napi_id to XDP socket Rx path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add napi_id to the xdp_rxq_info structure, and make sure the XDP socket pick up the napi_id in the Rx path. The napi_id is used to find the corresponding NAPI structure for socket busy polling. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Ilias Apalodimas Acked-by: Michael S. Tsirkin Acked-by: Tariq Toukan Link: https://lore.kernel.org/bpf/20201130185205.196029-7-bjorn.topel@gmail.com --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- .../ethernet/cavium/thunder/nicvf_queues.c | 2 +- .../net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/ice/ice_base.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- .../ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- drivers/net/ethernet/sfc/rx_common.c | 2 +- drivers/net/ethernet/socionext/netsec.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/hyperv/netvsc.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/veth.c | 12 ++++++++---- drivers/net/virtio_net.c | 2 +- drivers/net/xen-netfront.c | 2 +- include/net/busy_poll.h | 19 +++++++++++++++---- include/net/xdp.h | 3 ++- net/core/dev.c | 2 +- net/core/xdp.c | 3 ++- net/xdp/xsk.c | 1 + 29 files changed, 54 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index e8131dadc22c..6ad59f0068f6 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -416,7 +416,7 @@ static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) { int rc; - rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid); + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0); if (rc) { netif_err(rx_ring->adapter, ifup, rx_ring->netdev, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7975f59735d6..725d929eddb1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2884,7 +2884,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) if (rc) return rc; - rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i); + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); if (rc < 0) return rc; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 7a141ce32e86..f782e6af45e9 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -770,7 +770,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs, rq->caching = 1; /* Driver have no proper error path for failed XDP RX-queue info reg */ - WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0); + WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx, 0) < 0); /* Send a mailbox msg to PF to config RQ */ mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index cf9400a9886d..40953980e846 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -3334,7 +3334,7 @@ static int dpaa2_eth_setup_rx_flow(struct dpaa2_eth_priv *priv, return 0; err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev, - fq->flowid); + fq->flowid, 0); if (err) { dev_err(dev, "xdp_rxq_info_reg failed\n"); return err; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index c21548c71bb1..9f73cd7aee09 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1447,7 +1447,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) /* XDP RX-queue info only needed for RX rings exposed to XDP */ if (rx_ring->vsi->type == I40E_VSI_MAIN) { err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index); + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); if (err < 0) return err; } diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index fe4320e2d1f2..3124a3bf519a 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -306,7 +306,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index); + ring->q_index, ring->q_vector->napi.napi_id); ring->xsk_pool = ice_xsk_pool(ring); if (ring->xsk_pool) { @@ -333,7 +333,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index); + ring->q_index, ring->q_vector->napi.napi_id); err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index eae75260fe20..77d5eae6b4c2 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -483,7 +483,7 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring) if (rx_ring->vsi->type == ICE_VSI_PF && !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->q_index)) + rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) goto err; return 0; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 5fc2c381da55..6a4ef4934fcf 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4352,7 +4352,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, 0) < 0) goto err; return 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 45ae33e15303..50e6b8b6ba7b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6577,7 +6577,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0) goto err; rx_ring->xdp_prog = adapter->xdp_prog; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 82fce27f682b..4061cd7db5dd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -3493,7 +3493,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, 0) < 0) goto err; rx_ring->xdp_prog = adapter->xdp_prog; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 183530ed4d1d..ba6dcb19bb1d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3227,7 +3227,7 @@ static int mvneta_create_page_pool(struct mvneta_port *pp, return err; } - err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0); if (err < 0) goto err_free_pp; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 3069e192d773..5504cbc24970 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -2614,11 +2614,11 @@ static int mvpp2_rxq_init(struct mvpp2_port *port, mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size); if (priv->percpu_pools) { - err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id, 0); if (err < 0) goto err_free_dma; - err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id, 0); if (err < 0) goto err_unregister_rxq_short; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b0f79a5151cf..40775cb8fb2a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -283,7 +283,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; - if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0) + if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0) goto err_ring; tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 527c5f12c5af..427fc376fe1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -434,7 +434,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq_xdp_ix = rq->ix; if (xsk) rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK; - err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix); + err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); if (err < 0) goto err_rq_xdp_prog; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index b150da43adb2..b4acf2f41e84 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2533,7 +2533,7 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring) if (dp->netdev) { err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev, - rx_ring->idx); + rx_ring->idx, rx_ring->r_vec->napi.napi_id); if (err < 0) return err; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 05e3a3b60269..9cf960a6d007 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1762,7 +1762,7 @@ static void qede_init_fp(struct qede_dev *edev) /* Driver have no error path from here */ WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev, - fp->rxq->rxq_id) < 0); + fp->rxq->rxq_id, 0) < 0); if (xdp_rxq_info_reg_mem_model(&fp->rxq->xdp_rxq, MEM_TYPE_PAGE_ORDER0, diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 19cf7cac1e6e..68fc7d317693 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -262,7 +262,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) /* Initialise XDP queue information */ rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, - rx_queue->core_index); + rx_queue->core_index, 0); if (rc) { netif_err(efx, rx_err, efx->net_dev, diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 1503cc9ec6e2..27d3c9d9210e 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1304,7 +1304,7 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv) goto err_out; } - err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0); + err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0, priv->napi.napi_id); if (err) goto err_out; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 31c5e36ff706..6dd73bd0f458 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1186,7 +1186,7 @@ static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch) pool = cpsw->page_pool[ch]; rxq = &priv->xdp_rxq[ch]; - ret = xdp_rxq_info_reg(rxq, priv->ndev, ch); + ret = xdp_rxq_info_reg(rxq, priv->ndev, ch, 0); if (ret) return ret; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 0c3de94b5178..fa8341f8359a 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1499,7 +1499,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, u64_stats_init(&nvchan->tx_stats.syncp); u64_stats_init(&nvchan->rx_stats.syncp); - ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i); + ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0); if (ret) { netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3d45d56172cb..8867d39db6ac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -780,7 +780,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, - tun->dev, tfile->queue_index); + tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 8c737668008a..9bd37c7151f8 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -884,7 +884,6 @@ static int veth_napi_add(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); napi_enable(&rq->xdp_napi); } @@ -926,7 +925,8 @@ static int veth_enable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; @@ -952,8 +952,12 @@ static int veth_enable_xdp(struct net_device *dev) err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: - for (i--; i >= 0; i--) - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + for (i--; i >= 0; i--) { + struct veth_rq *rq = &priv->rq[i]; + + xdp_rxq_info_unreg(&rq->xdp_rxq); + netif_napi_del(&rq->xdp_napi); + } return err; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 21b71148c532..052975ea0af4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1485,7 +1485,7 @@ static int virtnet_open(struct net_device *dev) if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); - err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id); if (err < 0) return err; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 920cac4385bf..b01848ef4649 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2014,7 +2014,7 @@ static int xennet_create_page_pool(struct netfront_queue *queue) } err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev, - queue->id); + queue->id, 0); if (err) { netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n"); goto err_free_pp; diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 2f8f51807b83..45b3e04b99d3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -135,14 +135,25 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) sk_rx_queue_set(sk, skb); } +static inline void __sk_mark_napi_id_once_xdp(struct sock *sk, unsigned int napi_id) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (!READ_ONCE(sk->sk_napi_id)) + WRITE_ONCE(sk->sk_napi_id, napi_id); +#endif +} + /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { -#ifdef CONFIG_NET_RX_BUSY_POLL - if (!READ_ONCE(sk->sk_napi_id)) - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); -#endif + __sk_mark_napi_id_once_xdp(sk, skb->napi_id); +} + +static inline void sk_mark_napi_id_once_xdp(struct sock *sk, + const struct xdp_buff *xdp) +{ + __sk_mark_napi_id_once_xdp(sk, xdp->rxq->napi_id); } #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 7d48b2ae217a..700ad5db7f5d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -59,6 +59,7 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; + unsigned int napi_id; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { @@ -226,7 +227,7 @@ static inline void xdp_release_frame(struct xdp_frame *xdpf) } int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); + struct net_device *dev, u32 queue_index, unsigned int napi_id); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); diff --git a/net/core/dev.c b/net/core/dev.c index 7a1e5936c67f..3b6b0e175fe7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9810,7 +9810,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 3d330ebda893..17ffd33c6b18 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a8501a5477cf..7588e599a048 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + sk_mark_napi_id_once_xdp(&xs->sk, xdp); len = xdp->data_end - xdp->data; return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? From f2d2728220ac6482c69c5f018ec09bafd688e7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:02 +0100 Subject: [PATCH 043/118] samples/bpf: Use recvfrom() in xdpsock/rxdrop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start using recvfrom() the rxdrop scenario. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-8-bjorn.topel@gmail.com --- samples/bpf/xdpsock_user.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 2567f0db5aca..f90111b95b2e 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1170,7 +1170,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, } } -static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) +static void rx_drop(struct xsk_socket_info *xsk) { unsigned int rcvd, i; u32 idx_rx = 0, idx_fq = 0; @@ -1180,7 +1180,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) if (!rcvd) { if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; - ret = poll(fds, num_socks, opt_timeout); + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } return; } @@ -1191,7 +1191,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) exit_with_error(-ret); if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.fill_fail_polls++; - ret = poll(fds, num_socks, opt_timeout); + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } @@ -1233,7 +1233,7 @@ static void rx_drop_all(void) } for (i = 0; i < num_socks; i++) - rx_drop(xsks[i], fds); + rx_drop(xsks[i]); if (benchmark_done) break; From 284cbc61f851bf86326b28acfe6d161691d4a4ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:03 +0100 Subject: [PATCH 044/118] samples/bpf: Use recvfrom() in xdpsock/l2fwd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start using recvfrom() the l2fwd scenario, instead of poll() which is more expensive and need additional knobs for busy-polling. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-9-bjorn.topel@gmail.com --- samples/bpf/xdpsock_user.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index f90111b95b2e..a1a3d6f02ba9 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1098,8 +1098,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, - struct pollfd *fds) +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) { struct xsk_umem_info *umem = xsk->umem; u32 idx_cq = 0, idx_fq = 0; @@ -1134,7 +1133,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, exit_with_error(-ret); if (xsk_ring_prod__needs_wakeup(&umem->fq)) { xsk->app_stats.fill_fail_polls++; - ret = poll(fds, num_socks, opt_timeout); + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, + NULL); } ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); } @@ -1331,19 +1331,19 @@ static void tx_only_all(void) complete_tx_only_all(); } -static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) +static void l2fwd(struct xsk_socket_info *xsk) { unsigned int rcvd, i; u32 idx_rx = 0, idx_tx = 0; int ret; - complete_tx_l2fwd(xsk, fds); + complete_tx_l2fwd(xsk); rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; - ret = poll(fds, num_socks, opt_timeout); + recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } return; } @@ -1353,7 +1353,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - complete_tx_l2fwd(xsk, fds); + complete_tx_l2fwd(xsk); if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); @@ -1388,22 +1388,20 @@ static void l2fwd_all(void) struct pollfd fds[MAX_SOCKS] = {}; int i, ret; - for (i = 0; i < num_socks; i++) { - fds[i].fd = xsk_socket__fd(xsks[i]->xsk); - fds[i].events = POLLOUT | POLLIN; - } - for (;;) { if (opt_poll) { - for (i = 0; i < num_socks; i++) + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); + fds[i].events = POLLOUT | POLLIN; xsks[i]->app_stats.opt_polls++; + } ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; } for (i = 0; i < num_socks; i++) - l2fwd(xsks[i], fds); + l2fwd(xsks[i]); if (benchmark_done) break; From b35fc1482ceb2f36bedd1587c3cfea3167baa2f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:04 +0100 Subject: [PATCH 045/118] samples/bpf: Add busy-poll support to xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new option to xdpsock, 'B', for busy-polling. This option will also set the batching size, 'b' option, to the busy-poll budget. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-10-bjorn.topel@gmail.com --- samples/bpf/xdpsock_user.c | 40 +++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index a1a3d6f02ba9..4622a17fafe1 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -95,6 +95,7 @@ static int opt_timeout = 1000; static bool opt_need_wakeup = true; static u32 opt_num_xsks = 1; static u32 prog_id; +static bool opt_busy_poll; struct xsk_ring_stats { unsigned long rx_npkts; @@ -911,6 +912,7 @@ static struct option long_options[] = { {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, {"irq-string", no_argument, 0, 'I'}, + {"busy-poll", no_argument, 0, 'B'}, {0, 0, 0, 0} }; @@ -949,6 +951,7 @@ static void usage(const char *prog) " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" + " -B, --busy-poll Busy poll.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -964,7 +967,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:B", long_options, &option_index); if (c == -1) break; @@ -1062,7 +1065,9 @@ static void parse_command_line(int argc, char **argv) fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); usage(basename(argv[0])); } - + break; + case 'B': + opt_busy_poll = 1; break; default: usage(basename(argv[0])); @@ -1131,7 +1136,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&umem->fq)) { xsk->app_stats.fill_fail_polls++; recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); @@ -1178,7 +1183,7 @@ static void rx_drop(struct xsk_socket_info *xsk) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } @@ -1189,7 +1194,7 @@ static void rx_drop(struct xsk_socket_info *xsk) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.fill_fail_polls++; recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } @@ -1341,7 +1346,7 @@ static void l2fwd(struct xsk_socket_info *xsk) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { xsk->app_stats.rx_empty_polls++; recvfrom(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, NULL); } @@ -1354,7 +1359,7 @@ static void l2fwd(struct xsk_socket_info *xsk) if (ret < 0) exit_with_error(-ret); complete_tx_l2fwd(xsk); - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + if (opt_busy_poll || xsk_ring_prod__needs_wakeup(&xsk->tx)) { xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); } @@ -1459,6 +1464,24 @@ static void enter_xsks_into_map(struct bpf_object *obj) } } +static void apply_setsockopt(struct xsk_socket_info *xsk) +{ + int sock_opt; + + if (!opt_busy_poll) + return; + + sock_opt = 1; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_PREFER_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); + + sock_opt = 20; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); +} + int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; @@ -1500,6 +1523,9 @@ int main(int argc, char **argv) for (i = 0; i < opt_num_xsks; i++) xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); + for (i = 0; i < opt_num_xsks; i++) + apply_setsockopt(xsks[i]); + if (opt_bench == BENCH_TXONLY) { gen_eth_hdr_data(); From 41bf900fe2a0cd56bdc3639ac73d509d52826149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 30 Nov 2020 19:52:05 +0100 Subject: [PATCH 046/118] samples/bpf: Add option to set the busy-poll budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support for the SO_BUSY_POLL_BUDGET setsockopt, via the batching option ('b'). Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201130185205.196029-11-bjorn.topel@gmail.com --- samples/bpf/xdpsock_user.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4622a17fafe1..036bd019e400 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1480,6 +1480,11 @@ static void apply_setsockopt(struct xsk_socket_info *xsk) if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt, sizeof(sock_opt)) < 0) exit_with_error(errno); + + sock_opt = opt_batch_size; + if (setsockopt(xsk_socket__fd(xsk->xsk), SOL_SOCKET, SO_BUSY_POLL_BUDGET, + (void *)&sock_opt, sizeof(sock_opt)) < 0) + exit_with_error(errno); } int main(int argc, char **argv) From ba0581749fec389e55c9d761f2716f8fcbefced5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 1 Dec 2020 15:22:59 +0100 Subject: [PATCH 047/118] net, xdp, xsk: fix __sk_mark_napi_id_once napi_id error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stephen reported the following build error for !CONFIG_NET_RX_BUSY_POLL built kernels: In file included from fs/select.c:32: include/net/busy_poll.h: In function 'sk_mark_napi_id_once': include/net/busy_poll.h:150:36: error: 'const struct sk_buff' has no member named 'napi_id' 150 | __sk_mark_napi_id_once_xdp(sk, skb->napi_id); | ^~ Fix it by wrapping a CONFIG_NET_RX_BUSY_POLL around the helpers. Fixes: b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path") Reported-by: Stephen Rothwell Signed-off-by: Stephen Rothwell Signed-off-by: Daniel Borkmann Cc: Björn Töpel Link: https://lore.kernel.org/linux-next/20201201190746.7d3357fb@canb.auug.org.au --- include/net/busy_poll.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 45b3e04b99d3..73af4a64a599 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -135,7 +135,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) sk_rx_queue_set(sk, skb); } -static inline void __sk_mark_napi_id_once_xdp(struct sock *sk, unsigned int napi_id) +static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) @@ -147,13 +147,17 @@ static inline void __sk_mark_napi_id_once_xdp(struct sock *sk, unsigned int napi static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { - __sk_mark_napi_id_once_xdp(sk, skb->napi_id); +#ifdef CONFIG_NET_RX_BUSY_POLL + __sk_mark_napi_id_once(sk, skb->napi_id); +#endif } static inline void sk_mark_napi_id_once_xdp(struct sock *sk, const struct xdp_buff *xdp) { - __sk_mark_napi_id_once_xdp(sk, xdp->rxq->napi_id); +#ifdef CONFIG_NET_RX_BUSY_POLL + __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); +#endif } #endif /* _LINUX_NET_BUSY_POLL_H */ From a999696c547f1a8ef2ddbb9b0e77abc3f6db4ff1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Dec 2020 09:25:14 -0800 Subject: [PATCH 048/118] selftests/bpf: Rewrite test_sock_addr bind bpf into C I'm planning to extend it in the next patches. It's much easier to work with C than BPF assembly. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201202172516.3483656-2-sdf@google.com --- .../testing/selftests/bpf/progs/bind4_prog.c | 71 +++++++ .../testing/selftests/bpf/progs/bind6_prog.c | 88 ++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 196 ++---------------- 3 files changed, 171 insertions(+), 184 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bind4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/bind6_prog.c diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c new file mode 100644 index 000000000000..0951302a984a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ +#define SERV4_PORT 4040 +#define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ +#define SERV4_REWRITE_PORT 4444 + +SEC("cgroup/bind4") +int bind_v4_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip4; + __u16 user_port; + + sk = ctx->sk; + if (!sk) + return 0; + + if (sk->family != AF_INET) + return 0; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 0; + + if (ctx->user_ip4 != bpf_htonl(SERV4_IP) || + ctx->user_port != bpf_htons(SERV4_PORT)) + return 0; + + // u8 narrow loads: + user_ip4 = 0; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; + user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + if (ctx->user_ip4 != user_ip4) + return 0; + + user_port = 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + if (ctx->user_port != user_port) + return 0; + + // u16 narrow loads: + user_ip4 = 0; + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; + user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + if (ctx->user_ip4 != user_ip4) + return 0; + + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); + ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c new file mode 100644 index 000000000000..16da1cf85418 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ +#define SERV6_IP_1 0x12345678 +#define SERV6_IP_2 0x00000000 +#define SERV6_IP_3 0x0000abcd +#define SERV6_PORT 6060 +#define SERV6_REWRITE_IP_0 0x00000000 +#define SERV6_REWRITE_IP_1 0x00000000 +#define SERV6_REWRITE_IP_2 0x00000000 +#define SERV6_REWRITE_IP_3 0x00000001 +#define SERV6_REWRITE_PORT 6666 + +SEC("cgroup/bind6") +int bind_v6_prog(struct bpf_sock_addr *ctx) +{ + struct bpf_sock *sk; + __u32 user_ip6; + __u16 user_port; + int i; + + sk = ctx->sk; + if (!sk) + return 0; + + if (sk->family != AF_INET6) + return 0; + + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) + return 0; + + if (ctx->user_ip6[0] != bpf_htonl(SERV6_IP_0) || + ctx->user_ip6[1] != bpf_htonl(SERV6_IP_1) || + ctx->user_ip6[2] != bpf_htonl(SERV6_IP_2) || + ctx->user_ip6[3] != bpf_htonl(SERV6_IP_3) || + ctx->user_port != bpf_htons(SERV6_PORT)) + return 0; + + // u8 narrow loads: + for (i = 0; i < 4; i++) { + user_ip6 = 0; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; + user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + if (ctx->user_ip6[i] != user_ip6) + return 0; + } + + user_port = 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; + user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + if (ctx->user_port != user_port) + return 0; + + // u16 narrow loads: + for (i = 0; i < 4; i++) { + user_ip6 = 0; + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; + user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + if (ctx->user_ip6[i] != user_ip6) + return 0; + } + + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); + ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); + ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); + ctx->user_ip6[3] = bpf_htonl(SERV6_REWRITE_IP_3); + ctx->user_port = bpf_htons(SERV6_REWRITE_PORT); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index b8c72c1d9cf7..dcb83ab02919 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -31,6 +31,8 @@ #define CONNECT6_PROG_PATH "./connect6_prog.o" #define SENDMSG4_PROG_PATH "./sendmsg4_prog.o" #define SENDMSG6_PROG_PATH "./sendmsg6_prog.o" +#define BIND4_PROG_PATH "./bind4_prog.o" +#define BIND6_PROG_PATH "./bind6_prog.o" #define SERV4_IP "192.168.1.254" #define SERV4_REWRITE_IP "127.0.0.1" @@ -660,190 +662,6 @@ static int load_insns(const struct sock_addr_test *test, return ret; } -/* [1] These testing programs try to read different context fields, including - * narrow loads of different sizes from user_ip4 and user_ip6, and write to - * those allowed to be overridden. - * - * [2] BPF_LD_IMM64 & BPF_JMP_REG are used below whenever there is a need to - * compare a register with unsigned 32bit integer. BPF_JMP_IMM can't be used - * in such cases since it accepts only _signed_ 32bit integer as IMM - * argument. Also note that BPF_LD_IMM64 contains 2 instructions what matters - * to count jumps properly. - */ - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - union { - uint8_t u4_addr8[4]; - uint16_t u4_addr16[2]; - uint32_t u4_addr32; - } ip4, port; - struct sockaddr_in addr4_rw; - - if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { - log_err("Invalid IPv4: %s", SERV4_IP); - return -1; - } - - port.u4_addr32 = htons(SERV4_PORT); - - if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) - return -1; - - /* See [1]. */ - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), - - /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), - BPF_JMP_A(1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), - - /* 1st_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), - - /* 2nd_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), - - /* 3rd_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), - - /* 4th_byte_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 3), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), - - /* 1st_half_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), - - /* 2nd_half_of_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), - - /* whole_user_ip4 == expected && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip4)), - BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), - - /* 1st_byte_of_user_port == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), - - /* 1st_half_of_user_port == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), - - /* user_port == expected) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_port)), - BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), - - /* user_ip4 = addr4_rw.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = addr4_rw.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in6 addr6_rw; - struct in6_addr ip6; - - if (inet_pton(AF_INET6, SERV6_IP, (void *)&ip6) != 1) { - log_err("Invalid IPv6: %s", SERV6_IP); - return -1; - } - - if (mk_sockaddr(AF_INET6, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, - (struct sockaddr *)&addr6_rw, sizeof(addr6_rw)) == -1) - return -1; - - /* See [1]. */ - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6 && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - - /* 5th_byte_of_user_ip6 == expected && */ - BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[1])), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr[4], 16), - - /* 3rd_half_of_user_ip6 == expected && */ - BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[1])), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr16[2], 14), - - /* last_word_of_user_ip6 == expected) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, user_ip6[3])), - BPF_LD_IMM64(BPF_REG_8, ip6.s6_addr32[3]), /* See [2]. */ - BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 10), - - -#define STORE_IPV6_WORD(N) \ - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_addr.s6_addr32[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, user_ip6[N])) - - /* user_ip6 = addr6_rw.sin6_addr */ - STORE_IPV6_WORD(0), - STORE_IPV6_WORD(1), - STORE_IPV6_WORD(2), - STORE_IPV6_WORD(3), - - /* user_port = addr6_rw.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn)); -} - static int load_path(const struct sock_addr_test *test, const char *path) { struct bpf_prog_load_attr attr; @@ -865,6 +683,16 @@ static int load_path(const struct sock_addr_test *test, const char *path) return prog_fd; } +static int bind4_prog_load(const struct sock_addr_test *test) +{ + return load_path(test, BIND4_PROG_PATH); +} + +static int bind6_prog_load(const struct sock_addr_test *test) +{ + return load_path(test, BIND6_PROG_PATH); +} + static int connect4_prog_load(const struct sock_addr_test *test) { return load_path(test, CONNECT4_PROG_PATH); From 427167c0b064ed898b848209add62b4322ec7840 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Dec 2020 09:25:15 -0800 Subject: [PATCH 049/118] bpf: Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks I have to now lock/unlock socket for the bind hook execution. That shouldn't cause any overhead because the socket is unbound and shouldn't receive any traffic. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20201202172516.3483656-3-sdf@google.com --- include/linux/bpf-cgroup.h | 12 ++++++------ net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 2 +- net/ipv6/af_inet6.c | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ed71bd1a0825..72e69a0e1e8c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ sk->sk_prot->pre_connect) @@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) diff --git a/net/core/filter.c b/net/core/filter.c index 2ca5eecebacf..21d91dcf0260 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6995,6 +6995,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_setsockopt_proto; @@ -7003,6 +7005,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_sock_addr_getsockopt_proto; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b7260c8cef2e..b94fa8eb831b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); if (err) return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..a7e3d170af51 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); if (err) return err; From a540c81a2bcb95227c3e24a4478956824858a6b0 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Dec 2020 09:25:16 -0800 Subject: [PATCH 050/118] selftests/bpf: Extend bind{4,6} programs with a call to bpf_setsockopt To make sure it doesn't trigger sock_owned_by_me splat. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201202172516.3483656-4-sdf@google.com --- .../testing/selftests/bpf/progs/bind4_prog.c | 31 +++++++++++++++++++ .../testing/selftests/bpf/progs/bind6_prog.c | 31 +++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 0951302a984a..c6520f21f5f5 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -19,6 +19,33 @@ #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ #define SERV4_REWRITE_PORT 4444 +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/bind4") int bind_v4_prog(struct bpf_sock_addr *ctx) { @@ -62,6 +89,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (ctx->user_ip4 != user_ip4) return 0; + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 16da1cf85418..4358e44dcf47 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -25,6 +25,33 @@ #define SERV6_REWRITE_IP_3 0x00000001 #define SERV6_REWRITE_PORT 6666 +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/bind6") int bind_v6_prog(struct bpf_sock_addr *ctx) { @@ -76,6 +103,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) return 0; } + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); From bcfe06bf2622f7c4899468e427683aec49070687 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:27 -0800 Subject: [PATCH 051/118] mm: memcontrol: Use helpers to read page's memcg data Patch series "mm: allow mapping accounted kernel pages to userspace", v6. Currently a non-slab kernel page which has been charged to a memory cgroup can't be mapped to userspace. The underlying reason is simple: PageKmemcg flag is defined as a page type (like buddy, offline, etc), so it takes a bit from a page->mapped counter. Pages with a type set can't be mapped to userspace. But in general the kmemcg flag has nothing to do with mapping to userspace. It only means that the page has been accounted by the page allocator, so it has to be properly uncharged on release. Some bpf maps are mapping the vmalloc-based memory to userspace, and their memory can't be accounted because of this implementation detail. This patchset removes this limitation by moving the PageKmemcg flag into one of the free bits of the page->mem_cgroup pointer. Also it formalizes accesses to the page->mem_cgroup and page->obj_cgroups using new helpers, adds several checks and removes a couple of obsolete functions. As the result the code became more robust with fewer open-coded bit tricks. This patch (of 4): Currently there are many open-coded reads of the page->mem_cgroup pointer, as well as a couple of read helpers, which are barely used. It creates an obstacle on a way to reuse some bits of the pointer for storing additional bits of information. In fact, we already do this for slab pages, where the last bit indicates that a pointer has an attached vector of objcg pointers instead of a regular memcg pointer. This commits uses 2 existing helpers and introduces a new helper to converts all read sides to calls of these helpers: struct mem_cgroup *page_memcg(struct page *page); struct mem_cgroup *page_memcg_rcu(struct page *page); struct mem_cgroup *page_memcg_check(struct page *page); page_memcg_check() is intended to be used in cases when the page can be a slab page and have a memcg pointer pointing at objcg vector. It does check the lowest bit, and if set, returns NULL. page_memcg() contains a VM_BUG_ON_PAGE() check for the page not being a slab page. To make sure nobody uses a direct access, struct page's mem_cgroup/obj_cgroups is converted to unsigned long memcg_data. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com --- fs/buffer.c | 2 +- fs/iomap/buffered-io.c | 2 +- include/linux/memcontrol.h | 114 ++++++++++++++++++++++++++--- include/linux/mm.h | 22 ------ include/linux/mm_types.h | 5 +- include/trace/events/writeback.h | 2 +- kernel/fork.c | 7 +- mm/debug.c | 4 +- mm/huge_memory.c | 4 +- mm/memcontrol.c | 121 ++++++++++++++----------------- mm/page_alloc.c | 4 +- mm/page_io.c | 6 +- mm/slab.h | 9 +-- mm/workingset.c | 2 +- 14 files changed, 184 insertions(+), 120 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 23f645657488..b56f99f82b5b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 10cc7979ce38..16a1e82e3aeb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page) return !TestSetPageDirty(page); /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..f95c1433461c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,79 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + return (struct mem_cgroup *)page->memcg_data; +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)READ_ONCE(page->memcg_data); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + /* + * The lowest bit set means that memcg isn't a valid + * memcg pointer, but a obj_cgroups pointer. + * In this case the page is shared and doesn't belong + * to any specific memory cgroup. + */ + if (memcg_data & 0x1UL) + return NULL; + + return (struct mem_cgroup *)memcg_data; +} + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@ -743,15 +816,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@ -834,16 +911,17 @@ static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -878,8 +956,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, @@ -941,6 +1021,22 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1430,7 +1526,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page, if (mem_cgroup_disabled()) return; - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..6b0c9d2c1d10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return page->mem_cgroup; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); -} -#else -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; -} -#endif - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..80f5d755c037 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -199,10 +199,7 @@ struct page { atomic_t _refcount; #ifdef CONFIG_MEMCG - union { - struct mem_cgroup *mem_cgroup; - struct obj_cgroup **obj_cgroups; - }; + unsigned long memcg_data; #endif /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index e7cbccc7c14c..39a40dfb578a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..cbd4f6f58409 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* - * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and memcg_kmem_uncharge_page() in - * free_thread_stack() will ignore this page. + * If memcg_kmem_charge_page() fails, page's + * memory cgroup pointer is NULL, and + * memcg_kmem_uncharge_page() in free_thread_stack() + * will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); diff --git a/mm/debug.c b/mm/debug.c index ccca576b2899..8a40b3fefbeb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,8 +182,8 @@ hex_only: pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (!page_poisoned && page->mem_cgroup) - pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); + if (!page_poisoned && page->memcg_data) + pr_warn("pages's memcg:%lx\n", page->memcg_data); #endif } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9474dbc150ed..cedfb3503411 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); if (memcg) @@ -2765,7 +2765,7 @@ void deferred_split_huge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); #endif unsigned long flags; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dcbf24d2227..3968d68503cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = page->mem_cgroup; - - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if ((unsigned long) memcg & 0x1UL) - memcg = NULL; + memcg = page_memcg_check(page); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); @@ -1050,7 +1041,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm); */ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); if (mem_cgroup_disabled()) return NULL; @@ -1349,7 +1340,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd goto out; } - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. @@ -2109,7 +2100,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page->mem_cgroup binding + * lock_page_memcg - lock a page and memcg binding * @page: the page * * This function protects unlocked LRU pages from being moved to @@ -2141,7 +2132,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = head->mem_cgroup; + memcg = page_memcg(head); if (unlikely(!memcg)) return NULL; @@ -2149,7 +2140,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != head->mem_cgroup) { + if (memcg != page_memcg(head)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2187,14 +2178,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page->mem_cgroup binding + * unlock_page_memcg - unlock a page and memcg binding * @page: the page */ void unlock_page_memcg(struct page *page) { struct page *head = compound_head(page); - __unlock_page_memcg(head->mem_cgroup); + __unlock_page_memcg(page_memcg(head)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2884,7 +2875,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page->mem_cgroup, page); + VM_BUG_ON_PAGE(page_memcg(page), page); /* * Any of the following ensures page->mem_cgroup stability: * @@ -2893,7 +2884,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; } #ifdef CONFIG_MEMCG_KMEM @@ -2908,8 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (cmpxchg(&page->obj_cgroups, NULL, - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL)) kfree(vec); else kmemleak_not_leak(vec); @@ -2920,6 +2910,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * + * A passed kernel object can be a slab object or a generic kernel page, so + * different mechanisms for getting the memory cgroup pointer should be used. + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ @@ -2932,17 +2928,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); - /* - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer - * or a pointer to obj_cgroup vector. In the latter case the lowest - * bit of the pointer is set. - * The page->mem_cgroup pointer can be asynchronously changed - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed - * from a valid memcg pointer to objcg vector or back. - */ - if (!page->mem_cgroup) - return NULL; - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -2960,8 +2945,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } - /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + /* + * page_memcg_check() is used here, because page_has_obj_cgroups() + * check above could fail because the object cgroups vector wasn't set + * at that moment, but it can be set concurrently. + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(page); } __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@ -3099,7 +3090,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; __SetPageKmemcg(page); return 0; } @@ -3115,7 +3106,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); unsigned int nr_pages = 1 << order; if (!memcg) @@ -3123,7 +3114,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&memcg->css); /* slab pages do not have PageKmemcg flag set */ @@ -3274,7 +3265,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct mem_cgroup *memcg = head->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(head); int i; if (mem_cgroup_disabled()) @@ -3282,7 +3273,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); - head[i].mem_cgroup = memcg; + head[i].memcg_data = (unsigned long)memcg; } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4664,7 +4655,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@ -5641,14 +5632,14 @@ static int mem_cgroup_move_account(struct page *page, /* * Prevent mem_cgroup_migrate() from looking at - * page->mem_cgroup of its source page while we change it. + * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; if (!trylock_page(page)) goto out; ret = -EINVAL; - if (page->mem_cgroup != from) + if (page_memcg(page) != from) goto out_unlock; pgdat = page_pgdat(page); @@ -5703,13 +5694,13 @@ static int mem_cgroup_move_account(struct page *page, /* * All state has been migrated, let's switch to the new memcg. * - * It is safe to change page->mem_cgroup here because the page + * It is safe to change page's memcg here because the page * is referenced, charged, isolated, and locked: we can't race * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page->mem_cgroup. + * that would rely on a stable page's memory cgroup. * * Note that lock_page_memcg is a memcg lock, not a page lock, - * to save space. As soon as we switch page->mem_cgroup to a + * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. */ @@ -5718,7 +5709,7 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->mem_cgroup = to; + page->memcg_data = (unsigned long)to; __unlock_page_memcg(from); @@ -5784,7 +5775,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; @@ -5828,7 +5819,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -6774,12 +6765,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page->mem_cgroup is protected - * by the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. + * already charged pages, too. page and memcg binding is + * protected by the page lock, which serializes swap cache + * removal, which in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound_head(page)->mem_cgroup) + if (page_memcg(compound_head(page))) goto out; id = lookup_swap_cgroup_id(ent); @@ -6863,21 +6854,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page->mem_cgroup) + if (!page_memcg(page)) return; /* * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully + * page_memcg(page) at this point, we have fully * exclusive access to the page. */ - if (ug->memcg != page->mem_cgroup) { + if (ug->memcg != page_memcg(page)) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page->mem_cgroup; + ug->memcg = page_memcg(page); /* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@ -6894,7 +6885,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->dummy_page = page; - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&ug->memcg->css); } @@ -6937,7 +6928,7 @@ void mem_cgroup_uncharge(struct page *page) return; /* Don't touch page->lru of any random page, pre-check: */ - if (!page->mem_cgroup) + if (!page_memcg(page)) return; uncharge_gather_clear(&ug); @@ -6987,11 +6978,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) return; /* Page cache replacement: new page already charged? */ - if (newpage->mem_cgroup) + if (page_memcg(newpage)) return; /* Swapcache readahead pages can get replaced before being charged */ - memcg = oldpage->mem_cgroup; + memcg = page_memcg(oldpage); if (!memcg) return; @@ -7186,7 +7177,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7207,7 +7198,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - page->mem_cgroup = NULL; + page->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@ -7250,7 +7241,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7331,7 +7322,7 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg) return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f5066bd4a5..271133b8243b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; diff --git a/mm/page_io.c b/mm/page_io.c index 433df1263349..9bca17ecc4df 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page) static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; - if (!page->mem_cgroup) + memcg = page_memcg(page); + if (!memcg) return; rcu_read_lock(); - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } diff --git a/mm/slab.h b/mm/slab.h index 6d7c6a5056ba..e2535cee0d33 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -242,18 +242,17 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla static inline struct obj_cgroup **page_obj_cgroups(struct page *page) { /* - * page->mem_cgroup and page->obj_cgroups are sharing the same + * Page's memory cgroup and obj_cgroups vector are sharing the same * space. To distinguish between them in case we don't know for sure * that the page is a slab page (e.g. page_cgroup_ino()), let's * always set the lowest bit of obj_cgroups. */ - return (struct obj_cgroup **) - ((unsigned long)page->obj_cgroups & ~0x1UL); + return (struct obj_cgroup **)(page->memcg_data & ~0x1UL); } static inline bool page_has_obj_cgroups(struct page *page) { - return ((unsigned long)page->obj_cgroups & 0x1UL); + return page->memcg_data & 0x1UL; } int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, @@ -262,7 +261,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, static inline void memcg_free_page_obj_cgroups(struct page *page) { kfree(page_obj_cgroups(page)); - page->obj_cgroups = NULL; + page->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..130348cbf40a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page->mem_cgroup */ + /* Page is fully exclusive and pins page's memory cgroup pointer */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); From 270c6a71460e12b07b1dcadf7457ff95b6c6e8f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:28 -0800 Subject: [PATCH 052/118] mm: memcontrol/slab: Use helpers to access slab page's memcg_data To gather all direct accesses to struct page's memcg_data field in one place, let's introduce 3 new helpers to use in the slab accounting code: struct obj_cgroup **page_objcgs(struct page *page); struct obj_cgroup **page_objcgs_check(struct page *page); bool set_page_objcgs(struct page *page, struct obj_cgroup **objcgs); They are similar to the corresponding API for generic pages, except that the setter can return false, indicating that the value has been already set from a different thread. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20201027001657.3398190-3-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-3-guro@fb.com --- include/linux/memcontrol.h | 64 ++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 6 ++-- mm/slab.h | 35 +++++---------------- 3 files changed, 75 insertions(+), 30 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f95c1433461c..c7ac0a5b8989 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -416,6 +416,70 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)memcg_data; } +#ifdef CONFIG_MEMCG_KMEM +/* + * page_objcgs - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function assumes that the page is known to have an + * associated object cgroups vector. It's not safe to call this function + * against pages, which might have an associated memory cgroup: e.g. + * kernel stack pages. + */ +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL); +} + +/* + * page_objcgs_check - get the object cgroups vector associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the object cgroups vector associated with the page, + * or NULL. This function is safe to use if the page can be directly associated + * with a memory cgroup. + */ +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + if (memcg_data && (memcg_data & 0x1UL)) + return (struct obj_cgroup **)(memcg_data & ~0x1UL); + + return NULL; +} + +/* + * set_page_objcgs - associate a page with a object cgroups vector + * @page: a pointer to the page struct + * @objcgs: a pointer to the object cgroups vector + * + * Atomically associates a page with a vector of object cgroups. + */ +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL); +} +#else +static inline struct obj_cgroup **page_objcgs(struct page *page) +{ + return NULL; +} + +static inline struct obj_cgroup **page_objcgs_check(struct page *page) +{ + return NULL; +} + +static inline bool set_page_objcgs(struct page *page, + struct obj_cgroup **objcgs) +{ + return true; +} +#endif + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3968d68503cb..0054b4846770 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2899,7 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL)) + if (!set_page_objcgs(page, vec)) kfree(vec); else kmemleak_not_leak(vec); @@ -2933,12 +2933,12 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) * Memcg membership data for each individual object is saved in * the page->obj_cgroups. */ - if (page_has_obj_cgroups(page)) { + if (page_objcgs_check(page)) { struct obj_cgroup *objcg; unsigned int off; off = obj_to_index(page->slab_cache, page, p); - objcg = page_obj_cgroups(page)[off]; + objcg = page_objcgs(page)[off]; if (objcg) return obj_cgroup_memcg(objcg); diff --git a/mm/slab.h b/mm/slab.h index e2535cee0d33..9a54a0cb5cca 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -239,28 +239,12 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla } #ifdef CONFIG_MEMCG_KMEM -static inline struct obj_cgroup **page_obj_cgroups(struct page *page) -{ - /* - * Page's memory cgroup and obj_cgroups vector are sharing the same - * space. To distinguish between them in case we don't know for sure - * that the page is a slab page (e.g. page_cgroup_ino()), let's - * always set the lowest bit of obj_cgroups. - */ - return (struct obj_cgroup **)(page->memcg_data & ~0x1UL); -} - -static inline bool page_has_obj_cgroups(struct page *page) -{ - return page->memcg_data & 0x1UL; -} - int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp); static inline void memcg_free_page_obj_cgroups(struct page *page) { - kfree(page_obj_cgroups(page)); + kfree(page_objcgs(page)); page->memcg_data = 0; } @@ -322,7 +306,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, if (likely(p[i])) { page = virt_to_head_page(p[i]); - if (!page_has_obj_cgroups(page) && + if (!page_objcgs(page) && memcg_alloc_page_obj_cgroups(page, s, flags)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; @@ -330,7 +314,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, off = obj_to_index(s, page, p[i]); obj_cgroup_get(objcg); - page_obj_cgroups(page)[off] = objcg; + page_objcgs(page)[off] = objcg; mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), obj_full_size(s)); } else { @@ -344,6 +328,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, void **p, int objects) { struct kmem_cache *s; + struct obj_cgroup **objcgs; struct obj_cgroup *objcg; struct page *page; unsigned int off; @@ -357,7 +342,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, continue; page = virt_to_head_page(p[i]); - if (!page_has_obj_cgroups(page)) + objcgs = page_objcgs(page); + if (!objcgs) continue; if (!s_orig) @@ -366,11 +352,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, s = s_orig; off = obj_to_index(s, page, p[i]); - objcg = page_obj_cgroups(page)[off]; + objcg = objcgs[off]; if (!objcg) continue; - page_obj_cgroups(page)[off] = NULL; + objcgs[off] = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), -obj_full_size(s)); @@ -379,11 +365,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, } #else /* CONFIG_MEMCG_KMEM */ -static inline bool page_has_obj_cgroups(struct page *page) -{ - return false; -} - static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; From 87944e2992bd28098c6806086a1e96bb4d0e502b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:29 -0800 Subject: [PATCH 053/118] mm: Introduce page memcg flags The lowest bit in page->memcg_data is used to distinguish between struct memory_cgroup pointer and a pointer to a objcgs array. All checks and modifications of this bit are open-coded. Let's formalize it using page memcg flags, defined in enum page_memcg_data_flags. Additional flags might be added later. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-4-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-4-guro@fb.com --- include/linux/memcontrol.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c7ac0a5b8989..99a4841d658b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,15 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +enum page_memcg_data_flags { + /* page->memcg_data is a pointer to an objcgs vector */ + MEMCG_DATA_OBJCGS = (1UL << 0), + /* the next bit after the last actual flag */ + __NR_MEMCG_DATA_FLAGS = (1UL << 1), +}; + +#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) + /* * page_memcg - get the memory cgroup associated with a page * @page: a pointer to the page struct @@ -404,13 +413,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) */ unsigned long memcg_data = READ_ONCE(page->memcg_data); - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if (memcg_data & 0x1UL) + if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; return (struct mem_cgroup *)memcg_data; @@ -429,7 +432,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) */ static inline struct obj_cgroup **page_objcgs(struct page *page) { - return (struct obj_cgroup **)(READ_ONCE(page->memcg_data) & ~0x1UL); + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -444,10 +451,10 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) { unsigned long memcg_data = READ_ONCE(page->memcg_data); - if (memcg_data && (memcg_data & 0x1UL)) - return (struct obj_cgroup **)(memcg_data & ~0x1UL); + if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) + return NULL; - return NULL; + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -460,7 +467,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) static inline bool set_page_objcgs(struct page *page, struct obj_cgroup **objcgs) { - return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | 0x1UL); + return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | + MEMCG_DATA_OBJCGS); } #else static inline struct obj_cgroup **page_objcgs(struct page *page) From 18b2db3b0385226b71cb3288474fa5a6e4a45474 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:30 -0800 Subject: [PATCH 054/118] mm: Convert page kmemcg type to a page memcg flag PageKmemcg flag is currently defined as a page type (like buddy, offline, table and guard). Semantically it means that the page was accounted as a kernel memory by the page allocator and has to be uncharged on the release. As a side effect of defining the flag as a page type, the accounted page can't be mapped to userspace (look at page_has_type() and comments above). In particular, this blocks the accounting of vmalloc-backed memory used by some bpf maps, because these maps do map the memory to userspace. One option is to fix it by complicating the access to page->mapcount, which provides some free bits for page->page_type. But it's way better to move this flag into page->memcg_data flags. Indeed, the flag makes no sense without enabled memory cgroups and memory cgroup pointer set in particular. This commit replaces PageKmemcg() and __SetPageKmemcg() with PageMemcgKmem() and an open-coded OR operation setting the memcg pointer with the MEMCG_DATA_KMEM bit. __ClearPageKmemcg() can be simple deleted, as the whole memcg_data is zeroed at once. As a bonus, on !CONFIG_MEMCG build the PageMemcgKmem() check will be compiled out. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-5-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-5-guro@fb.com --- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- include/linux/page-flags.h | 11 ++--------- mm/memcontrol.c | 16 +++++----------- mm/page_alloc.c | 4 ++-- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 99a4841d658b..7c9d43476166 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -346,8 +346,10 @@ extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an objcgs vector */ MEMCG_DATA_OBJCGS = (1UL << 0), + /* page has been accounted as a non-slab kernel page */ + MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 1), + __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) @@ -369,8 +371,12 @@ enum page_memcg_data_flags { */ static inline struct mem_cgroup *page_memcg(struct page *page) { + unsigned long memcg_data = page->memcg_data; + VM_BUG_ON_PAGE(PageSlab(page), page); - return (struct mem_cgroup *)page->memcg_data; + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); + + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* @@ -387,7 +393,8 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) VM_BUG_ON_PAGE(PageSlab(page), page); WARN_ON_ONCE(!rcu_read_lock_held()); - return (struct mem_cgroup *)READ_ONCE(page->memcg_data); + return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) & + ~MEMCG_DATA_FLAGS_MASK); } /* @@ -416,7 +423,21 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; - return (struct mem_cgroup *)memcg_data; + return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +/* + * PageMemcgKmem - check if the page has MemcgKmem flag set + * @page: a pointer to the page struct + * + * Checks if the page has MemcgKmem flag set. The caller must ensure that + * the page has an associated memory cgroup. It's not safe to call this function + * against some types of pages, e.g. slab pages. + */ +static inline bool PageMemcgKmem(struct page *page) +{ + VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); + return page->memcg_data & MEMCG_DATA_KMEM; } #ifdef CONFIG_MEMCG_KMEM @@ -435,6 +456,7 @@ static inline struct obj_cgroup **page_objcgs(struct page *page) unsigned long memcg_data = READ_ONCE(page->memcg_data); VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -454,6 +476,8 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS)) return NULL; + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } @@ -1109,6 +1133,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline bool PageMemcgKmem(struct page *page) +{ + return false; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4f6ba9379112..fc0e1bd48e73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -715,9 +715,8 @@ PAGEFLAG_FALSE(DoubleMap) #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -#define PG_kmemcg 0x00000200 -#define PG_table 0x00000400 -#define PG_guard 0x00000800 +#define PG_table 0x00000200 +#define PG_guard 0x00000400 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -768,12 +767,6 @@ PAGE_TYPE_OPS(Buddy, buddy) */ PAGE_TYPE_OPS(Offline, offline) -/* - * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on - * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. - */ -PAGE_TYPE_OPS(Kmemcg, kmemcg) - /* * Marks pages in use as page tables. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0054b4846770..e0366e306221 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3090,8 +3090,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->memcg_data = (unsigned long)memcg; - __SetPageKmemcg(page); + page->memcg_data = (unsigned long)memcg | + MEMCG_DATA_KMEM; return 0; } css_put(&memcg->css); @@ -3116,10 +3116,6 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) __memcg_kmem_uncharge(memcg, nr_pages); page->memcg_data = 0; css_put(&memcg->css); - - /* slab pages do not have PageKmemcg flag set */ - if (PageKmemcg(page)) - __ClearPageKmemcg(page); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -6877,12 +6873,10 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) nr_pages = compound_nr(page); ug->nr_pages += nr_pages; - if (!PageKmemcg(page)) { - ug->pgpgout++; - } else { + if (PageMemcgKmem(page)) ug->nr_kmem += nr_pages; - __ClearPageKmemcg(page); - } + else + ug->pgpgout++; ug->dummy_page = page; page->memcg_data = 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 271133b8243b..3c53018c9c61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1214,7 +1214,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); return false; @@ -1244,7 +1244,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) bad += check_free_page(page); From ddf8503c7c434374a1112e89bcedfe1ccb3057df Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:31 -0800 Subject: [PATCH 055/118] bpf: Memcg-based memory accounting for bpf progs Include memory used by bpf programs into the memcg-based accounting. This includes the memory used by programs itself, auxiliary data, statistics and bpf line info. A memory cgroup containing the process which loads the program is getting charged. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-6-guro@fb.com --- kernel/bpf/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff55cbcfbab4..2921f58c03a8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -77,7 +77,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -86,7 +86,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag if (fp == NULL) return NULL; - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); if (aux == NULL) { vfree(fp); return NULL; @@ -106,7 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; int cpu; @@ -138,7 +138,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!prog->aux->jited_linfo) return -ENOMEM; @@ -219,7 +219,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog) struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; From 48edc1f78aabeba35ed00e40c36f211de89e0090 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:32 -0800 Subject: [PATCH 056/118] bpf: Prepare for memcg-based memory accounting for bpf maps Bpf maps can be updated from an interrupt context and in such case there is no process which can be charged. It makes the memory accounting of bpf maps non-trivial. Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel memcg accounting from interrupt contexts") and commit b87d8cefe43c ("mm, memcg: rework remote charging API to support nesting") it's finally possible. To make the ownership model simple and consistent, when the map is created, the memory cgroup of the current process is recorded. All subsequent allocations related to the bpf map are charged to the same memory cgroup. It includes allocations made by any processes (even if they do belong to a different cgroup) and from interrupts. This commit introduces 3 new helpers, which will be used by following commits to enable the accounting of bpf maps memory: - bpf_map_kmalloc_node() - bpf_map_kzalloc() - bpf_map_alloc_percpu() They are wrapping popular memory allocation functions. They set the active memory cgroup to the map's memory cgroup and add __GFP_ACCOUNT to the passed gfp flags. Then they call into the corresponding memory allocation function and restore the original active memory cgroup. These helpers are supposed to use everywhere except the map creation path. During the map creation when the map structure is allocated by itself, it cannot be passed to those helpers. In those cases default memory allocation function will be used with the __GFP_ACCOUNT flag. Signed-off-by: Roman Gushchin Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-7-guro@fb.com --- include/linux/bpf.h | 34 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1bcb6d7345c..e1f2c95c15ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -37,6 +39,7 @@ struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; +struct mem_cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -161,6 +164,9 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg; +#endif char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; @@ -1240,6 +1246,34 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +#ifdef CONFIG_MEMCG_KMEM +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node); +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags); +#else +static inline void * +bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + return kmalloc_node(size, flags, node); +} + +static inline void * +bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + return kzalloc(size, flags); +} + +static inline void __percpu * +bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, + gfp_t flags) +{ + return __alloc_percpu_gfp(size, align, flags); +} +#endif + extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f3fe9f53f93c..dedbf6d4cd84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -456,6 +457,65 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) __release(&map_idr_lock); } +#ifdef CONFIG_MEMCG_KMEM +static void bpf_map_save_memcg(struct bpf_map *map) +{ + map->memcg = get_mem_cgroup_from_mm(current->mm); +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ + mem_cgroup_put(map->memcg); +} + +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + + return ptr; +} + +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kzalloc(size, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void __percpu *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +#else +static void bpf_map_save_memcg(struct bpf_map *map) +{ +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ +} +#endif + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -464,6 +524,7 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); + bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); bpf_map_charge_finish(&mem); @@ -875,6 +936,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_sec; + bpf_map_save_memcg(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. From d5299b67dd59445902cd30cbc60a03c869cf1adb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:33 -0800 Subject: [PATCH 057/118] bpf: Memcg-based memory accounting for bpf maps This patch enables memcg-based memory accounting for memory allocated by __bpf_map_area_alloc(), which is used by many types of bpf maps for large initial memory allocations. Please note, that __bpf_map_area_alloc() should not be used outside of map creation paths without setting the active memory cgroup to the map's memory cgroup. Following patches in the series will refine the accounting for some of the map types. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-8-guro@fb.com --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dedbf6d4cd84..dff3a5f62d7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -268,6 +268,10 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, return err; } +/* Please, do not use this function outside from the map creation path + * (e.g. in map update path) without taking care of setting the active + * memory cgroup (see at bpf_map_kmalloc_node() for example). + */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -280,7 +284,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; unsigned int flags = 0; unsigned long align = 1; void *area; From 6d192c7938b7e53a6bb55b90b86bd02ea0153731 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:34 -0800 Subject: [PATCH 058/118] bpf: Refine memcg-based memory accounting for arraymap maps Include percpu arrays and auxiliary data into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-9-guro@fb.com --- kernel/bpf/arraymap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c6c81eceb68f..d837e0603c89 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -34,8 +34,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) int i; for (i = 0; i < array->map.max_entries; i++) { - ptr = __alloc_percpu_gfp(array->elem_size, 8, - GFP_USER | __GFP_NOWARN); + ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, + GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; @@ -1018,7 +1018,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) struct bpf_array_aux *aux; struct bpf_map *map; - aux = kzalloc(sizeof(*aux), GFP_KERNEL); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); From e88cc05b61f3fe8bd4bd8ce1a0a2d03357225305 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:35 -0800 Subject: [PATCH 059/118] bpf: Refine memcg-based memory accounting for cpumap maps Include metadata and percpu data into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-10-guro@fb.com --- kernel/bpf/cpumap.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c61a23b564aa..90b949666605 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER); + cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); if (!cmap) return ERR_PTR(-ENOMEM); @@ -412,7 +412,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) } static struct bpf_cpu_map_entry * -__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) +__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, + u32 cpu) { int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -422,13 +423,13 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); - rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) return NULL; /* Alloc percpu bulkq */ - rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), - sizeof(void *), gfp); + rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), + sizeof(void *), gfp); if (!rcpu->bulkq) goto free_rcu; @@ -438,7 +439,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) } /* Alloc queue */ - rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, + numa); if (!rcpu->queue) goto free_bulkq; @@ -447,7 +449,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) goto free_queue; rcpu->cpu = cpu; - rcpu->map_id = map_id; + rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) @@ -455,7 +457,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, - "cpumap/%d/map:%d", cpu, map_id); + "cpumap/%d/map:%d", cpu, + map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; @@ -571,7 +574,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; From 3a61c7c58b3012ac28c166801842615ca99b49c5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:36 -0800 Subject: [PATCH 060/118] bpf: Memcg-based memory accounting for cgroup storage maps Account memory used by cgroup storage maps including metadata structures. Account the percpu memory for the percpu flavor of cgroup storage. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-11-guro@fb.com --- kernel/bpf/local_storage.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 571bb351ed3b..74dcee8926e5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -164,10 +164,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!new) return -ENOMEM; @@ -313,7 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER, numa_node); + __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); if (!map) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -496,9 +496,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { + const gfp_t gfp = __GFP_ZERO | GFP_USER; struct bpf_cgroup_storage *storage; struct bpf_map *map; - gfp_t flags; size_t size; u32 pages; @@ -511,20 +511,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); - storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), - __GFP_ZERO | GFP_USER, map->numa_node); + storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), + gfp, map->numa_node); if (!storage) goto enomem; - flags = __GFP_ZERO | GFP_USER; - if (stype == BPF_CGROUP_STORAGE_SHARED) { - storage->buf = kmalloc_node(size, flags, map->numa_node); + storage->buf = bpf_map_kmalloc_node(map, size, gfp, + map->numa_node); if (!storage->buf) goto enomem; check_and_init_map_lock(map, storage->buf->data); } else { - storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) goto enomem; } From 1440290adf7bb27602bbb7d8b2dc3d903ed3c6c9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:37 -0800 Subject: [PATCH 061/118] bpf: Refine memcg-based memory accounting for devmap maps Include map metadata and the node size (struct bpf_dtab_netdev) into the accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-12-guro@fb.com --- kernel/bpf/devmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2b5ca93c17de..b43ab247302d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -175,7 +175,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER); + dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); if (!dtab) return ERR_PTR(-ENOMEM); @@ -602,8 +602,9 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - dtab->map.numa_node); + dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), + GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); From 881456811a33b9d3952897f4d01ee4d74fa2f30e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:38 -0800 Subject: [PATCH 062/118] bpf: Refine memcg-based memory accounting for hashtab maps Include percpu objects and the size of map metadata into the accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-13-guro@fb.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ec46266aaf1c..bf70fb3ed9c1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -292,7 +292,8 @@ static int prealloc_init(struct bpf_htab *htab) u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; - pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_USER | __GFP_NOWARN); if (!pptr) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, @@ -346,8 +347,8 @@ static int alloc_extra_elems(struct bpf_htab *htab) struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, - GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; @@ -444,7 +445,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -502,8 +503,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_charge; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), - sizeof(int), GFP_USER); + htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, + sizeof(int), + sizeof(int), + GFP_USER); if (!htab->map_locked[i]) goto free_map_locked; } @@ -925,8 +928,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-E2BIG); goto dec_count; } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -942,8 +946,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, - GFP_ATOMIC | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); l_new = ERR_PTR(-ENOMEM); From 353e7af4bf5e7247c35e9ba5beab42904f1b3499 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:39 -0800 Subject: [PATCH 063/118] bpf: Memcg-based memory accounting for lpm_trie maps Include lpm trie and lpm trie node objects into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-14-guro@fb.com --- kernel/bpf/lpm_trie.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 00e32f2ec3e6..1a6981203d7f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -282,8 +282,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, - trie->map.numa_node); + node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -557,7 +557,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!trie) return ERR_PTR(-ENOMEM); From be4035c734d12918866c5eb2c496d420aa80adeb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:40 -0800 Subject: [PATCH 064/118] bpf: Memcg-based memory accounting for bpf ringbuffer Enable the memcg-based memory accounting for the memory used by the bpf ringbuffer. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-15-guro@fb.com --- kernel/bpf/ringbuf.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 31cb04a4dd2d..8983a46f6580 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -60,8 +60,8 @@ struct bpf_ringbuf_hdr { static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { - const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | - __GFP_ZERO; + const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | + __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; @@ -88,10 +88,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); - if (array_size > PAGE_SIZE) - pages = vmalloc_node(array_size, numa_node); - else - pages = kmalloc_node(array_size, flags, numa_node); + pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; @@ -167,7 +164,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); if (!rb_map) return ERR_PTR(-ENOMEM); From e9aae8beba825e4670463ddcf420b954f18d5ced Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:41 -0800 Subject: [PATCH 065/118] bpf: Memcg-based memory accounting for bpf local storage maps Account memory used by bpf local storage maps: per-socket, per-inode and per-task storages. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-16-guro@fb.com --- kernel/bpf/bpf_local_storage.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 5d3a7af9ba9b..023a3eaa4d74 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -67,7 +67,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + GFP_ATOMIC | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -264,7 +265,8 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), + GFP_ATOMIC | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -546,7 +548,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) u64 cost; int ret; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -564,7 +566,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); + GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { bpf_map_charge_finish(&smap->map.memory); kfree(smap); From 7846dd9f835e248901a9f77a43745f8f1de04741 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:42 -0800 Subject: [PATCH 066/118] bpf: Refine memcg-based memory accounting for sockmap and sockhash maps Include internal metadata into the memcg-based memory accounting. Also include the memory allocated on updating an element. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-17-guro@fb.com --- net/core/sock_map.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ddc899e83313..153652a582ee 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -39,7 +39,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER); + stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); if (!stab) return ERR_PTR(-ENOMEM); @@ -975,8 +975,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, } } - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); @@ -1116,7 +1117,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); From 28e1dcdef0cbf5ff79aceb149c7ab14589598af0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:43 -0800 Subject: [PATCH 067/118] bpf: Refine memcg-based memory accounting for xskmap maps Extend xskmap memory accounting to include the memory taken by the xsk_map_node structure. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-18-guro@fb.com --- net/xdp/xskmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 66231ba6c348..9fff1e6dc9cd 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -16,7 +16,8 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, { struct xsk_map_node *node; - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + node = bpf_map_kzalloc(&map->map, sizeof(*node), + GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); From 1bc5975613ed155fc57ee321041d3463e580b4a3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:44 -0800 Subject: [PATCH 068/118] bpf: Eliminate rlimit-based memory accounting for arraymap maps Do not use rlimit-based memory accounting for arraymap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-19-guro@fb.com --- kernel/bpf/arraymap.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d837e0603c89..1f8453343bf2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -81,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; - int ret, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(); - u64 cost, array_size, mask64; - struct bpf_map_memory mem; + u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -126,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } } - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (percpu) - cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); - /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); - if (!data) { - bpf_map_charge_finish(&mem); + if (!data) return ERR_PTR(-ENOMEM); - } array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { - bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } From f043733f31e5e12c6254045a03e519290543fa1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:45 -0800 Subject: [PATCH 069/118] bpf: Eliminate rlimit-based memory accounting for bpf_struct_ops maps Do not use rlimit-based memory accounting for bpf_struct_ops maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-20-guro@fb.com --- kernel/bpf/bpf_struct_ops.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 4c3b543bb33b..1a666a975416 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -548,12 +548,10 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops *st_ops; - size_t map_total_size, st_map_size; + size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; - struct bpf_map_memory mem; struct bpf_map *map; - int err; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -573,20 +571,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) * struct bpf_struct_ops_tcp_congestions_ops */ (vt->size - sizeof(struct bpf_struct_ops_value)); - map_total_size = st_map_size + - /* uvalue */ - sizeof(vt->size) + - /* struct bpf_progs **progs */ - btf_type_vlen(t) * sizeof(struct bpf_prog *); - err = bpf_map_charge_init(&mem, map_total_size); - if (err < 0) - return ERR_PTR(err); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); - if (!st_map) { - bpf_map_charge_finish(&mem); + if (!st_map) return ERR_PTR(-ENOMEM); - } + st_map->st_ops = st_ops; map = &st_map->map; @@ -597,14 +586,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); if (!st_map->uvalue || !st_map->progs || !st_map->image) { bpf_struct_ops_map_free(map); - bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); } mutex_init(&st_map->lock); set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); - bpf_map_charge_move(&map->memory, &mem); return map; } From 711cabaf1432fbec4a5f9ffcfbfe2ed7a78cd096 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:46 -0800 Subject: [PATCH 070/118] bpf: Eliminate rlimit-based memory accounting for cpumap maps Do not use rlimit-based memory accounting for cpumap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-21-guro@fb.com --- kernel/bpf/cpumap.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 90b949666605..747313698178 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -84,8 +84,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; - u64 cost; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -109,26 +107,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* make sure page count doesn't overflow */ - cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, cost); - if (ret) { - err = ret; - goto free_cmap; - } - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_charge; + goto free_cmap; return &cmap->map; -free_charge: - bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); From 087b0d39fe22dcc2ddcef7ed699c658f0e725bac Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:47 -0800 Subject: [PATCH 071/118] bpf: Eliminate rlimit-based memory accounting for cgroup storage maps Do not use rlimit-based memory accounting for cgroup storage maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-22-guro@fb.com --- kernel/bpf/local_storage.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 74dcee8926e5..2d4f9ac12377 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -287,8 +287,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; - struct bpf_map_memory mem; - int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) @@ -308,18 +306,10 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); - if (ret < 0) - return ERR_PTR(ret); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); - if (!map) { - bpf_map_charge_finish(&mem); + if (!map) return ERR_PTR(-ENOMEM); - } - - bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); @@ -508,9 +498,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, size = bpf_cgroup_storage_calculate_size(map, &pages); - if (bpf_map_charge_memlock(map, pages)) - return ERR_PTR(-EPERM); - storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), gfp, map->numa_node); if (!storage) @@ -533,7 +520,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, return storage; enomem: - bpf_map_uncharge_memlock(map, pages); kfree(storage); return ERR_PTR(-ENOMEM); } @@ -560,16 +546,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { enum bpf_cgroup_storage_type stype; struct bpf_map *map; - u32 pages; if (!storage) return; map = &storage->map->map; - - bpf_cgroup_storage_calculate_size(map, &pages); - bpf_map_uncharge_memlock(map, pages); - stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); From 844f157f6c0a905d039d2e20212ab3231f2e5eaf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:48 -0800 Subject: [PATCH 072/118] bpf: Eliminate rlimit-based memory accounting for devmap maps Do not use rlimit-based memory accounting for devmap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-23-guro@fb.com --- kernel/bpf/devmap.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b43ab247302d..f6e9c68afdd4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -109,8 +109,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { u32 valsize = attr->value_size; - u64 cost = 0; - int err; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex @@ -135,21 +133,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; - } else { - cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, cost); - if (err) - return -EINVAL; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) - goto free_charge; + return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { @@ -157,14 +147,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + return -ENOMEM; } return 0; - -free_charge: - bpf_map_charge_finish(&dtab->map.memory); - return -ENOMEM; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) From 755e5d55367af5ff75a4db9b6cf439416878e2c7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:49 -0800 Subject: [PATCH 073/118] bpf: Eliminate rlimit-based memory accounting for hashtab maps Do not use rlimit-based memory accounting for hashtab maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-24-guro@fb.com --- kernel/bpf/hashtab.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index bf70fb3ed9c1..fe7a0733a63a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -443,7 +443,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; - u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) @@ -481,26 +480,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (percpu) - cost += (u64) round_up(htab->map.value_size, 8) * - num_possible_cpus() * htab->map.max_entries; - else - cost += (u64) htab->elem_size * num_possible_cpus(); - - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_charge; + goto free_htab; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, @@ -541,8 +526,6 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); -free_charge: - bpf_map_charge_finish(&htab->map.memory); free_htab: lockdep_unregister_key(&htab->lockdep_key); kfree(htab); From cbddcb574d419fd5b70c5f87ba733feec6147aeb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:50 -0800 Subject: [PATCH 074/118] bpf: Eliminate rlimit-based memory accounting for lpm_trie maps Do not use rlimit-based memory accounting for lpm_trie maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-25-guro@fb.com --- kernel/bpf/lpm_trie.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1a6981203d7f..cec792a17e5f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -540,8 +540,6 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - u64 cost = sizeof(*trie), cost_per_node; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -567,20 +565,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; - cost_per_node = sizeof(struct lpm_trie_node) + - attr->value_size + trie->data_size; - cost += (u64) attr->max_entries * cost_per_node; - - ret = bpf_map_charge_init(&trie->map.memory, cost); - if (ret) - goto out_err; - spin_lock_init(&trie->lock); return &trie->map; -out_err: - kfree(trie); - return ERR_PTR(ret); } static void trie_free(struct bpf_map *map) From a37fb7ef24a475012547fa28f0148d2e538ad5d4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:51 -0800 Subject: [PATCH 075/118] bpf: Eliminate rlimit-based memory accounting for queue_stack_maps maps Do not use rlimit-based memory accounting for queue_stack maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-26-guro@fb.com --- kernel/bpf/queue_stack_maps.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0ee2347ba510..f9c734aaa990 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -66,29 +66,21 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { - int ret, numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem = {0}; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u64 size, queue_size, cost; + u64 size, queue_size; size = (u64) attr->max_entries + 1; - cost = queue_size = sizeof(*qs) + size * attr->value_size; - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); + queue_size = sizeof(*qs) + size * attr->value_size; qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) { - bpf_map_charge_finish(&mem); + if (!qs) return ERR_PTR(-ENOMEM); - } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); From db54330d3e137c23bea26784cecf5ae17e72e4c6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:52 -0800 Subject: [PATCH 076/118] bpf: Eliminate rlimit-based memory accounting for reuseport_array maps Do not use rlimit-based memory accounting for reuseport_array maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-27-guro@fb.com --- kernel/bpf/reuseport_array.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a55cd542f2ce..4838922f723d 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -150,9 +150,8 @@ static void reuseport_array_free(struct bpf_map *map) static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { - int err, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - struct bpf_map_memory mem; u64 array_size; if (!bpf_capable()) @@ -161,20 +160,13 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&mem, array_size); - if (err) - return ERR_PTR(err); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } From abbdd0813f347f9d1eea376409a68295318b2ef5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:53 -0800 Subject: [PATCH 077/118] bpf: Eliminate rlimit-based memory accounting for bpf ringbuffer Do not use rlimit-based memory accounting for bpf ringbuffer. It has been replaced with the memcg-based memory accounting. bpf_ringbuf_alloc() can't return anything except ERR_PTR(-ENOMEM) and a valid pointer, so to simplify the code make it return NULL in the first case. This allows to drop a couple of lines in ringbuf_map_alloc() and also makes it look similar to other memory allocating function like kmalloc(). Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201201215900.3569844-28-guro@fb.com --- kernel/bpf/ringbuf.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 8983a46f6580..f25b719ac786 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -48,7 +48,6 @@ struct bpf_ringbuf { struct bpf_ringbuf_map { struct bpf_map map; - struct bpf_map_memory memory; struct bpf_ringbuf *rb; }; @@ -131,7 +130,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) - return ERR_PTR(-ENOMEM); + return NULL; spin_lock_init(&rb->spinlock); init_waitqueue_head(&rb->waitq); @@ -147,8 +146,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; - u64 cost; - int err; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); @@ -170,26 +167,13 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - cost = sizeof(struct bpf_ringbuf_map) + - sizeof(struct bpf_ringbuf) + - attr->max_entries; - err = bpf_map_charge_init(&rb_map->map.memory, cost); - if (err) - goto err_free_map; - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); - if (IS_ERR(rb_map->rb)) { - err = PTR_ERR(rb_map->rb); - goto err_uncharge; + if (!rb_map->rb) { + kfree(rb_map); + return ERR_PTR(-ENOMEM); } return &rb_map->map; - -err_uncharge: - bpf_map_charge_finish(&rb_map->map.memory); -err_free_map: - kfree(rb_map); - return ERR_PTR(err); } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) From 0d2c4f9640501ff57ba0be1f5644a02c29a02fa1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:54 -0800 Subject: [PATCH 078/118] bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps Do not use rlimit-based memory accounting for sockmap and sockhash maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-29-guro@fb.com --- net/core/sock_map.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 153652a582ee..64b5ec14ff50 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -27,8 +27,6 @@ struct bpf_stab { static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -46,22 +44,15 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&stab->map.memory, cost); - if (err) - goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; - bpf_map_charge_finish(&stab->map.memory); -free_stab: - kfree(stab); - return ERR_PTR(err); + if (!stab->sks) { + kfree(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) @@ -1104,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; - u64 cost; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -1132,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) goto free_htab; } - cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { - bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } From 370868107bf6624cc104038bf38be2ca153eeb2e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:55 -0800 Subject: [PATCH 079/118] bpf: Eliminate rlimit-based memory accounting for stackmap maps Do not use rlimit-based memory accounting for stackmap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-30-guro@fb.com --- kernel/bpf/stackmap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 06065fa27124..3325add8e629 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -90,7 +90,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; - struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -119,15 +118,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - err = bpf_map_charge_init(&mem, cost); - if (err) - return ERR_PTR(err); - smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) { - bpf_map_charge_finish(&mem); + if (!smap) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; @@ -135,20 +128,17 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_charge; + goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; - bpf_map_charge_move(&smap->map.memory, &mem); - return &smap->map; put_buffers: put_callchain_buffers(); -free_charge: - bpf_map_charge_finish(&mem); +free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } From 819a4f323579b56cd942f14718edd1f308adbbe2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:56 -0800 Subject: [PATCH 080/118] bpf: Eliminate rlimit-based memory accounting for xskmap maps Do not use rlimit-based memory accounting for xskmap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-31-guro@fb.com --- net/xdp/xskmap.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 9fff1e6dc9cd..113fd9017203 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -58,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - struct bpf_map_memory mem; - int err, numa_node; struct xsk_map *m; + int numa_node; u64 size; if (!capable(CAP_NET_ADMIN)) @@ -74,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); + if (!m) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); return &m->map; From ab31be378a63a8bc1868c9890d28b0206f81396e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:57 -0800 Subject: [PATCH 081/118] bpf: Eliminate rlimit-based memory accounting for bpf local storage maps Do not use rlimit-based memory accounting for bpf local storage maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-32-guro@fb.com --- kernel/bpf/bpf_local_storage.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 023a3eaa4d74..dd5aedee99e7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -545,8 +545,6 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; - u64 cost; - int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap) @@ -557,18 +555,10 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - ret = bpf_map_charge_init(&smap->map.memory, cost); - if (ret < 0) { - kfree(smap); - return ERR_PTR(ret); - } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } From 80ee81e0403c48f4eb342f7c8d40477c89b8836a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:58 -0800 Subject: [PATCH 082/118] bpf: Eliminate rlimit-based memory accounting infra for bpf maps Remove rlimit-based accounting infrastructure code, which is not used anymore. To provide a backward compatibility, use an approximation of the bpf map memory footprint as a "memlock" value, available to a user via map info. The approximation is based on the maximal number of elements and key and value sizes. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-33-guro@fb.com --- include/linux/bpf.h | 12 --- kernel/bpf/syscall.c | 96 ++++--------------- .../selftests/bpf/progs/bpf_iter_bpf_map.c | 2 +- .../selftests/bpf/progs/map_ptr_kern.c | 7 -- 4 files changed, 17 insertions(+), 100 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1f2c95c15ec..61331a148cde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -138,11 +138,6 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; -struct bpf_map_memory { - u32 pages; - struct user_struct *user; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -163,7 +158,6 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - struct bpf_map_memory memory; #ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif @@ -1224,12 +1218,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); -void bpf_map_charge_finish(struct bpf_map_memory *mem); -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dff3a5f62d7a..29096d96d989 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static u32 bpf_map_value_size(struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -346,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -static int bpf_charge_memlock(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - return 0; -} - -static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) -{ - u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; - struct user_struct *user; - int ret; - - if (size >= U32_MAX - PAGE_SIZE) - return -E2BIG; - - user = get_current_user(); - ret = bpf_charge_memlock(user, pages); - if (ret) { - free_uid(user); - return ret; - } - - mem->pages = pages; - mem->user = user; - - return 0; -} - -void bpf_map_charge_finish(struct bpf_map_memory *mem) -{ - bpf_uncharge_memlock(mem->user, mem->pages); - free_uid(mem->user); -} - -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src) -{ - *dst = *src; - - /* Make sure src will not be used for the redundant uncharging. */ - memset(src, 0, sizeof(struct bpf_map_memory)); -} - -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) -{ - int ret; - - ret = bpf_charge_memlock(map->memory.user, pages); - if (ret) - return ret; - map->memory.pages += pages; - return ret; -} - -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) -{ - bpf_uncharge_memlock(map->memory.user, pages); - map->memory.pages -= pages; -} - static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -524,14 +453,11 @@ static void bpf_map_release_memcg(struct bpf_map *map) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - struct bpf_map_memory mem; - bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); - bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -592,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) } #ifdef CONFIG_PROC_FS +/* Provides an approximation of the map's memory footprint. + * Used only to provide a backward compatibility and display + * a reasonable "memlock" info. + */ +static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +{ + unsigned long size; + + size = round_up(map->key_size + bpf_map_value_size(map), 8); + + return round_up(map->max_entries * size, PAGE_SIZE); +} + static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; @@ -610,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n" + "memlock:\t%lu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -618,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->memory.pages * 1ULL << PAGE_SHIFT, + bpf_map_memory_footprint(map), map->id, READ_ONCE(map->frozen)); if (type) { @@ -861,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -960,9 +898,7 @@ free_map_sec: security_bpf_map_free(map); free_map: btf_put(map->btf); - bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); - bpf_map_charge_finish(&mem); return err; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 08651b23edba..b83b5d2e17dc 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -23,6 +23,6 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, map->usercnt.counter, - map->memory.user->locked_vm.counter); + 0LLU); return 0; } diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index c325405751e2..d8850bc6a9f1 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -26,17 +26,12 @@ __u32 g_line = 0; return 0; \ }) -struct bpf_map_memory { - __u32 pages; -} __attribute__((preserve_access_index)); - struct bpf_map { enum bpf_map_type map_type; __u32 key_size; __u32 value_size; __u32 max_entries; __u32 id; - struct bpf_map_memory memory; } __attribute__((preserve_access_index)); static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, @@ -47,7 +42,6 @@ static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, VERIFY(map->value_size == value_size); VERIFY(map->max_entries == max_entries); VERIFY(map->id > 0); - VERIFY(map->memory.pages > 0); return 1; } @@ -60,7 +54,6 @@ static inline int check_bpf_map_ptr(struct bpf_map *indirect, VERIFY(indirect->value_size == direct->value_size); VERIFY(indirect->max_entries == direct->max_entries); VERIFY(indirect->id == direct->id); - VERIFY(indirect->memory.pages == direct->memory.pages); return 1; } From 3ac1f01b43b6e2759cc34d3a715ba5eed04c5805 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:59 -0800 Subject: [PATCH 083/118] bpf: Eliminate rlimit-based memory accounting for bpf progs Do not use rlimit-based memory accounting for bpf progs. It has been replaced with memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-34-guro@fb.com --- include/linux/bpf.h | 11 ------- kernel/bpf/core.c | 12 ++------ kernel/bpf/syscall.c | 69 +++++++------------------------------------- 3 files changed, 12 insertions(+), 80 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 61331a148cde..a9de5711b23f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1202,8 +1202,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -int __bpf_prog_charge(struct user_struct *user, u32 pages); -void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); @@ -1512,15 +1510,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog) return ERR_PTR(-EOPNOTSUPP); } -static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - return 0; -} - -static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ -} - static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2921f58c03a8..261f8692d0d2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -221,23 +221,15 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, { gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - u32 pages, delta; - int ret; + u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; - delta = pages - fp_old->pages; - ret = __bpf_prog_charge(fp_old->aux->user, delta); - if (ret) - return NULL; - fp = __vmalloc(size, gfp_flags); - if (fp == NULL) { - __bpf_prog_uncharge(fp_old->aux->user, delta); - } else { + if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 29096d96d989..d16dd4945100 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1632,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) audit_log_end(ab); } -int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - unsigned long user_bufs; - - if (user) { - user_bufs = atomic_long_add_return(pages, &user->locked_vm); - if (user_bufs > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - } - - return 0; -} - -void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -static int bpf_prog_charge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = get_current_user(); - int ret; - - ret = __bpf_prog_charge(user, prog->pages); - if (ret) { - free_uid(user); - return ret; - } - - prog->aux->user = user; - return 0; -} - -static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = prog->aux->user; - - __bpf_prog_uncharge(user, prog->pages); - free_uid(user); -} - static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; @@ -1726,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - bpf_prog_uncharge_memlock(aux->prog); + free_uid(aux->user); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -2164,7 +2119,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) dst_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(dst_prog)) { err = PTR_ERR(dst_prog); - goto free_prog_nouncharge; + goto free_prog; } prog->aux->dst_prog = dst_prog; } @@ -2174,18 +2129,15 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) err = security_bpf_prog_alloc(prog->aux); if (err) - goto free_prog_nouncharge; - - err = bpf_prog_charge_memlock(prog); - if (err) - goto free_prog_sec; + goto free_prog; + prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), bpf_prog_insn_size(prog)) != 0) - goto free_prog; + goto free_prog_sec; prog->orig_prog = NULL; prog->jited = 0; @@ -2196,19 +2148,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) - goto free_prog; + goto free_prog_sec; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog; + goto free_prog_sec; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) - goto free_prog; + goto free_prog_sec; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr); @@ -2253,11 +2205,10 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->func_cnt); return err; -free_prog: - bpf_prog_uncharge_memlock(prog); free_prog_sec: + free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); -free_prog_nouncharge: +free_prog: bpf_prog_free(prog); return err; } From 5b0764b2d34510bc87d33a580da98f77789ac36f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:59:00 -0800 Subject: [PATCH 084/118] bpf: samples: Do not touch RLIMIT_MEMLOCK Since bpf is not using rlimit memlock for the memory accounting and control, do not change the limit in sample applications. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-35-guro@fb.com --- samples/bpf/map_perf_test_user.c | 6 ------ samples/bpf/offwaketime_user.c | 6 ------ samples/bpf/sockex2_user.c | 2 -- samples/bpf/sockex3_user.c | 2 -- samples/bpf/spintest_user.c | 6 ------ samples/bpf/syscall_tp_user.c | 2 -- samples/bpf/task_fd_query_user.c | 6 ------ samples/bpf/test_lru_dist.c | 3 --- samples/bpf/test_map_in_map_user.c | 6 ------ samples/bpf/test_overhead_user.c | 2 -- samples/bpf/trace_event_user.c | 2 -- samples/bpf/tracex2_user.c | 6 ------ samples/bpf/tracex3_user.c | 6 ------ samples/bpf/tracex4_user.c | 6 ------ samples/bpf/tracex5_user.c | 3 --- samples/bpf/tracex6_user.c | 3 --- samples/bpf/xdp1_user.c | 6 ------ samples/bpf/xdp_adjust_tail_user.c | 6 ------ samples/bpf/xdp_monitor_user.c | 5 ----- samples/bpf/xdp_redirect_cpu_user.c | 6 ------ samples/bpf/xdp_redirect_map_user.c | 6 ------ samples/bpf/xdp_redirect_user.c | 6 ------ samples/bpf/xdp_router_ipv4_user.c | 6 ------ samples/bpf/xdp_rxq_info_user.c | 6 ------ samples/bpf/xdp_sample_pkts_user.c | 6 ------ samples/bpf/xdp_tx_iptunnel_user.c | 6 ------ samples/bpf/xdpsock_user.c | 7 ------- 27 files changed, 133 deletions(-) diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 8b13230b4c46..9db949290a78 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -421,7 +421,6 @@ static void fixup_map(struct bpf_object *obj) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); struct bpf_link *links[8]; struct bpf_program *prog; @@ -430,11 +429,6 @@ int main(int argc, char **argv) char filename[256]; int i = 0; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (argc > 1) test_flags = atoi(argv[1]) ? : test_flags; diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index 5734cfdaaacb..73a986876c1a 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -95,18 +95,12 @@ static void int_exit(int sig) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_object *obj = NULL; struct bpf_link *links[2]; struct bpf_program *prog; int delay = 1, i = 0; char filename[256]; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index af925a5afd1d..bafa567b840c 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -16,7 +16,6 @@ struct pair { int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_object *obj; int map_fd, prog_fd; char filename[256]; @@ -24,7 +23,6 @@ int main(int ac, char **argv) FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, &obj, &prog_fd)) diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 7793f6a6ae7e..6ae99ecc766c 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -26,7 +26,6 @@ struct pair { int main(int argc, char **argv) { int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_program *prog; struct bpf_object *obj; const char *section; @@ -34,7 +33,6 @@ int main(int argc, char **argv) FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index f090d0dc60d6..0d7e1e5a8658 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -10,7 +10,6 @@ int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256], symbol[256]; struct bpf_object *obj = NULL; struct bpf_link *links[20]; @@ -20,11 +19,6 @@ int main(int ac, char **argv) const char *section; struct ksym *sym; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index 76a1d00128fb..a0ebf1833ed3 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -115,7 +115,6 @@ cleanup: int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int opt, num_progs = 1; char filename[256]; @@ -131,7 +130,6 @@ int main(int argc, char **argv) } } - setrlimit(RLIMIT_MEMLOCK, &r); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); return test(filename, num_progs); diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index f6b772faa348..a78025b0026b 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -310,7 +310,6 @@ cleanup: int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; @@ -318,11 +317,6 @@ int main(int argc, char **argv) struct bpf_object *obj; int i = 0, err = -1; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return err; - } - if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return err; diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index b313dba4111b..c92c5c06b965 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -489,7 +489,6 @@ static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; const char *dist_file; int nr_tasks = 1; @@ -508,8 +507,6 @@ int main(int argc, char **argv) setbuf(stdout, NULL); - assert(!setrlimit(RLIMIT_MEMLOCK, &r)); - srand(time(NULL)); nr_cpus = bpf_num_possible_cpus(); diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c index 98656de56b83..472d65c70354 100644 --- a/samples/bpf/test_map_in_map_user.c +++ b/samples/bpf/test_map_in_map_user.c @@ -114,17 +114,11 @@ static void test_map_in_map(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *link = NULL; struct bpf_program *prog; struct bpf_object *obj; char filename[256]; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c index 819a6fe86f89..4821f9d99c1f 100644 --- a/samples/bpf/test_overhead_user.c +++ b/samples/bpf/test_overhead_user.c @@ -162,13 +162,11 @@ static void unload_progs(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int num_cpu = sysconf(_SC_NPROCESSORS_ONLN); int test_flags = ~0; char filename[256]; int err = 0; - setrlimit(RLIMIT_MEMLOCK, &r); if (argc > 1) test_flags = atoi(argv[1]) ? : test_flags; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index ac1ba368195c..9664749bf618 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -294,13 +294,11 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_object *obj = NULL; char filename[256]; int error = 1; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); signal(SIGINT, err_exit); signal(SIGTERM, err_exit); diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 3d6eab711d23..1626d51dfffd 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -116,7 +116,6 @@ static void int_exit(int sig) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; long key, next_key, value; struct bpf_link *links[2]; struct bpf_program *prog; @@ -125,11 +124,6 @@ int main(int ac, char **argv) int i, j = 0; FILE *f; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 83e0fecbb01a..33e16ba39f25 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -107,7 +107,6 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; @@ -127,11 +126,6 @@ int main(int ac, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index e8faf8f184ae..cea399424bca 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -48,18 +48,12 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; char filename[256]; int map_fd, i, j = 0; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index c17d3fb5fd64..08dfdc77ad2a 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -34,7 +34,6 @@ static void install_accept_all_seccomp(void) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *link = NULL; struct bpf_program *prog; struct bpf_object *obj; @@ -43,8 +42,6 @@ int main(int ac, char **argv) char filename[256]; FILE *f; - setrlimit(RLIMIT_MEMLOCK, &r); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 33df9784775d..28296f40c133 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -175,15 +175,12 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; char filename[256]; int i = 0; - setrlimit(RLIMIT_MEMLOCK, &r); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index c447ad9e3a1d..116e39f6b666 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -79,7 +79,6 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -117,11 +116,6 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - ifindex = if_nametoindex(argv[optind]); if (!ifindex) { perror("if_nametoindex"); diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index ba482dc3da33..a70b094c8ec5 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -82,7 +82,6 @@ static void usage(const char *cmd) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -143,11 +142,6 @@ int main(int argc, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; - } - if (!ifindex) { fprintf(stderr, "Invalid ifname\n"); return 1; diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index 03d0a182913f..49ebc49aefc3 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -687,7 +687,6 @@ static void print_bpf_prog_info(void) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_program *prog; int longindex = 0, opt; int ret = EXIT_FAILURE; @@ -719,10 +718,6 @@ int main(int argc, char **argv) } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return ret; - } /* Remove tracepoint program when program is interrupted or killed */ signal(SIGINT, int_exit); diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index f78cb18319aa..576411612523 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -765,7 +765,6 @@ static int load_cpumap_prog(char *file_name, char *prog_name, int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; char *mprog_filename = "xdp_redirect_kern.o"; char *redir_interface = NULL, *redir_map = NULL; @@ -804,11 +803,6 @@ int main(int argc, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) return err; diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 35e16dee613e..31131b6e7782 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -96,7 +96,6 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -135,11 +134,6 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - ifindex_in = if_nametoindex(argv[optind]); if (!ifindex_in) ifindex_in = strtoul(argv[optind], NULL, 0); diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 9ca2bf457cda..41d705c3a1f7 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -97,7 +97,6 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -136,11 +135,6 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - ifindex_in = if_nametoindex(argv[optind]); if (!ifindex_in) ifindex_in = strtoul(argv[optind], NULL, 0); diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index c2da1b51ff95..b5f03cb17a3c 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -625,7 +625,6 @@ static void usage(const char *prog) int main(int ac, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -670,11 +669,6 @@ int main(int ac, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) return 1; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 93fa1bc54f13..74a2926eba08 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -450,7 +450,6 @@ static void stats_poll(int interval, int action, __u32 cfg_opt) int main(int argc, char **argv) { __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -474,11 +473,6 @@ int main(int argc, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) return EXIT_FAIL; diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index 4b2a300c750c..706475e004cb 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -109,7 +109,6 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; @@ -143,11 +142,6 @@ int main(int argc, char **argv) return 1; } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index a419bee151a8..1d4f305d02aa 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -155,7 +155,6 @@ int main(int argc, char **argv) struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int min_port = 0, max_port = 0, vip2tnl_map_fd; const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; unsigned char opt_flags[256] = {}; @@ -254,11 +253,6 @@ int main(int argc, char **argv) } } - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); - return 1; - } - if (!ifindex) { fprintf(stderr, "Invalid ifname\n"); return 1; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 036bd019e400..909f77647deb 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1489,7 +1489,6 @@ static void apply_setsockopt(struct xsk_socket_info *xsk) int main(int argc, char **argv) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; bool rx = false, tx = false; struct xsk_umem_info *umem; struct bpf_object *obj; @@ -1499,12 +1498,6 @@ int main(int argc, char **argv) parse_command_line(argc, argv); - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - if (opt_num_xsks > 1) load_xdp_program(argv, &obj); From 71ccb50074f31a50a1da4c1d8306d54da0907b00 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 1 Dec 2020 22:52:41 -0800 Subject: [PATCH 085/118] tools/bpftool: Emit name for anonymous BTFs For consistency of output, emit "name " for BTFs without the name. This keeps output more consistent and obvious. Suggested-by: Song Liu Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202065244.530571-2-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index ed5e97157241..bd46af6a61cc 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -750,6 +750,8 @@ show_btf_plain(struct bpf_btf_info *info, int fd, printf("name [%s] ", name); else if (name && name[0]) printf("name %s ", name); + else + printf("name "); printf("size %uB", info->btf_size); n = 0; From 0cfdcd6378071f383c900e3d8862347e2af1d1ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 1 Dec 2020 22:52:42 -0800 Subject: [PATCH 086/118] libbpf: Add base BTF accessor Add ability to get base BTF. It can be also used to check if BTF is split BTF. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202065244.530571-3-andrii@kernel.org --- tools/lib/bpf/btf.c | 5 +++++ tools/lib/bpf/btf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 7 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8ff46cd30ca1..1935e83d309c 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -432,6 +432,11 @@ __u32 btf__get_nr_types(const struct btf *btf) return btf->start_id + btf->nr_types - 1; } +const struct btf *btf__base_btf(const struct btf *btf) +{ + return btf->base_btf; +} + /* internal helper returning non-const pointer to a type */ static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 1093f6fe6800..1237bcd1dd17 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -51,6 +51,7 @@ LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind); LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); +LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); LIBBPF_API size_t btf__pointer_size(const struct btf *btf); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 29ff4807b909..ed55498c4122 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -340,6 +340,7 @@ LIBBPF_0.2.0 { LIBBPF_0.3.0 { global: + btf__base_btf; btf__parse_elf_split; btf__parse_raw_split; btf__parse_split; From fa4528379a51ff8d5271e1bcfa0d5ef71657869f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 1 Dec 2020 22:52:43 -0800 Subject: [PATCH 087/118] tools/bpftool: Auto-detect split BTFs in common cases In case of working with module's split BTF from /sys/kernel/btf/*, auto-substitute /sys/kernel/btf/vmlinux as the base BTF. This makes using bpftool with module BTFs faster and simpler. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202065244.530571-4-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index bd46af6a61cc..94cd3bad5430 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -357,11 +357,13 @@ static int dump_btf_raw(const struct btf *btf, dump_btf_type(btf, root_type_ids[i], t); } } else { + const struct btf *base; int cnt = btf__get_nr_types(btf); int start_id = 1; - if (base_btf) - start_id = btf__get_nr_types(base_btf) + 1; + base = btf__base_btf(btf); + if (base) + start_id = btf__get_nr_types(base) + 1; for (i = start_id; i <= cnt; i++) { t = btf__type_by_id(btf, i); @@ -428,7 +430,7 @@ done: static int do_dump(int argc, char **argv) { - struct btf *btf = NULL; + struct btf *btf = NULL, *base = NULL; __u32 root_type_ids[2]; int root_type_cnt = 0; bool dump_c = false; @@ -502,7 +504,21 @@ static int do_dump(int argc, char **argv) } NEXT_ARG(); } else if (is_prefix(src, "file")) { - btf = btf__parse_split(*argv, base_btf); + const char sysfs_prefix[] = "/sys/kernel/btf/"; + const char sysfs_vmlinux[] = "/sys/kernel/btf/vmlinux"; + + if (!base_btf && + strncmp(*argv, sysfs_prefix, sizeof(sysfs_prefix) - 1) == 0 && + strcmp(*argv, sysfs_vmlinux) != 0) { + base = btf__parse(sysfs_vmlinux, NULL); + if (libbpf_get_error(base)) { + p_err("failed to parse vmlinux BTF at '%s': %ld\n", + sysfs_vmlinux, libbpf_get_error(base)); + base = NULL; + } + } + + btf = btf__parse_split(*argv, base ?: base_btf); if (IS_ERR(btf)) { err = -PTR_ERR(btf); btf = NULL; @@ -567,6 +583,7 @@ static int do_dump(int argc, char **argv) done: close(fd); btf__free(btf); + btf__free(base); return err; } From a874c8c389a12b9f5ab67ba01995f06bf82e94fe Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Dec 2020 09:49:47 -0800 Subject: [PATCH 088/118] selftests/bpf: Copy file using read/write in local storage test Splice (copy_file_range) doesn't work on all filesystems. I'm running test kernels on top of my read-only disk image and it uses plan9 under the hood. This prevents test_local_storage from successfully passing. There is really no technical reason to use splice, so lets do old-school read/write to copy file; this should work in all environments. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202174947.3621989-1-sdf@google.com --- .../bpf/prog_tests/test_local_storage.c | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index fcca7ba1f368..c0fe73a17ed1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -21,14 +21,6 @@ static inline int sys_pidfd_open(pid_t pid, unsigned int flags) return syscall(__NR_pidfd_open, pid, flags); } -static inline ssize_t copy_file_range(int fd_in, loff_t *off_in, int fd_out, - loff_t *off_out, size_t len, - unsigned int flags) -{ - return syscall(__NR_copy_file_range, fd_in, off_in, fd_out, off_out, - len, flags); -} - static unsigned int duration; #define TEST_STORAGE_VALUE 0xbeefdead @@ -47,6 +39,7 @@ static int copy_rm(char *dest) { int fd_in, fd_out = -1, ret = 0; struct stat stat; + char *buf = NULL; fd_in = open("/bin/rm", O_RDONLY); if (fd_in < 0) @@ -64,18 +57,33 @@ static int copy_rm(char *dest) goto out; } - ret = copy_file_range(fd_in, NULL, fd_out, NULL, stat.st_size, 0); - if (ret == -1) { + buf = malloc(stat.st_blksize); + if (!buf) { ret = -errno; goto out; } + while (ret = read(fd_in, buf, stat.st_blksize), ret > 0) { + ret = write(fd_out, buf, ret); + if (ret < 0) { + ret = -errno; + goto out; + + } + } + if (ret < 0) { + ret = -errno; + goto out; + + } + /* Set executable permission on the copied file */ ret = chmod(dest, 0100); if (ret == -1) ret = -errno; out: + free(buf); close(fd_in); close(fd_out); return ret; From 22e8ebe35a2e30ee19e02c41cacc99c2f896bc4b Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 3 Dec 2020 10:22:34 +0000 Subject: [PATCH 089/118] tools/resolve_btfids: Fix some error messages Add missing newlines and fix polarity of strerror argument. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20201203102234.648540-1-jackmanb@google.com --- tools/bpf/resolve_btfids/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index dfa540d8a02d..e3ea569ee125 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -454,7 +454,7 @@ static int symbols_collect(struct object *obj) return -ENOMEM; if (id->addr_cnt >= ADDR_CNT) { - pr_err("FAILED symbol %s crossed the number of allowed lists", + pr_err("FAILED symbol %s crossed the number of allowed lists\n", id->name); return -1; } @@ -477,8 +477,8 @@ static int symbols_resolve(struct object *obj) btf = btf__parse(obj->btf ?: obj->path, NULL); err = libbpf_get_error(btf); if (err) { - pr_err("FAILED: load BTF from %s: %s", - obj->path, strerror(err)); + pr_err("FAILED: load BTF from %s: %s\n", + obj->path, strerror(-err)); return -1; } From e459f49b4394e2630ea55d5ac7a49402686848fe Mon Sep 17 00:00:00 2001 From: Mariusz Dudek Date: Thu, 3 Dec 2020 10:05:45 +0100 Subject: [PATCH 090/118] libbpf: Separate XDP program load with xsk socket creation Add support for separation of eBPF program load and xsk socket creation. This is needed for use-case when you want to privide as little privileges as possible to the data plane application that will handle xsk socket creation and incoming traffic. With this patch the data entity container can be run with only CAP_NET_RAW capability to fulfill its purpose of creating xsk socket and handling packages. In case your umem is larger or equal process limit for MEMLOCK you need either increase the limit or CAP_IPC_LOCK capability. To resolve privileges issue two APIs are introduced: - xsk_setup_xdp_prog - loads the built in XDP program. It can also return xsks_map_fd which is needed by unprivileged process to update xsks_map with AF_XDP socket "fd" - xsk_socket__update_xskmap - inserts an AF_XDP socket into an xskmap for a particular xsk_socket Signed-off-by: Mariusz Dudek Signed-off-by: Alexei Starovoitov Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201203090546.11976-2-mariuszx.dudek@intel.com --- tools/lib/bpf/libbpf.map | 2 + tools/lib/bpf/xsk.c | 92 ++++++++++++++++++++++++++++++++++++---- tools/lib/bpf/xsk.h | 5 +++ 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index ed55498c4122..7c4126542e2b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -346,4 +346,6 @@ LIBBPF_0.3.0 { btf__parse_split; btf__new_empty_split; btf__new_split; + xsk_setup_xdp_prog; + xsk_socket__update_xskmap; } LIBBPF_0.2.0; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 9bc537d0b92d..4b051ec7cfbb 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -566,8 +566,35 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk) &xsk->fd, 0); } -static int xsk_setup_xdp_prog(struct xsk_socket *xsk) +static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) { + char ifname[IFNAMSIZ]; + struct xsk_ctx *ctx; + char *interface; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + interface = if_indextoname(ifindex, &ifname[0]); + if (!interface) { + free(ctx); + return -errno; + } + + ctx->ifindex = ifindex; + strncpy(ctx->ifname, ifname, IFNAMSIZ - 1); + ctx->ifname[IFNAMSIZ - 1] = 0; + + xsk->ctx = ctx; + + return 0; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, + int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; struct xsk_ctx *ctx = xsk->ctx; __u32 prog_id = 0; int err; @@ -584,8 +611,7 @@ static int xsk_setup_xdp_prog(struct xsk_socket *xsk) err = xsk_load_xdp_prog(xsk); if (err) { - xsk_delete_bpf_maps(xsk); - return err; + goto err_load_xdp_prog; } } else { ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); @@ -598,15 +624,29 @@ static int xsk_setup_xdp_prog(struct xsk_socket *xsk) } } - if (xsk->rx) + if (xsk->rx) { err = xsk_set_bpf_maps(xsk); - if (err) { - xsk_delete_bpf_maps(xsk); - close(ctx->prog_fd); - return err; + if (err) { + if (!prog_id) { + goto err_set_bpf_maps; + } else { + close(ctx->prog_fd); + return err; + } + } } + if (xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; return 0; + +err_set_bpf_maps: + close(ctx->prog_fd); + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); +err_load_xdp_prog: + xsk_delete_bpf_maps(xsk); + + return err; } static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, @@ -689,6 +729,40 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, return ctx; } +static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) +{ + free(xsk->ctx); + free(xsk); +} + +int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) +{ + xsk->ctx->xsks_map_fd = fd; + return xsk_set_bpf_maps(xsk); +} + +int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) +{ + struct xsk_socket *xsk; + int res; + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + res = xsk_create_xsk_struct(ifindex, xsk); + if (res) { + free(xsk); + return -EINVAL; + } + + res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); + + xsk_destroy_xsk_struct(xsk); + + return res; +} + int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, const char *ifname, __u32 queue_id, struct xsk_umem *umem, @@ -838,7 +912,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, ctx->prog_fd = -1; if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { - err = xsk_setup_xdp_prog(xsk); + err = __xsk_setup_xdp_prog(xsk, NULL); if (err) goto out_mmap_tx; } diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index 5865e082ba0b..e9f121f5d129 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -204,6 +204,11 @@ struct xsk_umem_config { __u32 flags; }; +LIBBPF_API int xsk_setup_xdp_prog(int ifindex, + int *xsks_map_fd); +LIBBPF_API int xsk_socket__update_xskmap(struct xsk_socket *xsk, + int xsks_map_fd); + /* Flags for the libbpf_flags field. */ #define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) From 3627d9702d789804a1f4c5a52eabdae810cd9def Mon Sep 17 00:00:00 2001 From: Mariusz Dudek Date: Thu, 3 Dec 2020 10:05:46 +0100 Subject: [PATCH 091/118] samples/bpf: Sample application for eBPF load and socket creation split Introduce a sample program to demonstrate the control and data plane split. For the control plane part a new program called xdpsock_ctrl_proc is introduced. For the data plane part, some code was added to xdpsock_user.c to act as the data plane entity. Application xdpsock_ctrl_proc works as control entity with sudo privileges (CAP_SYS_ADMIN and CAP_NET_ADMIN are sufficient) and the extended xdpsock as data plane entity with CAP_NET_RAW capability only. Usage example: sudo ./samples/bpf/xdpsock_ctrl_proc -i sudo ./samples/bpf/xdpsock -i -q -n -N -l -R Signed-off-by: Mariusz Dudek Signed-off-by: Alexei Starovoitov Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20201203090546.11976-3-mariuszx.dudek@intel.com --- samples/bpf/Makefile | 4 +- samples/bpf/xdpsock.h | 8 ++ samples/bpf/xdpsock_ctrl_proc.c | 187 ++++++++++++++++++++++++++++++++ samples/bpf/xdpsock_user.c | 144 +++++++++++++++++++++++- 4 files changed, 337 insertions(+), 6 deletions(-) create mode 100644 samples/bpf/xdpsock_ctrl_proc.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 05db041f8b18..26fc96ca619e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -48,6 +48,7 @@ tprogs-y += syscall_tp tprogs-y += cpustat tprogs-y += xdp_adjust_tail tprogs-y += xdpsock +tprogs-y += xdpsock_ctrl_proc tprogs-y += xsk_fwd tprogs-y += xdp_fwd tprogs-y += task_fd_query @@ -105,6 +106,7 @@ syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := xdpsock_user.o +xdpsock_ctrl_proc-objs := xdpsock_ctrl_proc.o xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS) @@ -202,7 +204,7 @@ TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt TPROGLDLIBS_test_overhead += -lrt -TPROGLDLIBS_xdpsock += -pthread +TPROGLDLIBS_xdpsock += -pthread -lcap TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h index b7eca15c78cc..fd70cce60712 100644 --- a/samples/bpf/xdpsock.h +++ b/samples/bpf/xdpsock.h @@ -8,4 +8,12 @@ #define MAX_SOCKS 4 +#define SOCKET_NAME "sock_cal_bpf_fd" +#define MAX_NUM_OF_CLIENTS 10 + +#define CLOSE_CONN 1 + +typedef __u64 u64; +typedef __u32 u32; + #endif /* XDPSOCK_H */ diff --git a/samples/bpf/xdpsock_ctrl_proc.c b/samples/bpf/xdpsock_ctrl_proc.c new file mode 100644 index 000000000000..384e62e3c6d6 --- /dev/null +++ b/samples/bpf/xdpsock_ctrl_proc.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 - 2018 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "xdpsock.h" + +static const char *opt_if = ""; + +static struct option long_options[] = { + {"interface", required_argument, 0, 'i'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface=n Run on interface n\n" + "\n"; + fprintf(stderr, "%s\n", str); + + exit(0); +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "i:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'i': + opt_if = optarg; + break; + default: + usage(basename(argv[0])); + } + } +} + +static int send_xsks_map_fd(int sock, int fd) +{ + char cmsgbuf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg; + struct iovec iov; + int value = 0; + + if (fd == -1) { + fprintf(stderr, "Incorrect fd = %d\n", fd); + return -1; + } + iov.iov_base = &value; + iov.iov_len = sizeof(int); + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + msg.msg_control = cmsgbuf; + msg.msg_controllen = CMSG_LEN(sizeof(int)); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + *(int *)CMSG_DATA(cmsg) = fd; + int ret = sendmsg(sock, &msg, 0); + + if (ret == -1) { + fprintf(stderr, "Sendmsg failed with %s", strerror(errno)); + return -errno; + } + + return ret; +} + +int +main(int argc, char **argv) +{ + struct sockaddr_un server; + int listening = 1; + int rval, msgsock; + int ifindex = 0; + int flag = 1; + int cmd = 0; + int sock; + int err; + int xsks_map_fd; + + parse_command_line(argc, argv); + + ifindex = if_nametoindex(opt_if); + if (ifindex == 0) { + fprintf(stderr, "Unable to get ifindex for Interface %s. Reason:%s", + opt_if, strerror(errno)); + return -errno; + } + + sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + fprintf(stderr, "Opening socket stream failed: %s", strerror(errno)); + return -errno; + } + + server.sun_family = AF_UNIX; + strcpy(server.sun_path, SOCKET_NAME); + + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &flag, sizeof(int)); + + if (bind(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un))) { + fprintf(stderr, "Binding to socket stream failed: %s", strerror(errno)); + return -errno; + } + + listen(sock, MAX_NUM_OF_CLIENTS); + + err = xsk_setup_xdp_prog(ifindex, &xsks_map_fd); + if (err) { + fprintf(stderr, "Setup of xdp program failed\n"); + goto close_sock; + } + + while (listening) { + msgsock = accept(sock, 0, 0); + if (msgsock == -1) { + fprintf(stderr, "Error accepting connection: %s", strerror(errno)); + err = -errno; + goto close_sock; + } + err = send_xsks_map_fd(msgsock, xsks_map_fd); + if (err <= 0) { + fprintf(stderr, "Error %d sending xsks_map_fd\n", err); + goto cleanup; + } + do { + rval = read(msgsock, &cmd, sizeof(int)); + if (rval < 0) { + fprintf(stderr, "Error reading stream message"); + } else { + if (cmd != CLOSE_CONN) + fprintf(stderr, "Recv unknown cmd = %d\n", cmd); + listening = 0; + break; + } + } while (rval > 0); + } + close(msgsock); + close(sock); + unlink(SOCKET_NAME); + + /* Unset fd for given ifindex */ + err = bpf_set_link_xdp_fd(ifindex, -1, 0); + if (err) { + fprintf(stderr, "Error when unsetting bpf prog_fd for ifindex(%d)\n", ifindex); + return err; + } + + return 0; + +cleanup: + close(msgsock); +close_sock: + close(sock); + unlink(SOCKET_NAME); + return err; +} diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 909f77647deb..74a578f880b5 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -24,10 +24,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -96,6 +98,7 @@ static bool opt_need_wakeup = true; static u32 opt_num_xsks = 1; static u32 prog_id; static bool opt_busy_poll; +static bool opt_reduced_cap; struct xsk_ring_stats { unsigned long rx_npkts; @@ -154,6 +157,7 @@ struct xsk_socket_info { static int num_socks; struct xsk_socket_info *xsks[MAX_SOCKS]; +int sock; static unsigned long get_nsecs(void) { @@ -461,6 +465,7 @@ static void *poller(void *arg) static void remove_xdp_program(void) { u32 curr_prog_id = 0; + int cmd = CLOSE_CONN; if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { printf("bpf_get_link_xdp_id failed\n"); @@ -472,6 +477,13 @@ static void remove_xdp_program(void) printf("couldn't find a prog id on a given interface\n"); else printf("program on interface changed, not removing\n"); + + if (opt_reduced_cap) { + if (write(sock, &cmd, sizeof(int)) < 0) { + fprintf(stderr, "Error writing into stream socket: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + } } static void int_exit(int sig) @@ -854,7 +866,7 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, xsk->umem = umem; cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; - if (opt_num_xsks > 1) + if (opt_num_xsks > 1 || opt_reduced_cap) cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; else cfg.libbpf_flags = 0; @@ -913,6 +925,7 @@ static struct option long_options[] = { {"app-stats", no_argument, 0, 'a'}, {"irq-string", no_argument, 0, 'I'}, {"busy-poll", no_argument, 0, 'B'}, + {"reduce-cap", no_argument, 0, 'R'}, {0, 0, 0, 0} }; @@ -935,7 +948,7 @@ static void usage(const char *prog) " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" " -u, --unaligned Enable unaligned chunk placement\n" - " -M, --shared-umem Enable XDP_SHARED_UMEM\n" + " -M, --shared-umem Enable XDP_SHARED_UMEM (cannot be used with -R)\n" " -F, --force Force loading the XDP prog\n" " -d, --duration=n Duration in secs to run command.\n" " Default: forever.\n" @@ -952,6 +965,7 @@ static void usage(const char *prog) " -a, --app-stats Display application (syscall) statistics.\n" " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" " -B, --busy-poll Busy poll.\n" + " -R, --reduce-cap Use reduced capabilities (cannot be used with -M)\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -967,7 +981,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:B", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1069,6 +1083,9 @@ static void parse_command_line(int argc, char **argv) case 'B': opt_busy_poll = 1; break; + case 'R': + opt_reduced_cap = true; + break; default: usage(basename(argv[0])); } @@ -1090,6 +1107,11 @@ static void parse_command_line(int argc, char **argv) opt_xsk_frame_size); usage(basename(argv[0])); } + + if (opt_reduced_cap && opt_num_xsks > 1) { + fprintf(stderr, "ERROR: -M and -R cannot be used together\n"); + usage(basename(argv[0])); + } } static void kick_tx(struct xsk_socket_info *xsk) @@ -1487,19 +1509,116 @@ static void apply_setsockopt(struct xsk_socket_info *xsk) exit_with_error(errno); } +static int recv_xsks_map_fd_from_ctrl_node(int sock, int *_fd) +{ + char cms[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg; + struct msghdr msg; + struct iovec iov; + int value; + int len; + + iov.iov_base = &value; + iov.iov_len = sizeof(int); + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + msg.msg_control = (caddr_t)cms; + msg.msg_controllen = sizeof(cms); + + len = recvmsg(sock, &msg, 0); + + if (len < 0) { + fprintf(stderr, "Recvmsg failed length incorrect.\n"); + return -EINVAL; + } + + if (len == 0) { + fprintf(stderr, "Recvmsg failed no data\n"); + return -EINVAL; + } + + cmsg = CMSG_FIRSTHDR(&msg); + *_fd = *(int *)CMSG_DATA(cmsg); + + return 0; +} + +static int +recv_xsks_map_fd(int *xsks_map_fd) +{ + struct sockaddr_un server; + int err; + + sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + fprintf(stderr, "Error opening socket stream: %s", strerror(errno)); + return errno; + } + + server.sun_family = AF_UNIX; + strcpy(server.sun_path, SOCKET_NAME); + + if (connect(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { + close(sock); + fprintf(stderr, "Error connecting stream socket: %s", strerror(errno)); + return errno; + } + + err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); + if (err) { + fprintf(stderr, "Error %d recieving fd\n", err); + return err; + } + return 0; +} + int main(int argc, char **argv) { + struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; + struct __user_cap_data_struct data[2] = { { 0 } }; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; bool rx = false, tx = false; struct xsk_umem_info *umem; struct bpf_object *obj; + int xsks_map_fd = 0; pthread_t pt; int i, ret; void *bufs; parse_command_line(argc, argv); - if (opt_num_xsks > 1) - load_xdp_program(argv, &obj); + if (opt_reduced_cap) { + if (capget(&hdr, data) < 0) + fprintf(stderr, "Error getting capabilities\n"); + + data->effective &= CAP_TO_MASK(CAP_NET_RAW); + data->permitted &= CAP_TO_MASK(CAP_NET_RAW); + + if (capset(&hdr, data) < 0) + fprintf(stderr, "Setting capabilities failed\n"); + + if (capget(&hdr, data) < 0) { + fprintf(stderr, "Error getting capabilities\n"); + } else { + fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", + data[0].effective, data[0].inheritable, data[0].permitted); + fprintf(stderr, "Capabilities EFF %x Caps INH %x Caps Per %x\n", + data[1].effective, data[1].inheritable, data[1].permitted); + } + } else { + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (opt_num_xsks > 1) + load_xdp_program(argv, &obj); + } /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, @@ -1534,6 +1653,21 @@ int main(int argc, char **argv) if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) enter_xsks_into_map(obj); + if (opt_reduced_cap) { + ret = recv_xsks_map_fd(&xsks_map_fd); + if (ret) { + fprintf(stderr, "Error %d receiving xsks_map_fd\n", ret); + exit_with_error(ret); + } + if (xsks[0]->xsk) { + ret = xsk_socket__update_xskmap(xsks[0]->xsk, xsks_map_fd); + if (ret) { + fprintf(stderr, "Update of BPF map failed(%d)\n", ret); + exit_with_error(ret); + } + } + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); signal(SIGABRT, int_exit); From 3db980449bc3b9765c78210787bcbf4305636982 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 3 Dec 2020 19:14:34 +0000 Subject: [PATCH 092/118] selftests/bpf: Update ima_setup.sh for busybox losetup on busybox does not output the name of loop device on using -f with --show. It also doesn't support -j to find the loop devices for a given backing file. losetup is updated to use "-a" which is available on busybox. blkid does not support options (-s and -o) to only display the uuid, so parse the output instead. Not all environments have mkfs.ext4, the test requires a loop device with a backing image file which could formatted with any filesystem. Update to using mkfs.ext2 which is available on busybox. Fixes: 34b82d3ac105 ("bpf: Add a selftest for bpf_ima_inode_hash") Reported-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203191437.666737-2-kpsingh@chromium.org --- tools/testing/selftests/bpf/ima_setup.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh index 15490ccc5e55..137f2d32598f 100755 --- a/tools/testing/selftests/bpf/ima_setup.sh +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -3,6 +3,7 @@ set -e set -u +set -o pipefail IMA_POLICY_FILE="/sys/kernel/security/ima/policy" TEST_BINARY="/bin/true" @@ -23,13 +24,15 @@ setup() dd if=/dev/zero of="${mount_img}" bs=1M count=10 - local loop_device="$(losetup --find --show ${mount_img})" + losetup -f "${mount_img}" + local loop_device=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) - mkfs.ext4 "${loop_device}" + mkfs.ext2 "${loop_device:?}" mount "${loop_device}" "${mount_dir}" cp "${TEST_BINARY}" "${mount_dir}" - local mount_uuid="$(blkid -s UUID -o value ${loop_device})" + local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="\([^"]*\)".*/\1/')" + echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} } @@ -38,7 +41,8 @@ cleanup() { local mount_img="${tmp_dir}/test.img" local mount_dir="${tmp_dir}/mnt" - local loop_devices=$(losetup -j ${mount_img} -O NAME --noheadings) + local loop_devices=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) + for loop_dev in "${loop_devices}"; do losetup -d $loop_dev done From 1ee076719d4e14c005f375c50731ed44eb48fee4 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 3 Dec 2020 19:14:35 +0000 Subject: [PATCH 093/118] selftests/bpf: Ensure securityfs mount before writing ima policy SecurityFS may not be mounted even if it is enabled in the kernel config. So, check if the mount exists in /proc/mounts by parsing the file and, if not, mount it on /sys/kernel/security. Fixes: 34b82d3ac105 ("bpf: Add a selftest for bpf_ima_inode_hash") Reported-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203191437.666737-3-kpsingh@chromium.org --- tools/testing/selftests/bpf/ima_setup.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh index 137f2d32598f..b1ee4bf06996 100755 --- a/tools/testing/selftests/bpf/ima_setup.sh +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -14,6 +14,20 @@ usage() exit 1 } +ensure_mount_securityfs() +{ + local securityfs_dir=$(grep "securityfs" /proc/mounts | awk '{print $2}') + + if [ -z "${securityfs_dir}" ]; then + securityfs_dir=/sys/kernel/security + mount -t securityfs security "${securityfs_dir}" + fi + + if [ ! -d "${securityfs_dir}" ]; then + echo "${securityfs_dir}: securityfs is not mounted" && exit 1 + fi +} + setup() { local tmp_dir="$1" @@ -33,6 +47,7 @@ setup() cp "${TEST_BINARY}" "${mount_dir}" local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="\([^"]*\)".*/\1/')" + ensure_mount_securityfs echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} } From d932e043b9d6d60113e90267ae2fbe4e946d7b08 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 3 Dec 2020 19:14:36 +0000 Subject: [PATCH 094/118] selftests/bpf: Add config dependency on BLK_DEV_LOOP The ima selftest restricts its scope to a test filesystem image mounted on a loop device and prevents permanent ima policy changes for the whole system. Fixes: 34b82d3ac105 ("bpf: Add a selftest for bpf_ima_inode_hash") Reported-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203191437.666737-4-kpsingh@chromium.org --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 365bf9771b07..37e1f303fc11 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -43,3 +43,4 @@ CONFIG_IMA=y CONFIG_SECURITYFS=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y +CONFIG_BLK_DEV_LOOP=y From ffebecd9d49542046c5ecbb410af01e016636e19 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 3 Dec 2020 19:14:37 +0000 Subject: [PATCH 095/118] selftests/bpf: Indent ima_setup.sh with tabs. The file was formatted with spaces instead of tabs and went unnoticed as checkpatch.pl did not complain (probably because this is a shell script). Re-indent it with tabs to be consistent with other scripts. Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203191437.666737-5-kpsingh@chromium.org --- tools/testing/selftests/bpf/ima_setup.sh | 102 +++++++++++------------ 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/bpf/ima_setup.sh b/tools/testing/selftests/bpf/ima_setup.sh index b1ee4bf06996..2bfc646bc230 100755 --- a/tools/testing/selftests/bpf/ima_setup.sh +++ b/tools/testing/selftests/bpf/ima_setup.sh @@ -10,90 +10,90 @@ TEST_BINARY="/bin/true" usage() { - echo "Usage: $0 " - exit 1 + echo "Usage: $0 " + exit 1 } ensure_mount_securityfs() { - local securityfs_dir=$(grep "securityfs" /proc/mounts | awk '{print $2}') + local securityfs_dir=$(grep "securityfs" /proc/mounts | awk '{print $2}') - if [ -z "${securityfs_dir}" ]; then - securityfs_dir=/sys/kernel/security - mount -t securityfs security "${securityfs_dir}" - fi + if [ -z "${securityfs_dir}" ]; then + securityfs_dir=/sys/kernel/security + mount -t securityfs security "${securityfs_dir}" + fi - if [ ! -d "${securityfs_dir}" ]; then - echo "${securityfs_dir}: securityfs is not mounted" && exit 1 - fi + if [ ! -d "${securityfs_dir}" ]; then + echo "${securityfs_dir}: securityfs is not mounted" && exit 1 + fi } setup() { - local tmp_dir="$1" - local mount_img="${tmp_dir}/test.img" - local mount_dir="${tmp_dir}/mnt" - local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" - mkdir -p ${mount_dir} + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + mkdir -p ${mount_dir} - dd if=/dev/zero of="${mount_img}" bs=1M count=10 + dd if=/dev/zero of="${mount_img}" bs=1M count=10 - losetup -f "${mount_img}" - local loop_device=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) + losetup -f "${mount_img}" + local loop_device=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) - mkfs.ext2 "${loop_device:?}" - mount "${loop_device}" "${mount_dir}" + mkfs.ext2 "${loop_device:?}" + mount "${loop_device}" "${mount_dir}" - cp "${TEST_BINARY}" "${mount_dir}" - local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="\([^"]*\)".*/\1/')" + cp "${TEST_BINARY}" "${mount_dir}" + local mount_uuid="$(blkid ${loop_device} | sed 's/.*UUID="\([^"]*\)".*/\1/')" - ensure_mount_securityfs - echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} + ensure_mount_securityfs + echo "measure func=BPRM_CHECK fsuuid=${mount_uuid}" > ${IMA_POLICY_FILE} } cleanup() { - local tmp_dir="$1" - local mount_img="${tmp_dir}/test.img" - local mount_dir="${tmp_dir}/mnt" + local tmp_dir="$1" + local mount_img="${tmp_dir}/test.img" + local mount_dir="${tmp_dir}/mnt" - local loop_devices=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) + local loop_devices=$(losetup -a | grep ${mount_img:?} | cut -d ":" -f1) - for loop_dev in "${loop_devices}"; do - losetup -d $loop_dev - done + for loop_dev in "${loop_devices}"; do + losetup -d $loop_dev + done - umount ${mount_dir} - rm -rf ${tmp_dir} + umount ${mount_dir} + rm -rf ${tmp_dir} } run() { - local tmp_dir="$1" - local mount_dir="${tmp_dir}/mnt" - local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" + local tmp_dir="$1" + local mount_dir="${tmp_dir}/mnt" + local copied_bin_path="${mount_dir}/$(basename ${TEST_BINARY})" - exec "${copied_bin_path}" + exec "${copied_bin_path}" } main() { - [[ $# -ne 2 ]] && usage + [[ $# -ne 2 ]] && usage - local action="$1" - local tmp_dir="$2" + local action="$1" + local tmp_dir="$2" - [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 + [[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1 - if [[ "${action}" == "setup" ]]; then - setup "${tmp_dir}" - elif [[ "${action}" == "cleanup" ]]; then - cleanup "${tmp_dir}" - elif [[ "${action}" == "run" ]]; then - run "${tmp_dir}" - else - echo "Unknown action: ${action}" - exit 1 - fi + if [[ "${action}" == "setup" ]]; then + setup "${tmp_dir}" + elif [[ "${action}" == "cleanup" ]]; then + cleanup "${tmp_dir}" + elif [[ "${action}" == "run" ]]; then + run "${tmp_dir}" + else + echo "Unknown action: ${action}" + exit 1 + fi } main "$@" From 80b2b5c3a701d56de98d00d99bc9cc384fb316d9 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Wed, 2 Dec 2020 23:34:10 -0500 Subject: [PATCH 096/118] libbpf: Fail early when loading programs with unspecified type Before this patch, a program with unspecified type (BPF_PROG_TYPE_UNSPEC) would be passed to the BPF syscall, only to have the kernel reject it with an opaque invalid argument error. This patch makes libbpf reject such programs with a nicer error message - in particular libbpf now tries to diagnose bad ELF section names at both open time and load time. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203043410.59699-1-andreimatei1@gmail.com --- tools/lib/bpf/libbpf.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 313034117070..d6f45538444d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6629,6 +6629,16 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, char *log_buf = NULL; int btf_fd, ret; + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* + * The program type must be set. Most likely we couldn't find a proper + * section definition at load time, and thus we didn't infer the type. + */ + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + if (!insns || !insns_cnt) return -EINVAL; @@ -6920,9 +6930,12 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, bpf_object__for_each_program(prog, obj) { prog->sec_def = find_sec_def(prog->sec_name); - if (!prog->sec_def) + if (!prog->sec_def) { /* couldn't guess, but user might manually specify */ + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", + prog->name, prog->sec_name); continue; + } if (prog->sec_def->is_sleepable) prog->prog_flags |= BPF_F_SLEEPABLE; From 9cf309c56f7910a81fbe053b6f11c3b1f0987b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 3 Dec 2020 10:33:06 +0100 Subject: [PATCH 097/118] libbpf: Sanitise map names before pinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we added sanitising of map names before loading programs to libbpf, we still allowed periods in the name. While the kernel will accept these for the map names themselves, they are not allowed in file names when pinning maps. This means that bpf_object__pin_maps() will fail if called on an object that contains internal maps (such as sections .rodata). Fix this by replacing periods with underscores when constructing map pin paths. This only affects the paths generated by libbpf when bpf_object__pin_maps() is called with a path argument. Any pin paths set by bpf_map__set_pin_path() are unaffected, and it will still be up to the caller to avoid invalid characters in those. Fixes: 113e6b7e15e2 ("libbpf: Sanitise internal map names so they are not rejected by the kernel") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201203093306.107676-1-toke@redhat.com --- tools/lib/bpf/libbpf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d6f45538444d..b2e16efabde0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7659,6 +7659,16 @@ bool bpf_map__is_pinned(const struct bpf_map *map) return map->pinned; } +static void sanitize_pin_path(char *s) +{ + /* bpffs disallows periods in path names */ + while (*s) { + if (*s == '.') + *s = '_'; + s++; + } +} + int bpf_object__pin_maps(struct bpf_object *obj, const char *path) { struct bpf_map *map; @@ -7688,6 +7698,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) err = -ENAMETOOLONG; goto err_unpin_maps; } + sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { continue; @@ -7732,6 +7743,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) return -EINVAL; else if (len >= PATH_MAX) return -ENAMETOOLONG; + sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { continue; From d6d418bd8f92aaa4c7c26d606188147c2ee0dae9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Dec 2020 15:13:32 -0800 Subject: [PATCH 098/118] libbpf: Cap retries in sys_bpf_prog_load I've seen a situation, where a process that's under pprof constantly generates SIGPROF which prevents program loading indefinitely. The right thing to do probably is to disable signals in the upper layers while loading, but it still would be nice to get some error from libbpf instead of an endless loop. Let's add some small retry limit to the program loading: try loading the program 5 (arbitrary) times and give up. v2: * 10 -> 5 retires (Andrii Nakryiko) Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201202231332.3923644-1-sdf@google.com --- tools/lib/bpf/bpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d27e34133973..4025266d0fb0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -67,11 +67,12 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size) { + int retries = 5; int fd; do { fd = sys_bpf(BPF_PROG_LOAD, attr, size); - } while (fd < 0 && errno == EAGAIN); + } while (fd < 0 && errno == EAGAIN && retries-- > 0); return fd; } From 58c185b85d0c1753b0b6a9390294bd883faf4d77 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 3 Dec 2020 12:08:50 +0000 Subject: [PATCH 099/118] bpf: Fix cold build of test_progs-no_alu32 This object lives inside the trunner output dir, i.e. tools/testing/selftests/bpf/no_alu32/btf_data.o At some point it gets copied into the parent directory during another part of the build, but that doesn't happen when building test_progs-no_alu32 from clean. Signed-off-by: Brendan Jackman Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20201203120850.859170-1-jackmanb@google.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 894192c319fb..371b022d932c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -378,7 +378,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --no-fail --btf btf_data.o $$@ + $(Q)$(RESOLVE_BTFIDS) --no-fail --btf $(TRUNNER_OUTPUT)/btf_data.o $$@ endef From 2faa7328f53b36b2b171501154bba3fd66d8f5da Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 3 Dec 2020 11:44:52 +0000 Subject: [PATCH 100/118] samples/bpf: Fix spelling mistake "recieving" -> "receiving" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a spelling mistake in an error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201203114452.1060017-1-colin.king@canonical.com --- samples/bpf/xdpsock_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 74a578f880b5..568f9815bb1b 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1570,7 +1570,7 @@ recv_xsks_map_fd(int *xsks_map_fd) err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); if (err) { - fprintf(stderr, "Error %d recieving fd\n", err); + fprintf(stderr, "Error %d receiving fd\n", err); return err; } return 0; From cb81110997d1f5097f29dd8e49d32a1fc55cbf86 Mon Sep 17 00:00:00 2001 From: Prankur gupta Date: Wed, 2 Dec 2020 13:31:51 -0800 Subject: [PATCH 101/118] bpf: Adds support for setting window clamp Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_WINDOW_CLAMP, which sets the maximum receiver window size. It will be useful for limiting receiver window based on RTT. Signed-off-by: Prankur gupta Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202213152.435886-2-prankgup@fb.com --- include/net/tcp.h | 1 + net/core/filter.c | 3 +++ net/ipv4/tcp.c | 25 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4aba0f069b05..347a76f176b4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -406,6 +406,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +int tcp_set_window_clamp(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, diff --git a/net/core/filter.c b/net/core/filter.c index 21d91dcf0260..77001a35768f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->notsent_lowat = val; sk->sk_write_space(sk); break; + case TCP_WINDOW_CLAMP: + ret = tcp_set_window_clamp(sk, val); + break; default: ret = -EINVAL; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2bc3d7fe9e8..17379f6dd955 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3022,6 +3022,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_keepcnt); +int tcp_set_window_clamp(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!val) { + if (sk->sk_state != TCP_CLOSE) + return -EINVAL; + tp->window_clamp = 0; + } else { + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + } + return 0; +} + /* * Socket option code for TCP. */ @@ -3235,15 +3250,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_WINDOW_CLAMP: - if (!val) { - if (sk->sk_state != TCP_CLOSE) { - err = -EINVAL; - break; - } - tp->window_clamp = 0; - } else - tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: From 55144f31f0d2fdd3e74ead67f1649bf577961eaa Mon Sep 17 00:00:00 2001 From: Prankur gupta Date: Wed, 2 Dec 2020 13:31:52 -0800 Subject: [PATCH 102/118] selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP Adding selftests for new added functionality to set TCP_WINDOW_CLAMP from bpf setsockopt. Signed-off-by: Prankur gupta Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201202213152.435886-3-prankgup@fb.com --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 1 + .../selftests/bpf/prog_tests/tcpbpf_user.c | 4 +++ .../selftests/bpf/progs/test_tcpbpf_kern.c | 33 +++++++++++++++++++ tools/testing/selftests/bpf/test_tcpbpf.h | 2 ++ 4 files changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 2915664c335d..6a9053162cf2 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -56,6 +56,7 @@ struct tcp_sock { __u32 rcv_nxt; __u32 snd_nxt; __u32 snd_una; + __u32 window_clamp; __u8 ecn_flags; __u32 delivered; __u32 delivered_ce; diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index ab5281475f44..87923d2865b7 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -42,6 +42,10 @@ static void verify_result(struct tcpbpf_globals *result) /* check getsockopt for SAVED_SYN */ ASSERT_EQ(result->tcp_saved_syn, 1, "tcp_saved_syn"); + + /* check getsockopt for window_clamp */ + ASSERT_EQ(result->window_clamp_client, 9216, "window_clamp_client"); + ASSERT_EQ(result->window_clamp_server, 9216, "window_clamp_server"); } static void run_test(struct tcpbpf_globals *result) diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index e85e49deba70..94f50f7e94d6 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -12,17 +12,41 @@ #include #include #include +#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" struct tcpbpf_globals global = {}; int _version SEC("version") = 1; +/** + * SOL_TCP is defined in while + * TCP_SAVED_SYN is defined in already included + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +static __always_inline int get_tp_window_clamp(struct bpf_sock_ops *skops) +{ + struct bpf_sock *sk; + struct tcp_sock *tp; + + sk = skops->sk; + if (!sk) + return -1; + tp = bpf_skc_to_tcp_sock(sk); + if (!tp) + return -1; + return tp->window_clamp; +} + SEC("sockops") int bpf_testcb(struct bpf_sock_ops *skops) { char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)]; struct bpf_sock_ops *reuse = skops; struct tcphdr *thdr; + int window_clamp = 9216; int good_call_rv = 0; int bad_call_rv = 0; int save_syn = 1; @@ -75,6 +99,11 @@ int bpf_testcb(struct bpf_sock_ops *skops) global.event_map |= (1 << op); switch (op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, + &window_clamp, sizeof(window_clamp)); + global.window_clamp_client = get_tp_window_clamp(skops); + break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Test failure to set largest cb flag (assumes not defined) */ global.bad_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); @@ -100,6 +129,10 @@ int bpf_testcb(struct bpf_sock_ops *skops) global.tcp_saved_syn = v; } } + rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP, + &window_clamp, sizeof(window_clamp)); + + global.window_clamp_server = get_tp_window_clamp(skops); break; case BPF_SOCK_OPS_RTO_CB: break; diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h index 0ed33521cbbb..9dd9b5590f9d 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf.h +++ b/tools/testing/selftests/bpf/test_tcpbpf.h @@ -16,5 +16,7 @@ struct tcpbpf_globals { __u32 num_close_events; __u32 tcp_save_syn; __u32 tcp_saved_syn; + __u32 window_clamp_client; + __u32 window_clamp_server; }; #endif From 12cc126df82c96c89706aa207ad27c56f219047c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:21 -0800 Subject: [PATCH 103/118] bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address() __module_address() needs to be called with preemption disabled or with module_mutex taken. preempt_disable() is enough for read-only uses, which is what this fix does. Also, module_put() does internal check for NULL, so drop it as well. Fixes: a38d1107f937 ("bpf: support raw tracepoints in modules") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201203204634.1325171-2-andrii@kernel.org --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d255bc9b2bfa..23a390aac524 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2060,10 +2060,12 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { - struct module *mod = __module_address((unsigned long)btp); + struct module *mod; - if (mod) - module_put(mod); + preempt_disable(); + mod = __module_address((unsigned long)btp); + module_put(mod); + preempt_enable(); } static __always_inline From 2fe8890848c799515a881502339a0a7b2b555988 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:22 -0800 Subject: [PATCH 104/118] bpf: Keep module's btf_data_size intact after load Having real btf_data_size stored in struct module is benefitial to quickly determine which kernel modules have associated BTF object and which don't. There is no harm in keeping this info, as opposed to keeping invalid pointer. Fixes: 607c543f939d ("bpf: Sanitize BTF data pointer after module is loaded") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-3-andrii@kernel.org --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/module.c b/kernel/module.c index 18f259d61d14..c3a9e972d3b2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3712,7 +3712,6 @@ static noinline int do_init_module(struct module *mod) #ifdef CONFIG_DEBUG_INFO_BTF_MODULES /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ mod->btf_data = NULL; - mod->btf_data_size = 0; #endif /* * We want to free module_init, but be aware that kallsyms may be From a19f93cfafdf85851c69bc9f677aa4f40c53610f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:23 -0800 Subject: [PATCH 105/118] libbpf: Add internal helper to load BTF data by FD Add a btf_get_from_fd() helper, which constructs struct btf from in-kernel BTF data by FD. This is used for loading module BTFs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-4-andrii@kernel.org --- tools/lib/bpf/btf.c | 61 +++++++++++++++++++-------------- tools/lib/bpf/libbpf_internal.h | 1 + 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 1935e83d309c..3c3f2bc6c652 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1323,35 +1323,27 @@ const char *btf__name_by_offset(const struct btf *btf, __u32 offset) return btf__str_by_offset(btf, offset); } -int btf__get_from_id(__u32 id, struct btf **btf) +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) { - struct bpf_btf_info btf_info = { 0 }; + struct bpf_btf_info btf_info; __u32 len = sizeof(btf_info); __u32 last_size; - int btf_fd; + struct btf *btf; void *ptr; int err; - err = 0; - *btf = NULL; - btf_fd = bpf_btf_get_fd_by_id(id); - if (btf_fd < 0) - return 0; - /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so * let's start with a sane default - 4KiB here - and resize it only if * bpf_obj_get_info_by_fd() needs a bigger buffer. */ - btf_info.btf_size = 4096; - last_size = btf_info.btf_size; + last_size = 4096; ptr = malloc(last_size); - if (!ptr) { - err = -ENOMEM; - goto exit_free; - } + if (!ptr) + return ERR_PTR(-ENOMEM); - memset(ptr, 0, last_size); + memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); if (!err && btf_info.btf_size > last_size) { @@ -1360,31 +1352,48 @@ int btf__get_from_id(__u32 id, struct btf **btf) last_size = btf_info.btf_size; temp_ptr = realloc(ptr, last_size); if (!temp_ptr) { - err = -ENOMEM; + btf = ERR_PTR(-ENOMEM); goto exit_free; } ptr = temp_ptr; - memset(ptr, 0, last_size); + + len = sizeof(btf_info); + memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); } if (err || btf_info.btf_size > last_size) { - err = errno; + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); goto exit_free; } - *btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size); - if (IS_ERR(*btf)) { - err = PTR_ERR(*btf); - *btf = NULL; - } + btf = btf_new(ptr, btf_info.btf_size, base_btf); exit_free: - close(btf_fd); free(ptr); + return btf; +} - return err; +int btf__get_from_id(__u32 id, struct btf **btf) +{ + struct btf *res; + int btf_fd; + + *btf = NULL; + btf_fd = bpf_btf_get_fd_by_id(id); + if (btf_fd < 0) + return -errno; + + res = btf_get_from_fd(btf_fd, NULL); + close(btf_fd); + if (IS_ERR(res)) + return PTR_ERR(res); + + *btf = res; + return 0; } int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index d99bc847bf84..e569ae63808e 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -155,6 +155,7 @@ int bpf_object__section_size(const struct bpf_object *obj, const char *name, __u32 *size); int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, __u32 *off); +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); struct btf_ext_info { /* From 0f7515ca7cddadabe04e28e20a257b1bbb6cb98a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:24 -0800 Subject: [PATCH 106/118] libbpf: Refactor CO-RE relocs to not assume a single BTF object Refactor CO-RE relocation candidate search to not expect a single BTF, rather return all candidate types with their corresponding BTF objects. This will allow to extend CO-RE relocations to accommodate kernel module BTFs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201203204634.1325171-5-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 187 ++++++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 76 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b2e16efabde0..2e4fa3ce6b94 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -462,11 +462,14 @@ struct bpf_object { struct list_head list; struct btf *btf; + struct btf_ext *btf_ext; + /* Parse and load BTF vmlinux if any of the programs in the object need * it at load time. */ struct btf *btf_vmlinux; - struct btf_ext *btf_ext; + /* vmlinux BTF override for CO-RE relocations */ + struct btf *btf_vmlinux_override; void *priv; bpf_object_clear_priv_t clear_priv; @@ -4600,46 +4603,43 @@ static size_t bpf_core_essential_name_len(const char *name) return n; } -/* dynamically sized list of type IDs */ -struct ids_vec { - __u32 *data; +struct core_cand +{ + const struct btf *btf; + const struct btf_type *t; + const char *name; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct core_cand_list { + struct core_cand *cands; int len; }; -static void bpf_core_free_cands(struct ids_vec *cand_ids) +static void bpf_core_free_cands(struct core_cand_list *cands) { - free(cand_ids->data); - free(cand_ids); + free(cands->cands); + free(cands); } -static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, - __u32 local_type_id, - const struct btf *targ_btf) +static int bpf_core_add_cands(struct core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct core_cand_list *cands) { - size_t local_essent_len, targ_essent_len; - const char *local_name, *targ_name; - const struct btf_type *t, *local_t; - struct ids_vec *cand_ids; - __u32 *new_ids; - int i, err, n; - - local_t = btf__type_by_id(local_btf, local_type_id); - if (!local_t) - return ERR_PTR(-EINVAL); - - local_name = btf__name_by_offset(local_btf, local_t->name_off); - if (str_is_empty(local_name)) - return ERR_PTR(-EINVAL); - local_essent_len = bpf_core_essential_name_len(local_name); - - cand_ids = calloc(1, sizeof(*cand_ids)); - if (!cand_ids) - return ERR_PTR(-ENOMEM); + struct core_cand *new_cands, *cand; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; n = btf__get_nr_types(targ_btf); - for (i = 1; i <= n; i++) { + for (i = targ_start_id; i <= n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_t)) + if (btf_kind(t) != btf_kind(local_cand->t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -4650,25 +4650,62 @@ static struct ids_vec *bpf_core_find_cands(const struct btf *local_btf, if (targ_essent_len != local_essent_len) continue; - if (strncmp(local_name, targ_name, local_essent_len) == 0) { - pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s\n", - local_type_id, btf_kind_str(local_t), - local_name, i, btf_kind_str(t), targ_name); - new_ids = libbpf_reallocarray(cand_ids->data, - cand_ids->len + 1, - sizeof(*cand_ids->data)); - if (!new_ids) { - err = -ENOMEM; - goto err_out; - } - cand_ids->data = new_ids; - cand_ids->data[cand_ids->len++] = i; - } + if (strncmp(local_cand->name, targ_name, local_essent_len) != 0) + continue; + + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", + local_cand->id, btf_kind_str(local_cand->t), + local_cand->name, i, btf_kind_str(t), targ_name, + targ_btf_name); + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, + sizeof(*cands->cands)); + if (!new_cands) + return -ENOMEM; + + cand = &new_cands[cands->len]; + cand->btf = targ_btf; + cand->t = t; + cand->name = targ_name; + cand->id = i; + + cands->cands = new_cands; + cands->len++; } - return cand_ids; -err_out: - bpf_core_free_cands(cand_ids); - return ERR_PTR(err); + return 0; +} + +static struct core_cand_list * +bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) +{ + struct core_cand local_cand = {}; + struct core_cand_list *cands; + size_t local_essent_len; + int err; + + local_cand.btf = local_btf; + local_cand.t = btf__type_by_id(local_btf, local_type_id); + if (!local_cand.t) + return ERR_PTR(-EINVAL); + + local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off); + if (str_is_empty(local_cand.name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(local_cand.name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return ERR_PTR(-ENOMEM); + + /* Attempt to find target candidates in vmlinux BTF first */ + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_vmlinux_override ?: obj->btf_vmlinux, + "vmlinux", 1, cands); + if (err) { + bpf_core_free_cands(cands); + return ERR_PTR(err); + } + + return cands; } /* Check two types for compatibility for the purpose of field access @@ -5661,7 +5698,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - const struct btf *targ_btf, struct hashmap *cand_cache) { struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; @@ -5669,8 +5705,8 @@ static int bpf_core_apply_relo(struct bpf_program *prog, struct bpf_core_relo_res cand_res, targ_res; const struct btf_type *local_type; const char *local_name; - struct ids_vec *cand_ids; - __u32 local_id, cand_id; + struct core_cand_list *cands = NULL; + __u32 local_id; const char *spec_str; int i, j, err; @@ -5717,24 +5753,24 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -EOPNOTSUPP; } - if (!hashmap__find(cand_cache, type_key, (void **)&cand_ids)) { - cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf); - if (IS_ERR(cand_ids)) { + if (!hashmap__find(cand_cache, type_key, (void **)&cands)) { + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); + if (IS_ERR(cands)) { pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", prog->name, relo_idx, local_id, btf_kind_str(local_type), - local_name, PTR_ERR(cand_ids)); - return PTR_ERR(cand_ids); + local_name, PTR_ERR(cands)); + return PTR_ERR(cands); } - err = hashmap__set(cand_cache, type_key, cand_ids, NULL, NULL); + err = hashmap__set(cand_cache, type_key, cands, NULL, NULL); if (err) { - bpf_core_free_cands(cand_ids); + bpf_core_free_cands(cands); return err; } } - for (i = 0, j = 0; i < cand_ids->len; i++) { - cand_id = cand_ids->data[i]; - err = bpf_core_spec_match(&local_spec, targ_btf, cand_id, &cand_spec); + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, + cands->cands[i].id, &cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog->name, relo_idx, i); @@ -5778,7 +5814,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -EINVAL; } - cand_ids->data[j++] = cand_spec.root_type_id; + cands->cands[j++] = cands->cands[i]; } /* @@ -5790,7 +5826,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * depending on relo's kind. */ if (j > 0) - cand_ids->len = j; + cands->len = j; /* * If no candidates were found, it might be both a programmer error, @@ -5834,20 +5870,19 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) struct hashmap_entry *entry; struct hashmap *cand_cache = NULL; struct bpf_program *prog; - struct btf *targ_btf; const char *sec_name; int i, err = 0, insn_idx, sec_idx; if (obj->btf_ext->core_relo_info.len == 0) return 0; - if (targ_btf_path) - targ_btf = btf__parse(targ_btf_path, NULL); - else - targ_btf = obj->btf_vmlinux; - if (IS_ERR_OR_NULL(targ_btf)) { - pr_warn("failed to get target BTF: %ld\n", PTR_ERR(targ_btf)); - return PTR_ERR(targ_btf); + if (targ_btf_path) { + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); + if (IS_ERR_OR_NULL(obj->btf_vmlinux_override)) { + err = PTR_ERR(obj->btf_vmlinux_override); + pr_warn("failed to parse target BTF: %d\n", err); + return err; + } } cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); @@ -5899,8 +5934,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) if (!prog->load) continue; - err = bpf_core_apply_relo(prog, rec, i, obj->btf, - targ_btf, cand_cache); + err = bpf_core_apply_relo(prog, rec, i, obj->btf, cand_cache); if (err) { pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", prog->name, i, err); @@ -5911,8 +5945,9 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) out: /* obj->btf_vmlinux is freed at the end of object load phase */ - if (targ_btf != obj->btf_vmlinux) - btf__free(targ_btf); + btf__free(obj->btf_vmlinux_override); + obj->btf_vmlinux_override = NULL; + if (!IS_ERR_OR_NULL(cand_cache)) { hashmap__for_each_entry(cand_cache, entry, i) { bpf_core_free_cands(entry->value); From 4f33a53d56000cfa67e2e4e8a5dac08f084a979b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:25 -0800 Subject: [PATCH 107/118] libbpf: Add kernel module BTF support for CO-RE relocations Teach libbpf to search for candidate types for CO-RE relocations across kernel modules BTFs, in addition to vmlinux BTF. If at least one candidate type is found in vmlinux BTF, kernel module BTFs are not iterated. If vmlinux BTF has no matching candidates, then find all kernel module BTFs and search for all matching candidates across all of them. Kernel's support for module BTFs are inferred from the support for BTF name pointer in BPF UAPI. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-6-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 179 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 169 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2e4fa3ce6b94..ca20e493726d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -176,6 +176,8 @@ enum kern_feature_id { FEAT_PROBE_READ_KERN, /* BPF_PROG_BIND_MAP is supported */ FEAT_PROG_BIND_MAP, + /* Kernel support for module BTFs */ + FEAT_MODULE_BTF, __FEAT_CNT, }; @@ -402,6 +404,12 @@ struct extern_desc { static LIST_HEAD(bpf_objects_list); +struct module_btf { + struct btf *btf; + char *name; + __u32 id; +}; + struct bpf_object { char name[BPF_OBJ_NAME_LEN]; char license[64]; @@ -470,6 +478,11 @@ struct bpf_object { struct btf *btf_vmlinux; /* vmlinux BTF override for CO-RE relocations */ struct btf *btf_vmlinux_override; + /* Lazily initialized kernel module BTFs */ + struct module_btf *btf_modules; + bool btf_modules_loaded; + size_t btf_module_cnt; + size_t btf_module_cap; void *priv; bpf_object_clear_priv_t clear_priv; @@ -3960,6 +3973,35 @@ static int probe_prog_bind_map(void) return ret >= 0; } +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_obj_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4003,7 +4045,10 @@ static struct kern_feature_desc { }, [FEAT_PROG_BIND_MAP] = { "BPF_PROG_BIND_MAP support", probe_prog_bind_map, - } + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) @@ -4674,13 +4719,96 @@ static int bpf_core_add_cands(struct core_cand *local_cand, return 0; } +static int load_module_btfs(struct bpf_object *obj) +{ + struct bpf_btf_info info; + struct module_btf *mod_btf; + struct btf *btf; + char name[64]; + __u32 id = 0, len; + int err, fd; + + if (obj->btf_modules_loaded) + return 0; + + /* don't do this again, even if we find no module BTFs */ + obj->btf_modules_loaded = true; + + /* kernel too old to support module BTFs */ + if (!kernel_supports(FEAT_MODULE_BTF)) + return 0; + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err && errno == ENOENT) + return 0; + if (err) { + err = -errno; + pr_warn("failed to iterate BTF objects: %d\n", err); + return err; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; /* expected race: BTF was unloaded */ + err = -errno; + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); + return err; + } + + len = sizeof(info); + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err = -errno; + pr_warn("failed to get BTF object #%d info: %d\n", id, err); + close(fd); + return err; + } + + /* ignore non-module BTFs */ + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { + close(fd); + continue; + } + + btf = btf_get_from_fd(fd, obj->btf_vmlinux); + close(fd); + if (IS_ERR(btf)) { + pr_warn("failed to load module [%s]'s BTF object #%d: %ld\n", + name, id, PTR_ERR(btf)); + return PTR_ERR(btf); + } + + err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + if (err) + return err; + + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + + mod_btf->btf = btf; + mod_btf->id = id; + mod_btf->name = strdup(name); + if (!mod_btf->name) + return -ENOMEM; + } + + return 0; +} + static struct core_cand_list * bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) { struct core_cand local_cand = {}; struct core_cand_list *cands; + const struct btf *main_btf; size_t local_essent_len; - int err; + int err, i; local_cand.btf = local_btf; local_cand.t = btf__type_by_id(local_btf, local_type_id); @@ -4697,15 +4825,38 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l return ERR_PTR(-ENOMEM); /* Attempt to find target candidates in vmlinux BTF first */ - err = bpf_core_add_cands(&local_cand, local_essent_len, - obj->btf_vmlinux_override ?: obj->btf_vmlinux, - "vmlinux", 1, cands); - if (err) { - bpf_core_free_cands(cands); - return ERR_PTR(err); + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + /* if vmlinux BTF has any candidate, don't got for module BTFs */ + if (cands->len) + return cands; + + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ + if (obj->btf_vmlinux_override) + return cands; + + /* now look through module BTFs, trying to still find candidates */ + err = load_module_btfs(obj); + if (err) + goto err_out; + + for (i = 0; i < obj->btf_module_cnt; i++) { + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_modules[i].btf, + obj->btf_modules[i].name, + btf__get_nr_types(obj->btf_vmlinux) + 1, + cands); + if (err) + goto err_out; } return cands; +err_out: + bpf_core_free_cands(cands); + return ERR_PTR(err); } /* Check two types for compatibility for the purpose of field access @@ -5756,7 +5907,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, if (!hashmap__find(cand_cache, type_key, (void **)&cands)) { cands = bpf_core_find_cands(prog->obj, local_btf, local_id); if (IS_ERR(cands)) { - pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", prog->name, relo_idx, local_id, btf_kind_str(local_type), local_name, PTR_ERR(cands)); return PTR_ERR(cands); @@ -5944,7 +6095,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) } out: - /* obj->btf_vmlinux is freed at the end of object load phase */ + /* obj->btf_vmlinux and module BTFs are freed after object load */ btf__free(obj->btf_vmlinux_override); obj->btf_vmlinux_override = NULL; @@ -7316,6 +7467,14 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__relocate(obj, attr->target_btf_path); err = err ? : bpf_object__load_progs(obj, attr->log_level); + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + free(obj->btf_modules); + + /* clean up vmlinux BTF */ btf__free(obj->btf_vmlinux); obj->btf_vmlinux = NULL; From 9f7fa225894c7fcb014f3699a402fcc4d896cb1c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:26 -0800 Subject: [PATCH 108/118] selftests/bpf: Add bpf_testmod kernel module for testing Add bpf_testmod module, which is conceptually out-of-tree module and provides ways for selftests/bpf to test various kernel module-related functionality: raw tracepoint, fentry/fexit/fmod_ret, etc. This module will be auto-loaded by test_progs test runner and expected by some of selftests to be present and loaded. Pahole currently isn't able to generate BTF for static functions in kernel modules, so make sure traced function is global. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201203204634.1325171-7-andrii@kernel.org --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 12 +++- .../selftests/bpf/bpf_testmod/.gitignore | 6 ++ .../selftests/bpf/bpf_testmod/Makefile | 20 +++++++ .../bpf/bpf_testmod/bpf_testmod-events.h | 36 +++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod.c | 52 ++++++++++++++++ .../selftests/bpf/bpf_testmod/bpf_testmod.h | 14 +++++ tools/testing/selftests/bpf/test_progs.c | 59 +++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 1 + 9 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/bpf_testmod/.gitignore create mode 100644 tools/testing/selftests/bpf/bpf_testmod/Makefile create mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h create mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c create mode 100644 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 395ae040ce1f..752d8edddc66 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -35,3 +35,4 @@ test_cpp /tools /runqslower /bench +*.ko diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 371b022d932c..ac25ba5d0d6c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -80,7 +80,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower bench + test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko TEST_CUSTOM_PROGS = urandom_read @@ -104,6 +104,7 @@ OVERRIDE_TARGETS := 1 override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) + $(Q)$(MAKE) -C bpf_testmod clean endef include ../lib.mk @@ -136,6 +137,11 @@ $(OUTPUT)/urandom_read: urandom_read.c $(call msg,BINARY,,$@) $(Q)$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id=sha1 +$(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) + $(call msg,MOD,,$@) + $(Q)$(MAKE) $(submake_extras) -C bpf_testmod + $(Q)cp bpf_testmod/bpf_testmod.ko $@ + $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(Q)$(CC) -c $(CFLAGS) -o $@ $< @@ -388,7 +394,7 @@ TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ btf_helpers.c flow_dissector_load.h -TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ +TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE @@ -460,4 +466,4 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ - $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc) + $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) diff --git a/tools/testing/selftests/bpf/bpf_testmod/.gitignore b/tools/testing/selftests/bpf/bpf_testmod/.gitignore new file mode 100644 index 000000000000..ded513777281 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/.gitignore @@ -0,0 +1,6 @@ +*.mod +*.mod.c +*.o +.ko +/Module.symvers +/modules.order diff --git a/tools/testing/selftests/bpf/bpf_testmod/Makefile b/tools/testing/selftests/bpf/bpf_testmod/Makefile new file mode 100644 index 000000000000..15cb36c4483a --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/Makefile @@ -0,0 +1,20 @@ +BPF_TESTMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= $(abspath $(BPF_TESTMOD_DIR)/../../../../..) + +ifeq ($(V),1) +Q = +else +Q = @ +endif + +MODULES = bpf_testmod.ko + +obj-m += bpf_testmod.o +CFLAGS_bpf_testmod.o = -I$(src) + +all: + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) modules + +clean: + +$(Q)make -C $(KDIR) M=$(BPF_TESTMOD_DIR) clean + diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h new file mode 100644 index 000000000000..b83ea448bc79 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_testmod + +#if !defined(_BPF_TESTMOD_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _BPF_TESTMOD_EVENTS_H + +#include +#include "bpf_testmod.h" + +TRACE_EVENT(bpf_testmod_test_read, + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_read_ctx *ctx), + TP_ARGS(task, ctx), + TP_STRUCT__entry( + __field(pid_t, pid) + __array(char, comm, TASK_COMM_LEN) + __field(loff_t, off) + __field(size_t, len) + ), + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->off = ctx->off; + __entry->len = ctx->len; + ), + TP_printk("pid=%d comm=%s off=%llu len=%zu", + __entry->pid, __entry->comm, __entry->off, __entry->len) +); + +#endif /* _BPF_TESTMOD_EVENTS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_testmod-events +#include diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c new file mode 100644 index 000000000000..2df19d73ca49 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include "bpf_testmod.h" + +#define CREATE_TRACE_POINTS +#include "bpf_testmod-events.h" + +noinline ssize_t +bpf_testmod_test_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct bpf_testmod_test_read_ctx ctx = { + .buf = buf, + .off = off, + .len = len, + }; + + trace_bpf_testmod_test_read(current, &ctx); + + return -EIO; /* always fail */ +} +EXPORT_SYMBOL(bpf_testmod_test_read); +ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); + +static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { + .attr = { .name = "bpf_testmod", .mode = 0444, }, + .read = bpf_testmod_test_read, +}; + +static int bpf_testmod_init(void) +{ + return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); +} + +static void bpf_testmod_exit(void) +{ + return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); +} + +module_init(bpf_testmod_init); +module_exit(bpf_testmod_exit); + +MODULE_AUTHOR("Andrii Nakryiko"); +MODULE_DESCRIPTION("BPF selftests module"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h new file mode 100644 index 000000000000..b81adfedb4f6 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#ifndef _BPF_TESTMOD_H +#define _BPF_TESTMOD_H + +#include + +struct bpf_testmod_test_read_ctx { + char *buf; + loff_t off; + size_t len; +}; + +#endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 22943b58d752..17587754b7a7 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -360,6 +360,58 @@ err: return -1; } +static int finit_module(int fd, const char *param_values, int flags) +{ + return syscall(__NR_finit_module, fd, param_values, flags); +} + +static int delete_module(const char *name, int flags) +{ + return syscall(__NR_delete_module, name, flags); +} + +static void unload_bpf_testmod(void) +{ + if (delete_module("bpf_testmod", 0)) { + if (errno == ENOENT) { + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "bpf_testmod.ko is already unloaded.\n"); + return; + } + fprintf(env.stderr, "Failed to unload bpf_testmod.ko from kernel: %d\n", -errno); + exit(1); + } + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Successfully unloaded bpf_testmod.ko.\n"); +} + +static int load_bpf_testmod(void) +{ + int fd; + + /* ensure previous instance of the module is unloaded */ + unload_bpf_testmod(); + + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Loading bpf_testmod.ko...\n"); + + fd = open("bpf_testmod.ko", O_RDONLY); + if (fd < 0) { + fprintf(env.stderr, "Can't find bpf_testmod.ko kernel module: %d\n", -errno); + return -ENOENT; + } + if (finit_module(fd, "", 0)) { + fprintf(env.stderr, "Failed to load bpf_testmod.ko into the kernel: %d\n", -errno); + close(fd); + return -EINVAL; + } + close(fd); + + if (env.verbosity > VERBOSE_NONE) + fprintf(stdout, "Successfully loaded bpf_testmod.ko.\n"); + return 0; +} + /* extern declarations for test funcs */ #define DEFINE_TEST(name) extern void test_##name(void); #include @@ -678,6 +730,11 @@ int main(int argc, char **argv) save_netns(); stdio_hijack(); + env.has_testmod = true; + if (load_bpf_testmod()) { + fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); + env.has_testmod = false; + } for (i = 0; i < prog_test_cnt; i++) { struct prog_test_def *test = &prog_test_defs[i]; @@ -722,6 +779,8 @@ int main(int argc, char **argv) if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } + if (env.has_testmod) + unload_bpf_testmod(); stdio_restore(); if (env.get_test_cnt) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index d6b14853f3bc..115953243f62 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -66,6 +66,7 @@ struct test_env { enum verbosity verbosity; bool jit_enabled; + bool has_testmod; bool get_test_cnt; bool list_test_names; From 5ed31472b9ad6373a0a24bc21186b5eac999213d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:27 -0800 Subject: [PATCH 109/118] selftests/bpf: Add support for marking sub-tests as skipped Previously skipped sub-tests would be counted as passing with ":OK" appened in the log. Change that to be accounted as ":SKIP". Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-8-andrii@kernel.org --- tools/testing/selftests/bpf/test_progs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 17587754b7a7..5ef081bdae4e 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -149,15 +149,15 @@ void test__end_subtest() if (sub_error_cnt) env.fail_cnt++; - else + else if (test->skip_cnt == 0) env.sub_succ_cnt++; skip_account(); dump_test_log(test, sub_error_cnt); fprintf(env.stdout, "#%d/%d %s:%s\n", - test->test_num, test->subtest_num, - test->subtest_name, sub_error_cnt ? "FAIL" : "OK"); + test->test_num, test->subtest_num, test->subtest_name, + sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); free(test->subtest_name); test->subtest_name = NULL; From 6bcd39d366b64318562785d5b47c2837e3a53ae5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:28 -0800 Subject: [PATCH 110/118] selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF Add a self-tests validating libbpf is able to perform CO-RE relocations against the type defined in kernel module BTF. if bpf_testmod.o is not supported by the kernel (e.g., due to version mismatch), skip tests, instead of failing. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-9-andrii@kernel.org --- .../selftests/bpf/prog_tests/core_reloc.c | 79 ++++++++++++++++--- .../selftests/bpf/progs/core_reloc_types.h | 17 ++++ .../bpf/progs/test_core_reloc_module.c | 66 ++++++++++++++++ 3 files changed, 151 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_core_reloc_module.c diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 30e40ff4b0d8..bb980848cd77 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "progs/core_reloc_types.h" +#include "bpf_testmod/bpf_testmod.h" #include #include #include @@ -9,6 +10,30 @@ static int duration = 0; #define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name) +#define MODULES_CASE(name, sec_name, tp_name) { \ + .case_name = name, \ + .bpf_obj_file = "test_core_reloc_module.o", \ + .btf_src_file = NULL, /* find in kernel module BTFs */ \ + .input = "", \ + .input_len = 0, \ + .output = STRUCT_TO_CHAR_PTR(core_reloc_module_output) { \ + .read_ctx_sz = sizeof(struct bpf_testmod_test_read_ctx),\ + .read_ctx_exists = true, \ + .buf_exists = true, \ + .len_exists = true, \ + .off_exists = true, \ + .len = 123, \ + .off = 0, \ + .comm = "test_progs", \ + .comm_len = sizeof("test_progs"), \ + }, \ + .output_len = sizeof(struct core_reloc_module_output), \ + .prog_sec_name = sec_name, \ + .raw_tp_name = tp_name, \ + .trigger = trigger_module_test_read, \ + .needs_testmod = true, \ +} + #define FLAVORS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \ .a = 42, \ .b = 0xc001, \ @@ -211,7 +236,7 @@ static int duration = 0; .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ __VA_ARGS__, \ .output_len = sizeof(struct core_reloc_bitfields_output), \ - .direct_raw_tp = true, \ + .prog_sec_name = "tp_btf/sys_enter", \ } @@ -222,7 +247,7 @@ static int duration = 0; }, { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ "direct:", name), \ - .direct_raw_tp = true, \ + .prog_sec_name = "tp_btf/sys_enter", \ .fails = true, \ } @@ -309,6 +334,7 @@ static int duration = 0; struct core_reloc_test_case; typedef int (*setup_test_fn)(struct core_reloc_test_case *test); +typedef int (*trigger_test_fn)(const struct core_reloc_test_case *test); struct core_reloc_test_case { const char *case_name; @@ -319,9 +345,12 @@ struct core_reloc_test_case { const char *output; int output_len; bool fails; + bool needs_testmod; bool relaxed_core_relocs; - bool direct_raw_tp; + const char *prog_sec_name; + const char *raw_tp_name; setup_test_fn setup; + trigger_test_fn trigger; }; static int find_btf_type(const struct btf *btf, const char *name, __u32 kind) @@ -451,6 +480,23 @@ static int setup_type_id_case_failure(struct core_reloc_test_case *test) return 0; } +static int trigger_module_test_read(const struct core_reloc_test_case *test) +{ + struct core_reloc_module_output *exp = (void *)test->output; + int fd, err; + + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + return err; + + read(fd, NULL, exp->len); /* request expected number of bytes */ + close(fd); + + return 0; +} + + static struct core_reloc_test_case test_cases[] = { /* validate we can find kernel image and use its BTF for relocs */ { @@ -467,6 +513,9 @@ static struct core_reloc_test_case test_cases[] = { .output_len = sizeof(struct core_reloc_kernel_output), }, + /* validate we can find kernel module BTF types for relocs/attach */ + MODULES_CASE("module", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"), + /* validate BPF program can use multiple flavors to match against * single target BTF type */ @@ -779,6 +828,11 @@ void test_core_reloc(void) if (!test__start_subtest(test_case->case_name)) continue; + if (test_case->needs_testmod && !env.has_testmod) { + test__skip(); + continue; + } + if (test_case->setup) { err = test_case->setup(test_case); if (CHECK(err, "test_setup", "test #%d setup failed: %d\n", i, err)) @@ -790,13 +844,11 @@ void test_core_reloc(void) test_case->bpf_obj_file, PTR_ERR(obj))) continue; - /* for typed raw tracepoints, NULL should be specified */ - if (test_case->direct_raw_tp) { - probe_name = "tp_btf/sys_enter"; - tp_name = NULL; - } else { - probe_name = "raw_tracepoint/sys_enter"; - tp_name = "sys_enter"; + probe_name = "raw_tracepoint/sys_enter"; + tp_name = "sys_enter"; + if (test_case->prog_sec_name) { + probe_name = test_case->prog_sec_name; + tp_name = test_case->raw_tp_name; /* NULL for tp_btf */ } prog = bpf_object__find_program_by_title(obj, probe_name); @@ -837,7 +889,12 @@ void test_core_reloc(void) goto cleanup; /* trigger test run */ - usleep(1); + if (test_case->trigger) { + if (!ASSERT_OK(test_case->trigger(test_case), "test_trigger")) + goto cleanup; + } else { + usleep(1); + } if (data->skip) { test__skip(); diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index e6e616cb7bc9..9a2850850121 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -15,6 +15,23 @@ struct core_reloc_kernel_output { int comm_len; }; +/* + * MODULE + */ + +struct core_reloc_module_output { + long long len; + long long off; + int read_ctx_sz; + bool read_ctx_exists; + bool buf_exists; + bool len_exists; + bool off_exists; + /* we have test_progs[-flavor], so cut flavor part */ + char comm[sizeof("test_progs")]; + int comm_len; +}; + /* * FLAVORS */ diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c new file mode 100644 index 000000000000..d1840c1a9d36 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct bpf_testmod_test_read_ctx { + /* field order is mixed up */ + size_t len; + char *buf; + loff_t off; +} __attribute__((preserve_access_index)); + +struct { + char in[256]; + char out[256]; + bool skip; + uint64_t my_pid_tgid; +} data = {}; + +struct core_reloc_module_output { + long long len; + long long off; + int read_ctx_sz; + bool read_ctx_exists; + bool buf_exists; + bool len_exists; + bool off_exists; + /* we have test_progs[-flavor], so cut flavor part */ + char comm[sizeof("test_progs")]; + int comm_len; +}; + +SEC("raw_tp/bpf_testmod_test_read") +int BPF_PROG(test_core_module, + struct task_struct *task, + struct bpf_testmod_test_read_ctx *read_ctx) +{ + struct core_reloc_module_output *out = (void *)&data.out; + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 real_tgid = (__u32)(pid_tgid >> 32); + __u32 real_pid = (__u32)pid_tgid; + + if (data.my_pid_tgid != pid_tgid) + return 0; + + if (BPF_CORE_READ(task, pid) != real_pid || BPF_CORE_READ(task, tgid) != real_tgid) + return 0; + + out->len = BPF_CORE_READ(read_ctx, len); + out->off = BPF_CORE_READ(read_ctx, off); + + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); + out->buf_exists = bpf_core_field_exists(read_ctx->buf); + out->off_exists = bpf_core_field_exists(read_ctx->off); + out->len_exists = bpf_core_field_exists(read_ctx->len); + + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); + + return 0; +} From 22dc4a0f5ed11b6dc8fd73a0892fa0ea1a4c3cdf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:29 -0800 Subject: [PATCH 111/118] bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier Remove a permeating assumption thoughout BPF verifier of vmlinux BTF. Instead, wherever BTF type IDs are involved, also track the instance of struct btf that goes along with the type ID. This allows to gradually add support for kernel module BTFs and using/tracking module types across BPF helper calls and registers. This patch also renames btf_id() function to btf_obj_id() to minimize naming clash with using btf_id to denote BTF *type* ID, rather than BTF *object*'s ID. Also, altough btf_vmlinux can't get destructed and thus doesn't need refcounting, module BTFs need that, so apply BTF refcounting universally when BPF program is using BTF-powered attachment (tp_btf, fentry/fexit, etc). This makes for simpler clean up code. Now that BTF type ID is not enough to uniquely identify a BTF type, extend BPF trampoline key to include BTF object ID. To differentiate that from target program BPF ID, set 31st bit of type ID. BTF type IDs (at least currently) are not allowed to take full 32 bits, so there is no danger of confusing that bit with a valid BTF type ID. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-10-andrii@kernel.org --- include/linux/bpf.h | 13 ++++-- include/linux/bpf_verifier.h | 28 +++++++++---- include/linux/btf.h | 5 ++- kernel/bpf/btf.c | 65 ++++++++++++++++++++---------- kernel/bpf/syscall.c | 24 +++++++++-- kernel/bpf/verifier.c | 77 ++++++++++++++++++++++-------------- net/ipv4/bpf_tcp_ca.c | 3 +- 7 files changed, 148 insertions(+), 67 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9de5711b23f..d05e75ed8c1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -421,7 +421,10 @@ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; union { int ctx_field_size; - u32 btf_id; + struct { + struct btf *btf; + u32 btf_id; + }; }; struct bpf_verifier_log *log; /* for verbose logs */ }; @@ -458,6 +461,7 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); @@ -771,6 +775,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; @@ -1005,7 +1010,6 @@ struct bpf_event_entry { bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); -const char *kernel_type_name(u32 btf_type_id); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -1450,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id); + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 306869d4743b..e941fe1484e5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -5,6 +5,7 @@ #define _LINUX_BPF_VERIFIER_H 1 #include /* for enum bpf_reg_type */ +#include /* for struct btf and btf_id() */ #include /* for MAX_BPF_STACK */ #include @@ -43,6 +44,8 @@ enum bpf_reg_liveness { struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; + /* Fixed part of pointer offset, pointer types only */ + s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -52,15 +55,20 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; - u32 btf_id; /* for PTR_TO_BTF_ID */ + /* for PTR_TO_BTF_ID */ + struct { + struct btf *btf; + u32 btf_id; + }; u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ /* Max size from any of the above. */ - unsigned long raw; + struct { + unsigned long raw1; + unsigned long raw2; + } raw; }; - /* Fixed part of pointer offset, pointer types only */ - s32 off; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -311,7 +319,10 @@ struct bpf_insn_aux_data { struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { - u32 btf_id; /* btf_id for struct typed var */ + struct { + struct btf *btf; + u32 btf_id; /* btf_id for struct typed var */ + }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; @@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env, /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, - u32 btf_id) + struct btf *btf, u32 btf_id) { - return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; + if (tgt_prog) + return ((u64)tgt_prog->aux->id << 32) | btf_id; + else + return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } int bpf_check_attach_target(struct bpf_verifier_log *log, diff --git a/include/linux/btf.h b/include/linux/btf.h index 2bf641829664..fb608e4de076 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,6 +18,7 @@ struct btf_show; extern const struct file_operations btf_fops; +void btf_get(struct btf *btf); void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); @@ -88,7 +89,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags); int btf_get_fd_by_id(u32 id); -u32 btf_id(const struct btf *btf); +u32 btf_obj_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); @@ -206,6 +207,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( } #ifdef CONFIG_BPF_SYSCALL +struct bpf_prog; + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b2d508b33d4..7a19bf5bfe97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1524,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu) btf_free(btf); } +void btf_get(struct btf *btf) +{ + refcount_inc(&btf->refcnt); +} + void btf_put(struct btf *btf) { if (btf && refcount_dec_and_test(&btf->refcnt)) { @@ -4555,11 +4560,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; - if (tgt_prog) { + if (tgt_prog) return tgt_prog->aux->btf; - } else { - return btf_vmlinux; - } + else + return prog->aux->attach_btf; } static bool is_string_ptr(struct btf *btf, const struct btf_type *t) @@ -4700,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; + info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } @@ -4716,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { + info->btf = btf_vmlinux; info->btf_id = ret; return true; } else { @@ -4723,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } + info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ @@ -4749,7 +4756,7 @@ enum bpf_struct_walk_result { WALK_STRUCT, }; -static int btf_struct_walk(struct bpf_verifier_log *log, +static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, u32 *next_btf_id) { @@ -4760,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, u32 vlen, elem_id, mid; again: - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; @@ -4777,7 +4784,7 @@ again: goto error; member = btf_type_member(t) + vlen - 1; - mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + mtype = btf_type_skip_modifiers(btf, member->type, NULL); if (!btf_type_is_array(mtype)) goto error; @@ -4793,7 +4800,7 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + t = btf_type_skip_modifiers(btf, array_elem->type, NULL); if (!btf_type_is_struct(t)) goto error; @@ -4851,10 +4858,10 @@ error: /* type of the field */ mid = member->type; - mtype = btf_type_by_id(btf_vmlinux, member->type); - mname = __btf_name_by_offset(btf_vmlinux, member->name_off); + mtype = btf_type_by_id(btf, member->type); + mname = __btf_name_by_offset(btf, member->name_off); - mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + mtype = __btf_resolve_size(btf, mtype, &msize, &elem_type, &elem_id, &total_nelems, &mid); if (IS_ERR(mtype)) { @@ -4949,7 +4956,7 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; return WALK_PTR; @@ -4975,7 +4982,7 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id) @@ -4984,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 id; do { - err = btf_struct_walk(log, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id); switch (err) { case WALK_PTR: @@ -5000,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log, * by diving in it. At this point the offset is * aligned with the new type, so set it to 0. */ - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); off = 0; break; default: @@ -5016,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log, return -EINVAL; } +/* Check that two BTF types, each specified as an BTF object + id, are exactly + * the same. Trivial ID check is not enough due to module BTFs, because we can + * end up with two different module BTFs, but IDs point to the common type in + * vmlinux BTF. + */ +static bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + if (id1 != id2) + return false; + if (btf1 == btf2) + return true; + return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); +} + bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id) + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; int err; /* Are we already done? */ - if (need_type_id == id && off == 0) + if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; again: - type = btf_type_by_id(btf_vmlinux, id); + type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id); if (err != WALK_STRUCT) return false; @@ -5039,7 +5062,7 @@ again: * continue the search with offset 0 in the new * type. */ - if (need_type_id != id) { + if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { off = 0; goto again; } @@ -5710,7 +5733,7 @@ int btf_get_fd_by_id(u32 id) return fd; } -u32 btf_id(const struct btf *btf) +u32 btf_obj_id(const struct btf *btf) { return btf->id; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d16dd4945100..184204169949 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1691,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); if (deferred) { if (prog->aux->sleepable) @@ -2113,6 +2115,20 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; + + if (attr->attach_btf_id && !attr->attach_prog_fd) { + struct btf *btf; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + btf_get(btf); + prog->aux->attach_btf = btf; + } + if (attr->attach_prog_fd) { struct bpf_prog *dst_prog; @@ -2209,6 +2225,8 @@ free_prog_sec: free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); free_prog: + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); bpf_prog_free(prog); return err; } @@ -2566,7 +2584,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } link = kzalloc(sizeof(*link), GFP_USER); @@ -3543,7 +3561,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, } if (prog->aux->btf) - info.btf_id = btf_id(prog->aux->btf); + info.btf_id = btf_obj_id(prog->aux->btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -3646,7 +3664,7 @@ static int bpf_map_get_info_by_fd(struct file *file, memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { - info.btf_id = btf_id(map->btf); + info.btf_id = btf_obj_id(map->btf); info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e333ce43f281..2f3950839b85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,9 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + struct btf *btf; u32 btf_id; + struct btf *ret_btf; u32 ret_btf_id; }; @@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env, return cur->frame[reg->frameno]; } -const char *kernel_type_name(u32 id) +static const char *kernel_type_name(const struct btf* btf, u32 id) { - return btf_name_by_offset(btf_vmlinux, - btf_type_by_id(btf_vmlinux, id)->name_off); + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } static void print_verifier_state(struct bpf_verifier_env *env, @@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL || t == PTR_TO_PERCPU_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf_id)); + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); @@ -1383,7 +1384,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, u32 btf_id) + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); @@ -1391,6 +1393,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, } mark_reg_known_zero(env, regs, regno); regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -2764,7 +2767,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - u32 *btf_id) + struct btf **btf, u32 *btf_id) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, @@ -2782,10 +2785,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + *btf = info.btf; *btf_id = info.btf_id; - else + } else { env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -3297,8 +3302,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, int value_regno) { struct bpf_reg_state *reg = regs + regno; - const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); - const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + const char *tname = btf_name_by_offset(reg->btf, t->name_off); u32 btf_id; int ret; @@ -3319,23 +3324,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, t, off, size, - atype, &btf_id); + ret = env->ops->btf_struct_access(&env->log, reg->btf, t, + off, size, atype, &btf_id); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, - &btf_id); + ret = btf_struct_access(&env->log, reg->btf, t, off, size, + atype, &btf_id); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); return 0; } @@ -3385,12 +3390,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); return 0; } @@ -3466,6 +3471,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; + struct btf *btf = NULL; u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && @@ -3478,7 +3484,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -3500,8 +3506,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + reg_type == PTR_TO_BTF_ID_OR_NULL) { + regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; + } } regs[value_regno].type = reg_type; } @@ -4118,11 +4126,11 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - *arg_btf_id)) { + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), - kernel_type_name(*arg_btf_id)); + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -4244,6 +4252,7 @@ skip_type_check: verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } + meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { @@ -5190,16 +5199,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); - t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL); if (!btf_type_is_struct(t)) { u32 tsize; const struct btf_type *ret; const char *tname; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(meta.ret_btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(meta.ret_btf, t->name_off); verbose(env, "unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); return -EINVAL; @@ -5212,6 +5221,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || @@ -5228,6 +5238,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + /* current BPF helper definitions are only coming from + * built-in code with type IDs from vmlinux BTF + */ + regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", @@ -5627,7 +5641,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_SUB: @@ -5692,7 +5706,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_AND: @@ -7744,6 +7758,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) break; case PTR_TO_BTF_ID: case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -9739,6 +9754,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9757,6 +9773,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } return 0; @@ -11609,7 +11626,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; + btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; if (!btf) { bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); @@ -11885,7 +11902,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764..d520e61649c8 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id) @@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, size_t end; if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); From 290248a5b7d829871b3ea3c62578613a580a1744 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:30 -0800 Subject: [PATCH 112/118] bpf: Allow to specify kernel module BTFs when attaching BPF programs Add ability for user-space programs to specify non-vmlinux BTF when attaching BTF-powered BPF programs: raw_tp, fentry/fexit/fmod_ret, LSM, etc. For this, attach_prog_fd (now with the alias name attach_btf_obj_fd) should specify FD of a module or vmlinux BTF object. For backwards compatibility reasons, 0 denotes vmlinux BTF. Only kernel BTF (vmlinux or module) can be specified. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-11-andrii@kernel.org --- include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 7 ++- kernel/bpf/btf.c | 5 +++ kernel/bpf/syscall.c | 82 +++++++++++++++++++++------------- tools/include/uapi/linux/bpf.h | 7 ++- 5 files changed, 69 insertions(+), 33 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index fb608e4de076..4c200f5d242b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -90,6 +90,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); +bool btf_is_kernel(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c3458ec1f30a..1233f14f659f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -557,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7a19bf5bfe97..8d6bdb4f4d61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5738,6 +5738,11 @@ u32 btf_obj_id(const struct btf *btf) return btf->id; } +bool btf_is_kernel(const struct btf *btf) +{ + return btf->kernel_btf; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 184204169949..0cd3cc2af9c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1926,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id, u32 prog_fd) + struct btf *attach_btf, u32 btf_id, + struct bpf_prog *dst_prog) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; + if (!attach_btf && !dst_prog) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -1943,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + if (attach_btf && (!btf_id || dst_prog)) + return -EINVAL; + + if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; @@ -2060,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; - struct bpf_prog *prog; + struct bpf_prog *prog, *dst_prog = NULL; + struct btf *attach_btf = NULL; int err; char license[128]; bool is_gpl; @@ -2102,44 +2110,56 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog + * or btf, we need to check which one it is + */ + if (attr->attach_prog_fd) { + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + dst_prog = NULL; + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); + if (IS_ERR(attach_btf)) + return -EINVAL; + if (!btf_is_kernel(attach_btf)) { + btf_put(attach_btf); + return -EINVAL; + } + } + } else if (attr->attach_btf_id) { + /* fall back to vmlinux BTF, if BTF type ID is specified */ + attach_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(attach_btf)) + return PTR_ERR(attach_btf); + if (!attach_btf) + return -EINVAL; + btf_get(attach_btf); + } + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id, - attr->attach_prog_fd)) + attach_btf, attr->attach_btf_id, + dst_prog)) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -EINVAL; + } /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); - if (!prog) + if (!prog) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -ENOMEM; + } prog->expected_attach_type = attr->expected_attach_type; + prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; - - if (attr->attach_btf_id && !attr->attach_prog_fd) { - struct btf *btf; - - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - if (!btf) - return -EINVAL; - - btf_get(btf); - prog->aux->attach_btf = btf; - } - - if (attr->attach_prog_fd) { - struct bpf_prog *dst_prog; - - dst_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(dst_prog)) { - err = PTR_ERR(dst_prog); - goto free_prog; - } - prog->aux->dst_prog = dst_prog; - } - + prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c3458ec1f30a..1233f14f659f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -557,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ From 6aef10a481a3f42c8021fe410e07440c0d71a5fc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:31 -0800 Subject: [PATCH 113/118] libbpf: Factor out low-level BPF program loading helper Refactor low-level API for BPF program loading to not rely on public API types. This allows painless extension without constant efforts to cleverly not break backwards compatibility. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-12-andrii@kernel.org --- tools/lib/bpf/bpf.c | 100 ++++++++++++++++++++++---------- tools/lib/bpf/libbpf.c | 34 +++++------ tools/lib/bpf/libbpf_internal.h | 29 +++++++++ 3 files changed, 113 insertions(+), 50 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 4025266d0fb0..5d681ce32b37 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -215,59 +215,52 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, return info; } -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz) +int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr) { void *finfo = NULL, *linfo = NULL; union bpf_attr attr; - __u32 log_level; int fd; - if (!load_attr || !log_buf != !log_buf_sz) + if (!load_attr->log_buf != !load_attr->log_buf_sz) return -EINVAL; - log_level = load_attr->log_level; - if (log_level > (4 | 2 | 1) || (log_level && !log_buf)) + if (load_attr->log_level > (4 | 2 | 1) || (load_attr->log_level && !load_attr->log_buf)) return -EINVAL; memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; - if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS || - attr.prog_type == BPF_PROG_TYPE_LSM) { - attr.attach_btf_id = load_attr->attach_btf_id; - } else if (attr.prog_type == BPF_PROG_TYPE_TRACING || - attr.prog_type == BPF_PROG_TYPE_EXT) { - attr.attach_btf_id = load_attr->attach_btf_id; - attr.attach_prog_fd = load_attr->attach_prog_fd; - } else { - attr.prog_ifindex = load_attr->prog_ifindex; - attr.kern_version = load_attr->kern_version; - } - attr.insn_cnt = (__u32)load_attr->insns_cnt; + + attr.attach_btf_id = load_attr->attach_btf_id; + attr.attach_prog_fd = load_attr->attach_prog_fd; + + attr.prog_ifindex = load_attr->prog_ifindex; + attr.kern_version = load_attr->kern_version; + + attr.insn_cnt = (__u32)load_attr->insn_cnt; attr.insns = ptr_to_u64(load_attr->insns); attr.license = ptr_to_u64(load_attr->license); - attr.log_level = log_level; - if (log_level) { - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; - } else { - attr.log_buf = ptr_to_u64(NULL); - attr.log_size = 0; + attr.log_level = load_attr->log_level; + if (attr.log_level) { + attr.log_buf = ptr_to_u64(load_attr->log_buf); + attr.log_size = load_attr->log_buf_sz; } attr.prog_btf_fd = load_attr->prog_btf_fd; + attr.prog_flags = load_attr->prog_flags; + attr.func_info_rec_size = load_attr->func_info_rec_size; attr.func_info_cnt = load_attr->func_info_cnt; attr.func_info = ptr_to_u64(load_attr->func_info); + attr.line_info_rec_size = load_attr->line_info_rec_size; attr.line_info_cnt = load_attr->line_info_cnt; attr.line_info = ptr_to_u64(load_attr->line_info); + if (load_attr->name) memcpy(attr.prog_name, load_attr->name, - min(strlen(load_attr->name), BPF_OBJ_NAME_LEN - 1)); - attr.prog_flags = load_attr->prog_flags; + min(strlen(load_attr->name), (size_t)BPF_OBJ_NAME_LEN - 1)); fd = sys_bpf_prog_load(&attr, sizeof(attr)); if (fd >= 0) @@ -307,19 +300,19 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, } fd = sys_bpf_prog_load(&attr, sizeof(attr)); - if (fd >= 0) goto done; } - if (log_level || !log_buf) + if (load_attr->log_level || !load_attr->log_buf) goto done; /* Try again with log */ - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; + attr.log_buf = ptr_to_u64(load_attr->log_buf); + attr.log_size = load_attr->log_buf_sz; attr.log_level = 1; - log_buf[0] = 0; + load_attr->log_buf[0] = 0; + fd = sys_bpf_prog_load(&attr, sizeof(attr)); done: free(finfo); @@ -327,6 +320,49 @@ done: return fd; } +int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz) +{ + struct bpf_prog_load_params p = {}; + + if (!load_attr || !log_buf != !log_buf_sz) + return -EINVAL; + + p.prog_type = load_attr->prog_type; + p.expected_attach_type = load_attr->expected_attach_type; + switch (p.prog_type) { + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_LSM: + p.attach_btf_id = load_attr->attach_btf_id; + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + p.attach_btf_id = load_attr->attach_btf_id; + p.attach_prog_fd = load_attr->attach_prog_fd; + break; + default: + p.prog_ifindex = load_attr->prog_ifindex; + p.kern_version = load_attr->kern_version; + } + p.insn_cnt = load_attr->insns_cnt; + p.insns = load_attr->insns; + p.license = load_attr->license; + p.log_level = load_attr->log_level; + p.log_buf = log_buf; + p.log_buf_sz = log_buf_sz; + p.prog_btf_fd = load_attr->prog_btf_fd; + p.func_info_rec_size = load_attr->func_info_rec_size; + p.func_info_cnt = load_attr->func_info_cnt; + p.func_info = load_attr->func_info; + p.line_info_rec_size = load_attr->line_info_rec_size; + p.line_info_cnt = load_attr->line_info_cnt; + p.line_info = load_attr->line_info; + p.name = load_attr->name; + p.prog_flags = load_attr->prog_flags; + + return libbpf__bpf_prog_load(&p); +} + int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ca20e493726d..103d66e27406 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6809,7 +6809,7 @@ static int load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, char *license, __u32 kern_version, int *pfd) { - struct bpf_load_program_attr load_attr; + struct bpf_prog_load_params load_attr = {}; char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL; @@ -6828,7 +6828,6 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, if (!insns || !insns_cnt) return -EINVAL; - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); load_attr.prog_type = prog->type; /* old kernels might not support specifying expected_attach_type */ if (!kernel_supports(FEAT_EXP_ATTACH_TYPE) && prog->sec_def && @@ -6839,19 +6838,14 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, if (kernel_supports(FEAT_PROG_NAME)) load_attr.name = prog->name; load_attr.insns = insns; - load_attr.insns_cnt = insns_cnt; + load_attr.insn_cnt = insns_cnt; load_attr.license = license; - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || - prog->type == BPF_PROG_TYPE_LSM) { - load_attr.attach_btf_id = prog->attach_btf_id; - } else if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { - load_attr.attach_prog_fd = prog->attach_prog_fd; - load_attr.attach_btf_id = prog->attach_btf_id; - } else { - load_attr.kern_version = kern_version; - load_attr.prog_ifindex = prog->prog_ifindex; - } + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.attach_prog_fd = prog->attach_prog_fd; + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.kern_version = kern_version; + load_attr.prog_ifindex = prog->prog_ifindex; + /* specify func_info/line_info only if kernel supports them */ btf_fd = bpf_object__btf_fd(prog->obj); if (btf_fd >= 0 && kernel_supports(FEAT_BTF_FUNC)) { @@ -6875,7 +6869,9 @@ retry_load: *log_buf = 0; } - ret = bpf_load_program_xattr(&load_attr, log_buf, log_buf_size); + load_attr.log_buf = log_buf; + load_attr.log_buf_sz = log_buf_size; + ret = libbpf__bpf_prog_load(&load_attr); if (ret >= 0) { if (log_buf && load_attr.log_level) @@ -6916,9 +6912,9 @@ retry_load: pr_warn("-- BEGIN DUMP LOG ---\n"); pr_warn("\n%s\n", log_buf); pr_warn("-- END LOG --\n"); - } else if (load_attr.insns_cnt >= BPF_MAXINSNS) { + } else if (load_attr.insn_cnt >= BPF_MAXINSNS) { pr_warn("Program too large (%zu insns), at most %d insns\n", - load_attr.insns_cnt, BPF_MAXINSNS); + load_attr.insn_cnt, BPF_MAXINSNS); ret = -LIBBPF_ERRNO__PROG2BIG; } else if (load_attr.prog_type != BPF_PROG_TYPE_KPROBE) { /* Wrong program type? */ @@ -6926,7 +6922,9 @@ retry_load: load_attr.prog_type = BPF_PROG_TYPE_KPROBE; load_attr.expected_attach_type = 0; - fd = bpf_load_program_xattr(&load_attr, NULL, 0); + load_attr.log_buf = NULL; + load_attr.log_buf_sz = 0; + fd = libbpf__bpf_prog_load(&load_attr); if (fd >= 0) { close(fd); ret = -LIBBPF_ERRNO__PROGTYPE; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index e569ae63808e..681073a67ae3 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -151,6 +151,35 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); +struct bpf_prog_load_params { + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + const char *name; + const struct bpf_insn *insns; + size_t insn_cnt; + const char *license; + __u32 kern_version; + __u32 attach_prog_fd; + __u32 attach_btf_id; + __u32 prog_ifindex; + __u32 prog_btf_fd; + __u32 prog_flags; + + __u32 func_info_rec_size; + const void *func_info; + __u32 func_info_cnt; + + __u32 line_info_rec_size; + const void *line_info; + __u32 line_info_cnt; + + __u32 log_level; + char *log_buf; + size_t log_buf_sz; +}; + +int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr); + int bpf_object__section_size(const struct bpf_object *obj, const char *name, __u32 *size); int bpf_object__variable_offset(const struct bpf_object *obj, const char *name, From 91abb4a6d79df6c4dcd86d85632df53c8cca2dcf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:32 -0800 Subject: [PATCH 114/118] libbpf: Support attachment of BPF tracing programs to kernel modules Teach libbpf to search for BTF types in kernel modules for tracing BPF programs. This allows attachment of raw_tp/fentry/fexit/fmod_ret/etc BPF program types to tracepoints and functions in kernel modules. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-13-andrii@kernel.org --- tools/lib/bpf/bpf.c | 5 +- tools/lib/bpf/libbpf.c | 138 +++++++++++++++++++++++++------- tools/lib/bpf/libbpf_internal.h | 1 + 3 files changed, 112 insertions(+), 32 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5d681ce32b37..bba48ff4c5c0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -231,8 +231,11 @@ int libbpf__bpf_prog_load(const struct bpf_prog_load_params *load_attr) attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; + if (load_attr->attach_prog_fd) + attr.attach_prog_fd = load_attr->attach_prog_fd; + else + attr.attach_btf_obj_fd = load_attr->attach_btf_obj_fd; attr.attach_btf_id = load_attr->attach_btf_id; - attr.attach_prog_fd = load_attr->attach_prog_fd; attr.prog_ifindex = load_attr->prog_ifindex; attr.kern_version = load_attr->kern_version; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 103d66e27406..912e01b946fe 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -278,6 +278,7 @@ struct bpf_program { enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; + __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 attach_prog_fd; void *func_info; @@ -408,6 +409,7 @@ struct module_btf { struct btf *btf; char *name; __u32 id; + int fd; }; struct bpf_object { @@ -4766,8 +4768,7 @@ static int load_module_btfs(struct bpf_object *obj) if (err) { err = -errno; pr_warn("failed to get BTF object #%d info: %d\n", id, err); - close(fd); - return err; + goto err_out; } /* ignore non-module BTFs */ @@ -4777,25 +4778,33 @@ static int load_module_btfs(struct bpf_object *obj) } btf = btf_get_from_fd(fd, obj->btf_vmlinux); - close(fd); if (IS_ERR(btf)) { pr_warn("failed to load module [%s]'s BTF object #%d: %ld\n", name, id, PTR_ERR(btf)); - return PTR_ERR(btf); + err = PTR_ERR(btf); + goto err_out; } err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) - return err; + goto err_out; mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; mod_btf->btf = btf; mod_btf->id = id; + mod_btf->fd = fd; mod_btf->name = strdup(name); - if (!mod_btf->name) - return -ENOMEM; + if (!mod_btf->name) { + err = -ENOMEM; + goto err_out; + } + continue; + +err_out: + close(fd); + return err; } return 0; @@ -6841,7 +6850,10 @@ load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt, load_attr.insn_cnt = insns_cnt; load_attr.license = license; load_attr.attach_btf_id = prog->attach_btf_id; - load_attr.attach_prog_fd = prog->attach_prog_fd; + if (prog->attach_prog_fd) + load_attr.attach_prog_fd = prog->attach_prog_fd; + else + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; load_attr.attach_btf_id = prog->attach_btf_id; load_attr.kern_version = kern_version; load_attr.prog_ifindex = prog->prog_ifindex; @@ -6937,11 +6949,11 @@ out: return ret; } -static int libbpf_find_attach_btf_id(struct bpf_program *prog); +static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id); int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) { - int err = 0, fd, i, btf_id; + int err = 0, fd, i; if (prog->obj->loaded) { pr_warn("prog '%s': can't load after object was loaded\n", prog->name); @@ -6951,10 +6963,14 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if ((prog->type == BPF_PROG_TYPE_TRACING || prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { - btf_id = libbpf_find_attach_btf_id(prog); - if (btf_id <= 0) - return btf_id; - prog->attach_btf_id = btf_id; + int btf_obj_fd = 0, btf_type_id = 0; + + err = libbpf_find_attach_btf_id(prog, &btf_obj_fd, &btf_type_id); + if (err) + return err; + + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_btf_id = btf_type_id; } if (prog->instances.nr < 0 || !prog->instances.fds) { @@ -7467,6 +7483,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) /* clean up module BTFs */ for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); btf__free(obj->btf_modules[i].btf); free(obj->btf_modules[i].name); } @@ -8821,8 +8838,8 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, return btf__find_by_name_kind(btf, btf_type_name, kind); } -static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, - enum bpf_attach_type attach_type) +static inline int find_attach_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) { int err; @@ -8838,9 +8855,6 @@ static inline int __find_vmlinux_btf_id(struct btf *btf, const char *name, else err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); - if (err <= 0) - pr_warn("%s is not found in vmlinux BTF\n", name); - return err; } @@ -8856,7 +8870,10 @@ int libbpf_find_vmlinux_btf_id(const char *name, return -EINVAL; } - err = __find_vmlinux_btf_id(btf, name, attach_type); + err = find_attach_btf_id(btf, name, attach_type); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + btf__free(btf); return err; } @@ -8894,11 +8911,49 @@ out: return err; } -static int libbpf_find_attach_btf_id(struct bpf_program *prog) +static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, + enum bpf_attach_type attach_type, + int *btf_obj_fd, int *btf_type_id) +{ + int ret, i; + + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; + + ret = load_module_btfs(obj); + if (ret) + return ret; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = mod->fd; + *btf_type_id = ret; + return 0; + } + if (ret == -ENOENT) + continue; + + return ret; + } + + return -ESRCH; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd, int *btf_type_id) { enum bpf_attach_type attach_type = prog->expected_attach_type; __u32 attach_prog_fd = prog->attach_prog_fd; - const char *name = prog->sec_name; + const char *name = prog->sec_name, *attach_name; + const struct bpf_sec_def *sec = NULL; int i, err; if (!name) @@ -8909,17 +8964,37 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog) continue; if (strncmp(name, section_defs[i].sec, section_defs[i].len)) continue; - if (attach_prog_fd) - err = libbpf_find_prog_btf_id(name + section_defs[i].len, - attach_prog_fd); - else - err = __find_vmlinux_btf_id(prog->obj->btf_vmlinux, - name + section_defs[i].len, - attach_type); + + sec = §ion_defs[i]; + break; + } + + if (!sec) { + pr_warn("failed to identify BTF ID based on ELF section name '%s'\n", name); + return -ESRCH; + } + attach_name = name + sec->len; + + /* BPF program's BTF ID */ + if (attach_prog_fd) { + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); + if (err < 0) { + pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + attach_prog_fd, attach_name, err); + return err; + } + *btf_obj_fd = 0; + *btf_type_id = err; + return 0; + } + + /* kernel/module BTF ID */ + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + if (err) { + pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); return err; } - pr_warn("failed to identify btf_id based on ELF section name '%s'\n", name); - return -ESRCH; + return 0; } int libbpf_attach_type_by_name(const char *name, @@ -10808,6 +10883,7 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return btf_id; prog->attach_btf_id = btf_id; + prog->attach_btf_obj_fd = 0; prog->attach_prog_fd = attach_prog_fd; return 0; } diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 681073a67ae3..969d0ac592ba 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -160,6 +160,7 @@ struct bpf_prog_load_params { const char *license; __u32 kern_version; __u32 attach_prog_fd; + __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 prog_ifindex; __u32 prog_btf_fd; From bc9ed69c79ae7577314a24e09c5b0d1c1c314ced Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:33 -0800 Subject: [PATCH 115/118] selftests/bpf: Add tp_btf CO-RE reloc test for modules Add another CO-RE relocation test for kernel module relocations. This time for tp_btf with direct memory reads. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-14-andrii@kernel.org --- .../selftests/bpf/prog_tests/core_reloc.c | 3 +- .../bpf/progs/test_core_reloc_module.c | 32 ++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index bb980848cd77..06eb956ff7bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -514,7 +514,8 @@ static struct core_reloc_test_case test_cases[] = { }, /* validate we can find kernel module BTF types for relocs/attach */ - MODULES_CASE("module", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"), + MODULES_CASE("module_probed", "raw_tp/bpf_testmod_test_read", "bpf_testmod_test_read"), + MODULES_CASE("module_direct", "tp_btf/bpf_testmod_test_read", NULL), /* validate BPF program can use multiple flavors to match against * single target BTF type diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c index d1840c1a9d36..56363959f7b0 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_module.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_module.c @@ -36,7 +36,7 @@ struct core_reloc_module_output { }; SEC("raw_tp/bpf_testmod_test_read") -int BPF_PROG(test_core_module, +int BPF_PROG(test_core_module_probed, struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) { @@ -64,3 +64,33 @@ int BPF_PROG(test_core_module, return 0; } + +SEC("tp_btf/bpf_testmod_test_read") +int BPF_PROG(test_core_module_direct, + struct task_struct *task, + struct bpf_testmod_test_read_ctx *read_ctx) +{ + struct core_reloc_module_output *out = (void *)&data.out; + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 real_tgid = (__u32)(pid_tgid >> 32); + __u32 real_pid = (__u32)pid_tgid; + + if (data.my_pid_tgid != pid_tgid) + return 0; + + if (task->pid != real_pid || task->tgid != real_tgid) + return 0; + + out->len = read_ctx->len; + out->off = read_ctx->off; + + out->read_ctx_sz = bpf_core_type_size(struct bpf_testmod_test_read_ctx); + out->read_ctx_exists = bpf_core_type_exists(struct bpf_testmod_test_read_ctx); + out->buf_exists = bpf_core_field_exists(read_ctx->buf); + out->off_exists = bpf_core_field_exists(read_ctx->off); + out->len_exists = bpf_core_field_exists(read_ctx->len); + + out->comm_len = BPF_CORE_READ_STR_INTO(&out->comm, task, comm); + + return 0; +} From 1e38abefcfd65f3ef7b12895dfd48db80aca28da Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:34 -0800 Subject: [PATCH 116/118] selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module Add new selftest checking attachment of fentry/fexit/fmod_ret (and raw tracepoint ones for completeness) BPF programs to kernel module function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-15-andrii@kernel.org --- .../selftests/bpf/prog_tests/module_attach.c | 53 +++++++++++++++ .../selftests/bpf/progs/test_module_attach.c | 66 +++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/module_attach.c create mode 100644 tools/testing/selftests/bpf/progs/test_module_attach.c diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c new file mode 100644 index 000000000000..4b65e9918764 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include +#include "test_module_attach.skel.h" + +static int duration; + +static int trigger_module_test_read(int read_sz) +{ + int fd, err; + + fd = open("/sys/kernel/bpf_testmod", O_RDONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + return err; + + read(fd, NULL, read_sz); + close(fd); + + return 0; +} + +void test_module_attach(void) +{ + const int READ_SZ = 456; + struct test_module_attach* skel; + struct test_module_attach__bss *bss; + int err; + + skel = test_module_attach__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + bss = skel->bss; + + err = test_module_attach__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + ASSERT_OK(trigger_module_test_read(READ_SZ), "trigger_read"); + + ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); + ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); + ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); + ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); + ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); + ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); + +cleanup: + test_module_attach__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c new file mode 100644 index 000000000000..b563563df172 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +__u32 raw_tp_read_sz = 0; + +SEC("raw_tp/bpf_testmod_test_read") +int BPF_PROG(handle_raw_tp, + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) +{ + raw_tp_read_sz = BPF_CORE_READ(read_ctx, len); + return 0; +} + +__u32 tp_btf_read_sz = 0; + +SEC("tp_btf/bpf_testmod_test_read") +int BPF_PROG(handle_tp_btf, + struct task_struct *task, struct bpf_testmod_test_read_ctx *read_ctx) +{ + tp_btf_read_sz = read_ctx->len; + return 0; +} + +__u32 fentry_read_sz = 0; + +SEC("fentry/bpf_testmod_test_read") +int BPF_PROG(handle_fentry, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_read_sz = len; + return 0; +} + +__u32 fexit_read_sz = 0; +int fexit_ret = 0; + +SEC("fexit/bpf_testmod_test_read") +int BPF_PROG(handle_fexit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len, + int ret) +{ + fexit_read_sz = len; + fexit_ret = ret; + return 0; +} + +__u32 fmod_ret_read_sz = 0; + +SEC("fmod_ret/bpf_testmod_test_read") +int BPF_PROG(handle_fmod_ret, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fmod_ret_read_sz = len; + return 0; /* don't override the exit code */ +} + +char _license[] SEC("license") = "GPL"; From 3015b500ae42356936b9b4a8b660eacaee7a6147 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 15:54:39 -0800 Subject: [PATCH 117/118] libbpf: Use memcpy instead of strncpy to please GCC Some versions of GCC are really nit-picky about strncpy() use. Use memcpy(), as they are pretty much equivalent for the case of fixed length strings. Fixes: e459f49b4394 ("libbpf: Separate XDP program load with xsk socket creation") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203235440.2302137-1-andrii@kernel.org --- tools/lib/bpf/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 4b051ec7cfbb..e3e41ceeb1bc 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -583,7 +583,7 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) } ctx->ifindex = ifindex; - strncpy(ctx->ifname, ifname, IFNAMSIZ - 1); + memcpy(ctx->ifname, ifname, IFNAMSIZ -1); ctx->ifname[IFNAMSIZ - 1] = 0; xsk->ctx = ctx; From eceae70bdeaeb6b8ceb662983cf663ff352fbc96 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 15:54:40 -0800 Subject: [PATCH 118/118] selftests/bpf: Fix invalid use of strncat in test_sockmap strncat()'s third argument is how many bytes will be added *in addition* to already existing bytes in destination. Plus extra zero byte will be added after that. So existing use in test_sockmap has many opportunities to overflow the string and cause memory corruptions. And in this case, GCC complains for a good reason. Fixes: 16962b2404ac ("bpf: sockmap, add selftests") Fixes: 73563aa3d977 ("selftests/bpf: test_sockmap, print additional test options") Fixes: 1ade9abadfca ("bpf: test_sockmap, add options for msg_pop_data() helper") Fixes: 463bac5f1ca7 ("bpf, selftests: Add test for ktls with skb bpf ingress policy") Fixes: e9dd904708c4 ("bpf: add tls support for testing in test_sockmap") Fixes: 753fb2ee0934 ("bpf: sockmap, add msg_peek tests to test_sockmap") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203235440.2302137-2-andrii@kernel.org --- tools/testing/selftests/bpf/test_sockmap.c | 36 ++++++++++++++-------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 0fa1e421c3d7..427ca00a3217 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1273,6 +1273,16 @@ static char *test_to_str(int test) return "unknown"; } +static void append_str(char *dst, const char *src, size_t dst_cap) +{ + size_t avail = dst_cap - strlen(dst); + + if (avail <= 1) /* just zero byte could be written */ + return; + + strncat(dst, src, avail - 1); /* strncat() adds + 1 for zero byte */ +} + #define OPTSTRING 60 static void test_options(char *options) { @@ -1281,42 +1291,42 @@ static void test_options(char *options) memset(options, 0, OPTSTRING); if (txmsg_pass) - strncat(options, "pass,", OPTSTRING); + append_str(options, "pass,", OPTSTRING); if (txmsg_redir) - strncat(options, "redir,", OPTSTRING); + append_str(options, "redir,", OPTSTRING); if (txmsg_drop) - strncat(options, "drop,", OPTSTRING); + append_str(options, "drop,", OPTSTRING); if (txmsg_apply) { snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_cork) { snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_start) { snprintf(tstr, OPTSTRING, "start %d,", txmsg_start); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_end) { snprintf(tstr, OPTSTRING, "end %d,", txmsg_end); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_start_pop) { snprintf(tstr, OPTSTRING, "pop (%d,%d),", txmsg_start_pop, txmsg_start_pop + txmsg_pop); - strncat(options, tstr, OPTSTRING); + append_str(options, tstr, OPTSTRING); } if (txmsg_ingress) - strncat(options, "ingress,", OPTSTRING); + append_str(options, "ingress,", OPTSTRING); if (txmsg_redir_skb) - strncat(options, "redir_skb,", OPTSTRING); + append_str(options, "redir_skb,", OPTSTRING); if (txmsg_ktls_skb) - strncat(options, "ktls_skb,", OPTSTRING); + append_str(options, "ktls_skb,", OPTSTRING); if (ktls) - strncat(options, "ktls,", OPTSTRING); + append_str(options, "ktls,", OPTSTRING); if (peek_flag) - strncat(options, "peek,", OPTSTRING); + append_str(options, "peek,", OPTSTRING); } static int __test_exec(int cgrp, int test, struct sockmap_options *opt)