From 2f99619820c2269534eb2c0cde44870313c6d353 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 17 Jun 2021 11:22:55 +0200 Subject: [PATCH 01/14] xsk: Fix missing validation for skb and unaligned mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a missing validation of a Tx descriptor when executing in skb mode and the umem is in unaligned mode. A descriptor could point to a buffer straddling the end of the umem, thus effectively tricking the kernel to read outside the allowed umem region. This could lead to a kernel crash if that part of memory is not mapped. In zero-copy mode, the descriptor validation code rejects such descriptors by checking a bit in the DMA address that tells us if the next page is physically contiguous or not. For the last page in the umem, this bit is not set, therefore any descriptor pointing to a packet straddling this last page boundary will be rejected. However, the skb path does not use this bit since it copies out data and can do so to two different pages. (It also does not have the array of DMA address, so it cannot even store this bit.) The code just returned that the packet is always physically contiguous. But this is unfortunately also returned for the last page in the umem, which means that packets that cross the end of the umem are being allowed, which they should not be. Fix this by introducing a check for this in the SKB path only, not penalizing the zero-copy path. Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210617092255.3487-1-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index eaa8386dbc63..7a9a23e7a604 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -147,11 +147,16 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, { bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; - if (pool->dma_pages_cnt && cross_pg) { + if (likely(!cross_pg)) + return false; + + if (pool->dma_pages_cnt) { return !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } - return false; + + /* skb path */ + return addr + len > pool->addrs_cnt; } static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) From f654fae47e83e56b454fbbfd0af0a4f232e356d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Jun 2021 09:58:05 +0200 Subject: [PATCH 02/14] xsk: Fix broken Tx ring validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix broken Tx ring validation for AF_XDP. The commit under the Fixes tag, fixed an off-by-one error in the validation but introduced another error. Descriptors are now let through even if they straddle a chunk boundary which they are not allowed to do in aligned mode. Worse is that they are let through even if they straddle the end of the umem itself, tricking the kernel to read data outside the allowed umem region which might or might not be mapped at all. Fix this by reintroducing the old code, but subtract the length by one to fix the off-by-one error that the original patch was addressing. The test chunk != chunk_end makes sure packets do not straddle chunk boundraries. Note that packets of zero length are allowed in the interface, therefore the test if the length is non-zero. Fixes: ac31565c2193 ("xsk: Fix for xp_aligned_validate_desc() when len == chunk_size") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Xuan Zhuo Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210618075805.14412-1-magnus.karlsson@gmail.com --- net/xdp/xsk_queue.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9d2a89d793c0..9ae13cccfb28 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -128,12 +128,15 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 chunk; - - if (desc->len > pool->chunk_size) - return false; + u64 chunk, chunk_end; chunk = xp_aligned_extract_addr(pool, desc->addr); + if (likely(desc->len)) { + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len - 1); + if (chunk != chunk_end) + return false; + } + if (chunk >= pool->addrs_cnt) return false; From 61e8aeda9398925f8c6fc290585bdd9727d154c4 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Thu, 17 Jun 2021 23:14:04 -0700 Subject: [PATCH 03/14] bpf: Fix libelf endian handling in resolv_btfids The vmlinux ".BTF_ids" ELF section is declared in btf_ids.h to hold a list of zero-filled BTF IDs, which is then patched at link-time with correct values by resolv_btfids. The section is flagged as "allocable" to preclude compression, but notably the section contents (BTF IDs) are untyped. When patching the BTF IDs, resolve_btfids writes in host-native endianness and relies on libelf for any required translation on reading and updating vmlinux. However, since the type of the .BTF_ids section content defaults to ELF_T_BYTE (i.e. unsigned char), no translation occurs. This results in incorrect patched values when cross-compiling to non-native endianness, and can manifest as kernel Oops and test failures which are difficult to troubleshoot [1]. Explicitly set the type of patched data to ELF_T_WORD, the architecture- neutral ELF type corresponding to the u32 BTF IDs. This enables libelf to transparently perform any needed endian conversions. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Cc: Frank Eigler Cc: Mark Wielaard Cc: Jiri Olsa Cc: Yonghong Song Link: https://lore.kernel.org/bpf/CAPGftE_eY-Zdi3wBcgDfkz_iOr1KF10n=9mJHm1_a_PykcsoeA@mail.gmail.com [1] Link: https://lore.kernel.org/bpf/20210618061404.818569-1-Tony.Ambardar@gmail.com --- tools/bpf/resolve_btfids/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 7550fd9c3188..3ad9301b0f00 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -655,6 +655,9 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; + /* Set type to ensure endian translation occurs. */ + obj->efile.idlist->d_type = ELF_T_WORD; + elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); err = elf_update(obj->efile.elf, ELF_C_WRITE); From 9f2470fbc4cb4583c080bb729a998933ba61aca4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:35 -0700 Subject: [PATCH 04/14] skmsg: Improve udp_bpf_recvmsg() accuracy I tried to reuse sk_msg_wait_data() for different protocols, but it turns out it can not be simply reused. For example, UDP actually uses two queues to receive skb: udp_sk(sk)->reader_queue and sk->sk_receive_queue. So we have to check both of them to know whether we have received any packet. Also, UDP does not lock the sock during BH Rx path, it makes no sense for its ->recvmsg() to lock the sock. It is always possible for ->recvmsg() to be called before packets actually arrive in the receive queue, we just use best effort to make it accurate here. Fixes: 1f5be6b3b063 ("udp: Implement udp_bpf_recvmsg() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-2-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 23 --------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++- net/ipv4/udp_bpf.c | 47 ++++++++++++++++++++++++++++++++++++++----- 4 files changed, 65 insertions(+), 31 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index aba0f0f429be..e3d080c299f6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -126,8 +126,6 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f9a81b314e4c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,29 +399,6 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_wait_data); - /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..bb49b52d7be8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -188,7 +210,7 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = tcp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..565a70040c57 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); } +static bool udp_sk_has_data(struct sock *sk) +{ + return !skb_queue_empty(&udp_sk(sk)->reader_queue) || + !skb_queue_empty(&sk->sk_receive_queue); +} + +static bool psock_has_data(struct sk_psock *psock) +{ + return !skb_queue_empty(&psock->ingress_skb) || + !sk_psock_queue_empty(psock); +} + +#define udp_msg_has_data(__sk, __psock) \ + ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) + +static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = udp_msg_has_data(sk, psock); + if (!ret) { + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + ret = udp_msg_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); - if (sk_psock_queue_empty(psock)) { + if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } @@ -47,9 +85,9 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = udp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { - if (!sk_psock_queue_empty(psock)) + if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; @@ -62,7 +100,6 @@ msg_bytes_ready: } ret = copied; out: - release_sock(sk); sk_psock_put(sk, psock); return ret; } From a7e65fe7d8201527129206754db1a2db6a6b2fde Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:36 -0700 Subject: [PATCH 05/14] selftests/bpf: Retry for EAGAIN in udp_redir_to_connected() We use non-blocking sockets for testing sockmap redirections, and got some random EAGAIN errors from UDP tests. There is no guarantee the packet would be immediately available to receive as soon as it is sent out, even on the local host. For UDP, this is especially true because it does not lock the sock during BH (unlike the TCP path). This is probably why we only saw this error in UDP cases. No matter how hard we try to make the queue empty check accurate, it is always possible for recvmsg() to beat ->sk_data_ready(). Therefore, we should just retry in case of EAGAIN. Fixes: d6378af615275 ("selftests/bpf: Add a test case for udp sockmap") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-3-xiyou.wangcong@gmail.com --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 648d9ae898d2..01ab11259809 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1610,6 +1610,7 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, struct sockaddr_storage addr; int c0, c1, p0, p1; unsigned int pass; + int retries = 100; socklen_t len; int err, n; u64 value; @@ -1686,9 +1687,13 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); +again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) + if (n < 0) { + if (errno == EAGAIN && retries--) + goto again; FAIL_ERRNO("%s: read", log_prefix); + } if (n == 0) FAIL("%s: incomplete read", log_prefix); From e00a5c331bf57f41fcfdc5da4f5caeafe5e54c1d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:37 -0700 Subject: [PATCH 06/14] udp: Fix a memory leak in udp_read_sock() sk_psock_verdict_recv() clones the skb and uses the clone afterward, so udp_read_sock() should free the skb after using it, regardless of error or not. This fixes a real kmemleak. Fixes: d7f571188ecf ("udp: Implement ->read_sock() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-4-xiyou.wangcong@gmail.com --- net/ipv4/udp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..8091276cb85b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1798,11 +1798,13 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, if (used <= 0) { if (!copied) copied = used; + kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } + kfree_skb(skb); if (!desc->count) break; } From 30b9c54a707db4155735cf71f4600241c1b7b6ff Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:38 -0700 Subject: [PATCH 07/14] skmsg: Clear skb redirect pointer before dropping it When we drop skb inside sk_psock_skb_redirect(), we have to clear its skb->_sk_redir pointer too, otherwise kfree_skb() would misinterpret it as a valid skb->_skb_refdst and dst_release() would eventually complain. Fixes: e3526bb92a20 ("skmsg: Move sk_redir from TCP_SKB_CB to skb") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f9a81b314e4c..4334720e2a04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -843,12 +843,14 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } From 0cf6672b23c8aa9d9274798dd63cbf6ede77ef90 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:39 -0700 Subject: [PATCH 08/14] skmsg: Fix a memory leak in sk_psock_verdict_apply() If the dest psock does not set SK_PSOCK_TX_ENABLED, the skb can't be queued anywhere so must be dropped. This one is found during code review. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4334720e2a04..5464477e2d3d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -924,8 +924,13 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT: From 1581a6c1c3291a8320b080f4411345f60229976d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:40 -0700 Subject: [PATCH 09/14] skmsg: Teach sk_psock_verdict_apply() to return errors Currently sk_psock_verdict_apply() is void, but it handles some error conditions too. Its caller is impossible to learn whether it succeeds or fails, especially sk_psock_verdict_recv(). Make it return int to indicate error cases and propagate errors to callers properly. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 5464477e2d3d..e3d210811db4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static void sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -835,7 +835,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) */ if (unlikely(!sk_other)) { kfree_skb(skb); - return; + return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another @@ -845,19 +845,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); spin_unlock_bh(&psock_other->ingress_lock); + return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -894,14 +895,15 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) { struct sock *sk_other; - int err = -EIO; + int err = 0; switch (verdict) { case __SK_PASS: + err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { @@ -934,13 +936,15 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } break; case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(skb); break; case __SK_DROP: default: out_free: kfree_skb(skb); } + + return err; } static void sk_psock_write_space(struct sock *sk) @@ -1107,7 +1111,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_verdict_apply(psock, skb, ret); + if (sk_psock_verdict_apply(psock, skb, ret) < 0) + len = 0; out: rcu_read_unlock(); return len; From 42830571f1fd9751b3fbf38084bbb253320e185f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:41 -0700 Subject: [PATCH 10/14] skmsg: Pass source psock to sk_psock_skb_redirect() sk_psock_skb_redirect() only takes skb as a parameter, we will need to know where this skb is from, so just pass the source psock to this function as a new parameter. This patch prepares for the next one. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-8-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e3d210811db4..3aa9065811ad 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -861,11 +861,12 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) return 0; } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: @@ -889,7 +890,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, psock->sk, ret); + sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } @@ -936,7 +937,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, } break; case __SK_REDIRECT: - err = sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: From 781dd0431eb549f9cb1fdddf91a50d985febe884 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:42 -0700 Subject: [PATCH 11/14] skmsg: Increase sk->sk_drops when dropping packets It is hard to observe packet drops without increasing relevant drop counters, here we should increase sk->sk_drops which is a protocol-independent counter. Fortunately psock is always associated with a struct sock, we can just use psock->sk. Suggested-by: John Fastabend Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3aa9065811ad..9b6160a191f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -578,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } +static void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); @@ -617,7 +623,7 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); + sock_drop(psock->sk, skb); goto end; } off += ret; @@ -708,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } @@ -834,7 +840,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); @@ -844,14 +850,14 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } @@ -942,7 +948,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, case __SK_DROP: default: out_free: - kfree_skb(skb); + sock_drop(psock->sk, skb); } return err; @@ -977,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1098,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); From 5dec6d96d12d33900ec315972c8e47a73bcc378d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 18 Jun 2021 03:55:26 -0700 Subject: [PATCH 12/14] bpf: Fix regression on BPF_OBJ_GET with non-O_RDWR flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d37300ed1821 ("bpf: program: Refuse non-O_RDWR flags in BPF_OBJ_GET"). It breaks Android userspace which expects to be able to fetch programs with just read permissions. See: https://cs.android.com/android/platform/superproject/+/master:frameworks/libs/net/common/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h;drc=7005c764be23d31fa1d69e826b4a2f6689a8c81e;l=124 Side-note: another option to fix it would be to extend bpf_prog_new_fd() and to pass in used file mode flags in the same way as we do for maps via bpf_map_new_fd(). Meaning, they'd end up in anon_inode_getfd() and thus would be retained for prog fd operations with bpf() syscall. Right now these flags are not checked with progs since they are immutable for their lifetime (as opposed to maps which can be updated from user space). In future this could potentially change with new features, but at that point it's still fine to do the bpf_prog_new_fd() extension when needed. For a simple stable fix, a revert is less churn. Fixes: d37300ed1821 ("bpf: program: Refuse non-O_RDWR flags in BPF_OBJ_GET") Signed-off-by: Maciej Żenczykowski [ Daniel: added side-note to commit message ] Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20210618105526.265003-1-zenczykowski@gmail.com --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index b4ebd60a6c16..80da1db47c68 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -543,7 +543,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) return PTR_ERR(raw); if (type == BPF_TYPE_PROG) - ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw); + ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) From 7dd5d437c258bbf4cc15b35229e5208b87b8b4e0 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 13 Jun 2021 21:34:39 +0700 Subject: [PATCH 13/14] bpf: Fix integer overflow in argument calculation for bpf_map_area_alloc In 32-bit architecture, the result of sizeof() is a 32-bit integer so the expression becomes the multiplication between 2 32-bit integer which can potentially leads to integer overflow. As a result, bpf_map_area_alloc() allocates less memory than needed. Fix this by casting 1 operand to u64. Fixes: 0d2c4f964050 ("bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps") Fixes: 99c51064fb06 ("devmap: Use bpf_map_area_alloc() for allocating hash buckets") Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Bui Quang Minh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210613143440.71975-1-minhquangbui99@gmail.com --- kernel/bpf/devmap.c | 4 ++-- net/core/sock_map.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index aa516472ce46..3b45c23286c0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -92,7 +92,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries, int i; struct hlist_head *hash; - hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -143,7 +143,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) spin_lock_init(&dtab->index_lock); } else { - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 6f1b82b8ad49..60decd6420ca 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { From 7506d211b932870155bcb39e3dd9e39fab45a7c7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 16 Jun 2021 15:55:00 -0700 Subject: [PATCH 14/14] bpf: Fix null ptr deref with mixed tail calls and subprogs The sub-programs prog->aux->poke_tab[] is populated in jit_subprogs() and then used when emitting 'BPF_JMP|BPF_TAIL_CALL' insn->code from the individual JITs. The poke_tab[] to use is stored in the insn->imm by the code adding it to that array slot. The JIT then uses imm to find the right entry for an individual instruction. In the x86 bpf_jit_comp.c this is done by calling emit_bpf_tail_call_direct with the poke_tab[] of the imm value. However, we observed the below null-ptr-deref when mixing tail call programs with subprog programs. For this to happen we just need to mix bpf-2-bpf calls and tailcalls with some extra calls or instructions that would be patched later by one of the fixup routines. So whats happening? Before the fixup_call_args() -- where the jit op is done -- various code patching is done by do_misc_fixups(). This may increase the insn count, for example when we patch map_lookup_up using map_gen_lookup hook. This does two things. First, it means the instruction index, insn_idx field, of a tail call instruction will move by a 'delta'. In verifier code, struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; Then subprog start values subprog_info[i].start will be updated with the delta and any poke descriptor index will also be updated with the delta in adjust_poke_desc(). If we look at the adjust subprog starts though we see its only adjusted when the delta occurs before the new instructions, /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { if (env->subprog_info[i].start <= off) continue; Earlier subprograms are not changed because their start values are not moved. But, adjust_poke_desc() does the offset + delta indiscriminately. The result is poke descriptors are potentially corrupted. Then in jit_subprogs() we only populate the poke_tab[] when the above insn_idx is less than the next subprogram start. From above we corrupted our insn_idx so we might incorrectly assume a poke descriptor is not used in a subprogram omitting it from the subprogram. And finally when the jit runs it does the deref of poke_tab when emitting the instruction and crashes with below. Because earlier step omitted the poke descriptor. The fix is straight forward with above context. Simply move same logic from adjust_subprog_starts() into adjust_poke_descs() and only adjust insn_idx when needed. [ 82.396354] bpf_testmod: version magic '5.12.0-rc2alu+ SMP preempt mod_unload ' should be '5.12.0+ SMP preempt mod_unload ' [ 82.623001] loop10: detected capacity change from 0 to 8 [ 88.487424] ================================================================== [ 88.487438] BUG: KASAN: null-ptr-deref in do_jit+0x184a/0x3290 [ 88.487455] Write of size 8 at addr 0000000000000008 by task test_progs/5295 [ 88.487471] CPU: 7 PID: 5295 Comm: test_progs Tainted: G I 5.12.0+ #386 [ 88.487483] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [ 88.487490] Call Trace: [ 88.487498] dump_stack+0x93/0xc2 [ 88.487515] kasan_report.cold+0x5f/0xd8 [ 88.487530] ? do_jit+0x184a/0x3290 [ 88.487542] do_jit+0x184a/0x3290 ... [ 88.487709] bpf_int_jit_compile+0x248/0x810 ... [ 88.487765] bpf_check+0x3718/0x5140 ... [ 88.487920] bpf_prog_load+0xa22/0xf10 Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Reported-by: Jussi Maki Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6a27574242d..6e2ebcb0d66f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11459,7 +11459,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } -static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; int i, sz = prog->aux->size_poke_tab; @@ -11467,6 +11467,8 @@ static void adjust_poke_descs(struct bpf_prog *prog, u32 len) for (i = 0; i < sz; i++) { desc = &tab[i]; + if (desc->insn_idx <= off) + continue; desc->insn_idx += len - 1; } } @@ -11487,7 +11489,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); - adjust_poke_descs(new_prog, len); + adjust_poke_descs(new_prog, off, len); return new_prog; }