From 27e23836ce22a3e5d89712ef832ab72e47ce9f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20M=C3=BCller?= Date: Wed, 10 Aug 2022 20:07:10 +0000 Subject: [PATCH 01/56] selftests/bpf: Add lru_bug to s390x deny list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lru_bug BPF selftest is failing execution on s390x machines. The failure is due to program attachment failing in turn, similar to a bunch of other tests. Those other tests have already been deny-listed and with this change we do the same for the lru_bug test, adding it to the corresponding file. Fixes: de7b9927105b ("selftests/bpf: Add test for prealloc_lru_pop bug") Signed-off-by: Daniel Müller Acked-by: Mykola Lysenko Link: https://lore.kernel.org/r/20220810200710.1300299-1-deso@posteo.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index e33cab34d22f..db9810611788 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -65,3 +65,4 @@ send_signal # intermittently fails to receive signa select_reuseport # intermittently fails on new s390x setup xdp_synproxy # JIT does not support calling kernel function (kfunc) unpriv_bpf_disabled # fentry +lru_bug # prog 'printk': failed to auto-attach: -524 From 58ca14ed98c87cfe0d1408cc65a9745d9e9b7a56 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 12 Aug 2022 13:32:59 +0200 Subject: [PATCH 02/56] xsk: Fix corrupted packets for XDP_SHARED_UMEM Fix an issue in XDP_SHARED_UMEM mode together with aligned mode where packets are corrupted for the second and any further sockets bound to the same umem. In other words, this does not affect the first socket bound to the umem. The culprit for this bug is that the initialization of the DMA addresses for the pre-populated xsk buffer pool entries was not performed for any socket but the first one bound to the umem. Only the linear array of DMA addresses was populated. Fix this by populating the DMA addresses in the xsk buffer pool for every socket bound to the same umem. Fixes: 94033cd8e73b8 ("xsk: Optimize for aligned case") Reported-by: Alasdair McWilliam Reported-by: Intrusion Shield Team Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Tested-by: Alasdair McWilliam Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/xdp-newbies/6205E10C-292E-4995-9D10-409649354226@outlook.com/ Link: https://lore.kernel.org/bpf/20220812113259.531-1-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index f70112176b7c..a71a8c6edf55 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -379,6 +379,16 @@ static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) { + if (!pool->unaligned) { + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } + } + pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); if (!pool->dma_pages) return -ENOMEM; @@ -428,12 +438,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(dma_map); - else - for (i = 0; i < pool->heads_cnt; i++) { - struct xdp_buff_xsk *xskb = &pool->heads[i]; - - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); - } err = xp_init_dma_info(pool, dma_map); if (err) { From 583585e48d965338e73e1eb383768d16e0922d73 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 9 Aug 2022 17:49:15 +0800 Subject: [PATCH 03/56] skmsg: Fix wrong last sg check in sk_msg_recvmsg() Fix one kernel NULL pointer dereference as below: [ 224.462334] Call Trace: [ 224.462394] __tcp_bpf_recvmsg+0xd3/0x380 [ 224.462441] ? sock_has_perm+0x78/0xa0 [ 224.462463] tcp_bpf_recvmsg+0x12e/0x220 [ 224.462494] inet_recvmsg+0x5b/0xd0 [ 224.462534] __sys_recvfrom+0xc8/0x130 [ 224.462574] ? syscall_trace_enter+0x1df/0x2e0 [ 224.462606] ? __do_page_fault+0x2de/0x500 [ 224.462635] __x64_sys_recvfrom+0x24/0x30 [ 224.462660] do_syscall_64+0x5d/0x1d0 [ 224.462709] entry_SYSCALL_64_after_hwframe+0x65/0xca In commit 9974d37ea75f ("skmsg: Fix invalid last sg check in sk_msg_recvmsg()"), we change last sg check to sg_is_last(), but in sockmap redirection case (without stream_parser/stream_verdict/ skb_verdict), we did not mark the end of the scatterlist. Check the sk_msg_alloc, sk_msg_page_add, and bpf_msg_push_data functions, they all do not mark the end of sg. They are expected to use sg.end for end judgment. So the judgment of '(i != msg_rx->sg.end)' is added back here. Fixes: 9974d37ea75f ("skmsg: Fix invalid last sg check in sk_msg_recvmsg()") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20220809094915.150391-1-liujian56@huawei.com --- net/core/skmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f47338d89d5d..5be4485ff07d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -461,7 +461,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied == len) break; - } while (!sg_is_last(sge)); + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); @@ -471,7 +471,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx->sg.start = i; - if (!sge->length && sg_is_last(sge)) { + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } From 3024d95a4c521c278a7504ee9e80c57c3a9750e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 17 Aug 2022 23:32:09 +0200 Subject: [PATCH 04/56] bpf: Partially revert flexible-array member replacement Partially revert 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") given it breaks BPF UAPI. For example, BPF CI run reveals build breakage under LLVM: [...] CLNG-BPF [test_maps] map_ptr_kern.o CLNG-BPF [test_maps] btf__core_reloc_arrays___diff_arr_val_sz.o CLNG-BPF [test_maps] test_bpf_cookie.o progs/map_ptr_kern.c:314:26: error: field 'trie_key' with variable sized type 'struct bpf_lpm_trie_key' not at the end of a struct or class is a GNU extension [-Werror,-Wgnu-variable-sized-type-not-at-end] struct bpf_lpm_trie_key trie_key; ^ CLNG-BPF [test_maps] btf__core_reloc_type_based___diff.o 1 error generated. make: *** [Makefile:521: /tmp/runner/work/bpf/bpf/tools/testing/selftests/bpf/map_ptr_kern.o] Error 1 make: *** Waiting for unfinished jobs.... [...] Typical usage of the bpf_lpm_trie_key is that the struct gets embedded into a user defined key for the LPM BPF map, from the selftest example: struct bpf_lpm_trie_key { <-- UAPI exported struct __u32 prefixlen; __u8 data[]; }; struct lpm_key { <-- BPF program defined struct struct bpf_lpm_trie_key trie_key; __u32 data; }; Undo this for BPF until a different solution can be found. It's the only flexible- array member case in the UAPI header. This was discovered in BPF CI after Dave reported that the include/uapi/linux/bpf.h header was out of sync with tools/include/uapi/linux/bpf.h after 94dfc73e7cf4. And the subsequent sync attempt failed CI. Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Reported-by: Dave Marchevsky Signed-off-by: Daniel Borkmann Cc: Gustavo A. R. Silva Link: https://lore.kernel.org/bpf/22aebc88-da67-f086-e620-dd4a16e2bc69@iogearbox.net --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bf9ba1329be..59a217ca2dfd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -79,7 +79,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { From 14b20b784f59bdd95f6f1cfb112c9818bcec4d84 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 16 Aug 2022 20:55:16 +0000 Subject: [PATCH 05/56] bpf: Restrict bpf_sys_bpf to CAP_PERFMON The verifier cannot perform sufficient validation of any pointers passed into bpf_attr and treats them as integers rather than pointers. The helper will then read from arbitrary pointers passed into it. Restrict the helper to CAP_PERFMON since the security model in BPF of arbitrary kernel read is CAP_BPF + CAP_PERFMON. Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220816205517.682470-1-zhuyifei@google.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d40d98428a..27760627370d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: From 7ec9fce4b31604f8415136a4c07f7dc8ad431aec Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 18 Aug 2022 10:41:18 +0300 Subject: [PATCH 06/56] ip_tunnel: Respect tunnel key's "flow_flags" in IP tunnels Commit 451ef36bd229 ("ip_tunnels: Add new flow flags field to ip_tunnel_key") added a "flow_flags" member to struct ip_tunnel_key which was later used by the commit in the fixes tag to avoid dropping packets with sources that aren't locally configured when set in bpf_set_tunnel_key(). VXLAN and GENEVE were made to respect this flag, ip tunnels like IPIP and GRE were not. This commit fixes this omission by making ip_tunnel_init_flow() receive the flow flags from the tunnel key in the relevant collect_md paths. Fixes: b8fff748521c ("bpf: Set flow flag to allow any source IP in bpf_tunnel_key") Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann Reviewed-by: Paul Chaignon Link: https://lore.kernel.org/bpf/20220818074118.726639-1-eyal.birger@gmail.com --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 ++- include/net/ip_tunnels.h | 4 +++- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_tunnel.c | 7 ++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 39904dacf4f0..b3472fb94617 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,7 +423,8 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0); + 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, + 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 63fac94f9ace..ced80e2f8b58 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -246,7 +246,8 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, - __u32 mark, __u32 tun_inner_hash) + __u32 mark, __u32 tun_inner_hash, + __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); @@ -263,6 +264,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; + fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5c58e21f724e..f866d6282b2b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -609,7 +609,7 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), key->tos & ~INET_ECN_MASK, dev_net(dev), 0, - skb->mark, skb_get_hash(skb)); + skb->mark, skb_get_hash(skb), key->flow_flags); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e65e948cab9f..019f3b0839c5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -295,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), dev_net(dev), - tunnel->parms.link, tunnel->fwmark, 0); + tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -570,7 +570,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - dev_net(dev), 0, skb->mark, skb_get_hash(skb)); + dev_net(dev), 0, skb->mark, skb_get_hash(skb), + key->flow_flags); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; @@ -729,7 +730,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), dev_net(dev), tunnel->parms.link, - tunnel->fwmark, skb_get_hash(skb)); + tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; From 7d6620f107bae6ed687ff07668e8e8f855487aa9 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 13 Aug 2022 21:40:30 +0800 Subject: [PATCH 07/56] bpf, cgroup: Fix kernel BUG in purge_effective_progs Syzkaller reported a triggered kernel BUG as follows: ------------[ cut here ]------------ kernel BUG at kernel/bpf/cgroup.c:925! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 194 Comm: detach Not tainted 5.19.0-14184-g69dac8e431af #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__cgroup_bpf_detach+0x1f2/0x2a0 Code: 00 e8 92 60 30 00 84 c0 75 d8 4c 89 e0 31 f6 85 f6 74 19 42 f6 84 28 48 05 00 00 02 75 0e 48 8b 80 c0 00 00 00 48 85 c0 75 e5 <0f> 0b 48 8b 0c5 RSP: 0018:ffffc9000055bdb0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff888100ec0800 RCX: ffffc900000f1000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888100ec4578 RBP: 0000000000000000 R08: ffff888100ec0800 R09: 0000000000000040 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888100ec4000 R13: 000000000000000d R14: ffffc90000199000 R15: ffff888100effb00 FS: 00007f68213d2b80(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f74a0e5850 CR3: 0000000102836000 CR4: 00000000000006e0 Call Trace: cgroup_bpf_prog_detach+0xcc/0x100 __sys_bpf+0x2273/0x2a00 __x64_sys_bpf+0x17/0x20 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f68214dbcb9 Code: 08 44 89 e0 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff8 RSP: 002b:00007ffeb487db68 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f68214dbcb9 RDX: 0000000000000090 RSI: 00007ffeb487db70 RDI: 0000000000000009 RBP: 0000000000000003 R08: 0000000000000012 R09: 0000000b00000003 R10: 00007ffeb487db70 R11: 0000000000000246 R12: 00007ffeb487dc20 R13: 0000000000000004 R14: 0000000000000001 R15: 000055f74a1011b0 Modules linked in: ---[ end trace 0000000000000000 ]--- Repetition steps: For the following cgroup tree, root | cg1 | cg2 1. attach prog2 to cg2, and then attach prog1 to cg1, both bpf progs attach type is NONE or OVERRIDE. 2. write 1 to /proc/thread-self/fail-nth for failslab. 3. detach prog1 for cg1, and then kernel BUG occur. Failslab injection will cause kmalloc fail and fall back to purge_effective_progs. The problem is that cg2 have attached another prog, so when go through cg2 layer, iteration will add pos to 1, and subsequent operations will be skipped by the following condition, and cg will meet NULL in the end. `if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))` The NULL cg means no link or prog match, this is as expected, and it's not a bug. So here just skip the no match situation. Fixes: 4c46091ee985 ("bpf: Fix KASAN use-after-free Read in compute_effective_progs") Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220813134030.1972696-1-pulehui@huawei.com --- kernel/bpf/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 59b7eb60d5b4..4a400cd63731 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, pos++; } } + + /* no link or prog match, skip the cgroup of this layer */ + continue; found: - BUG_ON(!cg); progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); From afe7116f6d3b888778ed6d95e3cf724767b9aedf Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Mon, 8 Aug 2022 11:42:24 +0800 Subject: [PATCH 08/56] ieee802154/adf7242: defer destroy_workqueue call There is a possible race condition (use-after-free) like below (FREE) | (USE) adf7242_remove | adf7242_channel cancel_delayed_work_sync | destroy_workqueue (1) | adf7242_cmd_rx | mod_delayed_work (2) | The root cause for this race is that the upper layer (ieee802154) is unaware of this detaching event and the function adf7242_channel can be called without any checks. To fix this, we can add a flag write at the beginning of adf7242_remove and add flag check in adf7242_channel. Or we can just defer the destructive operation like other commit 3e0588c291d6 ("hamradio: defer ax25 kfree after unregister_netdev") which let the ieee802154_unregister_hw() to handle the synchronization. This patch takes the second option. Fixes: 58e9683d1475 ("net: ieee802154: adf7242: Fix OCL calibration runs") Signed-off-by: Lin Ma Acked-by: Michael Hennerich Link: https://lore.kernel.org/r/20220808034224.12642-1-linma@zju.edu.cn Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/adf7242.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index 6afdf1622944..5cf218c674a5 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1310,10 +1310,11 @@ static void adf7242_remove(struct spi_device *spi) debugfs_remove_recursive(lp->debugfs_root); + ieee802154_unregister_hw(lp->hw); + cancel_delayed_work_sync(&lp->work); destroy_workqueue(lp->wqueue); - ieee802154_unregister_hw(lp->hw); mutex_destroy(&lp->bmux); ieee802154_free_hw(lp->hw); } From b5a990209d72615e3cac2b3b0d8ddd445c020cf5 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Fri, 8 Jul 2022 23:15:38 +0800 Subject: [PATCH 09/56] net/ieee802154: fix repeated words in comments Delete the redundant word 'was'. Signed-off-by: Jilin Yuan Link: https://lore.kernel.org/r/20220708151538.51483-1-yuanjilin@cdjrlc.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index 42c0b451088d..450b16ad40a4 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -2293,7 +2293,7 @@ static int ca8210_set_csma_params( * @retries: Number of retries * * Sets the number of times to retry a transmission if no acknowledgment was - * was received from the other end when one was requested. + * received from the other end when one was requested. * * Return: 0 or linux error code */ From 0947ae1121083d363d522ff7518ee72b55bd8d29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 14:58:04 -0700 Subject: [PATCH 10/56] bpf: Fix a data-race around bpf_jit_limit. While reading bpf_jit_limit, it can be changed concurrently via sysctl, WRITE_ONCE() in __do_proc_doulongvec_minmax(). The size of bpf_jit_limit is long, so we need to add a paired READ_ONCE() to avoid load-tearing. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220823215804.2177-1-kuniyu@amazon.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..3d9eb3ae334c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; From 60deb9f10eec5c6a20252ed36238b55d8b614a2c Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Sat, 20 Aug 2022 01:33:40 +0530 Subject: [PATCH 11/56] wifi: mac80211: Fix UAF in ieee80211_scan_rx() ieee80211_scan_rx() tries to access scan_req->flags after a null check, but a UAF is observed when the scan is completed and __ieee80211_scan_completed() executes, which then calls cfg80211_scan_done() leading to the freeing of scan_req. Since scan_req is rcu_dereference()'d, prevent the racing in __ieee80211_scan_completed() by ensuring that from mac80211's POV it is no longer accessed from an RCU read critical section before we call cfg80211_scan_done(). Cc: stable@vger.kernel.org Link: https://syzkaller.appspot.com/bug?extid=f9acff9bf08a845f225d Reported-by: syzbot+f9acff9bf08a845f225d@syzkaller.appspotmail.com Suggested-by: Johannes Berg Signed-off-by: Siddh Raman Pant Link: https://lore.kernel.org/r/20220819200340.34826-1-code@siddh.me Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fa8ddf576bc1..c4f2aeb31da3 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -469,16 +469,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) { - local->scan_info.aborted = aborted; - cfg80211_scan_done(scan_req, &local->scan_info); - } RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; + synchronize_rcu(); + + if (scan_req != local->int_scan_req) { + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); + } + /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); From 36fe8e4e5cb02131719612aea1e64379670d1846 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 23 Aug 2022 15:22:23 +0200 Subject: [PATCH 12/56] wifi: mac80211: always free sta in __sta_info_alloc in case of error Free sta pointer in __sta_info_alloc routine if sta_info_alloc_link() fails. Fixes: 246b39e4a1ba5 ("wifi: mac80211: refactor some sta_info link handling") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/a3d079208684cddbc25289f7f7e0fed795b0cad4.1661260857.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cb23da9aff1e..330dab41f2fe 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -494,7 +494,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) - return NULL; + goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, From 62b03f45c6352410d13bf0710d24ef6290632eb1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 18 Aug 2022 12:33:49 +0800 Subject: [PATCH 13/56] wifi: mac80211: fix possible leak in ieee80211_tx_control_port() Add missing dev_kfree_skb() in an error path in ieee80211_tx_control_port() to avoid a memory leak. Fixes: dd820ed6336a ("wifi: mac80211: return error from control port TX for drops") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220818043349.4168835-1-yangyingliang@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 45df9932d0ba..594bd70ee641 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5885,6 +5885,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, rcu_read_lock(); err = ieee80211_lookup_ra_sta(sdata, skb, &sta); if (err) { + dev_kfree_skb(skb); rcu_read_unlock(); return err; } From 15bc8966b6d3a5b9bfe4c9facfa02f2b69b1e5f0 Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Sun, 14 Aug 2022 20:45:12 +0530 Subject: [PATCH 14/56] wifi: mac80211: Don't finalize CSA in IBSS mode if state is disconnected When we are not connected to a channel, sending channel "switch" announcement doesn't make any sense. The BSS list is empty in that case. This causes the for loop in cfg80211_get_bss() to be bypassed, so the function returns NULL (check line 1424 of net/wireless/scan.c), causing the WARN_ON() in ieee80211_ibss_csa_beacon() to get triggered (check line 500 of net/mac80211/ibss.c), which was consequently reported on the syzkaller dashboard. Thus, check if we have an existing connection before generating the CSA beacon in ieee80211_ibss_finish_csa(). Cc: stable@vger.kernel.org Fixes: cd7760e62c2a ("mac80211: add support for CSA in IBSS mode") Link: https://syzkaller.appspot.com/bug?id=05603ef4ae8926761b678d2939a3b2ad28ab9ca6 Reported-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com Signed-off-by: Siddh Raman Pant Tested-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20220814151512.9985-1-code@siddh.me Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index d56890e3fabb..9b283bbc7bb4 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -530,6 +530,10 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) sdata_assert_lock(sdata); + /* When not connected/joined, sending CSA doesn't make sense. */ + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) + return -ENOLINK; + /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(sdata->local->hw.wiphy, From d776763f48084926b5d9e25507a3ddb7c9243d5e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Aug 2022 10:03:21 +0300 Subject: [PATCH 15/56] wifi: cfg80211: debugfs: fix return type in ht40allow_map_read() The return type is supposed to be ssize_t, which is signed long, but "r" was declared as unsigned int. This means that on 64 bit systems we return positive values instead of negative error codes. Fixes: 80a3511d70e8 ("cfg80211: add debugfs HT40 allow map") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YutvOQeJm0UjLhwU@kili Signed-off-by: Johannes Berg --- net/wireless/debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index aab43469a2f0..0878b162890a 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -65,9 +65,10 @@ static ssize_t ht40allow_map_read(struct file *file, { struct wiphy *wiphy = file->private_data; char *buf; - unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; + unsigned int offset = 0, buf_size = PAGE_SIZE, i; enum nl80211_band band; struct ieee80211_supported_band *sband; + ssize_t r; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) From 55f0a4894484e8d6ddf662f5aebbf3b4cb028541 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 28 Jul 2022 17:25:16 +0300 Subject: [PATCH 16/56] wifi: mac80211: potential NULL dereference in ieee80211_tx_control_port() The ieee80211_lookup_ra_sta() function will sometimes set "sta" to NULL so add this NULL check to prevent an Oops. Fixes: 9dd1953846c7 ("wifi: nl80211/mac80211: clarify link ID in control port TX") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YuKcTAyO94YOy0Bu@kili Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 594bd70ee641..bf7fe6cd9dfc 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5900,7 +5900,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, * for MLO STA, the SA should be the AP MLD address, but * the link ID has been selected already */ - if (sta->sta.mlo) + if (sta && sta->sta.mlo) memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } rcu_read_unlock(); From 2fc31465c5373b5ca4edf2e5238558cb62902311 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 20:52:59 +0200 Subject: [PATCH 17/56] bpf: Do mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO Precision markers need to be propagated whenever we have an ARG_CONST_* style argument, as the verifier cannot consider imprecise scalars to be equivalent for the purposes of states_equal check when such arguments refine the return value (in this case, set mem_size for PTR_TO_MEM). The resultant mem_size for the R0 is derived from the constant value, and if the verifier incorrectly prunes states considering them equivalent where such arguments exist (by seeing that both registers have reg->precise as false in regsafe), we can end up with invalid programs passing the verifier which can do access beyond what should have been the correct mem_size in that explored state. To show a concrete example of the problem: 0000000000000000 : 0: r2 = *(u32 *)(r1 + 80) 1: r1 = *(u32 *)(r1 + 76) 2: r3 = r1 3: r3 += 4 4: if r3 > r2 goto +18 5: w2 = 0 6: *(u32 *)(r1 + 0) = r2 7: r1 = *(u32 *)(r1 + 0) 8: r2 = 1 9: if w1 == 0 goto +1 10: r2 = -1 0000000000000058 : 11: r1 = 0 ll 13: r3 = 0 14: call bpf_ringbuf_reserve 15: if r0 == 0 goto +7 16: r1 = r0 17: r1 += 16777215 18: w2 = 0 19: *(u8 *)(r1 + 0) = r2 20: r1 = r0 21: r2 = 0 22: call bpf_ringbuf_submit 00000000000000b8 : 23: w0 = 0 24: exit For the first case, the single line execution's exploration will prune the search at insn 14 for the branch insn 9's second leg as it will be verified first using r2 = -1 (UINT_MAX), while as w1 at insn 9 will always be 0 so at runtime we don't get error for being greater than UINT_MAX/4 from bpf_ringbuf_reserve. The verifier during regsafe just sees reg->precise as false for both r2 registers in both states, hence considers them equal for purposes of states_equal. If we propagated precise markers using the backtracking support, we would use the precise marking to then ensure that old r2 (UINT_MAX) was within the new r2 (1) and this would never be true, so the verification would rightfully fail. The end result is that the out of bounds access at instruction 19 would be permitted without this fix. Note that reg->precise is always set to true when user does not have CAP_BPF (or when subprog count is greater than 1 (i.e. use of any static or global functions)), hence this is only a problem when precision marks need to be explicitly propagated (i.e. privileged users with CAP_BPF). A simplified test case has been included in the next patch to prevent future regressions. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823185300.406-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096fdac70165..30c6eebce146 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6066,6 +6066,9 @@ skip_type_check: return -EACCES; } meta->mem_size = reg->var_off.value; + err = mark_chain_precision(env, regno); + if (err) + return err; break; case ARG_PTR_TO_INT: case ARG_PTR_TO_LONG: From 1800b2ac96d8bc4ccdddc2ea9e83ecaffd54d3f2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 20:55:00 +0200 Subject: [PATCH 18/56] selftests/bpf: Add regression test for pruning fix Add a test to ensure we do mark_chain_precision for the argument type ARG_CONST_ALLOC_SIZE_OR_ZERO. For other argument types, this was already done, but propagation for missing for this case. Without the fix, this test case loads successfully. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823185500.467-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/verifier/precise.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 9e754423fa8b..6c03a7d805f9 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -192,3 +192,28 @@ .result = VERBOSE_ACCEPT, .retval = -1, }, +{ + "precise: mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct xdp_md, ingress_ifindex)), + BPF_LD_MAP_FD(BPF_REG_6, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1), + BPF_MOV64_IMM(BPF_REG_2, 0x1000), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 42), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .prog_type = BPF_PROG_TYPE_XDP, + .flags = BPF_F_TEST_STATE_FREQ, + .errstr = "invalid access to memory, mem_size=1 off=42 size=8", + .result = REJECT, +}, From a657182a5c5150cdfacb6640aad1d2712571a409 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 Aug 2022 23:26:47 +0200 Subject: [PATCH 19/56] bpf: Don't use tnum_range on array range checking for poke descriptors Hsin-Wei reported a KASAN splat triggered by their BPF runtime fuzzer which is based on a customized syzkaller: BUG: KASAN: slab-out-of-bounds in bpf_int_jit_compile+0x1257/0x13f0 Read of size 8 at addr ffff888004e90b58 by task syz-executor.0/1489 CPU: 1 PID: 1489 Comm: syz-executor.0 Not tainted 5.19.0 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x9c/0xc9 print_address_description.constprop.0+0x1f/0x1f0 ? bpf_int_jit_compile+0x1257/0x13f0 kasan_report.cold+0xeb/0x197 ? kvmalloc_node+0x170/0x200 ? bpf_int_jit_compile+0x1257/0x13f0 bpf_int_jit_compile+0x1257/0x13f0 ? arch_prepare_bpf_dispatcher+0xd0/0xd0 ? rcu_read_lock_sched_held+0x43/0x70 bpf_prog_select_runtime+0x3e8/0x640 ? bpf_obj_name_cpy+0x149/0x1b0 bpf_prog_load+0x102f/0x2220 ? __bpf_prog_put.constprop.0+0x220/0x220 ? find_held_lock+0x2c/0x110 ? __might_fault+0xd6/0x180 ? lock_downgrade+0x6e0/0x6e0 ? lock_is_held_type+0xa6/0x120 ? __might_fault+0x147/0x180 __sys_bpf+0x137b/0x6070 ? bpf_perf_link_attach+0x530/0x530 ? new_sync_read+0x600/0x600 ? __fget_files+0x255/0x450 ? lock_downgrade+0x6e0/0x6e0 ? fput+0x30/0x1a0 ? ksys_write+0x1a8/0x260 __x64_sys_bpf+0x7a/0xc0 ? syscall_enter_from_user_mode+0x21/0x70 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f917c4e2c2d The problem here is that a range of tnum_range(0, map->max_entries - 1) has limited ability to represent the concrete tight range with the tnum as the set of resulting states from value + mask can result in a superset of the actual intended range, and as such a tnum_in(range, reg->var_off) check may yield true when it shouldn't, for example tnum_range(0, 2) would result in 00XX -> v = 0000, m = 0011 such that the intended set of {0, 1, 2} is here represented by a less precise superset of {0, 1, 2, 3}. As the register is known const scalar, really just use the concrete reg->var_off.value for the upper index check. Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Hsin-Wei Hung Signed-off-by: Daniel Borkmann Cc: Shung-Hsi Yu Acked-by: John Fastabend Link: https://lore.kernel.org/r/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 30c6eebce146..3eadb14e090b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7033,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -7044,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -7055,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && From b82a26d8633cc89367fac75beb3ec33061bea44a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 7 Aug 2022 22:57:40 +0200 Subject: [PATCH 20/56] Bluetooth: hci_event: Fix vendor (unknown) opcode status handling Commit c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete") was (presumably) meant to only refactor things without any functional changes. But it does have one undesirable side-effect, before *status would always be set to skb->data[0] and it might be overridden by some of the opcode specific handling. While now it always set by the opcode specific handlers. This means that if the opcode is not known *status does not get set any more at all! This behavior change has broken bluetooth support for BCM4343A0 HCIs, the hci_bcm.c code tries to configure UART attached HCIs at a higher baudraute using vendor specific opcodes. The BCM4343A0 does not support this and this used to simply fail: [ 25.646442] Bluetooth: hci0: BCM: failed to write clock (-56) [ 25.646481] Bluetooth: hci0: Failed to set baudrate After which things would continue with the initial baudraute. But now that hci_cmd_complete_evt() no longer sets status for unknown opcodes *status is left at 0. This causes the hci_bcm.c code to think the baudraute has been changed on the HCI side and to also adjust the UART baudrate, after which communication with the HCI is broken, leading to: [ 28.579042] Bluetooth: hci0: command 0x0c03 tx timeout [ 36.961601] Bluetooth: hci0: BCM: Reset failed (-110) And non working bluetooth. Fix this by restoring the previous default "*status = skb->data[0]" handling for unknown opcodes. Fixes: c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete") Signed-off-by: Hans de Goede Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 485c814cf44a..73aa9ee9d21a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4179,6 +4179,17 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } } + if (i == ARRAY_SIZE(hci_cc_table)) { + /* Unknown opcode, assume byte 0 contains the status, so + * that e.g. __hci_cmd_sync() properly returns errors + * for vendor specific commands send by HCI drivers. + * If a vendor doesn't actually follow this convention we may + * need to introduce a vendor CC table in order to properly set + * the status. + */ + *status = skb->data[0]; + } + handle_cmd_cnt_and_timer(hdev, ev->ncmd); hci_req_cmd_complete(hdev, *opcode, *status, req_complete, From 1fd02d56dae35b08e4cba8e6bd2c2e7ccff68ecc Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 11 Aug 2022 14:20:46 -0700 Subject: [PATCH 21/56] Bluetooth: hci_sync: Fix suspend performance regression This attempts to fix suspend performance when there is no connections by not updating the event mask. Fixes: ef61b6ea1544 ("Bluetooth: Always set event mask on suspend") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e6d804b82b67..5fe440cdc68d 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5288,17 +5288,21 @@ int hci_suspend_sync(struct hci_dev *hdev) /* Prevent disconnects from causing scanning to be re-enabled */ hci_pause_scan_sync(hdev); - /* Soft disconnect everything (power off) */ - err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); - if (err) { - /* Set state to BT_RUNNING so resume doesn't notify */ - hdev->suspend_state = BT_RUNNING; - hci_resume_sync(hdev); - return err; - } + if (hci_conn_count(hdev)) { + /* Soft disconnect everything (power off) */ + err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); + if (err) { + /* Set state to BT_RUNNING so resume doesn't notify */ + hdev->suspend_state = BT_RUNNING; + hci_resume_sync(hdev); + return err; + } - /* Update event mask so only the allowed event can wakeup the host */ - hci_set_event_mask_sync(hdev); + /* Update event mask so only the allowed event can wakeup the + * host. + */ + hci_set_event_mask_sync(hdev); + } /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. From b840304fb46cdf7012722f456bce06f151b3e81b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 12 Aug 2022 15:33:57 -0700 Subject: [PATCH 22/56] Bluetooth: L2CAP: Fix build errors in some archs This attempts to fix the follow errors: In function 'memcmp', inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9, inlined from 'l2cap_global_chan_by_psm' at net/bluetooth/l2cap_core.c:2003:15: ./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp' specified bound 6 exceeds source size 0 [-Werror=stringop-overread] 44 | #define __underlying_memcmp __builtin_memcmp | ^ ./include/linux/fortify-string.h:420:16: note: in expansion of macro '__underlying_memcmp' 420 | return __underlying_memcmp(p, q, size); | ^~~~~~~~~~~~~~~~~~~ In function 'memcmp', inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9, inlined from 'l2cap_global_chan_by_psm' at net/bluetooth/l2cap_core.c:2004:15: ./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp' specified bound 6 exceeds source size 0 [-Werror=stringop-overread] 44 | #define __underlying_memcmp __builtin_memcmp | ^ ./include/linux/fortify-string.h:420:16: note: in expansion of macro '__underlying_memcmp' 420 | return __underlying_memcmp(p, q, size); | ^~~~~~~~~~~~~~~~~~~ Fixes: 332f1795ca20 ("Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index cbe0cae73434..2c9de67daadc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1992,11 +1992,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { - c = l2cap_chan_hold_unless_zero(c); - if (c) { - read_unlock(&chan_list_lock); - return c; - } + if (!l2cap_chan_hold_unless_zero(c)) + continue; + + read_unlock(&chan_list_lock); + return c; } /* Closest match */ From 23b72814da1a094b4c065e0bb598249f310c5577 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Aug 2022 16:14:32 -0700 Subject: [PATCH 23/56] Bluetooth: MGMT: Fix Get Device Flags Get Device Flags don't check if device does actually use an RPA in which case it shall only set HCI_CONN_FLAG_REMOTE_WAKEUP if LL Privacy is enabled. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 73 ++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6e31023b84f5..566b6291b38e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4547,6 +4547,22 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_NOT_SUPPORTED); } +static u32 get_params_flags(struct hci_dev *hdev, + struct hci_conn_params *params) +{ + u32 flags = hdev->conn_flags; + + /* Devices using RPAs can only be programmed in the acceptlist if + * LL Privacy has been enable otherwise they cannot mark + * HCI_CONN_FLAG_REMOTE_WAKEUP. + */ + if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) && + hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) + flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP; + + return flags; +} + static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -4578,10 +4594,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } else { params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); - if (!params) goto done; + supported_flags = get_params_flags(hdev, params); current_flags = params->flags; } @@ -4649,38 +4665,35 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", &cp->addr.bdaddr, cp->addr.type); } - } else { - params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - if (params) { - /* Devices using RPAs can only be programmed in the - * acceptlist LL Privacy has been enable otherwise they - * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP. - */ - if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && - !use_ll_privacy(hdev) && - hci_find_irk_by_addr(hdev, ¶ms->addr, - params->addr_type)) { - bt_dev_warn(hdev, - "Cannot set wakeable for RPA"); - goto unlock; - } - params->flags = current_flags; - status = MGMT_STATUS_SUCCESS; - - /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY - * has been set. - */ - if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) - hci_update_passive_scan(hdev); - } else { - bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", - &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - } + goto unlock; } + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + if (!params) { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, le_addr_type(cp->addr.type)); + goto unlock; + } + + supported_flags = get_params_flags(hdev, params); + + if ((supported_flags | current_flags) != supported_flags) { + bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", + current_flags, supported_flags); + goto unlock; + } + + params->flags = current_flags; + status = MGMT_STATUS_SUCCESS; + + /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY + * has been set. + */ + if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) + hci_update_passive_scan(hdev); + unlock: hci_dev_unlock(hdev); From 3cfbc6ac22d62d0a06be9ce2996ba9fed75436cd Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 17 Aug 2022 20:14:36 +0900 Subject: [PATCH 24/56] Bluetooth: hci_sync: fix double mgmt_pending_free() in remove_adv_monitor() syzbot is reporting double kfree() at remove_adv_monitor() [1], for commit 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor") forgot to remove duplicated mgmt_pending_remove() when merging "if (err) {" path and "if (!pending) {" path. Link: https://syzkaller.appspot.com/bug?extid=915a8416bf15895b8e07 [1] Reported-by: syzbot Fixes: 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor") Signed-off-by: Tetsuo Handa Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 566b6291b38e..72e6595a71cc 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5067,7 +5067,6 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, else status = MGMT_STATUS_FAILED; - mgmt_pending_remove(cmd); goto unlock; } From c572909376673d8c126ae2ee53ba754ff9327d98 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 18 Aug 2022 14:31:42 -0700 Subject: [PATCH 25/56] Bluetooth: ISO: Fix not handling shutdown condition In order to properly handle shutdown syscall the code shall not assume that the how argument is always SHUT_RDWR resulting in SHUTDOWN_MASK as that would result in poll to immediately report EPOLLHUP instead of properly waiting for disconnect_cfm (Disconnect Complete) which is rather important for the likes of BAP as the CIG may need to be reprogrammed. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ced8ad4fed4f..613039ba5dbf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1309,7 +1309,7 @@ static int iso_sock_shutdown(struct socket *sock, int how) struct sock *sk = sock->sk; int err = 0; - BT_DBG("sock %p, sk %p", sock, sk); + BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; @@ -1317,17 +1317,32 @@ static int iso_sock_shutdown(struct socket *sock, int how) sock_hold(sk); lock_sock(sk); - if (!sk->sk_shutdown) { - sk->sk_shutdown = SHUTDOWN_MASK; - iso_sock_clear_timer(sk); - __iso_sock_close(sk); - - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && - !(current->flags & PF_EXITING)) - err = bt_sock_wait_state(sk, BT_CLOSED, - sk->sk_lingertime); + switch (how) { + case SHUT_RD: + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto unlock; + sk->sk_shutdown |= RCV_SHUTDOWN; + break; + case SHUT_WR: + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto unlock; + sk->sk_shutdown |= SEND_SHUTDOWN; + break; + case SHUT_RDWR: + if (sk->sk_shutdown & SHUTDOWN_MASK) + goto unlock; + sk->sk_shutdown |= SHUTDOWN_MASK; + break; } + iso_sock_clear_timer(sk); + __iso_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + +unlock: release_sock(sk); sock_put(sk); From f48735a9aaf8258f39918e13adf464ccd7dce33b Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Tue, 23 Aug 2022 12:39:22 +0800 Subject: [PATCH 26/56] Bluetooth: hci_event: Fix checking conn for le_conn_complete_evt To prevent multiple conn complete events, we shouldn't look up the conn with hci_lookup_le_connect, since it requires the state to be BT_CONNECT. By the time the duplicate event is processed, the state might have changed, so we end up processing the new event anyway. Change the lookup function to hci_conn_hash_lookup_ba. Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events") Signed-off-by: Archie Pusaka Reviewed-by: Sonny Sasaka Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 73aa9ee9d21a..6643c9c20fa4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5801,7 +5801,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_lookup_le_connect(hdev); + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); if (!conn) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. From cb0d160f813b23df2f4fe7af3237a026c44ed1aa Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:07 +0200 Subject: [PATCH 27/56] Bluetooth: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 5940744a8cd8..cc20e706c639 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -83,14 +83,14 @@ static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo ci->product = session->input->id.product; ci->version = session->input->id.version; if (session->input->name) - strlcpy(ci->name, session->input->name, 128); + strscpy(ci->name, session->input->name, 128); else - strlcpy(ci->name, "HID Boot Device", 128); + strscpy(ci->name, "HID Boot Device", 128); } else if (session->hid) { ci->vendor = session->hid->vendor; ci->product = session->hid->product; ci->version = session->hid->version; - strlcpy(ci->name, session->hid->name, 128); + strscpy(ci->name, session->hid->name, 128); } } From 2da8eb834b775a9d1acea6214d3e4a78ac841e6e Mon Sep 17 00:00:00 2001 From: Zhengping Jiang Date: Tue, 23 Aug 2022 10:28:08 -0700 Subject: [PATCH 28/56] Bluetooth: hci_sync: hold hdev->lock when cleanup hci_conn When disconnecting all devices, hci_conn_failed is used to cleanup hci_conn object when the hci_conn object cannot be aborted. The function hci_conn_failed requires the caller holds hdev->lock. Fixes: 9b3628d79b46f ("Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted") Signed-off-by: Zhengping Jiang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5fe440cdc68d..187786454d98 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4773,9 +4773,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) /* Cleanup hci_conn object if it cannot be cancelled as it * likelly means the controller and host stack are out of sync. */ - if (err) + if (err) { + hci_dev_lock(hdev); hci_conn_failed(conn, err); - + hci_dev_unlock(hdev); + } return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); From 7498a457ecf7ff2c4d379360aa8f24566bb1543e Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Thu, 25 Aug 2022 10:49:55 +0200 Subject: [PATCH 29/56] net: sparx5: fix handling uneven length packets in manual extraction Packets that are not of length divisible by 4 (e.g. 77, 78, 79) would have the checksum included up to next multiple of 4 (a 77 bytes packet would have 3 bytes of ethernet checksum included). The check for the value expects it in host (Little) endian. Fixes: f3cad2611a77 ("net: sparx5: add hostmode with phylink support") Signed-off-by: Casper Andersson Reviewed-by: Steen Hegelund Link: https://lore.kernel.org/r/20220825084955.684637-1-casper.casan@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index 304f84aadc36..21844beba72d 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -113,6 +113,8 @@ static void sparx5_xtr_grp(struct sparx5 *sparx5, u8 grp, bool byte_swap) /* This assumes STATUS_WORD_POS == 1, Status * just after last data */ + if (!byte_swap) + val = ntohl((__force __be32)val); byte_cnt -= (4 - XTR_VALID_BYTES(val)); eof_flag = true; break; From 2ca1c94ce0b65a2ce7512b718f3d8a0fe6224bca Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 26 Aug 2022 08:25:30 +0800 Subject: [PATCH 30/56] tg3: Disable tg3 device on system reboot to avoid triggering AER Commit d60cd06331a3 ("PM: ACPI: reboot: Use S5 for reboot") caused a reboot hang on one Dell servers so the commit was reverted. Someone managed to collect the AER log and it's caused by MSI: [ 148.762067] ACPI: Preparing to enter system sleep state S5 [ 148.794638] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 5 [ 148.803731] {1}[Hardware Error]: event severity: recoverable [ 148.810191] {1}[Hardware Error]: Error 0, type: fatal [ 148.816088] {1}[Hardware Error]: section_type: PCIe error [ 148.822391] {1}[Hardware Error]: port_type: 0, PCIe end point [ 148.829026] {1}[Hardware Error]: version: 3.0 [ 148.834266] {1}[Hardware Error]: command: 0x0006, status: 0x0010 [ 148.841140] {1}[Hardware Error]: device_id: 0000:04:00.0 [ 148.847309] {1}[Hardware Error]: slot: 0 [ 148.852077] {1}[Hardware Error]: secondary_bus: 0x00 [ 148.857876] {1}[Hardware Error]: vendor_id: 0x14e4, device_id: 0x165f [ 148.865145] {1}[Hardware Error]: class_code: 020000 [ 148.870845] {1}[Hardware Error]: aer_uncor_status: 0x00100000, aer_uncor_mask: 0x00010000 [ 148.879842] {1}[Hardware Error]: aer_uncor_severity: 0x000ef030 [ 148.886575] {1}[Hardware Error]: TLP Header: 40000001 0000030f 90028090 00000000 [ 148.894823] tg3 0000:04:00.0: AER: aer_status: 0x00100000, aer_mask: 0x00010000 [ 148.902795] tg3 0000:04:00.0: AER: [20] UnsupReq (First) [ 148.910234] tg3 0000:04:00.0: AER: aer_layer=Transaction Layer, aer_agent=Requester ID [ 148.918806] tg3 0000:04:00.0: AER: aer_uncor_severity: 0x000ef030 [ 148.925558] tg3 0000:04:00.0: AER: TLP Header: 40000001 0000030f 90028090 00000000 The MSI is probably raised by incoming packets, so power down the device and disable bus mastering to stop the traffic, as user confirmed this approach works. In addition to that, be extra safe and cancel reset task if it's running. Cc: Josef Bacik Link: https://lore.kernel.org/all/b8db79e6857c41dab4ef08bdf826ea7c47e3bafc.1615947283.git.josef@toxicpanda.com/ BugLink: https://bugs.launchpad.net/bugs/1917471 Signed-off-by: Kai-Heng Feng Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20220826002530.1153296-1-kai.heng.feng@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index db1e9d810b41..89889d8150da 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18076,16 +18076,20 @@ static void tg3_shutdown(struct pci_dev *pdev) struct net_device *dev = pci_get_drvdata(pdev); struct tg3 *tp = netdev_priv(dev); + tg3_reset_task_cancel(tp); + rtnl_lock(); + netif_device_detach(dev); if (netif_running(dev)) dev_close(dev); - if (system_state == SYSTEM_POWER_OFF) - tg3_power_down(tp); + tg3_power_down(tp); rtnl_unlock(); + + pci_disable_device(pdev); } /** From 3ce9f2bef75528936c78a7053301f5725f622f3a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 24 Aug 2022 19:39:51 -0700 Subject: [PATCH 31/56] net: smsc911x: Stop and start PHY during suspend and resume Commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") unveiled that the smsc911x driver was not properly stopping and restarting the PHY during suspend/resume. Correct that by indicating that the MAC is in charge of PHY PM operations and ensure that all MDIO bus activity is quiescent during suspend. Tested-by: Geert Uytterhoeven Tested-by: Marek Szyprowski Fixes: fba863b81604 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM") Fixes: 2aa70f864955 ("net: smsc911x: Quieten netif during suspend") Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20220825023951.3220-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/smsc/smsc911x.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index 3bf20211cceb..3829c2805b16 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -1037,6 +1037,8 @@ static int smsc911x_mii_probe(struct net_device *dev) return ret; } + /* Indicate that the MAC is responsible for managing PHY PM */ + phydev->mac_managed_pm = true; phy_attached_info(phydev); phy_set_max_speed(phydev, SPEED_100); @@ -2587,6 +2589,8 @@ static int smsc911x_suspend(struct device *dev) if (netif_running(ndev)) { netif_stop_queue(ndev); netif_device_detach(ndev); + if (!device_may_wakeup(dev)) + phy_stop(ndev->phydev); } /* enable wake on LAN, energy detection and the external PME @@ -2628,6 +2632,8 @@ static int smsc911x_resume(struct device *dev) if (netif_running(ndev)) { netif_device_attach(ndev); netif_start_queue(ndev); + if (!device_may_wakeup(dev)) + phy_start(ndev->phydev); } return 0; From a87406f4adee9c53b311d8a1ba2849c69e29a6d0 Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 25 Aug 2022 05:03:26 +0300 Subject: [PATCH 32/56] openvswitch: fix memory leak at failed datapath creation ovs_dp_cmd_new()->ovs_dp_change()->ovs_dp_set_upcall_portids() allocates array via kmalloc. If for some reason new_vport() fails during ovs_dp_cmd_new() dp->upcall_portids must be freed. Add missing kfree. Kmemleak example: unreferenced object 0xffff88800c382500 (size 64): comm "dump_state", pid 323, jiffies 4294955418 (age 104.347s) hex dump (first 32 bytes): 5e c2 79 e4 1f 7a 38 c7 09 21 38 0c 80 88 ff ff ^.y..z8..!8..... 03 00 00 00 0a 00 00 00 14 00 00 00 28 00 00 00 ............(... backtrace: [<0000000071bebc9f>] ovs_dp_set_upcall_portids+0x38/0xa0 [<000000000187d8bd>] ovs_dp_change+0x63/0xe0 [<000000002397e446>] ovs_dp_cmd_new+0x1f0/0x380 [<00000000aa06f36e>] genl_family_rcv_msg_doit+0xea/0x150 [<000000008f583bc4>] genl_rcv_msg+0xdc/0x1e0 [<00000000fa10e377>] netlink_rcv_skb+0x50/0x100 [<000000004959cece>] genl_rcv+0x24/0x40 [<000000004699ac7f>] netlink_unicast+0x23e/0x360 [<00000000c153573e>] netlink_sendmsg+0x24e/0x4b0 [<000000006f4aa380>] sock_sendmsg+0x62/0x70 [<00000000d0068654>] ____sys_sendmsg+0x230/0x270 [<0000000012dacf7d>] ___sys_sendmsg+0x88/0xd0 [<0000000011776020>] __sys_sendmsg+0x59/0xa0 [<000000002e8f2dc1>] do_syscall_64+0x3b/0x90 [<000000003243e7cb>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: b83d23a2a38b ("openvswitch: Introduce per-cpu upcall dispatch") Acked-by: Aaron Conole Signed-off-by: Andrey Zhadchenko Link: https://lore.kernel.org/r/20220825020326.664073-1-andrey.zhadchenko@virtuozzo.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e8a39a35627..6c9d153afbee 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1802,7 +1802,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_unlock_and_destroy_meters; + goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1817,6 +1817,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; +err_destroy_portids: + kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); From ebe5555c2f34505cdb1ae5c3de8b24e33740b3e0 Mon Sep 17 00:00:00 2001 From: Tianyu Yuan Date: Thu, 25 Aug 2022 10:08:45 +0200 Subject: [PATCH 33/56] nfp: flower: fix ingress police using matchall filter Referenced commit introduced nfp_policer_validate in the progress installing rate limiter. This validate check the action id and will reject police with CONTINUE, which is required to support ingress police offload. Fix this issue by allowing FLOW_ACTION_CONTINUE as notexceed action id in nfp_policer_validate Fixes: d97b4b105ce7 ("flow_offload: reject offload for all drivers with invalid police parameters") Signed-off-by: Tianyu Yuan Reviewed-by: Baowen Zheng Reviewed-by: Louis Peens Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20220825080845.507534-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/flower/qos_conf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c index 4e5df9f2c372..7b92026e1a6f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c @@ -127,10 +127,11 @@ static int nfp_policer_validate(const struct flow_action *action, return -EOPNOTSUPP; } - if (act->police.notexceed.act_id != FLOW_ACTION_PIPE && + if (act->police.notexceed.act_id != FLOW_ACTION_CONTINUE && + act->police.notexceed.act_id != FLOW_ACTION_PIPE && act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) { NL_SET_ERR_MSG_MOD(extack, - "Offload not supported when conform action is not pipe or ok"); + "Offload not supported when conform action is not continue, pipe or ok"); return -EOPNOTSUPP; } From 1bd3a383075c64d638e65d263c9267b08ee7733c Mon Sep 17 00:00:00 2001 From: Jean-Francois Le Fillatre Date: Wed, 24 Aug 2022 21:14:36 +0200 Subject: [PATCH 34/56] r8152: add PID for the Lenovo OneLink+ Dock The Lenovo OneLink+ Dock contains an RTL8153 controller that behaves as a broken CDC device by default. Add the custom Lenovo PID to the r8152 driver to support it properly. Also, systems compatible with this dock provide a BIOS option to enable MAC address passthrough (as per Lenovo document "ThinkPad Docking Solutions 2017"). Add the custom PID to the MAC passthrough list too. Tested on a ThinkPad 13 1st gen with the expected results: passthrough disabled: Invalid header when reading pass-thru MAC addr passthrough enabled: Using pass-thru MAC addr XX:XX:XX:XX:XX:XX Signed-off-by: Jean-Francois Le Fillatre Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 7 +++++++ drivers/net/usb/r8152.c | 3 +++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 2de09ad5bac0..e11f70911acc 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -777,6 +777,13 @@ static const struct usb_device_id products[] = { }, #endif +/* Lenovo ThinkPad OneLink+ Dock (based on Realtek RTL8153) */ +{ + USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3054, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), + .driver_info = 0, +}, + /* ThinkPad USB-C Dock (based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3062, USB_CLASS_COMM, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index d142ac8fcf6e..688905ea0a6d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -770,6 +770,7 @@ enum rtl8152_flags { RX_EPROTO, }; +#define DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK 0x3054 #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 #define DEVICE_ID_THINKPAD_USB_C_DONGLE 0x720c #define DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2 0xa387 @@ -9581,6 +9582,7 @@ static bool rtl8152_supports_lenovo_macpassthru(struct usb_device *udev) if (vendor_id == VENDOR_ID_LENOVO) { switch (product_id) { + case DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK: case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2: case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2: case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN3: @@ -9828,6 +9830,7 @@ static const struct usb_device_id rtl8152_table[] = { REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x0927), REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101), REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x304f), + REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3054), REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3062), REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3069), REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3082), From f0da47118c7e93cdbbc6fb403dd729a5f2c90ee3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 26 Aug 2022 16:29:54 +0200 Subject: [PATCH 35/56] net: mac802154: Fix a condition in the receive path Upon reception, a packet must be categorized, either it's destination is the host, or it is another host. A packet with no destination addressing fields may be valid in two situations: - the packet has no source field: only ACKs are built like that, we consider the host as the destination. - the packet has a valid source field: it is directed to the PAN coordinator, as for know we don't have this information we consider we are not the PAN coordinator. There was likely a copy/paste error made during a previous cleanup because the if clause is now containing exactly the same condition as in the switch case, which can never be true. In the past the destination address was used in the switch and the source address was used in the if, which matches what the spec says. Cc: stable@vger.kernel.org Fixes: ae531b9475f6 ("ieee802154: use ieee802154_addr instead of *_sa variants") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220826142954.254853-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index b8ce84618a55..c439125ef2b9 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -44,7 +44,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: - if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE) + if (hdr->source.mode != IEEE802154_ADDR_NONE) /* FIXME: check if we are PAN coordinator */ skb->pkt_type = PACKET_OTHERHOST; else From ffd7bdddaab193c38416fd5dd416d065517d266e Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Mon, 29 Aug 2022 15:12:59 +0800 Subject: [PATCH 36/56] ieee802154: cc2520: add rc code in cc2520_tx() The rc code is 0 at the error path "status & CC2520_STATUS_TX_UNDERFLOW". Assign rc code with '-EINVAL' at this error path to fix it. Signed-off-by: Li Qiong Link: https://lore.kernel.org/r/20220829071259.18330-1-liqiong@nfschina.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/cc2520.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c index 1e1f40f628a0..c69b87d3837d 100644 --- a/drivers/net/ieee802154/cc2520.c +++ b/drivers/net/ieee802154/cc2520.c @@ -504,6 +504,7 @@ cc2520_tx(struct ieee802154_hw *hw, struct sk_buff *skb) goto err_tx; if (status & CC2520_STATUS_TX_UNDERFLOW) { + rc = -EINVAL; dev_err(&priv->spi->dev, "cc2520 tx underflow exception\n"); goto err_tx; } From 3f8ae9fe0409698799e173f698b714f34570b64b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 13:36:44 +0200 Subject: [PATCH 37/56] net: dsa: xrs700x: Use irqsave variant for u64 stats update xrs700x_read_port_counters() updates the stats from a worker using the u64_stats_update_begin() version. This is okay on 32-UP since on the reader side preemption is disabled. On 32bit-SMP the writer can be preempted by the reader at which point the reader will spin on the seqcount until writer continues and completes the update. Assigning the mib_mutex mutex to the underlying seqcount would ensure proper synchronisation. The API for that on the u64_stats_init() side isn't available. Since it is the only user, just use disable interrupts during the update. Use u64_stats_update_begin_irqsave() on the writer side to ensure an uninterrupted update. Fixes: ee00b24f32eb8 ("net: dsa: add Arrow SpeedChips XRS700x driver") Cc: Andrew Lunn Cc: Florian Fainelli Cc: George McCollister Cc: Vivien Didelot Cc: Vladimir Oltean Signed-off-by: Sebastian Andrzej Siewior Acked-by: George McCollister Signed-off-by: David S. Miller --- drivers/net/dsa/xrs700x/xrs700x.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c index 3887ed33c5fe..fa622639d640 100644 --- a/drivers/net/dsa/xrs700x/xrs700x.c +++ b/drivers/net/dsa/xrs700x/xrs700x.c @@ -109,6 +109,7 @@ static void xrs700x_read_port_counters(struct xrs700x *priv, int port) { struct xrs700x_port *p = &priv->ports[port]; struct rtnl_link_stats64 stats; + unsigned long flags; int i; memset(&stats, 0, sizeof(stats)); @@ -138,9 +139,9 @@ static void xrs700x_read_port_counters(struct xrs700x *priv, int port) */ stats.rx_packets += stats.multicast; - u64_stats_update_begin(&p->syncp); + flags = u64_stats_update_begin_irqsave(&p->syncp); p->stats64 = stats; - u64_stats_update_end(&p->syncp); + u64_stats_update_end_irqrestore(&p->syncp, flags); mutex_unlock(&p->mib_mutex); } From 278d3ba61563ceed3cb248383ced19e14ec7bc1f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 13:36:45 +0200 Subject: [PATCH 38/56] net: Use u64_stats_fetch_begin_irq() for stats fetch. On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the reader is in preemptible context and the writer side (u64_stats_update_begin*()) runs in an interrupt context (IRQ or softirq) then the writer can update the stats during the read operation. This update remains undetected. Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP are not interrupted by a writer. 32bit-SMP remains unaffected by this change. Cc: "David S. Miller" Cc: Catherine Sullivan Cc: David Awogbemila Cc: Dimitris Michailidis Cc: Eric Dumazet Cc: Hans Ulli Kroll Cc: Jakub Kicinski Cc: Jeroen de Borst Cc: Johannes Berg Cc: Linus Walleij Cc: Paolo Abeni Cc: Simon Horman Cc: linux-arm-kernel@lists.infradead.org Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Cc: oss-drivers@corigine.com Cc: stable@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/cortina/gemini.c | 24 +++++++++---------- .../ethernet/fungible/funeth/funeth_txrx.h | 4 ++-- drivers/net/ethernet/google/gve/gve_ethtool.c | 16 ++++++------- drivers/net/ethernet/google/gve/gve_main.c | 12 +++++----- drivers/net/ethernet/huawei/hinic/hinic_rx.c | 4 ++-- drivers/net/ethernet/huawei/hinic/hinic_tx.c | 4 ++-- .../ethernet/netronome/nfp/nfp_net_common.c | 8 +++---- .../ethernet/netronome/nfp/nfp_net_ethtool.c | 8 +++---- drivers/net/netdevsim/netdev.c | 4 ++-- net/mac80211/sta_info.c | 8 +++---- net/mpls/af_mpls.c | 4 ++-- 11 files changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 9e6de2f968fa..6dae768671e3 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev, /* Racing with RX NAPI */ do { - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); stats->rx_packets = port->stats.rx_packets; stats->rx_bytes = port->stats.rx_bytes; @@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_crc_errors = port->stats.rx_crc_errors; stats->rx_frame_errors = port->stats.rx_frame_errors; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); /* Racing with MIB and TX completion interrupts */ do { - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); stats->tx_errors = port->stats.tx_errors; stats->tx_packets = port->stats.tx_packets; @@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev, stats->rx_missed_errors = port->stats.rx_missed_errors; stats->rx_fifo_errors = port->stats.rx_fifo_errors; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); /* Racing with hard_start_xmit */ do { - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); stats->tx_dropped = port->stats.tx_dropped; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); stats->rx_dropped += stats->rx_missed_errors; } @@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, /* Racing with MIB interrupt */ do { p = values; - start = u64_stats_fetch_begin(&port->ir_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp); for (i = 0; i < RX_STATS_NUM; i++) *p++ = port->hw_stats[i]; - } while (u64_stats_fetch_retry(&port->ir_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start)); values = p; /* Racing with RX NAPI */ do { p = values; - start = u64_stats_fetch_begin(&port->rx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp); for (i = 0; i < RX_STATUS_NUM; i++) *p++ = port->rx_stats[i]; @@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *p++ = port->rx_csum_stats[i]; *p++ = port->rx_napi_exits; - } while (u64_stats_fetch_retry(&port->rx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start)); values = p; /* Racing with TX start_xmit */ do { p = values; - start = u64_stats_fetch_begin(&port->tx_stats_syncp); + start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp); for (i = 0; i < TX_MAX_FRAGS; i++) { *values++ = port->tx_frag_stats[i]; @@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev, *values++ = port->tx_frags_linearized; *values++ = port->tx_hw_csummed; - } while (u64_stats_fetch_retry(&port->tx_stats_syncp, start)); + } while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start)); } static int gmac_get_ksettings(struct net_device *netdev, diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h index 53b7e95213a8..671f51135c26 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h +++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h @@ -206,9 +206,9 @@ struct funeth_rxq { #define FUN_QSTAT_READ(q, seq, stats_copy) \ do { \ - seq = u64_stats_fetch_begin(&(q)->syncp); \ + seq = u64_stats_fetch_begin_irq(&(q)->syncp); \ stats_copy = (q)->stats; \ - } while (u64_stats_fetch_retry(&(q)->syncp, (seq))) + } while (u64_stats_fetch_retry_irq(&(q)->syncp, (seq))) #define FUN_INT_NAME_LEN (IFNAMSIZ + 16) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 50b384910c83..7b9a2d9d9624 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev, struct gve_rx_ring *rx = &priv->rx[ring]; start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_pkts = rx->rpackets; tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); rx_pkts += tmp_rx_pkts; rx_bytes += tmp_rx_bytes; @@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev, if (priv->tx) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_pkts = priv->tx[ring].pkt_done; tmp_tx_bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); tx_pkts += tmp_tx_pkts; tx_bytes += tmp_tx_bytes; @@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev, data[i++] = rx->fill_cnt - rx->cnt; do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); data[i++] = tmp_rx_bytes; data[i++] = rx->rx_cont_packet_cnt; @@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev, } do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); tmp_tx_bytes = tx->bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); data[i++] = tmp_tx_bytes; data[i++] = tx->wake_queue; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 6cafee55efc3..044db3ebb071 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->rx[ring].statss); + u64_stats_fetch_begin_irq(&priv->rx[ring].statss); packets = priv->rx[ring].rpackets; bytes = priv->rx[ring].rbytes; - } while (u64_stats_fetch_retry(&priv->rx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss, start)); s->rx_packets += packets; s->rx_bytes += bytes; @@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { do { start = - u64_stats_fetch_begin(&priv->tx[ring].statss); + u64_stats_fetch_begin_irq(&priv->tx[ring].statss); packets = priv->tx[ring].pkt_done; bytes = priv->tx[ring].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[ring].statss, + } while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss, start)); s->tx_packets += packets; s->tx_bytes += bytes; @@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv) } do { - start = u64_stats_fetch_begin(&priv->tx[idx].statss); + start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss); tx_bytes = priv->tx[idx].bytes_done; - } while (u64_stats_fetch_retry(&priv->tx[idx].statss, start)); + } while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start)); stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_WAKE_CNT), .value = cpu_to_be64(priv->tx[idx].wake_queue), diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c index a866bea65110..e5828a658caf 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c @@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats) unsigned int start; do { - start = u64_stats_fetch_begin(&rxq_stats->syncp); + start = u64_stats_fetch_begin_irq(&rxq_stats->syncp); stats->pkts = rxq_stats->pkts; stats->bytes = rxq_stats->bytes; stats->errors = rxq_stats->csum_errors + rxq_stats->other_errors; stats->csum_errors = rxq_stats->csum_errors; stats->other_errors = rxq_stats->other_errors; - } while (u64_stats_fetch_retry(&rxq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxq_stats->syncp, start)); } /** diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index 5051cdff2384..3b6c7b585737 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats) unsigned int start; do { - start = u64_stats_fetch_begin(&txq_stats->syncp); + start = u64_stats_fetch_begin_irq(&txq_stats->syncp); stats->pkts = txq_stats->pkts; stats->bytes = txq_stats->bytes; stats->tx_busy = txq_stats->tx_busy; stats->tx_wake = txq_stats->tx_wake; stats->tx_dropped = txq_stats->tx_dropped; stats->big_frags_pkts = txq_stats->big_frags_pkts; - } while (u64_stats_fetch_retry(&txq_stats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&txq_stats->syncp, start)); } /** diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index cf4d6f1129fa..349a2b1a19a2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev, unsigned int start; do { - start = u64_stats_fetch_begin(&r_vec->rx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->rx_sync); data[0] = r_vec->rx_pkts; data[1] = r_vec->rx_bytes; data[2] = r_vec->rx_drops; - } while (u64_stats_fetch_retry(&r_vec->rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->rx_sync, start)); stats->rx_packets += data[0]; stats->rx_bytes += data[1]; stats->rx_dropped += data[2]; do { - start = u64_stats_fetch_begin(&r_vec->tx_sync); + start = u64_stats_fetch_begin_irq(&r_vec->tx_sync); data[0] = r_vec->tx_pkts; data[1] = r_vec->tx_bytes; data[2] = r_vec->tx_errors; - } while (u64_stats_fetch_retry(&r_vec->tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&r_vec->tx_sync, start)); stats->tx_packets += data[0]; stats->tx_bytes += data[1]; stats->tx_errors += data[2]; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index eeb1455a4e5d..b1b1b648e40c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) unsigned int start; do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].rx_sync); data[0] = nn->r_vecs[i].rx_pkts; tmp[0] = nn->r_vecs[i].hw_csum_rx_ok; tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok; @@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[3] = nn->r_vecs[i].hw_csum_rx_error; tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail; tmp[5] = nn->r_vecs[i].hw_tls_rx; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].rx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].rx_sync, start)); do { - start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync); + start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].tx_sync); data[1] = nn->r_vecs[i].tx_pkts; data[2] = nn->r_vecs[i].tx_busy; tmp[6] = nn->r_vecs[i].hw_csum_tx; @@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data) tmp[10] = nn->r_vecs[i].hw_tls_tx; tmp[11] = nn->r_vecs[i].tls_tx_fallback; tmp[12] = nn->r_vecs[i].tls_tx_no_fallback; - } while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start)); + } while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].tx_sync, start)); data += NN_RVEC_PER_Q_STATS; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index e470e3398abc..9a1a5b203624 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) unsigned int start; do { - start = u64_stats_fetch_begin(&ns->syncp); + start = u64_stats_fetch_begin_irq(&ns->syncp); stats->tx_bytes = ns->tx_bytes; stats->tx_packets = ns->tx_packets; - } while (u64_stats_fetch_retry(&ns->syncp, start)); + } while (u64_stats_fetch_retry_irq(&ns->syncp, start)); } static int diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 330dab41f2fe..58998d821778 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } @@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..b52afe316dc4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin(&p->syncp); + start = u64_stats_fetch_begin_irq(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry(&p->syncp, start)); + } while (u64_stats_fetch_retry_irq(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; From b05972f01e7d30419987a1f221b5593668fd6448 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 26 Aug 2022 09:39:30 +0800 Subject: [PATCH 39/56] net: sched: tbf: don't call qdisc_put() while holding tree lock The issue is the same to commit c2999f7fb05b ("net: sched: multiq: don't call qdisc_put() while holding tree lock"). Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20220826013930.340121-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/sched/sch_tbf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 72102277449e..36079fdde2cb 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -356,6 +356,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct Qdisc *child = NULL; + struct Qdisc *old = NULL; struct psched_ratecfg rate; struct psched_ratecfg peak; u64 max_size; @@ -447,7 +448,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + old = q->qdisc; q->qdisc = child; } q->limit = qopt->limit; @@ -467,6 +468,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); + qdisc_put(old); err = 0; tbf_offload_change(sch); From f612466ebecb12a00d9152344ddda6f6345f04dc Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Fri, 26 Aug 2022 17:00:55 +0800 Subject: [PATCH 40/56] net/sched: fix netdevice reference leaks in attach_default_qdiscs() In attach_default_qdiscs(), if a dev has multiple queues and queue 0 fails to attach qdisc because there is no memory in attach_one_default_qdisc(). Then dev->qdisc will be noop_qdisc by default. But the other queues may be able to successfully attach to default qdisc. In this case, the fallback to noqueue process will be triggered. If the original attached qdisc is not released and a new one is directly attached, this will cause netdevice reference leaks. The following is the bug log: veth0: default qdisc (fq_codel) fail, fallback to noqueue unregister_netdevice: waiting for veth0 to become free. Usage count = 32 leaked reference. qdisc_alloc+0x12e/0x210 qdisc_create_dflt+0x62/0x140 attach_one_default_qdisc.constprop.41+0x44/0x70 dev_activate+0x128/0x290 __dev_open+0x12a/0x190 __dev_change_flags+0x1a2/0x1f0 dev_change_flags+0x23/0x60 do_setlink+0x332/0x1150 __rtnl_newlink+0x52f/0x8e0 rtnl_newlink+0x43/0x70 rtnetlink_rcv_msg+0x140/0x3b0 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1bb/0x290 netlink_sendmsg+0x37c/0x4e0 sock_sendmsg+0x5f/0x70 ____sys_sendmsg+0x208/0x280 Fix this bug by clearing any non-noop qdiscs that may have been assigned before trying to re-attach. Fixes: bf6dba76d278 ("net: sched: fallback to qdisc noqueue if default qdisc setup fail") Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20220826090055.24424-1-wanghai38@huawei.com Signed-off-by: Paolo Abeni --- net/sched/sch_generic.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 99b697ad2b98..7a8ea03f673d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1122,6 +1122,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, } EXPORT_SYMBOL(dev_graft_qdisc); +static void shutdown_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc_default = _qdisc_default; + + if (qdisc) { + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + dev_queue->qdisc_sleeping = qdisc_default; + + qdisc_put(qdisc); + } +} + static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) @@ -1169,6 +1184,7 @@ static void attach_default_qdiscs(struct net_device *dev) if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); + netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = txq->qdisc_sleeping; @@ -1447,21 +1463,6 @@ void dev_init_scheduler(struct net_device *dev) timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } -static void shutdown_scheduler_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_qdisc_default) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - struct Qdisc *qdisc_default = _qdisc_default; - - if (qdisc) { - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - dev_queue->qdisc_sleeping = qdisc_default; - - qdisc_put(qdisc); - } -} - void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); From fce1c23f629173e0db78b79a74f2052044a00e65 Mon Sep 17 00:00:00 2001 From: Alvaro Karsz Date: Tue, 23 Aug 2022 10:39:47 +0300 Subject: [PATCH 41/56] net: virtio_net: fix notification coalescing comments Fix wording in comments for the notifications coalescing feature. Signed-off-by: Alvaro Karsz Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20220823073947.14774-1-alvaro.karsz@solid-run.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/virtio_net.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 29ced55514d4..6cb842ea8979 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -56,7 +56,7 @@ #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ -#define VIRTIO_NET_F_NOTF_COAL 53 /* Guest can handle notifications coalescing */ +#define VIRTIO_NET_F_NOTF_COAL 53 /* Device supports notifications coalescing */ #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ @@ -364,24 +364,24 @@ struct virtio_net_hash_config { */ #define VIRTIO_NET_CTRL_NOTF_COAL 6 /* - * Set the tx-usecs/tx-max-packets patameters. - * tx-usecs - Maximum number of usecs to delay a TX notification. - * tx-max-packets - Maximum number of packets to send before a TX notification. + * Set the tx-usecs/tx-max-packets parameters. */ struct virtio_net_ctrl_coal_tx { + /* Maximum number of packets to send before a TX notification */ __le32 tx_max_packets; + /* Maximum number of usecs to delay a TX notification */ __le32 tx_usecs; }; #define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET 0 /* - * Set the rx-usecs/rx-max-packets patameters. - * rx-usecs - Maximum number of usecs to delay a RX notification. - * rx-max-frames - Maximum number of packets to receive before a RX notification. + * Set the rx-usecs/rx-max-packets parameters. */ struct virtio_net_ctrl_coal_rx { + /* Maximum number of packets to receive before a RX notification */ __le32 rx_max_packets; + /* Maximum number of usecs to delay a RX notification */ __le32 rx_usecs; }; From 4a4ce82212ef014d70f486a427005b2b5bab8e34 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 30 Aug 2022 08:40:55 +0200 Subject: [PATCH 42/56] net: phy: micrel: Make the GPIO to be non-exclusive The same GPIO line can be shared by multiple phys for the coma mode pin. If that is the case then, all the other phys that share the same line will failed to be probed because the access to the gpio line is not non-exclusive. Fix this by making access to the gpio line to be nonexclusive using flag GPIOD_FLAGS_BIT_NONEXCLUSIVE. This allows all the other PHYs to be probed. Fixes: 738871b09250ee ("net: phy: micrel: add coma mode GPIO") Reviewed-by: Andrew Lunn Signed-off-by: Horatiu Vultur Link: https://lore.kernel.org/r/20220830064055.2340403-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index e78d0bf69bc3..6f52b4fb6888 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2873,12 +2873,18 @@ static int lan8814_config_init(struct phy_device *phydev) return 0; } +/* It is expected that there will not be any 'lan8814_take_coma_mode' + * function called in suspend. Because the GPIO line can be shared, so if one of + * the phys goes back in coma mode, then all the other PHYs will go, which is + * wrong. + */ static int lan8814_release_coma_mode(struct phy_device *phydev) { struct gpio_desc *gpiod; gpiod = devm_gpiod_get_optional(&phydev->mdio.dev, "coma-mode", - GPIOD_OUT_HIGH_OPEN_DRAIN); + GPIOD_OUT_HIGH_OPEN_DRAIN | + GPIOD_FLAGS_BIT_NONEXCLUSIVE); if (IS_ERR(gpiod)) return PTR_ERR(gpiod); From 642b2122c5df332a6df52dd1f91e552bc4356951 Mon Sep 17 00:00:00 2001 From: Gao Xiao Date: Mon, 29 Aug 2022 12:16:51 +0200 Subject: [PATCH 43/56] nfp: fix the access to management firmware hanging When running `ethtool -p` with the old management firmware, the management firmware resource is not correctly released, which causes firmware related malfunction: all the access to management firmware hangs. It releases the management firmware resource when set id mode operation is not supported. Fixes: ccb9bc1dfa44 ("nfp: add 'ethtool --identify' support") Signed-off-by: Gao Xiao Reviewed-by: Louis Peens Signed-off-by: Simon Horman Reviewed-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20220829101651.633840-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c index edd300033735..4cc38799eabc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c @@ -507,6 +507,7 @@ int nfp_eth_set_idmode(struct nfp_cpp *cpp, unsigned int idx, bool state) if (nfp_nsp_get_abi_ver_minor(nsp) < 32) { nfp_err(nfp_nsp_cpp(nsp), "set id mode operation not supported, please update flash\n"); + nfp_eth_config_cleanup_end(nsp); return -EOPNOTSUPP; } From 13a9d08c296228d18289de60b83792c586e1d073 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 Aug 2022 18:00:30 +0300 Subject: [PATCH 44/56] net: lan966x: improve error handle in lan966x_fdma_rx_get_frame() Don't just print a warning. Clean up and return an error as well. Fixes: c8349639324a ("net: lan966x: Add FDMA functionality") Signed-off-by: Dan Carpenter Reviewed-by: Horatiu Vultur Link: https://lore.kernel.org/r/YwjgDm/SVd5c1tQU@kili Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c index 6dea7f8c1481..51f8a0816377 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c @@ -425,7 +425,8 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx) lan966x_ifh_get_src_port(skb->data, &src_port); lan966x_ifh_get_timestamp(skb->data, ×tamp); - WARN_ON(src_port >= lan966x->num_phys_ports); + if (WARN_ON(src_port >= lan966x->num_phys_ports)) + goto free_skb; skb->dev = lan966x->ports[src_port]->dev; skb_pull(skb, IFH_LEN * sizeof(u32)); @@ -449,6 +450,8 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx) return skb; +free_skb: + kfree_skb(skb); unmap_page: dma_unmap_page(lan966x->dev, (dma_addr_t)db->dataptr, FDMA_DCB_STATUS_BLOCKL(db->status), From c0955bf957be4bead01fae1d791476260da7325d Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Sat, 27 Aug 2022 23:38:15 +0800 Subject: [PATCH 45/56] ethernet: rocker: fix sleep in atomic context bug in neigh_timer_handler The function neigh_timer_handler() is a timer handler that runs in an atomic context. When used by rocker, neigh_timer_handler() calls "kzalloc(.., GFP_KERNEL)" that may sleep. As a result, the sleep in atomic context bug will happen. One of the processes is shown below: ofdpa_fib4_add() ... neigh_add_timer() (wait a timer) neigh_timer_handler() neigh_release() neigh_destroy() rocker_port_neigh_destroy() rocker_world_port_neigh_destroy() ofdpa_port_neigh_destroy() ofdpa_port_ipv4_neigh() kzalloc(sizeof(.., GFP_KERNEL) //may sleep This patch changes the gfp_t parameter of kzalloc() from GFP_KERNEL to GFP_ATOMIC in order to mitigate the bug. Fixes: 00fc0c51e35b ("rocker: Change world_ops API and implementation to be switchdev independant") Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index bc70c6abd6a5..58cf7cc54f40 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1273,7 +1273,7 @@ static int ofdpa_port_ipv4_neigh(struct ofdpa_port *ofdpa_port, bool removing; int err = 0; - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; From 3a1a274e933fca73fdc960cb1f60636cd285a265 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Fri, 26 Aug 2022 11:59:16 -0400 Subject: [PATCH 46/56] mlxbf_gige: compute MDIO period based on i1clk This patch adds logic to compute the MDIO period based on the i1clk, and thereafter write the MDIO period into the YU MDIO config register. The i1clk resource from the ACPI table is used to provide addressing to YU bootrecord PLL registers. The values in these registers are used to compute MDIO period. If the i1clk resource is not present in the ACPI table, then the current default hardcorded value of 430Mhz is used. The i1clk clock value of 430MHz is only accurate for boards with BF2 mid bin and main bin SoCs. The BF2 high bin SoCs have i1clk = 500MHz, but can support a slower MDIO period. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Reviewed-by: Asmaa Mnebhi Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20220826155916.12491-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlxbf_gige/mlxbf_gige.h | 4 +- .../mellanox/mlxbf_gige/mlxbf_gige_mdio.c | 128 +++++++++++++++--- .../mellanox/mlxbf_gige/mlxbf_gige_regs.h | 2 + 3 files changed, 113 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h index 5fdf9b7179f5..5a1027b07215 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h @@ -75,6 +75,7 @@ struct mlxbf_gige { struct net_device *netdev; struct platform_device *pdev; void __iomem *mdio_io; + void __iomem *clk_io; struct mii_bus *mdiobus; spinlock_t lock; /* for packet processing indices */ u16 rx_q_entries; @@ -137,7 +138,8 @@ enum mlxbf_gige_res { MLXBF_GIGE_RES_MDIO9, MLXBF_GIGE_RES_GPIO0, MLXBF_GIGE_RES_LLU, - MLXBF_GIGE_RES_PLU + MLXBF_GIGE_RES_PLU, + MLXBF_GIGE_RES_CLK }; /* Version of register data returned by mlxbf_gige_get_regs() */ diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c index 2e6c1b7af096..85155cd9405c 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c @@ -22,10 +22,23 @@ #include #include "mlxbf_gige.h" +#include "mlxbf_gige_regs.h" #define MLXBF_GIGE_MDIO_GW_OFFSET 0x0 #define MLXBF_GIGE_MDIO_CFG_OFFSET 0x4 +#define MLXBF_GIGE_MDIO_FREQ_REFERENCE 156250000ULL +#define MLXBF_GIGE_MDIO_COREPLL_CONST 16384ULL +#define MLXBF_GIGE_MDC_CLK_NS 400 +#define MLXBF_GIGE_MDIO_PLL_I1CLK_REG1 0x4 +#define MLXBF_GIGE_MDIO_PLL_I1CLK_REG2 0x8 +#define MLXBF_GIGE_MDIO_CORE_F_SHIFT 0 +#define MLXBF_GIGE_MDIO_CORE_F_MASK GENMASK(25, 0) +#define MLXBF_GIGE_MDIO_CORE_R_SHIFT 26 +#define MLXBF_GIGE_MDIO_CORE_R_MASK GENMASK(31, 26) +#define MLXBF_GIGE_MDIO_CORE_OD_SHIFT 0 +#define MLXBF_GIGE_MDIO_CORE_OD_MASK GENMASK(3, 0) + /* Support clause 22 */ #define MLXBF_GIGE_MDIO_CL22_ST1 0x1 #define MLXBF_GIGE_MDIO_CL22_WRITE 0x1 @@ -50,28 +63,77 @@ #define MLXBF_GIGE_MDIO_CFG_MDIO_IN_SAMP_MASK GENMASK(23, 16) #define MLXBF_GIGE_MDIO_CFG_MDIO_OUT_SAMP_MASK GENMASK(31, 24) -/* Formula for encoding the MDIO period. The encoded value is - * passed to the MDIO config register. - * - * mdc_clk = 2*(val + 1)*i1clk - * - * 400 ns = 2*(val + 1)*(((1/430)*1000) ns) - * - * val = (((400 * 430 / 1000) / 2) - 1) - */ -#define MLXBF_GIGE_I1CLK_MHZ 430 -#define MLXBF_GIGE_MDC_CLK_NS 400 - -#define MLXBF_GIGE_MDIO_PERIOD (((MLXBF_GIGE_MDC_CLK_NS * MLXBF_GIGE_I1CLK_MHZ / 1000) / 2) - 1) - #define MLXBF_GIGE_MDIO_CFG_VAL (FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_MODE_MASK, 1) | \ FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO3_3_MASK, 1) | \ FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_FULL_DRIVE_MASK, 1) | \ - FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDC_PERIOD_MASK, \ - MLXBF_GIGE_MDIO_PERIOD) | \ FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_IN_SAMP_MASK, 6) | \ FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_OUT_SAMP_MASK, 13)) +#define MLXBF_GIGE_BF2_COREPLL_ADDR 0x02800c30 +#define MLXBF_GIGE_BF2_COREPLL_SIZE 0x0000000c + +static struct resource corepll_params[] = { + [MLXBF_GIGE_VERSION_BF2] = { + .start = MLXBF_GIGE_BF2_COREPLL_ADDR, + .end = MLXBF_GIGE_BF2_COREPLL_ADDR + MLXBF_GIGE_BF2_COREPLL_SIZE - 1, + .name = "COREPLL_RES" + }, +}; + +/* Returns core clock i1clk in Hz */ +static u64 calculate_i1clk(struct mlxbf_gige *priv) +{ + u8 core_od, core_r; + u64 freq_output; + u32 reg1, reg2; + u32 core_f; + + reg1 = readl(priv->clk_io + MLXBF_GIGE_MDIO_PLL_I1CLK_REG1); + reg2 = readl(priv->clk_io + MLXBF_GIGE_MDIO_PLL_I1CLK_REG2); + + core_f = (reg1 & MLXBF_GIGE_MDIO_CORE_F_MASK) >> + MLXBF_GIGE_MDIO_CORE_F_SHIFT; + core_r = (reg1 & MLXBF_GIGE_MDIO_CORE_R_MASK) >> + MLXBF_GIGE_MDIO_CORE_R_SHIFT; + core_od = (reg2 & MLXBF_GIGE_MDIO_CORE_OD_MASK) >> + MLXBF_GIGE_MDIO_CORE_OD_SHIFT; + + /* Compute PLL output frequency as follow: + * + * CORE_F / 16384 + * freq_output = freq_reference * ---------------------------- + * (CORE_R + 1) * (CORE_OD + 1) + */ + freq_output = div_u64((MLXBF_GIGE_MDIO_FREQ_REFERENCE * core_f), + MLXBF_GIGE_MDIO_COREPLL_CONST); + freq_output = div_u64(freq_output, (core_r + 1) * (core_od + 1)); + + return freq_output; +} + +/* Formula for encoding the MDIO period. The encoded value is + * passed to the MDIO config register. + * + * mdc_clk = 2*(val + 1)*(core clock in sec) + * + * i1clk is in Hz: + * 400 ns = 2*(val + 1)*(1/i1clk) + * + * val = (((400/10^9) / (1/i1clk) / 2) - 1) + * val = (400/2 * i1clk)/10^9 - 1 + */ +static u8 mdio_period_map(struct mlxbf_gige *priv) +{ + u8 mdio_period; + u64 i1clk; + + i1clk = calculate_i1clk(priv); + + mdio_period = div_u64((MLXBF_GIGE_MDC_CLK_NS >> 1) * i1clk, 1000000000) - 1; + + return mdio_period; +} + static u32 mlxbf_gige_mdio_create_cmd(u16 data, int phy_add, int phy_reg, u32 opcode) { @@ -124,9 +186,9 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add, int phy_reg, u16 val) { struct mlxbf_gige *priv = bus->priv; + u32 temp; u32 cmd; int ret; - u32 temp; if (phy_reg & MII_ADDR_C45) return -EOPNOTSUPP; @@ -144,18 +206,44 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add, return ret; } +static void mlxbf_gige_mdio_cfg(struct mlxbf_gige *priv) +{ + u8 mdio_period; + u32 val; + + mdio_period = mdio_period_map(priv); + + val = MLXBF_GIGE_MDIO_CFG_VAL; + val |= FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDC_PERIOD_MASK, mdio_period); + writel(val, priv->mdio_io + MLXBF_GIGE_MDIO_CFG_OFFSET); +} + int mlxbf_gige_mdio_probe(struct platform_device *pdev, struct mlxbf_gige *priv) { struct device *dev = &pdev->dev; + struct resource *res; int ret; priv->mdio_io = devm_platform_ioremap_resource(pdev, MLXBF_GIGE_RES_MDIO9); if (IS_ERR(priv->mdio_io)) return PTR_ERR(priv->mdio_io); - /* Configure mdio parameters */ - writel(MLXBF_GIGE_MDIO_CFG_VAL, - priv->mdio_io + MLXBF_GIGE_MDIO_CFG_OFFSET); + /* clk resource shared with other drivers so cannot use + * devm_platform_ioremap_resource + */ + res = platform_get_resource(pdev, IORESOURCE_MEM, MLXBF_GIGE_RES_CLK); + if (!res) { + /* For backward compatibility with older ACPI tables, also keep + * CLK resource internal to the driver. + */ + res = &corepll_params[MLXBF_GIGE_VERSION_BF2]; + } + + priv->clk_io = devm_ioremap(dev, res->start, resource_size(res)); + if (IS_ERR(priv->clk_io)) + return PTR_ERR(priv->clk_io); + + mlxbf_gige_mdio_cfg(priv); priv->mdiobus = devm_mdiobus_alloc(dev); if (!priv->mdiobus) { diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h index 5fb33c9294bf..7be3a793984d 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h @@ -8,6 +8,8 @@ #ifndef __MLXBF_GIGE_REGS_H__ #define __MLXBF_GIGE_REGS_H__ +#define MLXBF_GIGE_VERSION 0x0000 +#define MLXBF_GIGE_VERSION_BF2 0x0 #define MLXBF_GIGE_STATUS 0x0010 #define MLXBF_GIGE_STATUS_READY BIT(0) #define MLXBF_GIGE_INT_STATUS 0x0028 From 8fc29ff3910f3af08a7c40a75d436b5720efe2bf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 27 Aug 2022 11:13:14 -0700 Subject: [PATCH 47/56] kcm: fix strp_init() order and cleanup strp_init() is called just a few lines above this csk->sk_user_data check, it also initializes strp->work etc., therefore, it is unnecessary to call strp_done() to cancel the freshly initialized work. And if sk_user_data is already used by KCM, psock->strp should not be touched, particularly strp->work state, so we need to move strp_init() after the csk->sk_user_data check. This also makes a lockdep warning reported by syzbot go away. Reported-and-tested-by: syzbot+9fc084a4348493ef65d2@syzkaller.appspotmail.com Reported-by: syzbot+e696806ef96cdd2d87cd@syzkaller.appspotmail.com Fixes: e5571240236c ("kcm: Check if sk_user_data already set in kcm_attach") Fixes: dff8baa26117 ("kcm: Call strp_stop before strp_done in kcm_attach") Cc: Tom Herbert Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20220827181314.193710-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 71899e5a5a11..1215c863e1c4 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1412,12 +1412,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - err = strp_init(&psock->strp, csk, &cb); - if (err) { - kmem_cache_free(kcm_psockp, psock); - goto out; - } - write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is already by KCM or someone else. @@ -1425,13 +1419,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); - strp_stop(&psock->strp); - strp_done(&psock->strp); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } + err = strp_init(&psock->strp, csk, &cb); + if (err) { + write_unlock_bh(&csk->sk_callback_lock); + kmem_cache_free(kcm_psockp, psock); + goto out; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; From 404a5ad72011f5bd2bb90f0a035be7635e2bd839 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 29 Aug 2022 16:54:14 -0700 Subject: [PATCH 48/56] Documentation: networking: correct possessive "its" Change occurrences of "it's" that are possessive to "its" so that they don't read as "it is". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: Eric Dumazet Cc: Paolo Abeni Cc: Jiri Pirko Link: https://lore.kernel.org/r/20220829235414.17110-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/netdevsim.rst | 2 +- Documentation/networking/driver.rst | 2 +- Documentation/networking/ipvlan.rst | 2 +- Documentation/networking/l2tp.rst | 2 +- Documentation/networking/switchdev.rst | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/devlink/netdevsim.rst b/Documentation/networking/devlink/netdevsim.rst index 8a292fb5aaea..ec5e6d79b2e2 100644 --- a/Documentation/networking/devlink/netdevsim.rst +++ b/Documentation/networking/devlink/netdevsim.rst @@ -67,7 +67,7 @@ The ``netdevsim`` driver supports rate objects management, which includes: - setting tx_share and tx_max rate values for any rate object type; - setting parent node for any rate object type. -Rate nodes and it's parameters are exposed in ``netdevsim`` debugfs in RO mode. +Rate nodes and their parameters are exposed in ``netdevsim`` debugfs in RO mode. For example created rate node with name ``some_group``: .. code:: shell diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst index c8f59dbda46f..64f7236ff10b 100644 --- a/Documentation/networking/driver.rst +++ b/Documentation/networking/driver.rst @@ -8,7 +8,7 @@ Transmit path guidelines: 1) The ndo_start_xmit method must not return NETDEV_TX_BUSY under any normal circumstances. It is considered a hard error unless - there is no way your device can tell ahead of time when it's + there is no way your device can tell ahead of time when its transmit function will become busy. Instead it must maintain the queue properly. For example, diff --git a/Documentation/networking/ipvlan.rst b/Documentation/networking/ipvlan.rst index 694adcba36b0..0000c1d383bc 100644 --- a/Documentation/networking/ipvlan.rst +++ b/Documentation/networking/ipvlan.rst @@ -11,7 +11,7 @@ Initial Release: ================ This is conceptually very similar to the macvlan driver with one major exception of using L3 for mux-ing /demux-ing among slaves. This property makes -the master device share the L2 with it's slave devices. I have developed this +the master device share the L2 with its slave devices. I have developed this driver in conjunction with network namespaces and not sure if there is use case outside of it. diff --git a/Documentation/networking/l2tp.rst b/Documentation/networking/l2tp.rst index 498b382d25a0..7f383e99dbad 100644 --- a/Documentation/networking/l2tp.rst +++ b/Documentation/networking/l2tp.rst @@ -530,7 +530,7 @@ its tunnel close actions. For L2TPIP sockets, the socket's close handler initiates the same tunnel close actions. All sessions are first closed. Each session drops its tunnel ref. When the tunnel ref reaches zero, the tunnel puts its socket ref. When the socket is -eventually destroyed, it's sk_destruct finally frees the L2TP tunnel +eventually destroyed, its sk_destruct finally frees the L2TP tunnel context. Sessions diff --git a/Documentation/networking/switchdev.rst b/Documentation/networking/switchdev.rst index f1f4e6a85a29..bbf272e9d607 100644 --- a/Documentation/networking/switchdev.rst +++ b/Documentation/networking/switchdev.rst @@ -159,7 +159,7 @@ tools such as iproute2. The switchdev driver can know a particular port's position in the topology by monitoring NETDEV_CHANGEUPPER notifications. For example, a port moved into a -bond will see it's upper master change. If that bond is moved into a bridge, +bond will see its upper master change. If that bond is moved into a bridge, the bond's upper master will change. And so on. The driver will track such movements to know what position a port is in in the overall topology by registering for netdevice events and acting on NETDEV_CHANGEUPPER. From 5a3a59981027b53ec0f729ad76a43ce2b64ad968 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Mon, 29 Aug 2022 11:47:48 -0700 Subject: [PATCH 49/56] selftests: net: sort .gitignore file This is the result of `sort tools/testing/selftests/net/.gitignore`, but preserving the comment at the top. Suggested-by: Jakub Kicinski Signed-off-by: Axel Rasmussen Link: https://lore.kernel.org/r/20220829184748.1535580-1-axelrasmussen@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 52 +++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 0e5751af6247..de7d5cc15f85 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,42 +1,42 @@ # SPDX-License-Identifier: GPL-2.0-only +cmsg_sender +fin_ack_lat +gro +hwtstamp_config +ioam6_parser +ip_defrag ipsec +ipv6_flowlabel +ipv6_flowlabel_mgr msg_zerocopy -socket +nettest psock_fanout psock_snd psock_tpacket -stress_reuseport_listen +reuseaddr_conflict +reuseaddr_ports_exhausted reuseport_addr_any reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa reuseport_dualstack -reuseaddr_conflict +rxtimestamp +socket +so_netns_cookie +so_txtime +stress_reuseport_listen +tap +tcp_fastopen_backup_key +tcp_inq tcp_mmap +test_unix_oob +timestamping +tls +toeplitz +tun +txring_overwrite +txtimestamp udpgso udpgso_bench_rx udpgso_bench_tx -tcp_inq -tls -txring_overwrite -ip_defrag -ipv6_flowlabel -ipv6_flowlabel_mgr -so_txtime -tcp_fastopen_backup_key -nettest -fin_ack_lat -reuseaddr_ports_exhausted -hwtstamp_config -rxtimestamp -timestamping -txtimestamp -so_netns_cookie -test_unix_oob -gro -ioam6_parser -toeplitz -tun -cmsg_sender unix_connect -tap \ No newline at end of file From 90fabae8a2c225c4e4936723c38857887edde5cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 31 Aug 2022 11:21:03 +0200 Subject: [PATCH 50/56] sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GSO splitting feature of sch_cake is enabled, GSO superpackets will be broken up and the resulting segments enqueued in place of the original skb. In this case, CAKE calls consume_skb() on the original skb, but still returns NET_XMIT_SUCCESS. This can confuse parent qdiscs into assuming the original skb still exists, when it really has been freed. Fix this by adding the __NET_XMIT_STOLEN flag to the return value in this case. Fixes: 0c850344d388 ("sch_cake: Conditionally split GSO segments") Signed-off-by: Toke Høiland-Jørgensen Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Link: https://lore.kernel.org/r/20220831092103.442868-1-toke@toke.dk Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a43a58a73d09..a04928082e4a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1713,6 +1713,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } idx--; flow = &b->flows[idx]; + ret = NET_XMIT_SUCCESS; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { @@ -1771,6 +1772,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); + ret |= __NET_XMIT_STOLEN; } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); @@ -1904,7 +1906,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } b->drop_overlimit += dropped; } - return NET_XMIT_SUCCESS; + return ret; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) From eb55dc09b5dd040232d5de32812cc83001a23da6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 29 Aug 2022 12:01:21 +0200 Subject: [PATCH 51/56] ip: fix triggering of 'icmp redirect' __mkroute_input() uses fib_validate_source() to trigger an icmp redirect. My understanding is that fib_validate_source() is used to know if the src address and the gateway address are on the same link. For that, fib_validate_source() returns 1 (same link) or 0 (not the same network). __mkroute_input() is the only user of these positive values, all other callers only look if the returned value is negative. Since the below patch, fib_validate_source() didn't return anymore 1 when both addresses are on the same network, because the route lookup returns RT_SCOPE_LINK instead of RT_SCOPE_HOST. But this is, in fact, right. Let's adapat the test to return 1 again when both addresses are on the same link. CC: stable@vger.kernel.org Fixes: 747c14307214 ("ip: fix dflt addr selection for connected nexthop") Reported-by: kernel test robot Reported-by: Heng Qi Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220829100121.3821-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f361d3d56be2..943edf4ad4db 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; return ret; } if (no_addr) @@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; } return ret; From 52267ce25f60f37ae40ccbca0b21328ebae5ae75 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 30 Aug 2022 18:34:48 +0200 Subject: [PATCH 52/56] net: dsa: hellcreek: Print warning only once In case the source port cannot be decoded, print the warning only once. This still brings attention to the user and does not spam the logs at the same time. Signed-off-by: Kurt Kanzenbach Reviewed-by: Andrew Lunn Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220830163448.8921-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- net/dsa/tag_hellcreek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index eb204ad36eee..846588c0070a 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, skb->dev = dsa_master_find_slave(dev, 0, port); if (!skb->dev) { - netdev_warn(dev, "Failed to get source port: %d\n", port); + netdev_warn_once(dev, "Failed to get source port: %d\n", port); return NULL; } From 8c70521238b7863c2af607e20bcba20f974c969b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2022 11:56:55 -0700 Subject: [PATCH 53/56] tcp: annotate data-race around challenge_timestamp challenge_timestamp can be read an written by concurrent threads. This was expected, but we need to annotate the race to avoid potential issues. Following patch moves challenge_timestamp and challenge_count to per-netns storage to provide better isolation. Fixes: 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ab5f0ea166f1..c184e15397a2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3629,11 +3629,11 @@ static void tcp_send_challenge_ack(struct sock *sk) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != challenge_timestamp) { + if (now != READ_ONCE(challenge_timestamp)) { u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); u32 half = (ack_limit + 1) >> 1; - challenge_timestamp = now; + WRITE_ONCE(challenge_timestamp, now); WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); From 79e3602caa6f9d59c4f66a268407080496dae408 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2022 11:56:56 -0700 Subject: [PATCH 54/56] tcp: make global challenge ack rate limitation per net-ns and default disabled Because per host rate limiting has been proven problematic (side channel attacks can be based on it), per host rate limiting of challenge acks ideally should be per netns and turned off by default. This is a long due followup of following commits: 083ae308280d ("tcp: enable per-socket rate limiting of all 'challenge acks'") f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") 75ff39ccc1bd ("tcp: make challenge acks less predictable") Signed-off-by: Eric Dumazet Cc: Jason Baron Acked-by: Neal Cardwell Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 5 ++++- include/net/netns/ipv4.h | 2 ++ net/ipv4/tcp_input.c | 21 +++++++++++---------- net/ipv4/tcp_ipv4.c | 6 ++++-- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 56cd4ea059b2..a759872a2883 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1035,7 +1035,10 @@ tcp_limit_output_bytes - INTEGER tcp_challenge_ack_limit - INTEGER Limits number of Challenge ACK sent per second, as recommended in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks) - Default: 1000 + Note that this per netns rate limit can allow some side channel + attacks and probably should not be enabled. + TCP stack implements per TCP socket limits anyway. + Default: INT_MAX (unlimited) UDP variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c7320ef356d9..6320a76cefdc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -179,6 +179,8 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + u32 tcp_challenge_timestamp; + u32 tcp_challenge_count; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c184e15397a2..b85a9f755da4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3614,12 +3614,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk) { - /* unprotected vars, we dont care of overwrites */ - static u32 challenge_timestamp; - static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - u32 count, now; + u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, @@ -3627,18 +3624,22 @@ static void tcp_send_challenge_ack(struct sock *sk) &tp->last_oow_ack_time)) return; + ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (ack_limit == INT_MAX) + goto send_ack; + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != READ_ONCE(challenge_timestamp)) { - u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; - WRITE_ONCE(challenge_timestamp, now); - WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); + WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); } - count = READ_ONCE(challenge_count); + count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { - WRITE_ONCE(challenge_count, count - 1); + WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); +send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0c83780dc9bf..5b019ba2b9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3139,8 +3139,10 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of 16 TSO segments */ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; - /* rfc5961 challenge ack rate limiting */ - net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + + /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ + net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; + net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ net->ipv4.sysctl_tcp_min_rtt_wlen = 300; From 0b4f688d53fdc2a731b9d9cdf0c96255bc024ea6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2022 20:01:32 -0700 Subject: [PATCH 55/56] Revert "sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb" This reverts commit 90fabae8a2c225c4e4936723c38857887edde5cc. Patch was applied hastily, revert and let the v2 be reviewed. Fixes: 90fabae8a2c2 ("sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb") Link: https://lore.kernel.org/all/87wnao2ha3.fsf@toke.dk/ Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a04928082e4a..a43a58a73d09 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1713,7 +1713,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } idx--; flow = &b->flows[idx]; - ret = NET_XMIT_SUCCESS; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { @@ -1772,7 +1771,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); - ret |= __NET_XMIT_STOLEN; } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); @@ -1906,7 +1904,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } b->drop_overlimit += dropped; } - return ret; + return NET_XMIT_SUCCESS; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) From a8424a9b4522a3ab9f32175ad6d848739079071f Mon Sep 17 00:00:00 2001 From: Yacan Liu Date: Tue, 30 Aug 2022 23:23:14 +0800 Subject: [PATCH 56/56] net/smc: Remove redundant refcount increase For passive connections, the refcount increment has been done in smc_clcsock_accept()-->smc_sock_alloc(). Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: Yacan Liu Reviewed-by: Tony Lu Link: https://lore.kernel.org/r/20220830152314.838736-1-liuyacan@corp.netease.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 79c1318af1fe..0939cc3b915a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1855,7 +1855,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE;