From 27e23836ce22a3e5d89712ef832ab72e47ce9f43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20M=C3=BCller?= <deso@posteo.net>
Date: Wed, 10 Aug 2022 20:07:10 +0000
Subject: [PATCH 01/56] selftests/bpf: Add lru_bug to s390x deny list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lru_bug BPF selftest is failing execution on s390x machines. The
failure is due to program attachment failing in turn, similar to a bunch
of other tests. Those other tests have already been deny-listed and with
this change we do the same for the lru_bug test, adding it to the
corresponding file.

Fixes: de7b9927105b ("selftests/bpf: Add test for prealloc_lru_pop bug")
Signed-off-by: Daniel Müller <deso@posteo.net>
Acked-by: Mykola Lysenko <mykolal@fb.com>
Link: https://lore.kernel.org/r/20220810200710.1300299-1-deso@posteo.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index e33cab34d22f..db9810611788 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -65,3 +65,4 @@ send_signal                              # intermittently fails to receive signa
 select_reuseport                         # intermittently fails on new s390x setup
 xdp_synproxy                             # JIT does not support calling kernel function                                (kfunc)
 unpriv_bpf_disabled                      # fentry
+lru_bug                                  # prog 'printk': failed to auto-attach: -524

From 58ca14ed98c87cfe0d1408cc65a9745d9e9b7a56 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 12 Aug 2022 13:32:59 +0200
Subject: [PATCH 02/56] xsk: Fix corrupted packets for XDP_SHARED_UMEM

Fix an issue in XDP_SHARED_UMEM mode together with aligned mode where
packets are corrupted for the second and any further sockets bound to
the same umem. In other words, this does not affect the first socket
bound to the umem. The culprit for this bug is that the initialization
of the DMA addresses for the pre-populated xsk buffer pool entries was
not performed for any socket but the first one bound to the umem. Only
the linear array of DMA addresses was populated. Fix this by populating
the DMA addresses in the xsk buffer pool for every socket bound to the
same umem.

Fixes: 94033cd8e73b8 ("xsk: Optimize for aligned case")
Reported-by: Alasdair McWilliam <alasdair.mcwilliam@outlook.com>
Reported-by: Intrusion Shield Team <dnevil@intrusion.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Alasdair McWilliam <alasdair.mcwilliam@outlook.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/xdp-newbies/6205E10C-292E-4995-9D10-409649354226@outlook.com/
Link: https://lore.kernel.org/bpf/20220812113259.531-1-magnus.karlsson@gmail.com
---
 net/xdp/xsk_buff_pool.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index f70112176b7c..a71a8c6edf55 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -379,6 +379,16 @@ static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
 
 static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
 {
+	if (!pool->unaligned) {
+		u32 i;
+
+		for (i = 0; i < pool->heads_cnt; i++) {
+			struct xdp_buff_xsk *xskb = &pool->heads[i];
+
+			xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
+		}
+	}
+
 	pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
 	if (!pool->dma_pages)
 		return -ENOMEM;
@@ -428,12 +438,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 
 	if (pool->unaligned)
 		xp_check_dma_contiguity(dma_map);
-	else
-		for (i = 0; i < pool->heads_cnt; i++) {
-			struct xdp_buff_xsk *xskb = &pool->heads[i];
-
-			xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr);
-		}
 
 	err = xp_init_dma_info(pool, dma_map);
 	if (err) {

From 583585e48d965338e73e1eb383768d16e0922d73 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Tue, 9 Aug 2022 17:49:15 +0800
Subject: [PATCH 03/56] skmsg: Fix wrong last sg check in sk_msg_recvmsg()

Fix one kernel NULL pointer dereference as below:

[  224.462334] Call Trace:
[  224.462394]  __tcp_bpf_recvmsg+0xd3/0x380
[  224.462441]  ? sock_has_perm+0x78/0xa0
[  224.462463]  tcp_bpf_recvmsg+0x12e/0x220
[  224.462494]  inet_recvmsg+0x5b/0xd0
[  224.462534]  __sys_recvfrom+0xc8/0x130
[  224.462574]  ? syscall_trace_enter+0x1df/0x2e0
[  224.462606]  ? __do_page_fault+0x2de/0x500
[  224.462635]  __x64_sys_recvfrom+0x24/0x30
[  224.462660]  do_syscall_64+0x5d/0x1d0
[  224.462709]  entry_SYSCALL_64_after_hwframe+0x65/0xca

In commit 9974d37ea75f ("skmsg: Fix invalid last sg check in
sk_msg_recvmsg()"), we change last sg check to sg_is_last(),
but in sockmap redirection case (without stream_parser/stream_verdict/
skb_verdict), we did not mark the end of the scatterlist. Check the
sk_msg_alloc, sk_msg_page_add, and bpf_msg_push_data functions, they all
do not mark the end of sg. They are expected to use sg.end for end
judgment. So the judgment of '(i != msg_rx->sg.end)' is added back here.

Fixes: 9974d37ea75f ("skmsg: Fix invalid last sg check in sk_msg_recvmsg()")
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20220809094915.150391-1-liujian56@huawei.com
---
 net/core/skmsg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index f47338d89d5d..5be4485ff07d 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -461,7 +461,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 
 			if (copied == len)
 				break;
-		} while (!sg_is_last(sge));
+		} while ((i != msg_rx->sg.end) && !sg_is_last(sge));
 
 		if (unlikely(peek)) {
 			msg_rx = sk_psock_next_msg(psock, msg_rx);
@@ -471,7 +471,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		}
 
 		msg_rx->sg.start = i;
-		if (!sge->length && sg_is_last(sge)) {
+		if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
 			msg_rx = sk_psock_dequeue_msg(psock);
 			kfree_sk_msg(msg_rx);
 		}

From 3024d95a4c521c278a7504ee9e80c57c3a9750e0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 17 Aug 2022 23:32:09 +0200
Subject: [PATCH 04/56] bpf: Partially revert flexible-array member replacement

Partially revert 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays
with flexible-array members") given it breaks BPF UAPI.

For example, BPF CI run reveals build breakage under LLVM:

  [...]
    CLNG-BPF [test_maps] map_ptr_kern.o
    CLNG-BPF [test_maps] btf__core_reloc_arrays___diff_arr_val_sz.o
    CLNG-BPF [test_maps] test_bpf_cookie.o
  progs/map_ptr_kern.c:314:26: error: field 'trie_key' with variable sized type 'struct bpf_lpm_trie_key' not at the end of a struct or class is a GNU extension [-Werror,-Wgnu-variable-sized-type-not-at-end]
           struct bpf_lpm_trie_key trie_key;
                                   ^
    CLNG-BPF [test_maps] btf__core_reloc_type_based___diff.o
  1 error generated.
  make: *** [Makefile:521: /tmp/runner/work/bpf/bpf/tools/testing/selftests/bpf/map_ptr_kern.o] Error 1
  make: *** Waiting for unfinished jobs....
  [...]

Typical usage of the bpf_lpm_trie_key is that the struct gets embedded into
a user defined key for the LPM BPF map, from the selftest example:

  struct bpf_lpm_trie_key {                 <-- UAPI exported struct
         __u32   prefixlen;
         __u8    data[];
  };

  struct lpm_key {                          <-- BPF program defined struct
         struct bpf_lpm_trie_key trie_key;
         __u32 data;
  };

Undo this for BPF until a different solution can be found. It's the only flexible-
array member case in the UAPI header.

This was discovered in BPF CI after Dave reported that the include/uapi/linux/bpf.h
header was out of sync with tools/include/uapi/linux/bpf.h after 94dfc73e7cf4. And
the subsequent sync attempt failed CI.

Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members")
Reported-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/bpf/22aebc88-da67-f086-e620-dd4a16e2bc69@iogearbox.net
---
 include/uapi/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7bf9ba1329be..59a217ca2dfd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -79,7 +79,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
-	__u8	data[];	/* Arbitrary size */
+	__u8	data[0];	/* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {

From 14b20b784f59bdd95f6f1cfb112c9818bcec4d84 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Tue, 16 Aug 2022 20:55:16 +0000
Subject: [PATCH 05/56] bpf: Restrict bpf_sys_bpf to CAP_PERFMON

The verifier cannot perform sufficient validation of any pointers passed
into bpf_attr and treats them as integers rather than pointers. The helper
will then read from arbitrary pointers passed into it. Restrict the helper
to CAP_PERFMON since the security model in BPF of arbitrary kernel read is
CAP_BPF + CAP_PERFMON.

Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.")
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220816205517.682470-1-zhuyifei@google.com
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a4d40d98428a..27760627370d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return &bpf_sys_bpf_proto;
+		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:

From 7ec9fce4b31604f8415136a4c07f7dc8ad431aec Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Thu, 18 Aug 2022 10:41:18 +0300
Subject: [PATCH 06/56] ip_tunnel: Respect tunnel key's "flow_flags" in IP
 tunnels

Commit 451ef36bd229 ("ip_tunnels: Add new flow flags field to ip_tunnel_key")
added a "flow_flags" member to struct ip_tunnel_key which was later used by
the commit in the fixes tag to avoid dropping packets with sources that
aren't locally configured when set in bpf_set_tunnel_key().

VXLAN and GENEVE were made to respect this flag, ip tunnels like IPIP and GRE
were not.

This commit fixes this omission by making ip_tunnel_init_flow() receive
the flow flags from the tunnel key in the relevant collect_md paths.

Fixes: b8fff748521c ("bpf: Set flow flag to allow any source IP in bpf_tunnel_key")
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Paul Chaignon <paul@isovalent.com>
Link: https://lore.kernel.org/bpf/20220818074118.726639-1-eyal.birger@gmail.com
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 ++-
 include/net/ip_tunnels.h                            | 4 +++-
 net/ipv4/ip_gre.c                                   | 2 +-
 net/ipv4/ip_tunnel.c                                | 7 ++++---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 39904dacf4f0..b3472fb94617 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -423,7 +423,8 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev,
 
 	parms = mlxsw_sp_ipip_netdev_parms4(to_dev);
 	ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp,
-			    0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0);
+			    0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0,
+			    0);
 
 	rt = ip_route_output_key(tun->net, &fl4);
 	if (IS_ERR(rt))
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 63fac94f9ace..ced80e2f8b58 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -246,7 +246,8 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
 				       __be32 daddr, __be32 saddr,
 				       __be32 key, __u8 tos,
 				       struct net *net, int oif,
-				       __u32 mark, __u32 tun_inner_hash)
+				       __u32 mark, __u32 tun_inner_hash,
+				       __u8 flow_flags)
 {
 	memset(fl4, 0, sizeof(*fl4));
 
@@ -263,6 +264,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
 	fl4->fl4_gre_key = key;
 	fl4->flowi4_mark = mark;
 	fl4->flowi4_multipath_hash = tun_inner_hash;
+	fl4->flowi4_flags = flow_flags;
 }
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5c58e21f724e..f866d6282b2b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -609,7 +609,7 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 	ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
 			    tunnel_id_to_key32(key->tun_id),
 			    key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
-			    skb->mark, skb_get_hash(skb));
+			    skb->mark, skb_get_hash(skb), key->flow_flags);
 	rt = ip_route_output_key(dev_net(dev), &fl4);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e65e948cab9f..019f3b0839c5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -295,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
 				    iph->saddr, tunnel->parms.o_key,
 				    RT_TOS(iph->tos), dev_net(dev),
-				    tunnel->parms.link, tunnel->fwmark, 0);
+				    tunnel->parms.link, tunnel->fwmark, 0, 0);
 		rt = ip_route_output_key(tunnel->net, &fl4);
 
 		if (!IS_ERR(rt)) {
@@ -570,7 +570,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
-			    dev_net(dev), 0, skb->mark, skb_get_hash(skb));
+			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
+			    key->flow_flags);
 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
 		goto tx_error;
 
@@ -729,7 +730,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
 			    tunnel->parms.o_key, RT_TOS(tos),
 			    dev_net(dev), tunnel->parms.link,
-			    tunnel->fwmark, skb_get_hash(skb));
+			    tunnel->fwmark, skb_get_hash(skb), 0);
 
 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 		goto tx_error;

From 7d6620f107bae6ed687ff07668e8e8f855487aa9 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Sat, 13 Aug 2022 21:40:30 +0800
Subject: [PATCH 07/56] bpf, cgroup: Fix kernel BUG in purge_effective_progs

Syzkaller reported a triggered kernel BUG as follows:

  ------------[ cut here ]------------
  kernel BUG at kernel/bpf/cgroup.c:925!
  invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 1 PID: 194 Comm: detach Not tainted 5.19.0-14184-g69dac8e431af #8
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
  RIP: 0010:__cgroup_bpf_detach+0x1f2/0x2a0
  Code: 00 e8 92 60 30 00 84 c0 75 d8 4c 89 e0 31 f6 85 f6 74 19 42 f6 84
  28 48 05 00 00 02 75 0e 48 8b 80 c0 00 00 00 48 85 c0 75 e5 <0f> 0b 48
  8b 0c5
  RSP: 0018:ffffc9000055bdb0 EFLAGS: 00000246
  RAX: 0000000000000000 RBX: ffff888100ec0800 RCX: ffffc900000f1000
  RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888100ec4578
  RBP: 0000000000000000 R08: ffff888100ec0800 R09: 0000000000000040
  R10: 0000000000000000 R11: 0000000000000000 R12: ffff888100ec4000
  R13: 000000000000000d R14: ffffc90000199000 R15: ffff888100effb00
  FS:  00007f68213d2b80(0000) GS:ffff88813bc80000(0000)
  knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000055f74a0e5850 CR3: 0000000102836000 CR4: 00000000000006e0
  Call Trace:
   <TASK>
   cgroup_bpf_prog_detach+0xcc/0x100
   __sys_bpf+0x2273/0x2a00
   __x64_sys_bpf+0x17/0x20
   do_syscall_64+0x3b/0x90
   entry_SYSCALL_64_after_hwframe+0x63/0xcd
  RIP: 0033:0x7f68214dbcb9
  Code: 08 44 89 e0 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 48 89 f8 48 89
  f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
  f0 ff8
  RSP: 002b:00007ffeb487db68 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
  RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f68214dbcb9
  RDX: 0000000000000090 RSI: 00007ffeb487db70 RDI: 0000000000000009
  RBP: 0000000000000003 R08: 0000000000000012 R09: 0000000b00000003
  R10: 00007ffeb487db70 R11: 0000000000000246 R12: 00007ffeb487dc20
  R13: 0000000000000004 R14: 0000000000000001 R15: 000055f74a1011b0
   </TASK>
  Modules linked in:
  ---[ end trace 0000000000000000 ]---

Repetition steps:

For the following cgroup tree,

  root
   |
  cg1
   |
  cg2

  1. attach prog2 to cg2, and then attach prog1 to cg1, both bpf progs
     attach type is NONE or OVERRIDE.
  2. write 1 to /proc/thread-self/fail-nth for failslab.
  3. detach prog1 for cg1, and then kernel BUG occur.

Failslab injection will cause kmalloc fail and fall back to
purge_effective_progs. The problem is that cg2 have attached another prog,
so when go through cg2 layer, iteration will add pos to 1, and subsequent
operations will be skipped by the following condition, and cg will meet
NULL in the end.

  `if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))`

The NULL cg means no link or prog match, this is as expected, and it's not
a bug. So here just skip the no match situation.

Fixes: 4c46091ee985 ("bpf: Fix KASAN use-after-free Read in compute_effective_progs")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220813134030.1972696-1-pulehui@huawei.com
---
 kernel/bpf/cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 59b7eb60d5b4..4a400cd63731 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 				pos++;
 			}
 		}
+
+		/* no link or prog match, skip the cgroup of this layer */
+		continue;
 found:
-		BUG_ON(!cg);
 		progs = rcu_dereference_protected(
 				desc->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));

From afe7116f6d3b888778ed6d95e3cf724767b9aedf Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Mon, 8 Aug 2022 11:42:24 +0800
Subject: [PATCH 08/56] ieee802154/adf7242: defer destroy_workqueue call

There is a possible race condition (use-after-free) like below

  (FREE)                     |  (USE)
  adf7242_remove             |  adf7242_channel
   cancel_delayed_work_sync  |
    destroy_workqueue (1)    |   adf7242_cmd_rx
                             |    mod_delayed_work (2)
                             |

The root cause for this race is that the upper layer (ieee802154) is
unaware of this detaching event and the function adf7242_channel can
be called without any checks.

To fix this, we can add a flag write at the beginning of adf7242_remove
and add flag check in adf7242_channel. Or we can just defer the
destructive operation like other commit 3e0588c291d6 ("hamradio: defer
ax25 kfree after unregister_netdev") which let the
ieee802154_unregister_hw() to handle the synchronization. This patch
takes the second option.

Fixes: 58e9683d1475 ("net: ieee802154: adf7242: Fix OCL calibration
runs")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Acked-by: Michael Hennerich <michael.hennerich@analog.com>
Link: https://lore.kernel.org/r/20220808034224.12642-1-linma@zju.edu.cn
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 drivers/net/ieee802154/adf7242.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c
index 6afdf1622944..5cf218c674a5 100644
--- a/drivers/net/ieee802154/adf7242.c
+++ b/drivers/net/ieee802154/adf7242.c
@@ -1310,10 +1310,11 @@ static void adf7242_remove(struct spi_device *spi)
 
 	debugfs_remove_recursive(lp->debugfs_root);
 
+	ieee802154_unregister_hw(lp->hw);
+
 	cancel_delayed_work_sync(&lp->work);
 	destroy_workqueue(lp->wqueue);
 
-	ieee802154_unregister_hw(lp->hw);
 	mutex_destroy(&lp->bmux);
 	ieee802154_free_hw(lp->hw);
 }

From b5a990209d72615e3cac2b3b0d8ddd445c020cf5 Mon Sep 17 00:00:00 2001
From: Jilin Yuan <yuanjilin@cdjrlc.com>
Date: Fri, 8 Jul 2022 23:15:38 +0800
Subject: [PATCH 09/56] net/ieee802154: fix repeated words in comments

Delete the redundant word 'was'.

Signed-off-by: Jilin Yuan <yuanjilin@cdjrlc.com>
Link: https://lore.kernel.org/r/20220708151538.51483-1-yuanjilin@cdjrlc.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 drivers/net/ieee802154/ca8210.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c
index 42c0b451088d..450b16ad40a4 100644
--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -2293,7 +2293,7 @@ static int ca8210_set_csma_params(
  * @retries:  Number of retries
  *
  * Sets the number of times to retry a transmission if no acknowledgment was
- * was received from the other end when one was requested.
+ * received from the other end when one was requested.
  *
  * Return: 0 or linux error code
  */

From 0947ae1121083d363d522ff7518ee72b55bd8d29 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 14:58:04 -0700
Subject: [PATCH 10/56] bpf: Fix a data-race around bpf_jit_limit.

While reading bpf_jit_limit, it can be changed concurrently via sysctl,
WRITE_ONCE() in __do_proc_doulongvec_minmax(). The size of bpf_jit_limit
is long, so we need to add a paired READ_ONCE() to avoid load-tearing.

Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220823215804.2177-1-kuniyu@amazon.com
---
 kernel/bpf/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..3d9eb3ae334c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init);
 
 int bpf_jit_charge_modmem(u32 size)
 {
-	if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
+	if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
 		if (!bpf_capable()) {
 			atomic_long_sub(size, &bpf_jit_current);
 			return -EPERM;

From 60deb9f10eec5c6a20252ed36238b55d8b614a2c Mon Sep 17 00:00:00 2001
From: Siddh Raman Pant <code@siddh.me>
Date: Sat, 20 Aug 2022 01:33:40 +0530
Subject: [PATCH 11/56] wifi: mac80211: Fix UAF in ieee80211_scan_rx()

ieee80211_scan_rx() tries to access scan_req->flags after a
null check, but a UAF is observed when the scan is completed
and __ieee80211_scan_completed() executes, which then calls
cfg80211_scan_done() leading to the freeing of scan_req.

Since scan_req is rcu_dereference()'d, prevent the racing in
__ieee80211_scan_completed() by ensuring that from mac80211's
POV it is no longer accessed from an RCU read critical section
before we call cfg80211_scan_done().

Cc: stable@vger.kernel.org
Link: https://syzkaller.appspot.com/bug?extid=f9acff9bf08a845f225d
Reported-by: syzbot+f9acff9bf08a845f225d@syzkaller.appspotmail.com
Suggested-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Siddh Raman Pant <code@siddh.me>
Link: https://lore.kernel.org/r/20220819200340.34826-1-code@siddh.me
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/scan.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index fa8ddf576bc1..c4f2aeb31da3 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -469,16 +469,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	scan_req = rcu_dereference_protected(local->scan_req,
 					     lockdep_is_held(&local->mtx));
 
-	if (scan_req != local->int_scan_req) {
-		local->scan_info.aborted = aborted;
-		cfg80211_scan_done(scan_req, &local->scan_info);
-	}
 	RCU_INIT_POINTER(local->scan_req, NULL);
 	RCU_INIT_POINTER(local->scan_sdata, NULL);
 
 	local->scanning = 0;
 	local->scan_chandef.chan = NULL;
 
+	synchronize_rcu();
+
+	if (scan_req != local->int_scan_req) {
+		local->scan_info.aborted = aborted;
+		cfg80211_scan_done(scan_req, &local->scan_info);
+	}
+
 	/* Set power back to normal operating levels. */
 	ieee80211_hw_config(local, 0);
 

From 36fe8e4e5cb02131719612aea1e64379670d1846 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 23 Aug 2022 15:22:23 +0200
Subject: [PATCH 12/56] wifi: mac80211: always free sta in __sta_info_alloc in
 case of error

Free sta pointer in __sta_info_alloc routine if sta_info_alloc_link()
fails.

Fixes: 246b39e4a1ba5 ("wifi: mac80211: refactor some sta_info link handling")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/a3d079208684cddbc25289f7f7e0fed795b0cad4.1661260857.git.lorenzo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cb23da9aff1e..330dab41f2fe 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -494,7 +494,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	sta->sdata = sdata;
 
 	if (sta_info_alloc_link(local, &sta->deflink, gfp))
-		return NULL;
+		goto free;
 
 	if (link_id >= 0) {
 		sta_info_add_link(sta, link_id, &sta->deflink,

From 62b03f45c6352410d13bf0710d24ef6290632eb1 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 18 Aug 2022 12:33:49 +0800
Subject: [PATCH 13/56] wifi: mac80211: fix possible leak in
 ieee80211_tx_control_port()

Add missing dev_kfree_skb() in an error path in
ieee80211_tx_control_port() to avoid a memory leak.

Fixes: dd820ed6336a ("wifi: mac80211: return error from control port TX for drops")
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://lore.kernel.org/r/20220818043349.4168835-1-yangyingliang@huawei.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 45df9932d0ba..594bd70ee641 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5885,6 +5885,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	rcu_read_lock();
 	err = ieee80211_lookup_ra_sta(sdata, skb, &sta);
 	if (err) {
+		dev_kfree_skb(skb);
 		rcu_read_unlock();
 		return err;
 	}

From 15bc8966b6d3a5b9bfe4c9facfa02f2b69b1e5f0 Mon Sep 17 00:00:00 2001
From: Siddh Raman Pant <code@siddh.me>
Date: Sun, 14 Aug 2022 20:45:12 +0530
Subject: [PATCH 14/56] wifi: mac80211: Don't finalize CSA in IBSS mode if
 state is disconnected

When we are not connected to a channel, sending channel "switch"
announcement doesn't make any sense.

The BSS list is empty in that case. This causes the for loop in
cfg80211_get_bss() to be bypassed, so the function returns NULL
(check line 1424 of net/wireless/scan.c), causing the WARN_ON()
in ieee80211_ibss_csa_beacon() to get triggered (check line 500
of net/mac80211/ibss.c), which was consequently reported on the
syzkaller dashboard.

Thus, check if we have an existing connection before generating
the CSA beacon in ieee80211_ibss_finish_csa().

Cc: stable@vger.kernel.org
Fixes: cd7760e62c2a ("mac80211: add support for CSA in IBSS mode")
Link: https://syzkaller.appspot.com/bug?id=05603ef4ae8926761b678d2939a3b2ad28ab9ca6
Reported-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com
Signed-off-by: Siddh Raman Pant <code@siddh.me>
Tested-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com
Link: https://lore.kernel.org/r/20220814151512.9985-1-code@siddh.me
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index d56890e3fabb..9b283bbc7bb4 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -530,6 +530,10 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata)
 
 	sdata_assert_lock(sdata);
 
+	/* When not connected/joined, sending CSA doesn't make sense. */
+	if (ifibss->state != IEEE80211_IBSS_MLME_JOINED)
+		return -ENOLINK;
+
 	/* update cfg80211 bss information with the new channel */
 	if (!is_zero_ether_addr(ifibss->bssid)) {
 		cbss = cfg80211_get_bss(sdata->local->hw.wiphy,

From d776763f48084926b5d9e25507a3ddb7c9243d5e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 4 Aug 2022 10:03:21 +0300
Subject: [PATCH 15/56] wifi: cfg80211: debugfs: fix return type in
 ht40allow_map_read()

The return type is supposed to be ssize_t, which is signed long,
but "r" was declared as unsigned int.  This means that on 64 bit systems
we return positive values instead of negative error codes.

Fixes: 80a3511d70e8 ("cfg80211: add debugfs HT40 allow map")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/YutvOQeJm0UjLhwU@kili
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/debugfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index aab43469a2f0..0878b162890a 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -65,9 +65,10 @@ static ssize_t ht40allow_map_read(struct file *file,
 {
 	struct wiphy *wiphy = file->private_data;
 	char *buf;
-	unsigned int offset = 0, buf_size = PAGE_SIZE, i, r;
+	unsigned int offset = 0, buf_size = PAGE_SIZE, i;
 	enum nl80211_band band;
 	struct ieee80211_supported_band *sband;
+	ssize_t r;
 
 	buf = kzalloc(buf_size, GFP_KERNEL);
 	if (!buf)

From 55f0a4894484e8d6ddf662f5aebbf3b4cb028541 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 28 Jul 2022 17:25:16 +0300
Subject: [PATCH 16/56] wifi: mac80211: potential NULL dereference in
 ieee80211_tx_control_port()

The ieee80211_lookup_ra_sta() function will sometimes set "sta" to NULL
so add this NULL check to prevent an Oops.

Fixes: 9dd1953846c7 ("wifi: nl80211/mac80211: clarify link ID in control port TX")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/YuKcTAyO94YOy0Bu@kili
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 594bd70ee641..bf7fe6cd9dfc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5900,7 +5900,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 		 * for MLO STA, the SA should be the AP MLD address, but
 		 * the link ID has been selected already
 		 */
-		if (sta->sta.mlo)
+		if (sta && sta->sta.mlo)
 			memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
 	}
 	rcu_read_unlock();

From 2fc31465c5373b5ca4edf2e5238558cb62902311 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 23 Aug 2022 20:52:59 +0200
Subject: [PATCH 17/56] bpf: Do mark_chain_precision for
 ARG_CONST_ALLOC_SIZE_OR_ZERO

Precision markers need to be propagated whenever we have an ARG_CONST_*
style argument, as the verifier cannot consider imprecise scalars to be
equivalent for the purposes of states_equal check when such arguments
refine the return value (in this case, set mem_size for PTR_TO_MEM). The
resultant mem_size for the R0 is derived from the constant value, and if
the verifier incorrectly prunes states considering them equivalent where
such arguments exist (by seeing that both registers have reg->precise as
false in regsafe), we can end up with invalid programs passing the
verifier which can do access beyond what should have been the correct
mem_size in that explored state.

To show a concrete example of the problem:

0000000000000000 <prog>:
       0:       r2 = *(u32 *)(r1 + 80)
       1:       r1 = *(u32 *)(r1 + 76)
       2:       r3 = r1
       3:       r3 += 4
       4:       if r3 > r2 goto +18 <LBB5_5>
       5:       w2 = 0
       6:       *(u32 *)(r1 + 0) = r2
       7:       r1 = *(u32 *)(r1 + 0)
       8:       r2 = 1
       9:       if w1 == 0 goto +1 <LBB5_3>
      10:       r2 = -1

0000000000000058 <LBB5_3>:
      11:       r1 = 0 ll
      13:       r3 = 0
      14:       call bpf_ringbuf_reserve
      15:       if r0 == 0 goto +7 <LBB5_5>
      16:       r1 = r0
      17:       r1 += 16777215
      18:       w2 = 0
      19:       *(u8 *)(r1 + 0) = r2
      20:       r1 = r0
      21:       r2 = 0
      22:       call bpf_ringbuf_submit

00000000000000b8 <LBB5_5>:
      23:       w0 = 0
      24:       exit

For the first case, the single line execution's exploration will prune
the search at insn 14 for the branch insn 9's second leg as it will be
verified first using r2 = -1 (UINT_MAX), while as w1 at insn 9 will
always be 0 so at runtime we don't get error for being greater than
UINT_MAX/4 from bpf_ringbuf_reserve. The verifier during regsafe just
sees reg->precise as false for both r2 registers in both states, hence
considers them equal for purposes of states_equal.

If we propagated precise markers using the backtracking support, we
would use the precise marking to then ensure that old r2 (UINT_MAX) was
within the new r2 (1) and this would never be true, so the verification
would rightfully fail.

The end result is that the out of bounds access at instruction 19 would
be permitted without this fix.

Note that reg->precise is always set to true when user does not have
CAP_BPF (or when subprog count is greater than 1 (i.e. use of any static
or global functions)), hence this is only a problem when precision marks
need to be explicitly propagated (i.e. privileged users with CAP_BPF).

A simplified test case has been included in the next patch to prevent
future regressions.

Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220823185300.406-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 096fdac70165..30c6eebce146 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6066,6 +6066,9 @@ skip_type_check:
 			return -EACCES;
 		}
 		meta->mem_size = reg->var_off.value;
+		err = mark_chain_precision(env, regno);
+		if (err)
+			return err;
 		break;
 	case ARG_PTR_TO_INT:
 	case ARG_PTR_TO_LONG:

From 1800b2ac96d8bc4ccdddc2ea9e83ecaffd54d3f2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 23 Aug 2022 20:55:00 +0200
Subject: [PATCH 18/56] selftests/bpf: Add regression test for pruning fix

Add a test to ensure we do mark_chain_precision for the argument type
ARG_CONST_ALLOC_SIZE_OR_ZERO. For other argument types, this was already
done, but propagation for missing for this case. Without the fix, this
test case loads successfully.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220823185500.467-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/verifier/precise.c  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 9e754423fa8b..6c03a7d805f9 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -192,3 +192,28 @@
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
+{
+	"precise: mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO",
+	.insns = {
+	BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct xdp_md, ingress_ifindex)),
+	BPF_LD_MAP_FD(BPF_REG_6, 0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_IMM(BPF_REG_2, 1),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1),
+	BPF_MOV64_IMM(BPF_REG_2, 0x1000),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 42),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.prog_type = BPF_PROG_TYPE_XDP,
+	.flags = BPF_F_TEST_STATE_FREQ,
+	.errstr = "invalid access to memory, mem_size=1 off=42 size=8",
+	.result = REJECT,
+},

From a657182a5c5150cdfacb6640aad1d2712571a409 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 Aug 2022 23:26:47 +0200
Subject: [PATCH 19/56] bpf: Don't use tnum_range on array range checking for
 poke descriptors

Hsin-Wei reported a KASAN splat triggered by their BPF runtime fuzzer which
is based on a customized syzkaller:

  BUG: KASAN: slab-out-of-bounds in bpf_int_jit_compile+0x1257/0x13f0
  Read of size 8 at addr ffff888004e90b58 by task syz-executor.0/1489
  CPU: 1 PID: 1489 Comm: syz-executor.0 Not tainted 5.19.0 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  1.13.0-1ubuntu1.1 04/01/2014
  Call Trace:
   <TASK>
   dump_stack_lvl+0x9c/0xc9
   print_address_description.constprop.0+0x1f/0x1f0
   ? bpf_int_jit_compile+0x1257/0x13f0
   kasan_report.cold+0xeb/0x197
   ? kvmalloc_node+0x170/0x200
   ? bpf_int_jit_compile+0x1257/0x13f0
   bpf_int_jit_compile+0x1257/0x13f0
   ? arch_prepare_bpf_dispatcher+0xd0/0xd0
   ? rcu_read_lock_sched_held+0x43/0x70
   bpf_prog_select_runtime+0x3e8/0x640
   ? bpf_obj_name_cpy+0x149/0x1b0
   bpf_prog_load+0x102f/0x2220
   ? __bpf_prog_put.constprop.0+0x220/0x220
   ? find_held_lock+0x2c/0x110
   ? __might_fault+0xd6/0x180
   ? lock_downgrade+0x6e0/0x6e0
   ? lock_is_held_type+0xa6/0x120
   ? __might_fault+0x147/0x180
   __sys_bpf+0x137b/0x6070
   ? bpf_perf_link_attach+0x530/0x530
   ? new_sync_read+0x600/0x600
   ? __fget_files+0x255/0x450
   ? lock_downgrade+0x6e0/0x6e0
   ? fput+0x30/0x1a0
   ? ksys_write+0x1a8/0x260
   __x64_sys_bpf+0x7a/0xc0
   ? syscall_enter_from_user_mode+0x21/0x70
   do_syscall_64+0x3b/0x90
   entry_SYSCALL_64_after_hwframe+0x63/0xcd
  RIP: 0033:0x7f917c4e2c2d

The problem here is that a range of tnum_range(0, map->max_entries - 1) has
limited ability to represent the concrete tight range with the tnum as the
set of resulting states from value + mask can result in a superset of the
actual intended range, and as such a tnum_in(range, reg->var_off) check may
yield true when it shouldn't, for example tnum_range(0, 2) would result in
00XX -> v = 0000, m = 0011 such that the intended set of {0, 1, 2} is here
represented by a less precise superset of {0, 1, 2, 3}. As the register is
known const scalar, really just use the concrete reg->var_off.value for the
upper index check.

Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes")
Reported-by: Hsin-Wei Hung <hsinweih@uci.edu>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 30c6eebce146..3eadb14e090b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7033,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
 	struct bpf_reg_state *regs = cur_regs(env), *reg;
 	struct bpf_map *map = meta->map_ptr;
-	struct tnum range;
-	u64 val;
+	u64 val, max;
 	int err;
 
 	if (func_id != BPF_FUNC_tail_call)
@@ -7044,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 		return -EINVAL;
 	}
 
-	range = tnum_range(0, map->max_entries - 1);
 	reg = &regs[BPF_REG_3];
+	val = reg->var_off.value;
+	max = map->max_entries;
 
-	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+	if (!(register_is_const(reg) && val < max)) {
 		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
 		return 0;
 	}
@@ -7055,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	err = mark_chain_precision(env, BPF_REG_3);
 	if (err)
 		return err;
-
-	val = reg->var_off.value;
 	if (bpf_map_key_unseen(aux))
 		bpf_map_key_store(aux, val);
 	else if (!bpf_map_key_poisoned(aux) &&

From b82a26d8633cc89367fac75beb3ec33061bea44a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 7 Aug 2022 22:57:40 +0200
Subject: [PATCH 20/56] Bluetooth: hci_event: Fix vendor (unknown) opcode
 status handling

Commit c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to
handle Command Complete") was (presumably) meant to only refactor things
without any functional changes.

But it does have one undesirable side-effect, before *status would always
be set to skb->data[0] and it might be overridden by some of the opcode
specific handling. While now it always set by the opcode specific handlers.
This means that if the opcode is not known *status does not get set any
more at all!

This behavior change has broken bluetooth support for BCM4343A0 HCIs,
the hci_bcm.c code tries to configure UART attached HCIs at a higher
baudraute using vendor specific opcodes. The BCM4343A0 does not
support this and this used to simply fail:

[   25.646442] Bluetooth: hci0: BCM: failed to write clock (-56)
[   25.646481] Bluetooth: hci0: Failed to set baudrate

After which things would continue with the initial baudraute. But now
that hci_cmd_complete_evt() no longer sets status for unknown opcodes
*status is left at 0. This causes the hci_bcm.c code to think the baudraute
has been changed on the HCI side and to also adjust the UART baudrate,
after which communication with the HCI is broken, leading to:

[   28.579042] Bluetooth: hci0: command 0x0c03 tx timeout
[   36.961601] Bluetooth: hci0: BCM: Reset failed (-110)

And non working bluetooth. Fix this by restoring the previous
default "*status = skb->data[0]" handling for unknown opcodes.

Fixes: c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete")
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 485c814cf44a..73aa9ee9d21a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4179,6 +4179,17 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data,
 		}
 	}
 
+	if (i == ARRAY_SIZE(hci_cc_table)) {
+		/* Unknown opcode, assume byte 0 contains the status, so
+		 * that e.g. __hci_cmd_sync() properly returns errors
+		 * for vendor specific commands send by HCI drivers.
+		 * If a vendor doesn't actually follow this convention we may
+		 * need to introduce a vendor CC table in order to properly set
+		 * the status.
+		 */
+		*status = skb->data[0];
+	}
+
 	handle_cmd_cnt_and_timer(hdev, ev->ncmd);
 
 	hci_req_cmd_complete(hdev, *opcode, *status, req_complete,

From 1fd02d56dae35b08e4cba8e6bd2c2e7ccff68ecc Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 11 Aug 2022 14:20:46 -0700
Subject: [PATCH 21/56] Bluetooth: hci_sync: Fix suspend performance regression

This attempts to fix suspend performance when there is no connections by
not updating the event mask.

Fixes: ef61b6ea1544 ("Bluetooth: Always set event mask on suspend")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e6d804b82b67..5fe440cdc68d 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5288,17 +5288,21 @@ int hci_suspend_sync(struct hci_dev *hdev)
 	/* Prevent disconnects from causing scanning to be re-enabled */
 	hci_pause_scan_sync(hdev);
 
-	/* Soft disconnect everything (power off) */
-	err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
-	if (err) {
-		/* Set state to BT_RUNNING so resume doesn't notify */
-		hdev->suspend_state = BT_RUNNING;
-		hci_resume_sync(hdev);
-		return err;
-	}
+	if (hci_conn_count(hdev)) {
+		/* Soft disconnect everything (power off) */
+		err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+		if (err) {
+			/* Set state to BT_RUNNING so resume doesn't notify */
+			hdev->suspend_state = BT_RUNNING;
+			hci_resume_sync(hdev);
+			return err;
+		}
 
-	/* Update event mask so only the allowed event can wakeup the host */
-	hci_set_event_mask_sync(hdev);
+		/* Update event mask so only the allowed event can wakeup the
+		 * host.
+		 */
+		hci_set_event_mask_sync(hdev);
+	}
 
 	/* Only configure accept list if disconnect succeeded and wake
 	 * isn't being prevented.

From b840304fb46cdf7012722f456bce06f151b3e81b Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 12 Aug 2022 15:33:57 -0700
Subject: [PATCH 22/56] Bluetooth: L2CAP: Fix build errors in some archs

This attempts to fix the follow errors:

In function 'memcmp',
    inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9,
    inlined from 'l2cap_global_chan_by_psm' at
    net/bluetooth/l2cap_core.c:2003:15:
./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp'
specified bound 6 exceeds source size 0 [-Werror=stringop-overread]
   44 | #define __underlying_memcmp     __builtin_memcmp
      |                                 ^
./include/linux/fortify-string.h:420:16: note: in expansion of macro
'__underlying_memcmp'
  420 |         return __underlying_memcmp(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~
In function 'memcmp',
    inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9,
    inlined from 'l2cap_global_chan_by_psm' at
    net/bluetooth/l2cap_core.c:2004:15:
./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp'
specified bound 6 exceeds source size 0 [-Werror=stringop-overread]
   44 | #define __underlying_memcmp     __builtin_memcmp
      |                                 ^
./include/linux/fortify-string.h:420:16: note: in expansion of macro
'__underlying_memcmp'
  420 |         return __underlying_memcmp(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~

Fixes: 332f1795ca20 ("Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/l2cap_core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index cbe0cae73434..2c9de67daadc 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1992,11 +1992,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
 			src_match = !bacmp(&c->src, src);
 			dst_match = !bacmp(&c->dst, dst);
 			if (src_match && dst_match) {
-				c = l2cap_chan_hold_unless_zero(c);
-				if (c) {
-					read_unlock(&chan_list_lock);
-					return c;
-				}
+				if (!l2cap_chan_hold_unless_zero(c))
+					continue;
+
+				read_unlock(&chan_list_lock);
+				return c;
 			}
 
 			/* Closest match */

From 23b72814da1a094b4c065e0bb598249f310c5577 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 15 Aug 2022 16:14:32 -0700
Subject: [PATCH 23/56] Bluetooth: MGMT: Fix Get Device Flags

Get Device Flags don't check if device does actually use an RPA in which
case it shall only set HCI_CONN_FLAG_REMOTE_WAKEUP if LL Privacy is
enabled.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 73 ++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 6e31023b84f5..566b6291b38e 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4547,6 +4547,22 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
 			       MGMT_STATUS_NOT_SUPPORTED);
 }
 
+static u32 get_params_flags(struct hci_dev *hdev,
+			    struct hci_conn_params *params)
+{
+	u32 flags = hdev->conn_flags;
+
+	/* Devices using RPAs can only be programmed in the acceptlist if
+	 * LL Privacy has been enable otherwise they cannot mark
+	 * HCI_CONN_FLAG_REMOTE_WAKEUP.
+	 */
+	if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) &&
+	    hci_find_irk_by_addr(hdev, &params->addr, params->addr_type))
+		flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP;
+
+	return flags;
+}
+
 static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 			    u16 data_len)
 {
@@ -4578,10 +4594,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 	} else {
 		params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
 						le_addr_type(cp->addr.type));
-
 		if (!params)
 			goto done;
 
+		supported_flags = get_params_flags(hdev, params);
 		current_flags = params->flags;
 	}
 
@@ -4649,38 +4665,35 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 			bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)",
 				    &cp->addr.bdaddr, cp->addr.type);
 		}
-	} else {
-		params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
-						le_addr_type(cp->addr.type));
-		if (params) {
-			/* Devices using RPAs can only be programmed in the
-			 * acceptlist LL Privacy has been enable otherwise they
-			 * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP.
-			 */
-			if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) &&
-			    !use_ll_privacy(hdev) &&
-			    hci_find_irk_by_addr(hdev, &params->addr,
-						 params->addr_type)) {
-				bt_dev_warn(hdev,
-					    "Cannot set wakeable for RPA");
-				goto unlock;
-			}
 
-			params->flags = current_flags;
-			status = MGMT_STATUS_SUCCESS;
-
-			/* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
-			 * has been set.
-			 */
-			if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
-				hci_update_passive_scan(hdev);
-		} else {
-			bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
-				    &cp->addr.bdaddr,
-				    le_addr_type(cp->addr.type));
-		}
+		goto unlock;
 	}
 
+	params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+					le_addr_type(cp->addr.type));
+	if (!params) {
+		bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
+			    &cp->addr.bdaddr, le_addr_type(cp->addr.type));
+		goto unlock;
+	}
+
+	supported_flags = get_params_flags(hdev, params);
+
+	if ((supported_flags | current_flags) != supported_flags) {
+		bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+			    current_flags, supported_flags);
+		goto unlock;
+	}
+
+	params->flags = current_flags;
+	status = MGMT_STATUS_SUCCESS;
+
+	/* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
+	 * has been set.
+	 */
+	if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
+		hci_update_passive_scan(hdev);
+
 unlock:
 	hci_dev_unlock(hdev);
 

From 3cfbc6ac22d62d0a06be9ce2996ba9fed75436cd Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 17 Aug 2022 20:14:36 +0900
Subject: [PATCH 24/56] Bluetooth: hci_sync: fix double mgmt_pending_free() in
 remove_adv_monitor()

syzbot is reporting double kfree() at remove_adv_monitor() [1], for
commit 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv
Monitor") forgot to remove duplicated mgmt_pending_remove() when
merging "if (err) {" path and "if (!pending) {" path.

Link: https://syzkaller.appspot.com/bug?extid=915a8416bf15895b8e07 [1]
Reported-by: syzbot <syzbot+915a8416bf15895b8e07@syzkaller.appspotmail.com>
Fixes: 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 566b6291b38e..72e6595a71cc 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -5067,7 +5067,6 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
 		else
 			status = MGMT_STATUS_FAILED;
 
-		mgmt_pending_remove(cmd);
 		goto unlock;
 	}
 

From c572909376673d8c126ae2ee53ba754ff9327d98 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 18 Aug 2022 14:31:42 -0700
Subject: [PATCH 25/56] Bluetooth: ISO: Fix not handling shutdown condition

In order to properly handle shutdown syscall the code shall not assume
that the how argument is always SHUT_RDWR resulting in SHUTDOWN_MASK as
that would result in poll to immediately report EPOLLHUP instead of
properly waiting for disconnect_cfm (Disconnect Complete) which is
rather important for the likes of BAP as the CIG may need to be
reprogrammed.

Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/iso.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index ced8ad4fed4f..613039ba5dbf 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1309,7 +1309,7 @@ static int iso_sock_shutdown(struct socket *sock, int how)
 	struct sock *sk = sock->sk;
 	int err = 0;
 
-	BT_DBG("sock %p, sk %p", sock, sk);
+	BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
 
 	if (!sk)
 		return 0;
@@ -1317,17 +1317,32 @@ static int iso_sock_shutdown(struct socket *sock, int how)
 	sock_hold(sk);
 	lock_sock(sk);
 
-	if (!sk->sk_shutdown) {
-		sk->sk_shutdown = SHUTDOWN_MASK;
-		iso_sock_clear_timer(sk);
-		__iso_sock_close(sk);
-
-		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
-		    !(current->flags & PF_EXITING))
-			err = bt_sock_wait_state(sk, BT_CLOSED,
-						 sk->sk_lingertime);
+	switch (how) {
+	case SHUT_RD:
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			goto unlock;
+		sk->sk_shutdown |= RCV_SHUTDOWN;
+		break;
+	case SHUT_WR:
+		if (sk->sk_shutdown & SEND_SHUTDOWN)
+			goto unlock;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		break;
+	case SHUT_RDWR:
+		if (sk->sk_shutdown & SHUTDOWN_MASK)
+			goto unlock;
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+		break;
 	}
 
+	iso_sock_clear_timer(sk);
+	__iso_sock_close(sk);
+
+	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+	    !(current->flags & PF_EXITING))
+		err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+
+unlock:
 	release_sock(sk);
 	sock_put(sk);
 

From f48735a9aaf8258f39918e13adf464ccd7dce33b Mon Sep 17 00:00:00 2001
From: Archie Pusaka <apusaka@chromium.org>
Date: Tue, 23 Aug 2022 12:39:22 +0800
Subject: [PATCH 26/56] Bluetooth: hci_event: Fix checking conn for
 le_conn_complete_evt

To prevent multiple conn complete events, we shouldn't look up the
conn with hci_lookup_le_connect, since it requires the state to be
BT_CONNECT. By the time the duplicate event is processed, the state
might have changed, so we end up processing the new event anyway.

Change the lookup function to hci_conn_hash_lookup_ba.

Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events")
Signed-off-by: Archie Pusaka <apusaka@chromium.org>
Reviewed-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 73aa9ee9d21a..6643c9c20fa4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5801,7 +5801,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	 */
 	hci_dev_clear_flag(hdev, HCI_LE_ADV);
 
-	conn = hci_lookup_le_connect(hdev);
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr);
 	if (!conn) {
 		/* In case of error status and there is no connection pending
 		 * just unlock as there is nothing to cleanup.

From cb0d160f813b23df2f4fe7af3237a026c44ed1aa Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:02:07 +0200
Subject: [PATCH 27/56] Bluetooth: move from strlcpy with unused retval to
 strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hidp/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 5940744a8cd8..cc20e706c639 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -83,14 +83,14 @@ static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo
 		ci->product = session->input->id.product;
 		ci->version = session->input->id.version;
 		if (session->input->name)
-			strlcpy(ci->name, session->input->name, 128);
+			strscpy(ci->name, session->input->name, 128);
 		else
-			strlcpy(ci->name, "HID Boot Device", 128);
+			strscpy(ci->name, "HID Boot Device", 128);
 	} else if (session->hid) {
 		ci->vendor  = session->hid->vendor;
 		ci->product = session->hid->product;
 		ci->version = session->hid->version;
-		strlcpy(ci->name, session->hid->name, 128);
+		strscpy(ci->name, session->hid->name, 128);
 	}
 }
 

From 2da8eb834b775a9d1acea6214d3e4a78ac841e6e Mon Sep 17 00:00:00 2001
From: Zhengping Jiang <jiangzp@google.com>
Date: Tue, 23 Aug 2022 10:28:08 -0700
Subject: [PATCH 28/56] Bluetooth: hci_sync: hold hdev->lock when cleanup
 hci_conn

When disconnecting all devices, hci_conn_failed is used to cleanup
hci_conn object when the hci_conn object cannot be aborted.
The function hci_conn_failed requires the caller holds hdev->lock.

Fixes: 9b3628d79b46f ("Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted")
Signed-off-by: Zhengping Jiang <jiangzp@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5fe440cdc68d..187786454d98 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4773,9 +4773,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
 		/* Cleanup hci_conn object if it cannot be cancelled as it
 		 * likelly means the controller and host stack are out of sync.
 		 */
-		if (err)
+		if (err) {
+			hci_dev_lock(hdev);
 			hci_conn_failed(conn, err);
-
+			hci_dev_unlock(hdev);
+		}
 		return err;
 	case BT_CONNECT2:
 		return hci_reject_conn_sync(hdev, conn, reason);

From 7498a457ecf7ff2c4d379360aa8f24566bb1543e Mon Sep 17 00:00:00 2001
From: Casper Andersson <casper.casan@gmail.com>
Date: Thu, 25 Aug 2022 10:49:55 +0200
Subject: [PATCH 29/56] net: sparx5: fix handling uneven length packets in
 manual extraction

Packets that are not of length divisible by 4 (e.g. 77, 78, 79) would
have the checksum included up to next multiple of 4 (a 77 bytes packet
would have 3 bytes of ethernet checksum included). The check for the
value expects it in host (Little) endian.

Fixes: f3cad2611a77 ("net: sparx5: add hostmode with phylink support")
Signed-off-by: Casper Andersson <casper.casan@gmail.com>
Reviewed-by: Steen Hegelund <Steen.Hegelund@microchip.com>
Link: https://lore.kernel.org/r/20220825084955.684637-1-casper.casan@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
index 304f84aadc36..21844beba72d 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
@@ -113,6 +113,8 @@ static void sparx5_xtr_grp(struct sparx5 *sparx5, u8 grp, bool byte_swap)
 			/* This assumes STATUS_WORD_POS == 1, Status
 			 * just after last data
 			 */
+			if (!byte_swap)
+				val = ntohl((__force __be32)val);
 			byte_cnt -= (4 - XTR_VALID_BYTES(val));
 			eof_flag = true;
 			break;

From 2ca1c94ce0b65a2ce7512b718f3d8a0fe6224bca Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Fri, 26 Aug 2022 08:25:30 +0800
Subject: [PATCH 30/56] tg3: Disable tg3 device on system reboot to avoid
 triggering AER

Commit d60cd06331a3 ("PM: ACPI: reboot: Use S5 for reboot") caused a
reboot hang on one Dell servers so the commit was reverted.

Someone managed to collect the AER log and it's caused by MSI:
[ 148.762067] ACPI: Preparing to enter system sleep state S5
[ 148.794638] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 5
[ 148.803731] {1}[Hardware Error]: event severity: recoverable
[ 148.810191] {1}[Hardware Error]: Error 0, type: fatal
[ 148.816088] {1}[Hardware Error]: section_type: PCIe error
[ 148.822391] {1}[Hardware Error]: port_type: 0, PCIe end point
[ 148.829026] {1}[Hardware Error]: version: 3.0
[ 148.834266] {1}[Hardware Error]: command: 0x0006, status: 0x0010
[ 148.841140] {1}[Hardware Error]: device_id: 0000:04:00.0
[ 148.847309] {1}[Hardware Error]: slot: 0
[ 148.852077] {1}[Hardware Error]: secondary_bus: 0x00
[ 148.857876] {1}[Hardware Error]: vendor_id: 0x14e4, device_id: 0x165f
[ 148.865145] {1}[Hardware Error]: class_code: 020000
[ 148.870845] {1}[Hardware Error]: aer_uncor_status: 0x00100000, aer_uncor_mask: 0x00010000
[ 148.879842] {1}[Hardware Error]: aer_uncor_severity: 0x000ef030
[ 148.886575] {1}[Hardware Error]: TLP Header: 40000001 0000030f 90028090 00000000
[ 148.894823] tg3 0000:04:00.0: AER: aer_status: 0x00100000, aer_mask: 0x00010000
[ 148.902795] tg3 0000:04:00.0: AER: [20] UnsupReq (First)
[ 148.910234] tg3 0000:04:00.0: AER: aer_layer=Transaction Layer, aer_agent=Requester ID
[ 148.918806] tg3 0000:04:00.0: AER: aer_uncor_severity: 0x000ef030
[ 148.925558] tg3 0000:04:00.0: AER: TLP Header: 40000001 0000030f 90028090 00000000

The MSI is probably raised by incoming packets, so power down the device
and disable bus mastering to stop the traffic, as user confirmed this
approach works.

In addition to that, be extra safe and cancel reset task if it's running.

Cc: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/all/b8db79e6857c41dab4ef08bdf826ea7c47e3bafc.1615947283.git.josef@toxicpanda.com/
BugLink: https://bugs.launchpad.net/bugs/1917471
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/20220826002530.1153296-1-kai.heng.feng@canonical.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/tg3.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index db1e9d810b41..89889d8150da 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -18076,16 +18076,20 @@ static void tg3_shutdown(struct pci_dev *pdev)
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct tg3 *tp = netdev_priv(dev);
 
+	tg3_reset_task_cancel(tp);
+
 	rtnl_lock();
+
 	netif_device_detach(dev);
 
 	if (netif_running(dev))
 		dev_close(dev);
 
-	if (system_state == SYSTEM_POWER_OFF)
-		tg3_power_down(tp);
+	tg3_power_down(tp);
 
 	rtnl_unlock();
+
+	pci_disable_device(pdev);
 }
 
 /**

From 3ce9f2bef75528936c78a7053301f5725f622f3a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 24 Aug 2022 19:39:51 -0700
Subject: [PATCH 31/56] net: smsc911x: Stop and start PHY during suspend and
 resume

Commit 744d23c71af3 ("net: phy: Warn about incorrect
mdio_bus_phy_resume() state") unveiled that the smsc911x driver was not
properly stopping and restarting the PHY during suspend/resume. Correct
that by indicating that the MAC is in charge of PHY PM operations and
ensure that all MDIO bus activity is quiescent during suspend.

Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Fixes: fba863b81604 ("net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM")
Fixes: 2aa70f864955 ("net: smsc911x: Quieten netif during suspend")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220825023951.3220-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/smsc/smsc911x.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index 3bf20211cceb..3829c2805b16 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -1037,6 +1037,8 @@ static int smsc911x_mii_probe(struct net_device *dev)
 		return ret;
 	}
 
+	/* Indicate that the MAC is responsible for managing PHY PM */
+	phydev->mac_managed_pm = true;
 	phy_attached_info(phydev);
 
 	phy_set_max_speed(phydev, SPEED_100);
@@ -2587,6 +2589,8 @@ static int smsc911x_suspend(struct device *dev)
 	if (netif_running(ndev)) {
 		netif_stop_queue(ndev);
 		netif_device_detach(ndev);
+		if (!device_may_wakeup(dev))
+			phy_stop(ndev->phydev);
 	}
 
 	/* enable wake on LAN, energy detection and the external PME
@@ -2628,6 +2632,8 @@ static int smsc911x_resume(struct device *dev)
 	if (netif_running(ndev)) {
 		netif_device_attach(ndev);
 		netif_start_queue(ndev);
+		if (!device_may_wakeup(dev))
+			phy_start(ndev->phydev);
 	}
 
 	return 0;

From a87406f4adee9c53b311d8a1ba2849c69e29a6d0 Mon Sep 17 00:00:00 2001
From: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Date: Thu, 25 Aug 2022 05:03:26 +0300
Subject: [PATCH 32/56] openvswitch: fix memory leak at failed datapath
 creation

ovs_dp_cmd_new()->ovs_dp_change()->ovs_dp_set_upcall_portids()
allocates array via kmalloc.
If for some reason new_vport() fails during ovs_dp_cmd_new()
dp->upcall_portids must be freed.
Add missing kfree.

Kmemleak example:
unreferenced object 0xffff88800c382500 (size 64):
  comm "dump_state", pid 323, jiffies 4294955418 (age 104.347s)
  hex dump (first 32 bytes):
    5e c2 79 e4 1f 7a 38 c7 09 21 38 0c 80 88 ff ff  ^.y..z8..!8.....
    03 00 00 00 0a 00 00 00 14 00 00 00 28 00 00 00  ............(...
  backtrace:
    [<0000000071bebc9f>] ovs_dp_set_upcall_portids+0x38/0xa0
    [<000000000187d8bd>] ovs_dp_change+0x63/0xe0
    [<000000002397e446>] ovs_dp_cmd_new+0x1f0/0x380
    [<00000000aa06f36e>] genl_family_rcv_msg_doit+0xea/0x150
    [<000000008f583bc4>] genl_rcv_msg+0xdc/0x1e0
    [<00000000fa10e377>] netlink_rcv_skb+0x50/0x100
    [<000000004959cece>] genl_rcv+0x24/0x40
    [<000000004699ac7f>] netlink_unicast+0x23e/0x360
    [<00000000c153573e>] netlink_sendmsg+0x24e/0x4b0
    [<000000006f4aa380>] sock_sendmsg+0x62/0x70
    [<00000000d0068654>] ____sys_sendmsg+0x230/0x270
    [<0000000012dacf7d>] ___sys_sendmsg+0x88/0xd0
    [<0000000011776020>] __sys_sendmsg+0x59/0xa0
    [<000000002e8f2dc1>] do_syscall_64+0x3b/0x90
    [<000000003243e7cb>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: b83d23a2a38b ("openvswitch: Introduce per-cpu upcall dispatch")
Acked-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko@virtuozzo.com>
Link: https://lore.kernel.org/r/20220825020326.664073-1-andrey.zhadchenko@virtuozzo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/openvswitch/datapath.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7e8a39a35627..6c9d153afbee 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1802,7 +1802,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 				ovs_dp_reset_user_features(skb, info);
 		}
 
-		goto err_unlock_and_destroy_meters;
+		goto err_destroy_portids;
 	}
 
 	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1817,6 +1817,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	ovs_notify(&dp_datapath_genl_family, reply, info);
 	return 0;
 
+err_destroy_portids:
+	kfree(rcu_dereference_raw(dp->upcall_portids));
 err_unlock_and_destroy_meters:
 	ovs_unlock();
 	ovs_meters_exit(dp);

From ebe5555c2f34505cdb1ae5c3de8b24e33740b3e0 Mon Sep 17 00:00:00 2001
From: Tianyu Yuan <tianyu.yuan@corigine.com>
Date: Thu, 25 Aug 2022 10:08:45 +0200
Subject: [PATCH 33/56] nfp: flower: fix ingress police using matchall filter

Referenced commit introduced nfp_policer_validate in the progress
installing rate limiter. This validate check the action id and will
reject police with CONTINUE, which is required to support ingress
police offload.

Fix this issue by allowing FLOW_ACTION_CONTINUE as notexceed action
id in nfp_policer_validate

Fixes: d97b4b105ce7 ("flow_offload: reject offload for all drivers with invalid police parameters")
Signed-off-by: Tianyu Yuan <tianyu.yuan@corigine.com>
Reviewed-by: Baowen Zheng <baowen.zheng@corigine.com>
Reviewed-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20220825080845.507534-1-simon.horman@corigine.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/flower/qos_conf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c
index 4e5df9f2c372..7b92026e1a6f 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/qos_conf.c
@@ -127,10 +127,11 @@ static int nfp_policer_validate(const struct flow_action *action,
 		return -EOPNOTSUPP;
 	}
 
-	if (act->police.notexceed.act_id != FLOW_ACTION_PIPE &&
+	if (act->police.notexceed.act_id != FLOW_ACTION_CONTINUE &&
+	    act->police.notexceed.act_id != FLOW_ACTION_PIPE &&
 	    act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) {
 		NL_SET_ERR_MSG_MOD(extack,
-				   "Offload not supported when conform action is not pipe or ok");
+				   "Offload not supported when conform action is not continue, pipe or ok");
 		return -EOPNOTSUPP;
 	}
 

From 1bd3a383075c64d638e65d263c9267b08ee7733c Mon Sep 17 00:00:00 2001
From: Jean-Francois Le Fillatre <jflf_kernel@gmx.com>
Date: Wed, 24 Aug 2022 21:14:36 +0200
Subject: [PATCH 34/56] r8152: add PID for the Lenovo OneLink+ Dock

The Lenovo OneLink+ Dock contains an RTL8153 controller that behaves as
a broken CDC device by default. Add the custom Lenovo PID to the r8152
driver to support it properly.

Also, systems compatible with this dock provide a BIOS option to enable
MAC address passthrough (as per Lenovo document "ThinkPad Docking
Solutions 2017"). Add the custom PID to the MAC passthrough list too.

Tested on a ThinkPad 13 1st gen with the expected results:

passthrough disabled: Invalid header when reading pass-thru MAC addr
passthrough enabled:  Using pass-thru MAC addr XX:XX:XX:XX:XX:XX

Signed-off-by: Jean-Francois Le Fillatre <jflf_kernel@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 7 +++++++
 drivers/net/usb/r8152.c     | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 2de09ad5bac0..e11f70911acc 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -777,6 +777,13 @@ static const struct usb_device_id	products[] = {
 },
 #endif
 
+/* Lenovo ThinkPad OneLink+ Dock (based on Realtek RTL8153) */
+{
+	USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3054, USB_CLASS_COMM,
+			USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+	.driver_info = 0,
+},
+
 /* ThinkPad USB-C Dock (based on Realtek RTL8153) */
 {
 	USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3062, USB_CLASS_COMM,
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index d142ac8fcf6e..688905ea0a6d 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -770,6 +770,7 @@ enum rtl8152_flags {
 	RX_EPROTO,
 };
 
+#define DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK		0x3054
 #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2	0x3082
 #define DEVICE_ID_THINKPAD_USB_C_DONGLE			0x720c
 #define DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2		0xa387
@@ -9581,6 +9582,7 @@ static bool rtl8152_supports_lenovo_macpassthru(struct usb_device *udev)
 
 	if (vendor_id == VENDOR_ID_LENOVO) {
 		switch (product_id) {
+		case DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK:
 		case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2:
 		case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2:
 		case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN3:
@@ -9828,6 +9830,7 @@ static const struct usb_device_id rtl8152_table[] = {
 	REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x0927),
 	REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101),
 	REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f),
+	REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3054),
 	REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062),
 	REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3069),
 	REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3082),

From f0da47118c7e93cdbbc6fb403dd729a5f2c90ee3 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 26 Aug 2022 16:29:54 +0200
Subject: [PATCH 35/56] net: mac802154: Fix a condition in the receive path

Upon reception, a packet must be categorized, either it's destination is
the host, or it is another host. A packet with no destination addressing
fields may be valid in two situations:
- the packet has no source field: only ACKs are built like that, we
  consider the host as the destination.
- the packet has a valid source field: it is directed to the PAN
  coordinator, as for know we don't have this information we consider we
  are not the PAN coordinator.

There was likely a copy/paste error made during a previous cleanup
because the if clause is now containing exactly the same condition as in
the switch case, which can never be true. In the past the destination
address was used in the switch and the source address was used in the
if, which matches what the spec says.

Cc: stable@vger.kernel.org
Fixes: ae531b9475f6 ("ieee802154: use ieee802154_addr instead of *_sa variants")
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20220826142954.254853-1-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 net/mac802154/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index b8ce84618a55..c439125ef2b9 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -44,7 +44,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
 
 	switch (mac_cb(skb)->dest.mode) {
 	case IEEE802154_ADDR_NONE:
-		if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE)
+		if (hdr->source.mode != IEEE802154_ADDR_NONE)
 			/* FIXME: check if we are PAN coordinator */
 			skb->pkt_type = PACKET_OTHERHOST;
 		else

From ffd7bdddaab193c38416fd5dd416d065517d266e Mon Sep 17 00:00:00 2001
From: Li Qiong <liqiong@nfschina.com>
Date: Mon, 29 Aug 2022 15:12:59 +0800
Subject: [PATCH 36/56] ieee802154: cc2520: add rc code in cc2520_tx()

The rc code is 0 at the error path "status & CC2520_STATUS_TX_UNDERFLOW".
Assign rc code with '-EINVAL' at this error path to fix it.

Signed-off-by: Li Qiong <liqiong@nfschina.com>
Link: https://lore.kernel.org/r/20220829071259.18330-1-liqiong@nfschina.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 drivers/net/ieee802154/cc2520.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c
index 1e1f40f628a0..c69b87d3837d 100644
--- a/drivers/net/ieee802154/cc2520.c
+++ b/drivers/net/ieee802154/cc2520.c
@@ -504,6 +504,7 @@ cc2520_tx(struct ieee802154_hw *hw, struct sk_buff *skb)
 		goto err_tx;
 
 	if (status & CC2520_STATUS_TX_UNDERFLOW) {
+		rc = -EINVAL;
 		dev_err(&priv->spi->dev, "cc2520 tx underflow exception\n");
 		goto err_tx;
 	}

From 3f8ae9fe0409698799e173f698b714f34570b64b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Aug 2022 13:36:44 +0200
Subject: [PATCH 37/56] net: dsa: xrs700x: Use irqsave variant for u64 stats
 update

xrs700x_read_port_counters() updates the stats from a worker using the
u64_stats_update_begin() version. This is okay on 32-UP since on the
reader side preemption is disabled.
On 32bit-SMP the writer can be preempted by the reader at which point
the reader will spin on the seqcount until writer continues and
completes the update.

Assigning the mib_mutex mutex to the underlying seqcount would ensure
proper synchronisation. The API for that on the u64_stats_init() side
isn't available. Since it is the only user, just use disable interrupts
during the update.

Use u64_stats_update_begin_irqsave() on the writer side to ensure an
uninterrupted update.

Fixes: ee00b24f32eb8 ("net: dsa: add Arrow SpeedChips XRS700x driver")
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: George McCollister <george.mccollister@gmail.com>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: George McCollister <george.mccollister@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/xrs700x/xrs700x.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c
index 3887ed33c5fe..fa622639d640 100644
--- a/drivers/net/dsa/xrs700x/xrs700x.c
+++ b/drivers/net/dsa/xrs700x/xrs700x.c
@@ -109,6 +109,7 @@ static void xrs700x_read_port_counters(struct xrs700x *priv, int port)
 {
 	struct xrs700x_port *p = &priv->ports[port];
 	struct rtnl_link_stats64 stats;
+	unsigned long flags;
 	int i;
 
 	memset(&stats, 0, sizeof(stats));
@@ -138,9 +139,9 @@ static void xrs700x_read_port_counters(struct xrs700x *priv, int port)
 	 */
 	stats.rx_packets += stats.multicast;
 
-	u64_stats_update_begin(&p->syncp);
+	flags = u64_stats_update_begin_irqsave(&p->syncp);
 	p->stats64 = stats;
-	u64_stats_update_end(&p->syncp);
+	u64_stats_update_end_irqrestore(&p->syncp, flags);
 
 	mutex_unlock(&p->mib_mutex);
 }

From 278d3ba61563ceed3cb248383ced19e14ec7bc1f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 25 Aug 2022 13:36:45 +0200
Subject: [PATCH 38/56] net: Use u64_stats_fetch_begin_irq() for stats fetch.

On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the
reader is in preemptible context and the writer side
(u64_stats_update_begin*()) runs in an interrupt context (IRQ or
softirq) then the writer can update the stats during the read operation.
This update remains undetected.

Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP
are not interrupted by a writer. 32bit-SMP remains unaffected by this
change.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Catherine Sullivan <csully@google.com>
Cc: David Awogbemila <awogbemila@google.com>
Cc: Dimitris Michailidis <dmichail@fungible.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Hans Ulli Kroll <ulli.kroll@googlemail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jeroen de Borst <jeroendb@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <simon.horman@corigine.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: oss-drivers@corigine.com
Cc: stable@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cortina/gemini.c         | 24 +++++++++----------
 .../ethernet/fungible/funeth/funeth_txrx.h    |  4 ++--
 drivers/net/ethernet/google/gve/gve_ethtool.c | 16 ++++++-------
 drivers/net/ethernet/google/gve/gve_main.c    | 12 +++++-----
 drivers/net/ethernet/huawei/hinic/hinic_rx.c  |  4 ++--
 drivers/net/ethernet/huawei/hinic/hinic_tx.c  |  4 ++--
 .../ethernet/netronome/nfp/nfp_net_common.c   |  8 +++----
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  |  8 +++----
 drivers/net/netdevsim/netdev.c                |  4 ++--
 net/mac80211/sta_info.c                       |  8 +++----
 net/mpls/af_mpls.c                            |  4 ++--
 11 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 9e6de2f968fa..6dae768671e3 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -1919,7 +1919,7 @@ static void gmac_get_stats64(struct net_device *netdev,
 
 	/* Racing with RX NAPI */
 	do {
-		start = u64_stats_fetch_begin(&port->rx_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp);
 
 		stats->rx_packets = port->stats.rx_packets;
 		stats->rx_bytes = port->stats.rx_bytes;
@@ -1931,11 +1931,11 @@ static void gmac_get_stats64(struct net_device *netdev,
 		stats->rx_crc_errors = port->stats.rx_crc_errors;
 		stats->rx_frame_errors = port->stats.rx_frame_errors;
 
-	} while (u64_stats_fetch_retry(&port->rx_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start));
 
 	/* Racing with MIB and TX completion interrupts */
 	do {
-		start = u64_stats_fetch_begin(&port->ir_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp);
 
 		stats->tx_errors = port->stats.tx_errors;
 		stats->tx_packets = port->stats.tx_packets;
@@ -1945,15 +1945,15 @@ static void gmac_get_stats64(struct net_device *netdev,
 		stats->rx_missed_errors = port->stats.rx_missed_errors;
 		stats->rx_fifo_errors = port->stats.rx_fifo_errors;
 
-	} while (u64_stats_fetch_retry(&port->ir_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start));
 
 	/* Racing with hard_start_xmit */
 	do {
-		start = u64_stats_fetch_begin(&port->tx_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp);
 
 		stats->tx_dropped = port->stats.tx_dropped;
 
-	} while (u64_stats_fetch_retry(&port->tx_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start));
 
 	stats->rx_dropped += stats->rx_missed_errors;
 }
@@ -2031,18 +2031,18 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
 	/* Racing with MIB interrupt */
 	do {
 		p = values;
-		start = u64_stats_fetch_begin(&port->ir_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->ir_stats_syncp);
 
 		for (i = 0; i < RX_STATS_NUM; i++)
 			*p++ = port->hw_stats[i];
 
-	} while (u64_stats_fetch_retry(&port->ir_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->ir_stats_syncp, start));
 	values = p;
 
 	/* Racing with RX NAPI */
 	do {
 		p = values;
-		start = u64_stats_fetch_begin(&port->rx_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->rx_stats_syncp);
 
 		for (i = 0; i < RX_STATUS_NUM; i++)
 			*p++ = port->rx_stats[i];
@@ -2050,13 +2050,13 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
 			*p++ = port->rx_csum_stats[i];
 		*p++ = port->rx_napi_exits;
 
-	} while (u64_stats_fetch_retry(&port->rx_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->rx_stats_syncp, start));
 	values = p;
 
 	/* Racing with TX start_xmit */
 	do {
 		p = values;
-		start = u64_stats_fetch_begin(&port->tx_stats_syncp);
+		start = u64_stats_fetch_begin_irq(&port->tx_stats_syncp);
 
 		for (i = 0; i < TX_MAX_FRAGS; i++) {
 			*values++ = port->tx_frag_stats[i];
@@ -2065,7 +2065,7 @@ static void gmac_get_ethtool_stats(struct net_device *netdev,
 		*values++ = port->tx_frags_linearized;
 		*values++ = port->tx_hw_csummed;
 
-	} while (u64_stats_fetch_retry(&port->tx_stats_syncp, start));
+	} while (u64_stats_fetch_retry_irq(&port->tx_stats_syncp, start));
 }
 
 static int gmac_get_ksettings(struct net_device *netdev,
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
index 53b7e95213a8..671f51135c26 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
@@ -206,9 +206,9 @@ struct funeth_rxq {
 
 #define FUN_QSTAT_READ(q, seq, stats_copy) \
 	do { \
-		seq = u64_stats_fetch_begin(&(q)->syncp); \
+		seq = u64_stats_fetch_begin_irq(&(q)->syncp); \
 		stats_copy = (q)->stats; \
-	} while (u64_stats_fetch_retry(&(q)->syncp, (seq)))
+	} while (u64_stats_fetch_retry_irq(&(q)->syncp, (seq)))
 
 #define FUN_INT_NAME_LEN (IFNAMSIZ + 16)
 
diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c
index 50b384910c83..7b9a2d9d9624 100644
--- a/drivers/net/ethernet/google/gve/gve_ethtool.c
+++ b/drivers/net/ethernet/google/gve/gve_ethtool.c
@@ -177,14 +177,14 @@ gve_get_ethtool_stats(struct net_device *netdev,
 				struct gve_rx_ring *rx = &priv->rx[ring];
 
 				start =
-				  u64_stats_fetch_begin(&priv->rx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
 				tmp_rx_pkts = rx->rpackets;
 				tmp_rx_bytes = rx->rbytes;
 				tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
 				tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
 				tmp_rx_desc_err_dropped_pkt =
 					rx->rx_desc_err_dropped_pkt;
-			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
 						       start));
 			rx_pkts += tmp_rx_pkts;
 			rx_bytes += tmp_rx_bytes;
@@ -198,10 +198,10 @@ gve_get_ethtool_stats(struct net_device *netdev,
 		if (priv->tx) {
 			do {
 				start =
-				  u64_stats_fetch_begin(&priv->tx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
 				tmp_tx_pkts = priv->tx[ring].pkt_done;
 				tmp_tx_bytes = priv->tx[ring].bytes_done;
-			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
 						       start));
 			tx_pkts += tmp_tx_pkts;
 			tx_bytes += tmp_tx_bytes;
@@ -259,13 +259,13 @@ gve_get_ethtool_stats(struct net_device *netdev,
 			data[i++] = rx->fill_cnt - rx->cnt;
 			do {
 				start =
-				  u64_stats_fetch_begin(&priv->rx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
 				tmp_rx_bytes = rx->rbytes;
 				tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail;
 				tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail;
 				tmp_rx_desc_err_dropped_pkt =
 					rx->rx_desc_err_dropped_pkt;
-			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
 						       start));
 			data[i++] = tmp_rx_bytes;
 			data[i++] = rx->rx_cont_packet_cnt;
@@ -331,9 +331,9 @@ gve_get_ethtool_stats(struct net_device *netdev,
 			}
 			do {
 				start =
-				  u64_stats_fetch_begin(&priv->tx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
 				tmp_tx_bytes = tx->bytes_done;
-			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
 						       start));
 			data[i++] = tmp_tx_bytes;
 			data[i++] = tx->wake_queue;
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 6cafee55efc3..044db3ebb071 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -51,10 +51,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
 		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
 			do {
 				start =
-				  u64_stats_fetch_begin(&priv->rx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->rx[ring].statss);
 				packets = priv->rx[ring].rpackets;
 				bytes = priv->rx[ring].rbytes;
-			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->rx[ring].statss,
 						       start));
 			s->rx_packets += packets;
 			s->rx_bytes += bytes;
@@ -64,10 +64,10 @@ static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
 		for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) {
 			do {
 				start =
-				  u64_stats_fetch_begin(&priv->tx[ring].statss);
+				  u64_stats_fetch_begin_irq(&priv->tx[ring].statss);
 				packets = priv->tx[ring].pkt_done;
 				bytes = priv->tx[ring].bytes_done;
-			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
+			} while (u64_stats_fetch_retry_irq(&priv->tx[ring].statss,
 						       start));
 			s->tx_packets += packets;
 			s->tx_bytes += bytes;
@@ -1274,9 +1274,9 @@ void gve_handle_report_stats(struct gve_priv *priv)
 			}
 
 			do {
-				start = u64_stats_fetch_begin(&priv->tx[idx].statss);
+				start = u64_stats_fetch_begin_irq(&priv->tx[idx].statss);
 				tx_bytes = priv->tx[idx].bytes_done;
-			} while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
+			} while (u64_stats_fetch_retry_irq(&priv->tx[idx].statss, start));
 			stats[stats_idx++] = (struct stats) {
 				.stat_name = cpu_to_be32(TX_WAKE_CNT),
 				.value = cpu_to_be64(priv->tx[idx].wake_queue),
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index a866bea65110..e5828a658caf 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -74,14 +74,14 @@ void hinic_rxq_get_stats(struct hinic_rxq *rxq, struct hinic_rxq_stats *stats)
 	unsigned int start;
 
 	do {
-		start = u64_stats_fetch_begin(&rxq_stats->syncp);
+		start = u64_stats_fetch_begin_irq(&rxq_stats->syncp);
 		stats->pkts = rxq_stats->pkts;
 		stats->bytes = rxq_stats->bytes;
 		stats->errors = rxq_stats->csum_errors +
 				rxq_stats->other_errors;
 		stats->csum_errors = rxq_stats->csum_errors;
 		stats->other_errors = rxq_stats->other_errors;
-	} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
+	} while (u64_stats_fetch_retry_irq(&rxq_stats->syncp, start));
 }
 
 /**
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
index 5051cdff2384..3b6c7b585737 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
@@ -99,14 +99,14 @@ void hinic_txq_get_stats(struct hinic_txq *txq, struct hinic_txq_stats *stats)
 	unsigned int start;
 
 	do {
-		start = u64_stats_fetch_begin(&txq_stats->syncp);
+		start = u64_stats_fetch_begin_irq(&txq_stats->syncp);
 		stats->pkts    = txq_stats->pkts;
 		stats->bytes   = txq_stats->bytes;
 		stats->tx_busy = txq_stats->tx_busy;
 		stats->tx_wake = txq_stats->tx_wake;
 		stats->tx_dropped = txq_stats->tx_dropped;
 		stats->big_frags_pkts = txq_stats->big_frags_pkts;
-	} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
+	} while (u64_stats_fetch_retry_irq(&txq_stats->syncp, start));
 }
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index cf4d6f1129fa..349a2b1a19a2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1630,21 +1630,21 @@ static void nfp_net_stat64(struct net_device *netdev,
 		unsigned int start;
 
 		do {
-			start = u64_stats_fetch_begin(&r_vec->rx_sync);
+			start = u64_stats_fetch_begin_irq(&r_vec->rx_sync);
 			data[0] = r_vec->rx_pkts;
 			data[1] = r_vec->rx_bytes;
 			data[2] = r_vec->rx_drops;
-		} while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
+		} while (u64_stats_fetch_retry_irq(&r_vec->rx_sync, start));
 		stats->rx_packets += data[0];
 		stats->rx_bytes += data[1];
 		stats->rx_dropped += data[2];
 
 		do {
-			start = u64_stats_fetch_begin(&r_vec->tx_sync);
+			start = u64_stats_fetch_begin_irq(&r_vec->tx_sync);
 			data[0] = r_vec->tx_pkts;
 			data[1] = r_vec->tx_bytes;
 			data[2] = r_vec->tx_errors;
-		} while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
+		} while (u64_stats_fetch_retry_irq(&r_vec->tx_sync, start));
 		stats->tx_packets += data[0];
 		stats->tx_bytes += data[1];
 		stats->tx_errors += data[2];
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index eeb1455a4e5d..b1b1b648e40c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -649,7 +649,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
 		unsigned int start;
 
 		do {
-			start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync);
+			start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].rx_sync);
 			data[0] = nn->r_vecs[i].rx_pkts;
 			tmp[0] = nn->r_vecs[i].hw_csum_rx_ok;
 			tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok;
@@ -657,10 +657,10 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
 			tmp[3] = nn->r_vecs[i].hw_csum_rx_error;
 			tmp[4] = nn->r_vecs[i].rx_replace_buf_alloc_fail;
 			tmp[5] = nn->r_vecs[i].hw_tls_rx;
-		} while (u64_stats_fetch_retry(&nn->r_vecs[i].rx_sync, start));
+		} while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].rx_sync, start));
 
 		do {
-			start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync);
+			start = u64_stats_fetch_begin_irq(&nn->r_vecs[i].tx_sync);
 			data[1] = nn->r_vecs[i].tx_pkts;
 			data[2] = nn->r_vecs[i].tx_busy;
 			tmp[6] = nn->r_vecs[i].hw_csum_tx;
@@ -670,7 +670,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
 			tmp[10] = nn->r_vecs[i].hw_tls_tx;
 			tmp[11] = nn->r_vecs[i].tls_tx_fallback;
 			tmp[12] = nn->r_vecs[i].tls_tx_no_fallback;
-		} while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start));
+		} while (u64_stats_fetch_retry_irq(&nn->r_vecs[i].tx_sync, start));
 
 		data += NN_RVEC_PER_Q_STATS;
 
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e470e3398abc..9a1a5b203624 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -67,10 +67,10 @@ nsim_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	unsigned int start;
 
 	do {
-		start = u64_stats_fetch_begin(&ns->syncp);
+		start = u64_stats_fetch_begin_irq(&ns->syncp);
 		stats->tx_bytes = ns->tx_bytes;
 		stats->tx_packets = ns->tx_packets;
-	} while (u64_stats_fetch_retry(&ns->syncp, start));
+	} while (u64_stats_fetch_retry_irq(&ns->syncp, start));
 }
 
 static int
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 330dab41f2fe..58998d821778 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
 	u64 value;
 
 	do {
-		start = u64_stats_fetch_begin(&rxstats->syncp);
+		start = u64_stats_fetch_begin_irq(&rxstats->syncp);
 		value = rxstats->msdu[tid];
-	} while (u64_stats_fetch_retry(&rxstats->syncp, start));
+	} while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
 
 	return value;
 }
@@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
 	u64 value;
 
 	do {
-		start = u64_stats_fetch_begin(&rxstats->syncp);
+		start = u64_stats_fetch_begin_irq(&rxstats->syncp);
 		value = rxstats->bytes;
-	} while (u64_stats_fetch_retry(&rxstats->syncp, start));
+	} while (u64_stats_fetch_retry_irq(&rxstats->syncp, start));
 
 	return value;
 }
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 35b5f806fdda..b52afe316dc4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev,
 
 		p = per_cpu_ptr(mdev->stats, i);
 		do {
-			start = u64_stats_fetch_begin(&p->syncp);
+			start = u64_stats_fetch_begin_irq(&p->syncp);
 			local = p->stats;
-		} while (u64_stats_fetch_retry(&p->syncp, start));
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
 
 		stats->rx_packets	+= local.rx_packets;
 		stats->rx_bytes		+= local.rx_bytes;

From b05972f01e7d30419987a1f221b5593668fd6448 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Fri, 26 Aug 2022 09:39:30 +0800
Subject: [PATCH 39/56] net: sched: tbf: don't call qdisc_put() while holding
 tree lock

The issue is the same to commit c2999f7fb05b ("net: sched: multiq: don't
call qdisc_put() while holding tree lock"). Qdiscs call qdisc_put() while
holding sch tree spinlock, which results sleeping-while-atomic BUG.

Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Link: https://lore.kernel.org/r/20220826013930.340121-1-shaozhengchao@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/sch_tbf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 72102277449e..36079fdde2cb 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -356,6 +356,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 	struct nlattr *tb[TCA_TBF_MAX + 1];
 	struct tc_tbf_qopt *qopt;
 	struct Qdisc *child = NULL;
+	struct Qdisc *old = NULL;
 	struct psched_ratecfg rate;
 	struct psched_ratecfg peak;
 	u64 max_size;
@@ -447,7 +448,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 	sch_tree_lock(sch);
 	if (child) {
 		qdisc_tree_flush_backlog(q->qdisc);
-		qdisc_put(q->qdisc);
+		old = q->qdisc;
 		q->qdisc = child;
 	}
 	q->limit = qopt->limit;
@@ -467,6 +468,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
 
 	sch_tree_unlock(sch);
+	qdisc_put(old);
 	err = 0;
 
 	tbf_offload_change(sch);

From f612466ebecb12a00d9152344ddda6f6345f04dc Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Fri, 26 Aug 2022 17:00:55 +0800
Subject: [PATCH 40/56] net/sched: fix netdevice reference leaks in
 attach_default_qdiscs()

In attach_default_qdiscs(), if a dev has multiple queues and queue 0 fails
to attach qdisc because there is no memory in attach_one_default_qdisc().
Then dev->qdisc will be noop_qdisc by default. But the other queues may be
able to successfully attach to default qdisc.

In this case, the fallback to noqueue process will be triggered. If the
original attached qdisc is not released and a new one is directly
attached, this will cause netdevice reference leaks.

The following is the bug log:

veth0: default qdisc (fq_codel) fail, fallback to noqueue
unregister_netdevice: waiting for veth0 to become free. Usage count = 32
leaked reference.
 qdisc_alloc+0x12e/0x210
 qdisc_create_dflt+0x62/0x140
 attach_one_default_qdisc.constprop.41+0x44/0x70
 dev_activate+0x128/0x290
 __dev_open+0x12a/0x190
 __dev_change_flags+0x1a2/0x1f0
 dev_change_flags+0x23/0x60
 do_setlink+0x332/0x1150
 __rtnl_newlink+0x52f/0x8e0
 rtnl_newlink+0x43/0x70
 rtnetlink_rcv_msg+0x140/0x3b0
 netlink_rcv_skb+0x50/0x100
 netlink_unicast+0x1bb/0x290
 netlink_sendmsg+0x37c/0x4e0
 sock_sendmsg+0x5f/0x70
 ____sys_sendmsg+0x208/0x280

Fix this bug by clearing any non-noop qdiscs that may have been assigned
before trying to re-attach.

Fixes: bf6dba76d278 ("net: sched: fallback to qdisc noqueue if default qdisc setup fail")
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Link: https://lore.kernel.org/r/20220826090055.24424-1-wanghai38@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/sch_generic.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 99b697ad2b98..7a8ea03f673d 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1122,6 +1122,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 }
 EXPORT_SYMBOL(dev_graft_qdisc);
 
+static void shutdown_scheduler_queue(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_qdisc_default)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	struct Qdisc *qdisc_default = _qdisc_default;
+
+	if (qdisc) {
+		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		dev_queue->qdisc_sleeping = qdisc_default;
+
+		qdisc_put(qdisc);
+	}
+}
+
 static void attach_one_default_qdisc(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_unused)
@@ -1169,6 +1184,7 @@ static void attach_default_qdiscs(struct net_device *dev)
 	if (qdisc == &noop_qdisc) {
 		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
 			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
+		netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
 		dev->priv_flags |= IFF_NO_QUEUE;
 		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
 		qdisc = txq->qdisc_sleeping;
@@ -1447,21 +1463,6 @@ void dev_init_scheduler(struct net_device *dev)
 	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
 }
 
-static void shutdown_scheduler_queue(struct net_device *dev,
-				     struct netdev_queue *dev_queue,
-				     void *_qdisc_default)
-{
-	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
-	struct Qdisc *qdisc_default = _qdisc_default;
-
-	if (qdisc) {
-		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
-		dev_queue->qdisc_sleeping = qdisc_default;
-
-		qdisc_put(qdisc);
-	}
-}
-
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);

From fce1c23f629173e0db78b79a74f2052044a00e65 Mon Sep 17 00:00:00 2001
From: Alvaro Karsz <alvaro.karsz@solid-run.com>
Date: Tue, 23 Aug 2022 10:39:47 +0300
Subject: [PATCH 41/56] net: virtio_net: fix notification coalescing comments

Fix wording in comments for the notifications coalescing feature.

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20220823073947.14774-1-alvaro.karsz@solid-run.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/virtio_net.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 29ced55514d4..6cb842ea8979 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -56,7 +56,7 @@
 #define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
-#define VIRTIO_NET_F_NOTF_COAL	53	/* Guest can handle notifications coalescing */
+#define VIRTIO_NET_F_NOTF_COAL	53	/* Device supports notifications coalescing */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
@@ -364,24 +364,24 @@ struct virtio_net_hash_config {
  */
 #define VIRTIO_NET_CTRL_NOTF_COAL		6
 /*
- * Set the tx-usecs/tx-max-packets patameters.
- * tx-usecs - Maximum number of usecs to delay a TX notification.
- * tx-max-packets - Maximum number of packets to send before a TX notification.
+ * Set the tx-usecs/tx-max-packets parameters.
  */
 struct virtio_net_ctrl_coal_tx {
+	/* Maximum number of packets to send before a TX notification */
 	__le32 tx_max_packets;
+	/* Maximum number of usecs to delay a TX notification */
 	__le32 tx_usecs;
 };
 
 #define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET		0
 
 /*
- * Set the rx-usecs/rx-max-packets patameters.
- * rx-usecs - Maximum number of usecs to delay a RX notification.
- * rx-max-frames - Maximum number of packets to receive before a RX notification.
+ * Set the rx-usecs/rx-max-packets parameters.
  */
 struct virtio_net_ctrl_coal_rx {
+	/* Maximum number of packets to receive before a RX notification */
 	__le32 rx_max_packets;
+	/* Maximum number of usecs to delay a RX notification */
 	__le32 rx_usecs;
 };
 

From 4a4ce82212ef014d70f486a427005b2b5bab8e34 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Tue, 30 Aug 2022 08:40:55 +0200
Subject: [PATCH 42/56] net: phy: micrel: Make the GPIO to be non-exclusive

The same GPIO line can be shared by multiple phys for the coma mode pin.
If that is the case then, all the other phys that share the same line
will failed to be probed because the access to the gpio line is not
non-exclusive.
Fix this by making access to the gpio line to be nonexclusive using flag
GPIOD_FLAGS_BIT_NONEXCLUSIVE. This allows all the other PHYs to be
probed.

Fixes: 738871b09250ee ("net: phy: micrel: add coma mode GPIO")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Link: https://lore.kernel.org/r/20220830064055.2340403-1-horatiu.vultur@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/micrel.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index e78d0bf69bc3..6f52b4fb6888 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2873,12 +2873,18 @@ static int lan8814_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+/* It is expected that there will not be any 'lan8814_take_coma_mode'
+ * function called in suspend. Because the GPIO line can be shared, so if one of
+ * the phys goes back in coma mode, then all the other PHYs will go, which is
+ * wrong.
+ */
 static int lan8814_release_coma_mode(struct phy_device *phydev)
 {
 	struct gpio_desc *gpiod;
 
 	gpiod = devm_gpiod_get_optional(&phydev->mdio.dev, "coma-mode",
-					GPIOD_OUT_HIGH_OPEN_DRAIN);
+					GPIOD_OUT_HIGH_OPEN_DRAIN |
+					GPIOD_FLAGS_BIT_NONEXCLUSIVE);
 	if (IS_ERR(gpiod))
 		return PTR_ERR(gpiod);
 

From 642b2122c5df332a6df52dd1f91e552bc4356951 Mon Sep 17 00:00:00 2001
From: Gao Xiao <gao.xiao@corigine.com>
Date: Mon, 29 Aug 2022 12:16:51 +0200
Subject: [PATCH 43/56] nfp: fix the access to management firmware hanging

When running `ethtool -p` with the old management firmware,
the management firmware resource is not correctly released,
which causes firmware related malfunction: all the access
to management firmware hangs.

It releases the management firmware resource when set id
mode operation is not supported.

Fixes: ccb9bc1dfa44 ("nfp: add 'ethtool --identify' support")
Signed-off-by: Gao Xiao <gao.xiao@corigine.com>
Reviewed-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Link: https://lore.kernel.org/r/20220829101651.633840-1-simon.horman@corigine.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
index edd300033735..4cc38799eabc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
@@ -507,6 +507,7 @@ int nfp_eth_set_idmode(struct nfp_cpp *cpp, unsigned int idx, bool state)
 	if (nfp_nsp_get_abi_ver_minor(nsp) < 32) {
 		nfp_err(nfp_nsp_cpp(nsp),
 			"set id mode operation not supported, please update flash\n");
+		nfp_eth_config_cleanup_end(nsp);
 		return -EOPNOTSUPP;
 	}
 

From 13a9d08c296228d18289de60b83792c586e1d073 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 26 Aug 2022 18:00:30 +0300
Subject: [PATCH 44/56] net: lan966x: improve error handle in
 lan966x_fdma_rx_get_frame()

Don't just print a warning.  Clean up and return an error as well.

Fixes: c8349639324a ("net: lan966x: Add FDMA functionality")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Link: https://lore.kernel.org/r/YwjgDm/SVd5c1tQU@kili
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
index 6dea7f8c1481..51f8a0816377 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c
@@ -425,7 +425,8 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx)
 	lan966x_ifh_get_src_port(skb->data, &src_port);
 	lan966x_ifh_get_timestamp(skb->data, &timestamp);
 
-	WARN_ON(src_port >= lan966x->num_phys_ports);
+	if (WARN_ON(src_port >= lan966x->num_phys_ports))
+		goto free_skb;
 
 	skb->dev = lan966x->ports[src_port]->dev;
 	skb_pull(skb, IFH_LEN * sizeof(u32));
@@ -449,6 +450,8 @@ static struct sk_buff *lan966x_fdma_rx_get_frame(struct lan966x_rx *rx)
 
 	return skb;
 
+free_skb:
+	kfree_skb(skb);
 unmap_page:
 	dma_unmap_page(lan966x->dev, (dma_addr_t)db->dataptr,
 		       FDMA_DCB_STATUS_BLOCKL(db->status),

From c0955bf957be4bead01fae1d791476260da7325d Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Sat, 27 Aug 2022 23:38:15 +0800
Subject: [PATCH 45/56] ethernet: rocker: fix sleep in atomic context bug in
 neigh_timer_handler

The function neigh_timer_handler() is a timer handler that runs in an
atomic context. When used by rocker, neigh_timer_handler() calls
"kzalloc(.., GFP_KERNEL)" that may sleep. As a result, the sleep in
atomic context bug will happen. One of the processes is shown below:

ofdpa_fib4_add()
 ...
 neigh_add_timer()

(wait a timer)

neigh_timer_handler()
 neigh_release()
  neigh_destroy()
   rocker_port_neigh_destroy()
    rocker_world_port_neigh_destroy()
     ofdpa_port_neigh_destroy()
      ofdpa_port_ipv4_neigh()
       kzalloc(sizeof(.., GFP_KERNEL) //may sleep

This patch changes the gfp_t parameter of kzalloc() from GFP_KERNEL to
GFP_ATOMIC in order to mitigate the bug.

Fixes: 00fc0c51e35b ("rocker: Change world_ops API and implementation to be switchdev independant")
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index bc70c6abd6a5..58cf7cc54f40 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -1273,7 +1273,7 @@ static int ofdpa_port_ipv4_neigh(struct ofdpa_port *ofdpa_port,
 	bool removing;
 	int err = 0;
 
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 	if (!entry)
 		return -ENOMEM;
 

From 3a1a274e933fca73fdc960cb1f60636cd285a265 Mon Sep 17 00:00:00 2001
From: David Thompson <davthompson@nvidia.com>
Date: Fri, 26 Aug 2022 11:59:16 -0400
Subject: [PATCH 46/56] mlxbf_gige: compute MDIO period based on i1clk

This patch adds logic to compute the MDIO period based on
the i1clk, and thereafter write the MDIO period into the YU
MDIO config register. The i1clk resource from the ACPI table
is used to provide addressing to YU bootrecord PLL registers.
The values in these registers are used to compute MDIO period.
If the i1clk resource is not present in the ACPI table, then
the current default hardcorded value of 430Mhz is used.
The i1clk clock value of 430MHz is only accurate for boards
with BF2 mid bin and main bin SoCs. The BF2 high bin SoCs
have i1clk = 500MHz, but can support a slower MDIO period.

Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
Reviewed-by: Asmaa Mnebhi <asmaa@nvidia.com>
Signed-off-by: David Thompson <davthompson@nvidia.com>
Link: https://lore.kernel.org/r/20220826155916.12491-1-davthompson@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlxbf_gige/mlxbf_gige.h |   4 +-
 .../mellanox/mlxbf_gige/mlxbf_gige_mdio.c     | 128 +++++++++++++++---
 .../mellanox/mlxbf_gige/mlxbf_gige_regs.h     |   2 +
 3 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h
index 5fdf9b7179f5..5a1027b07215 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige.h
@@ -75,6 +75,7 @@ struct mlxbf_gige {
 	struct net_device *netdev;
 	struct platform_device *pdev;
 	void __iomem *mdio_io;
+	void __iomem *clk_io;
 	struct mii_bus *mdiobus;
 	spinlock_t lock;      /* for packet processing indices */
 	u16 rx_q_entries;
@@ -137,7 +138,8 @@ enum mlxbf_gige_res {
 	MLXBF_GIGE_RES_MDIO9,
 	MLXBF_GIGE_RES_GPIO0,
 	MLXBF_GIGE_RES_LLU,
-	MLXBF_GIGE_RES_PLU
+	MLXBF_GIGE_RES_PLU,
+	MLXBF_GIGE_RES_CLK
 };
 
 /* Version of register data returned by mlxbf_gige_get_regs() */
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
index 2e6c1b7af096..85155cd9405c 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c
@@ -22,10 +22,23 @@
 #include <linux/property.h>
 
 #include "mlxbf_gige.h"
+#include "mlxbf_gige_regs.h"
 
 #define MLXBF_GIGE_MDIO_GW_OFFSET	0x0
 #define MLXBF_GIGE_MDIO_CFG_OFFSET	0x4
 
+#define MLXBF_GIGE_MDIO_FREQ_REFERENCE 156250000ULL
+#define MLXBF_GIGE_MDIO_COREPLL_CONST  16384ULL
+#define MLXBF_GIGE_MDC_CLK_NS          400
+#define MLXBF_GIGE_MDIO_PLL_I1CLK_REG1 0x4
+#define MLXBF_GIGE_MDIO_PLL_I1CLK_REG2 0x8
+#define MLXBF_GIGE_MDIO_CORE_F_SHIFT   0
+#define MLXBF_GIGE_MDIO_CORE_F_MASK    GENMASK(25, 0)
+#define MLXBF_GIGE_MDIO_CORE_R_SHIFT   26
+#define MLXBF_GIGE_MDIO_CORE_R_MASK    GENMASK(31, 26)
+#define MLXBF_GIGE_MDIO_CORE_OD_SHIFT  0
+#define MLXBF_GIGE_MDIO_CORE_OD_MASK   GENMASK(3, 0)
+
 /* Support clause 22 */
 #define MLXBF_GIGE_MDIO_CL22_ST1	0x1
 #define MLXBF_GIGE_MDIO_CL22_WRITE	0x1
@@ -50,28 +63,77 @@
 #define MLXBF_GIGE_MDIO_CFG_MDIO_IN_SAMP_MASK		GENMASK(23, 16)
 #define MLXBF_GIGE_MDIO_CFG_MDIO_OUT_SAMP_MASK		GENMASK(31, 24)
 
-/* Formula for encoding the MDIO period. The encoded value is
- * passed to the MDIO config register.
- *
- * mdc_clk = 2*(val + 1)*i1clk
- *
- * 400 ns = 2*(val + 1)*(((1/430)*1000) ns)
- *
- * val = (((400 * 430 / 1000) / 2) - 1)
- */
-#define MLXBF_GIGE_I1CLK_MHZ		430
-#define MLXBF_GIGE_MDC_CLK_NS		400
-
-#define MLXBF_GIGE_MDIO_PERIOD	(((MLXBF_GIGE_MDC_CLK_NS * MLXBF_GIGE_I1CLK_MHZ / 1000) / 2) - 1)
-
 #define MLXBF_GIGE_MDIO_CFG_VAL (FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_MODE_MASK, 1) | \
 				 FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO3_3_MASK, 1) | \
 				 FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_FULL_DRIVE_MASK, 1) | \
-				 FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDC_PERIOD_MASK, \
-					    MLXBF_GIGE_MDIO_PERIOD) | \
 				 FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_IN_SAMP_MASK, 6) | \
 				 FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDIO_OUT_SAMP_MASK, 13))
 
+#define MLXBF_GIGE_BF2_COREPLL_ADDR 0x02800c30
+#define MLXBF_GIGE_BF2_COREPLL_SIZE 0x0000000c
+
+static struct resource corepll_params[] = {
+	[MLXBF_GIGE_VERSION_BF2] = {
+		.start = MLXBF_GIGE_BF2_COREPLL_ADDR,
+		.end = MLXBF_GIGE_BF2_COREPLL_ADDR + MLXBF_GIGE_BF2_COREPLL_SIZE - 1,
+		.name = "COREPLL_RES"
+	},
+};
+
+/* Returns core clock i1clk in Hz */
+static u64 calculate_i1clk(struct mlxbf_gige *priv)
+{
+	u8 core_od, core_r;
+	u64 freq_output;
+	u32 reg1, reg2;
+	u32 core_f;
+
+	reg1 = readl(priv->clk_io + MLXBF_GIGE_MDIO_PLL_I1CLK_REG1);
+	reg2 = readl(priv->clk_io + MLXBF_GIGE_MDIO_PLL_I1CLK_REG2);
+
+	core_f = (reg1 & MLXBF_GIGE_MDIO_CORE_F_MASK) >>
+		MLXBF_GIGE_MDIO_CORE_F_SHIFT;
+	core_r = (reg1 & MLXBF_GIGE_MDIO_CORE_R_MASK) >>
+		MLXBF_GIGE_MDIO_CORE_R_SHIFT;
+	core_od = (reg2 & MLXBF_GIGE_MDIO_CORE_OD_MASK) >>
+		MLXBF_GIGE_MDIO_CORE_OD_SHIFT;
+
+	/* Compute PLL output frequency as follow:
+	 *
+	 *                                     CORE_F / 16384
+	 * freq_output = freq_reference * ----------------------------
+	 *                              (CORE_R + 1) * (CORE_OD + 1)
+	 */
+	freq_output = div_u64((MLXBF_GIGE_MDIO_FREQ_REFERENCE * core_f),
+			      MLXBF_GIGE_MDIO_COREPLL_CONST);
+	freq_output = div_u64(freq_output, (core_r + 1) * (core_od + 1));
+
+	return freq_output;
+}
+
+/* Formula for encoding the MDIO period. The encoded value is
+ * passed to the MDIO config register.
+ *
+ * mdc_clk = 2*(val + 1)*(core clock in sec)
+ *
+ * i1clk is in Hz:
+ * 400 ns = 2*(val + 1)*(1/i1clk)
+ *
+ * val = (((400/10^9) / (1/i1clk) / 2) - 1)
+ * val = (400/2 * i1clk)/10^9 - 1
+ */
+static u8 mdio_period_map(struct mlxbf_gige *priv)
+{
+	u8 mdio_period;
+	u64 i1clk;
+
+	i1clk = calculate_i1clk(priv);
+
+	mdio_period = div_u64((MLXBF_GIGE_MDC_CLK_NS >> 1) * i1clk, 1000000000) - 1;
+
+	return mdio_period;
+}
+
 static u32 mlxbf_gige_mdio_create_cmd(u16 data, int phy_add,
 				      int phy_reg, u32 opcode)
 {
@@ -124,9 +186,9 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add,
 				 int phy_reg, u16 val)
 {
 	struct mlxbf_gige *priv = bus->priv;
+	u32 temp;
 	u32 cmd;
 	int ret;
-	u32 temp;
 
 	if (phy_reg & MII_ADDR_C45)
 		return -EOPNOTSUPP;
@@ -144,18 +206,44 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add,
 	return ret;
 }
 
+static void mlxbf_gige_mdio_cfg(struct mlxbf_gige *priv)
+{
+	u8 mdio_period;
+	u32 val;
+
+	mdio_period = mdio_period_map(priv);
+
+	val = MLXBF_GIGE_MDIO_CFG_VAL;
+	val |= FIELD_PREP(MLXBF_GIGE_MDIO_CFG_MDC_PERIOD_MASK, mdio_period);
+	writel(val, priv->mdio_io + MLXBF_GIGE_MDIO_CFG_OFFSET);
+}
+
 int mlxbf_gige_mdio_probe(struct platform_device *pdev, struct mlxbf_gige *priv)
 {
 	struct device *dev = &pdev->dev;
+	struct resource *res;
 	int ret;
 
 	priv->mdio_io = devm_platform_ioremap_resource(pdev, MLXBF_GIGE_RES_MDIO9);
 	if (IS_ERR(priv->mdio_io))
 		return PTR_ERR(priv->mdio_io);
 
-	/* Configure mdio parameters */
-	writel(MLXBF_GIGE_MDIO_CFG_VAL,
-	       priv->mdio_io + MLXBF_GIGE_MDIO_CFG_OFFSET);
+	/* clk resource shared with other drivers so cannot use
+	 * devm_platform_ioremap_resource
+	 */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, MLXBF_GIGE_RES_CLK);
+	if (!res) {
+		/* For backward compatibility with older ACPI tables, also keep
+		 * CLK resource internal to the driver.
+		 */
+		res = &corepll_params[MLXBF_GIGE_VERSION_BF2];
+	}
+
+	priv->clk_io = devm_ioremap(dev, res->start, resource_size(res));
+	if (IS_ERR(priv->clk_io))
+		return PTR_ERR(priv->clk_io);
+
+	mlxbf_gige_mdio_cfg(priv);
 
 	priv->mdiobus = devm_mdiobus_alloc(dev);
 	if (!priv->mdiobus) {
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
index 5fb33c9294bf..7be3a793984d 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
@@ -8,6 +8,8 @@
 #ifndef __MLXBF_GIGE_REGS_H__
 #define __MLXBF_GIGE_REGS_H__
 
+#define MLXBF_GIGE_VERSION                            0x0000
+#define MLXBF_GIGE_VERSION_BF2                        0x0
 #define MLXBF_GIGE_STATUS                             0x0010
 #define MLXBF_GIGE_STATUS_READY                       BIT(0)
 #define MLXBF_GIGE_INT_STATUS                         0x0028

From 8fc29ff3910f3af08a7c40a75d436b5720efe2bf Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sat, 27 Aug 2022 11:13:14 -0700
Subject: [PATCH 47/56] kcm: fix strp_init() order and cleanup

strp_init() is called just a few lines above this csk->sk_user_data
check, it also initializes strp->work etc., therefore, it is
unnecessary to call strp_done() to cancel the freshly initialized
work.

And if sk_user_data is already used by KCM, psock->strp should not be
touched, particularly strp->work state, so we need to move strp_init()
after the csk->sk_user_data check.

This also makes a lockdep warning reported by syzbot go away.

Reported-and-tested-by: syzbot+9fc084a4348493ef65d2@syzkaller.appspotmail.com
Reported-by: syzbot+e696806ef96cdd2d87cd@syzkaller.appspotmail.com
Fixes: e5571240236c ("kcm: Check if sk_user_data already set in kcm_attach")
Fixes: dff8baa26117 ("kcm: Call strp_stop before strp_done in kcm_attach")
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Link: https://lore.kernel.org/r/20220827181314.193710-1-xiyou.wangcong@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/kcm/kcmsock.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 71899e5a5a11..1215c863e1c4 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1412,12 +1412,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	psock->sk = csk;
 	psock->bpf_prog = prog;
 
-	err = strp_init(&psock->strp, csk, &cb);
-	if (err) {
-		kmem_cache_free(kcm_psockp, psock);
-		goto out;
-	}
-
 	write_lock_bh(&csk->sk_callback_lock);
 
 	/* Check if sk_user_data is already by KCM or someone else.
@@ -1425,13 +1419,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	 */
 	if (csk->sk_user_data) {
 		write_unlock_bh(&csk->sk_callback_lock);
-		strp_stop(&psock->strp);
-		strp_done(&psock->strp);
 		kmem_cache_free(kcm_psockp, psock);
 		err = -EALREADY;
 		goto out;
 	}
 
+	err = strp_init(&psock->strp, csk, &cb);
+	if (err) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		kmem_cache_free(kcm_psockp, psock);
+		goto out;
+	}
+
 	psock->save_data_ready = csk->sk_data_ready;
 	psock->save_write_space = csk->sk_write_space;
 	psock->save_state_change = csk->sk_state_change;

From 404a5ad72011f5bd2bb90f0a035be7635e2bd839 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 29 Aug 2022 16:54:14 -0700
Subject: [PATCH 48/56] Documentation: networking: correct possessive "its"

Change occurrences of "it's" that are possessive to "its"
so that they don't read as "it is".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20220829235414.17110-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/netdevsim.rst | 2 +-
 Documentation/networking/driver.rst            | 2 +-
 Documentation/networking/ipvlan.rst            | 2 +-
 Documentation/networking/l2tp.rst              | 2 +-
 Documentation/networking/switchdev.rst         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/devlink/netdevsim.rst b/Documentation/networking/devlink/netdevsim.rst
index 8a292fb5aaea..ec5e6d79b2e2 100644
--- a/Documentation/networking/devlink/netdevsim.rst
+++ b/Documentation/networking/devlink/netdevsim.rst
@@ -67,7 +67,7 @@ The ``netdevsim`` driver supports rate objects management, which includes:
 - setting tx_share and tx_max rate values for any rate object type;
 - setting parent node for any rate object type.
 
-Rate nodes and it's parameters are exposed in ``netdevsim`` debugfs in RO mode.
+Rate nodes and their parameters are exposed in ``netdevsim`` debugfs in RO mode.
 For example created rate node with name ``some_group``:
 
 .. code:: shell
diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst
index c8f59dbda46f..64f7236ff10b 100644
--- a/Documentation/networking/driver.rst
+++ b/Documentation/networking/driver.rst
@@ -8,7 +8,7 @@ Transmit path guidelines:
 
 1) The ndo_start_xmit method must not return NETDEV_TX_BUSY under
    any normal circumstances.  It is considered a hard error unless
-   there is no way your device can tell ahead of time when it's
+   there is no way your device can tell ahead of time when its
    transmit function will become busy.
 
    Instead it must maintain the queue properly.  For example,
diff --git a/Documentation/networking/ipvlan.rst b/Documentation/networking/ipvlan.rst
index 694adcba36b0..0000c1d383bc 100644
--- a/Documentation/networking/ipvlan.rst
+++ b/Documentation/networking/ipvlan.rst
@@ -11,7 +11,7 @@ Initial Release:
 ================
 This is conceptually very similar to the macvlan driver with one major
 exception of using L3 for mux-ing /demux-ing among slaves. This property makes
-the master device share the L2 with it's slave devices. I have developed this
+the master device share the L2 with its slave devices. I have developed this
 driver in conjunction with network namespaces and not sure if there is use case
 outside of it.
 
diff --git a/Documentation/networking/l2tp.rst b/Documentation/networking/l2tp.rst
index 498b382d25a0..7f383e99dbad 100644
--- a/Documentation/networking/l2tp.rst
+++ b/Documentation/networking/l2tp.rst
@@ -530,7 +530,7 @@ its tunnel close actions. For L2TPIP sockets, the socket's close
 handler initiates the same tunnel close actions. All sessions are
 first closed. Each session drops its tunnel ref. When the tunnel ref
 reaches zero, the tunnel puts its socket ref. When the socket is
-eventually destroyed, it's sk_destruct finally frees the L2TP tunnel
+eventually destroyed, its sk_destruct finally frees the L2TP tunnel
 context.
 
 Sessions
diff --git a/Documentation/networking/switchdev.rst b/Documentation/networking/switchdev.rst
index f1f4e6a85a29..bbf272e9d607 100644
--- a/Documentation/networking/switchdev.rst
+++ b/Documentation/networking/switchdev.rst
@@ -159,7 +159,7 @@ tools such as iproute2.
 
 The switchdev driver can know a particular port's position in the topology by
 monitoring NETDEV_CHANGEUPPER notifications.  For example, a port moved into a
-bond will see it's upper master change.  If that bond is moved into a bridge,
+bond will see its upper master change.  If that bond is moved into a bridge,
 the bond's upper master will change.  And so on.  The driver will track such
 movements to know what position a port is in in the overall topology by
 registering for netdevice events and acting on NETDEV_CHANGEUPPER.

From 5a3a59981027b53ec0f729ad76a43ce2b64ad968 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Mon, 29 Aug 2022 11:47:48 -0700
Subject: [PATCH 49/56] selftests: net: sort .gitignore file

This is the result of `sort tools/testing/selftests/net/.gitignore`, but
preserving the comment at the top.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Link: https://lore.kernel.org/r/20220829184748.1535580-1-axelrasmussen@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore | 52 +++++++++++++-------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 0e5751af6247..de7d5cc15f85 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -1,42 +1,42 @@
 # SPDX-License-Identifier: GPL-2.0-only
+cmsg_sender
+fin_ack_lat
+gro
+hwtstamp_config
+ioam6_parser
+ip_defrag
 ipsec
+ipv6_flowlabel
+ipv6_flowlabel_mgr
 msg_zerocopy
-socket
+nettest
 psock_fanout
 psock_snd
 psock_tpacket
-stress_reuseport_listen
+reuseaddr_conflict
+reuseaddr_ports_exhausted
 reuseport_addr_any
 reuseport_bpf
 reuseport_bpf_cpu
 reuseport_bpf_numa
 reuseport_dualstack
-reuseaddr_conflict
+rxtimestamp
+socket
+so_netns_cookie
+so_txtime
+stress_reuseport_listen
+tap
+tcp_fastopen_backup_key
+tcp_inq
 tcp_mmap
+test_unix_oob
+timestamping
+tls
+toeplitz
+tun
+txring_overwrite
+txtimestamp
 udpgso
 udpgso_bench_rx
 udpgso_bench_tx
-tcp_inq
-tls
-txring_overwrite
-ip_defrag
-ipv6_flowlabel
-ipv6_flowlabel_mgr
-so_txtime
-tcp_fastopen_backup_key
-nettest
-fin_ack_lat
-reuseaddr_ports_exhausted
-hwtstamp_config
-rxtimestamp
-timestamping
-txtimestamp
-so_netns_cookie
-test_unix_oob
-gro
-ioam6_parser
-toeplitz
-tun
-cmsg_sender
 unix_connect
-tap
\ No newline at end of file

From 90fabae8a2c225c4e4936723c38857887edde5cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@toke.dk>
Date: Wed, 31 Aug 2022 11:21:03 +0200
Subject: [PATCH 50/56] sch_cake: Return __NET_XMIT_STOLEN when consuming
 enqueued skb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the GSO splitting feature of sch_cake is enabled, GSO superpackets
will be broken up and the resulting segments enqueued in place of the
original skb. In this case, CAKE calls consume_skb() on the original skb,
but still returns NET_XMIT_SUCCESS. This can confuse parent qdiscs into
assuming the original skb still exists, when it really has been freed. Fix
this by adding the __NET_XMIT_STOLEN flag to the return value in this case.

Fixes: 0c850344d388 ("sch_cake: Conditionally split GSO segments")
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231
Link: https://lore.kernel.org/r/20220831092103.442868-1-toke@toke.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_cake.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index a43a58a73d09..a04928082e4a 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1713,6 +1713,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	}
 	idx--;
 	flow = &b->flows[idx];
+	ret = NET_XMIT_SUCCESS;
 
 	/* ensure shaper state isn't stale */
 	if (!b->tin_backlog) {
@@ -1771,6 +1772,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
 		consume_skb(skb);
+		ret |= __NET_XMIT_STOLEN;
 	} else {
 		/* not splitting */
 		cobalt_set_enqueue_time(skb, now);
@@ -1904,7 +1906,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 		b->drop_overlimit += dropped;
 	}
-	return NET_XMIT_SUCCESS;
+	return ret;
 }
 
 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)

From eb55dc09b5dd040232d5de32812cc83001a23da6 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 29 Aug 2022 12:01:21 +0200
Subject: [PATCH 51/56] ip: fix triggering of 'icmp redirect'

__mkroute_input() uses fib_validate_source() to trigger an icmp redirect.
My understanding is that fib_validate_source() is used to know if the src
address and the gateway address are on the same link. For that,
fib_validate_source() returns 1 (same link) or 0 (not the same network).
__mkroute_input() is the only user of these positive values, all other
callers only look if the returned value is negative.

Since the below patch, fib_validate_source() didn't return anymore 1 when
both addresses are on the same network, because the route lookup returns
RT_SCOPE_LINK instead of RT_SCOPE_HOST. But this is, in fact, right.
Let's adapat the test to return 1 again when both addresses are on the same
link.

CC: stable@vger.kernel.org
Fixes: 747c14307214 ("ip: fix dflt addr selection for connected nexthop")
Reported-by: kernel test robot <yujie.liu@intel.com>
Reported-by: Heng Qi <hengqi@linux.alibaba.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220829100121.3821-1-nicolas.dichtel@6wind.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/fib_frontend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f361d3d56be2..943edf4ad4db 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	dev_match = dev_match || (res.type == RTN_LOCAL &&
 				  dev == net->loopback_dev);
 	if (dev_match) {
-		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
 		return ret;
 	}
 	if (no_addr)
@@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	ret = 0;
 	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 		if (res.type == RTN_UNICAST)
-			ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+			ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
 	}
 	return ret;
 

From 52267ce25f60f37ae40ccbca0b21328ebae5ae75 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Tue, 30 Aug 2022 18:34:48 +0200
Subject: [PATCH 52/56] net: dsa: hellcreek: Print warning only once

In case the source port cannot be decoded, print the warning only once. This
still brings attention to the user and does not spam the logs at the same time.

Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20220830163448.8921-1-kurt@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/tag_hellcreek.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
index eb204ad36eee..846588c0070a 100644
--- a/net/dsa/tag_hellcreek.c
+++ b/net/dsa/tag_hellcreek.c
@@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
 
 	skb->dev = dsa_master_find_slave(dev, 0, port);
 	if (!skb->dev) {
-		netdev_warn(dev, "Failed to get source port: %d\n", port);
+		netdev_warn_once(dev, "Failed to get source port: %d\n", port);
 		return NULL;
 	}
 

From 8c70521238b7863c2af607e20bcba20f974c969b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Aug 2022 11:56:55 -0700
Subject: [PATCH 53/56] tcp: annotate data-race around challenge_timestamp

challenge_timestamp can be read an written by concurrent threads.

This was expected, but we need to annotate the race to avoid potential issues.

Following patch moves challenge_timestamp and challenge_count
to per-netns storage to provide better isolation.

Fixes: 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab5f0ea166f1..c184e15397a2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3629,11 +3629,11 @@ static void tcp_send_challenge_ack(struct sock *sk)
 
 	/* Then check host-wide RFC 5961 rate limit. */
 	now = jiffies / HZ;
-	if (now != challenge_timestamp) {
+	if (now != READ_ONCE(challenge_timestamp)) {
 		u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
 		u32 half = (ack_limit + 1) >> 1;
 
-		challenge_timestamp = now;
+		WRITE_ONCE(challenge_timestamp, now);
 		WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
 	}
 	count = READ_ONCE(challenge_count);

From 79e3602caa6f9d59c4f66a268407080496dae408 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Aug 2022 11:56:56 -0700
Subject: [PATCH 54/56] tcp: make global challenge ack rate limitation per
 net-ns and default disabled

Because per host rate limiting has been proven problematic (side channel
attacks can be based on it), per host rate limiting of challenge acks ideally
should be per netns and turned off by default.

This is a long due followup of following commits:

083ae308280d ("tcp: enable per-socket rate limiting of all 'challenge acks'")
f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock")
75ff39ccc1bd ("tcp: make challenge acks less predictable")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jason Baron <jbaron@akamai.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  5 ++++-
 include/net/netns/ipv4.h               |  2 ++
 net/ipv4/tcp_input.c                   | 21 +++++++++++----------
 net/ipv4/tcp_ipv4.c                    |  6 ++++--
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 56cd4ea059b2..a759872a2883 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1035,7 +1035,10 @@ tcp_limit_output_bytes - INTEGER
 tcp_challenge_ack_limit - INTEGER
 	Limits number of Challenge ACK sent per second, as recommended
 	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
-	Default: 1000
+	Note that this per netns rate limit can allow some side channel
+	attacks and probably should not be enabled.
+	TCP stack implements per TCP socket limits anyway.
+	Default: INT_MAX (unlimited)
 
 UDP variables
 =============
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index c7320ef356d9..6320a76cefdc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -179,6 +179,8 @@ struct netns_ipv4 {
 	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
 	atomic_t tfo_active_disable_times;
 	unsigned long tfo_active_disable_stamp;
+	u32 tcp_challenge_timestamp;
+	u32 tcp_challenge_count;
 
 	int sysctl_udp_wmem_min;
 	int sysctl_udp_rmem_min;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c184e15397a2..b85a9f755da4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3614,12 +3614,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
 /* RFC 5961 7 [ACK Throttling] */
 static void tcp_send_challenge_ack(struct sock *sk)
 {
-	/* unprotected vars, we dont care of overwrites */
-	static u32 challenge_timestamp;
-	static unsigned int challenge_count;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
-	u32 count, now;
+	u32 count, now, ack_limit;
 
 	/* First check our per-socket dupack rate limit. */
 	if (__tcp_oow_rate_limited(net,
@@ -3627,18 +3624,22 @@ static void tcp_send_challenge_ack(struct sock *sk)
 				   &tp->last_oow_ack_time))
 		return;
 
+	ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+	if (ack_limit == INT_MAX)
+		goto send_ack;
+
 	/* Then check host-wide RFC 5961 rate limit. */
 	now = jiffies / HZ;
-	if (now != READ_ONCE(challenge_timestamp)) {
-		u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+	if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
 		u32 half = (ack_limit + 1) >> 1;
 
-		WRITE_ONCE(challenge_timestamp, now);
-		WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
+		WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
+		WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit));
 	}
-	count = READ_ONCE(challenge_count);
+	count = READ_ONCE(net->ipv4.tcp_challenge_count);
 	if (count > 0) {
-		WRITE_ONCE(challenge_count, count - 1);
+		WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
+send_ack:
 		NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
 		tcp_send_ack(sk);
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0c83780dc9bf..5b019ba2b9d2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3139,8 +3139,10 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
 	/* Default TSQ limit of 16 TSO segments */
 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
-	/* rfc5961 challenge ack rate limiting */
-	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
+
+	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
+	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+
 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;

From 0b4f688d53fdc2a731b9d9cdf0c96255bc024ea6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 31 Aug 2022 20:01:32 -0700
Subject: [PATCH 55/56] Revert "sch_cake: Return __NET_XMIT_STOLEN when
 consuming enqueued skb"

This reverts commit 90fabae8a2c225c4e4936723c38857887edde5cc.

Patch was applied hastily, revert and let the v2 be reviewed.

Fixes: 90fabae8a2c2 ("sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb")
Link: https://lore.kernel.org/all/87wnao2ha3.fsf@toke.dk/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_cake.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index a04928082e4a..a43a58a73d09 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1713,7 +1713,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	}
 	idx--;
 	flow = &b->flows[idx];
-	ret = NET_XMIT_SUCCESS;
 
 	/* ensure shaper state isn't stale */
 	if (!b->tin_backlog) {
@@ -1772,7 +1771,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
 		consume_skb(skb);
-		ret |= __NET_XMIT_STOLEN;
 	} else {
 		/* not splitting */
 		cobalt_set_enqueue_time(skb, now);
@@ -1906,7 +1904,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 		b->drop_overlimit += dropped;
 	}
-	return ret;
+	return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)

From a8424a9b4522a3ab9f32175ad6d848739079071f Mon Sep 17 00:00:00 2001
From: Yacan Liu <liuyacan@corp.netease.com>
Date: Tue, 30 Aug 2022 23:23:14 +0800
Subject: [PATCH 56/56] net/smc: Remove redundant refcount increase

For passive connections, the refcount increment has been done in
smc_clcsock_accept()-->smc_sock_alloc().

Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc")
Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220830152314.838736-1-liuyacan@corp.netease.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/smc/af_smc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 79c1318af1fe..0939cc3b915a 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1855,7 +1855,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc)
 {
 	struct sock *newsmcsk = &new_smc->sk;
 
-	sk_refcnt_debug_inc(newsmcsk);
 	if (newsmcsk->sk_state == SMC_INIT)
 		newsmcsk->sk_state = SMC_ACTIVE;