From d61615c366a489646a1bfe5b33455f916762d5f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 8 Feb 2023 10:16:37 +0100 Subject: [PATCH 01/35] net: bgmac: fix BCM5358 support by setting correct flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code blocks handling BCMA_CHIP_ID_BCM5357 and BCMA_CHIP_ID_BCM53572 were incorrectly unified. Chip package values are not unique and cannot be checked independently. They are meaningful only in a context of a given chip. Packages BCM5358 and BCM47188 share the same value but then belong to different chips. Code unification resulted in treating BCM5358 as BCM47188 and broke its initialization. Link: https://github.com/openwrt/openwrt/issues/8278 Fixes: cb1b0f90acfe ("net: ethernet: bgmac: unify code of the same family") Cc: Jon Mason Signed-off-by: Rafał Miłecki Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230208091637.16291-1-zajec5@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bgmac-bcma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 02bd3cf9a260..6e4f36aaf5db 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -240,12 +240,12 @@ static int bgmac_probe(struct bcma_device *core) bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST; bgmac->feature_flags |= BGMAC_FEAT_FLW_CTRL1; bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_PHY; - if (ci->pkg == BCMA_PKG_ID_BCM47188 || - ci->pkg == BCMA_PKG_ID_BCM47186) { + if ((ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM47186) || + (ci->id == BCMA_CHIP_ID_BCM53572 && ci->pkg == BCMA_PKG_ID_BCM47188)) { bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_RGMII; bgmac->feature_flags |= BGMAC_FEAT_IOST_ATTACHED; } - if (ci->pkg == BCMA_PKG_ID_BCM5358) + if (ci->id == BCMA_CHIP_ID_BCM5357 && ci->pkg == BCMA_PKG_ID_BCM5358) bgmac->feature_flags |= BGMAC_FEAT_SW_TYPE_EPHYRMII; break; case BCMA_CHIP_ID_BCM53573: From 7a13a2eef645f2d2e3018d6ea518f121b35a87c8 Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Wed, 8 Feb 2023 11:22:57 +0100 Subject: [PATCH 02/35] nfp: fix incorrect use of mbox in IPsec code The mailbox configuration mechanism requires writing several registers, which shouldn't be interrupted, so need lock to avoid race condition. The base offset of mailbox configuration registers is not fixed, it depends on TLV caps read from application firmware. Fixes: 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer") Signed-off-by: Yinjun Zhang Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/crypto/ipsec.c | 15 ++++++++++++--- drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h | 1 - 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 4632268695cb..6d9d1c89ae6a 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -132,23 +132,32 @@ struct nfp_ipsec_cfg_mssg { static int nfp_ipsec_cfg_cmd_issue(struct nfp_net *nn, int type, int saidx, struct nfp_ipsec_cfg_mssg *msg) { + unsigned int offset = nn->tlv_caps.mbox_off + NFP_NET_CFG_MBOX_SIMPLE_VAL; int i, msg_size, ret; + ret = nfp_net_mbox_lock(nn, sizeof(*msg)); + if (ret) + return ret; + msg->cmd = type; msg->sa_idx = saidx; msg->rsp = 0; msg_size = ARRAY_SIZE(msg->raw); for (i = 0; i < msg_size; i++) - nn_writel(nn, NFP_NET_CFG_MBOX_VAL + 4 * i, msg->raw[i]); + nn_writel(nn, offset + 4 * i, msg->raw[i]); ret = nfp_net_mbox_reconfig(nn, NFP_NET_CFG_MBOX_CMD_IPSEC); - if (ret < 0) + if (ret < 0) { + nn_ctrl_bar_unlock(nn); return ret; + } /* For now we always read the whole message response back */ for (i = 0; i < msg_size; i++) - msg->raw[i] = nn_readl(nn, NFP_NET_CFG_MBOX_VAL + 4 * i); + msg->raw[i] = nn_readl(nn, offset + 4 * i); + + nn_ctrl_bar_unlock(nn); switch (msg->rsp) { case NFP_IPSEC_CFG_MSSG_OK: diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index 51124309ae1f..f03dcadff738 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -403,7 +403,6 @@ */ #define NFP_NET_CFG_MBOX_BASE 0x1800 #define NFP_NET_CFG_MBOX_VAL_MAX_SZ 0x1F8 -#define NFP_NET_CFG_MBOX_VAL 0x1808 #define NFP_NET_CFG_MBOX_SIMPLE_CMD 0x0 #define NFP_NET_CFG_MBOX_SIMPLE_RET 0x4 #define NFP_NET_CFG_MBOX_SIMPLE_VAL 0x8 From 71f814cda659dca3db575ed67dfcdc4b93e8d33f Mon Sep 17 00:00:00 2001 From: Yinjun Zhang Date: Wed, 8 Feb 2023 11:22:58 +0100 Subject: [PATCH 03/35] nfp: fix schedule in atomic context when offloading sa IPsec offloading callbacks may be called in atomic context, sleep is not allowed in the implementation. Now use workqueue mechanism to avoid this issue. Extend existing workqueue mechanism for multicast configuration only to universal use, so that all configuring through mailbox asynchronously can utilize it. Fixes: 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer") Signed-off-by: Yinjun Zhang Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- .../net/ethernet/netronome/nfp/crypto/ipsec.c | 24 ++-- drivers/net/ethernet/netronome/nfp/nfp_net.h | 25 +++- .../ethernet/netronome/nfp/nfp_net_common.c | 110 +++++++++--------- 3 files changed, 88 insertions(+), 71 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 6d9d1c89ae6a..063cd371033a 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -129,25 +129,21 @@ struct nfp_ipsec_cfg_mssg { }; }; -static int nfp_ipsec_cfg_cmd_issue(struct nfp_net *nn, int type, int saidx, - struct nfp_ipsec_cfg_mssg *msg) +static int nfp_net_ipsec_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) { unsigned int offset = nn->tlv_caps.mbox_off + NFP_NET_CFG_MBOX_SIMPLE_VAL; + struct nfp_ipsec_cfg_mssg *msg = (struct nfp_ipsec_cfg_mssg *)entry->msg; int i, msg_size, ret; ret = nfp_net_mbox_lock(nn, sizeof(*msg)); if (ret) return ret; - msg->cmd = type; - msg->sa_idx = saidx; - msg->rsp = 0; msg_size = ARRAY_SIZE(msg->raw); - for (i = 0; i < msg_size; i++) nn_writel(nn, offset + 4 * i, msg->raw[i]); - ret = nfp_net_mbox_reconfig(nn, NFP_NET_CFG_MBOX_CMD_IPSEC); + ret = nfp_net_mbox_reconfig(nn, entry->cmd); if (ret < 0) { nn_ctrl_bar_unlock(nn); return ret; @@ -486,7 +482,10 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) } /* Allocate saidx and commit the SA */ - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_ADD_SA, saidx, &msg); + msg.cmd = NFP_IPSEC_CFG_MSSG_ADD_SA; + msg.sa_idx = saidx; + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) { xa_erase(&nn->xa_ipsec, saidx); nn_err(nn, "Failed to issue IPsec command err ret=%d\n", err); @@ -500,14 +499,17 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) static void nfp_net_xfrm_del_state(struct xfrm_state *x) { + struct nfp_ipsec_cfg_mssg msg = { + .cmd = NFP_IPSEC_CFG_MSSG_INV_SA, + .sa_idx = x->xso.offload_handle - 1, + }; struct net_device *netdev = x->xso.dev; - struct nfp_ipsec_cfg_mssg msg; struct nfp_net *nn; int err; nn = netdev_priv(netdev); - err = nfp_ipsec_cfg_cmd_issue(nn, NFP_IPSEC_CFG_MSSG_INV_SA, - x->xso.offload_handle - 1, &msg); + err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, + sizeof(msg), nfp_net_ipsec_cfg); if (err) nn_warn(nn, "Failed to invalidate SA in hardware\n"); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 432d79d691c2..939cfce15830 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -617,9 +617,10 @@ struct nfp_net_dp { * @vnic_no_name: For non-port PF vNIC make ndo_get_phys_port_name return * -EOPNOTSUPP to keep backwards compatibility (set by app) * @port: Pointer to nfp_port structure if vNIC is a port - * @mc_lock: Protect mc_addrs list - * @mc_addrs: List of mc addrs to add/del to HW - * @mc_work: Work to update mc addrs + * @mbox_amsg: Asynchronously processed message via mailbox + * @mbox_amsg.lock: Protect message list + * @mbox_amsg.list: List of message to process + * @mbox_amsg.work: Work to process message asynchronously * @app_priv: APP private data for this vNIC */ struct nfp_net { @@ -721,13 +722,25 @@ struct nfp_net { struct nfp_port *port; - spinlock_t mc_lock; - struct list_head mc_addrs; - struct work_struct mc_work; + struct { + spinlock_t lock; + struct list_head list; + struct work_struct work; + } mbox_amsg; void *app_priv; }; +struct nfp_mbox_amsg_entry { + struct list_head list; + int (*cfg)(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry); + u32 cmd; + char msg[]; +}; + +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)); + /* Functions to read/write from/to a BAR * Performs any endian conversion necessary. */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 18fc9971f1c8..70d7484c82af 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1334,14 +1334,54 @@ err_unlock: return err; } -struct nfp_mc_addr_entry { - u8 addr[ETH_ALEN]; - u32 cmd; - struct list_head list; -}; - -static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) +int nfp_net_sched_mbox_amsg_work(struct nfp_net *nn, u32 cmd, const void *data, size_t len, + int (*cb)(struct nfp_net *, struct nfp_mbox_amsg_entry *)) { + struct nfp_mbox_amsg_entry *entry; + + entry = kmalloc(sizeof(*entry) + len, GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + memcpy(entry->msg, data, len); + entry->cmd = cmd; + entry->cfg = cb; + + spin_lock_bh(&nn->mbox_amsg.lock); + list_add_tail(&entry->list, &nn->mbox_amsg.list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + schedule_work(&nn->mbox_amsg.work); + + return 0; +} + +static void nfp_net_mbox_amsg_work(struct work_struct *work) +{ + struct nfp_net *nn = container_of(work, struct nfp_net, mbox_amsg.work); + struct nfp_mbox_amsg_entry *entry, *tmp; + struct list_head tmp_list; + + INIT_LIST_HEAD(&tmp_list); + + spin_lock_bh(&nn->mbox_amsg.lock); + list_splice_init(&nn->mbox_amsg.list, &tmp_list); + spin_unlock_bh(&nn->mbox_amsg.lock); + + list_for_each_entry_safe(entry, tmp, &tmp_list, list) { + int err = entry->cfg(nn, entry); + + if (err) + nn_err(nn, "Config cmd %d to HW failed %d.\n", entry->cmd, err); + + list_del(&entry->list); + kfree(entry); + } +} + +static int nfp_net_mc_cfg(struct nfp_net *nn, struct nfp_mbox_amsg_entry *entry) +{ + unsigned char *addr = entry->msg; int ret; ret = nfp_net_mbox_lock(nn, NFP_NET_CFG_MULTICAST_SZ); @@ -1353,26 +1393,7 @@ static int nfp_net_mc_cfg(struct nfp_net *nn, const unsigned char *addr, const u nn_writew(nn, nn->tlv_caps.mbox_off + NFP_NET_CFG_MULTICAST_MAC_LO, get_unaligned_be16(addr + 4)); - return nfp_net_mbox_reconfig_and_unlock(nn, cmd); -} - -static int nfp_net_mc_prep(struct nfp_net *nn, const unsigned char *addr, const u32 cmd) -{ - struct nfp_mc_addr_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - ether_addr_copy(entry->addr, addr); - entry->cmd = cmd; - spin_lock_bh(&nn->mc_lock); - list_add_tail(&entry->list, &nn->mc_addrs); - spin_unlock_bh(&nn->mc_lock); - - schedule_work(&nn->mc_work); - - return 0; + return nfp_net_mbox_reconfig_and_unlock(nn, entry->cmd); } static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) @@ -1385,35 +1406,16 @@ static int nfp_net_mc_sync(struct net_device *netdev, const unsigned char *addr) return -EINVAL; } - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD); + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_ADD, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static int nfp_net_mc_unsync(struct net_device *netdev, const unsigned char *addr) { struct nfp_net *nn = netdev_priv(netdev); - return nfp_net_mc_prep(nn, addr, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL); -} - -static void nfp_net_mc_addr_config(struct work_struct *work) -{ - struct nfp_net *nn = container_of(work, struct nfp_net, mc_work); - struct nfp_mc_addr_entry *entry, *tmp; - struct list_head tmp_list; - - INIT_LIST_HEAD(&tmp_list); - - spin_lock_bh(&nn->mc_lock); - list_splice_init(&nn->mc_addrs, &tmp_list); - spin_unlock_bh(&nn->mc_lock); - - list_for_each_entry_safe(entry, tmp, &tmp_list, list) { - if (nfp_net_mc_cfg(nn, entry->addr, entry->cmd)) - nn_err(nn, "Config mc address to HW failed.\n"); - - list_del(&entry->list); - kfree(entry); - } + return nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_MULTICAST_DEL, addr, + NFP_NET_CFG_MULTICAST_SZ, nfp_net_mc_cfg); } static void nfp_net_set_rx_mode(struct net_device *netdev) @@ -2681,9 +2683,9 @@ int nfp_net_init(struct nfp_net *nn) if (!nn->dp.netdev) return 0; - spin_lock_init(&nn->mc_lock); - INIT_LIST_HEAD(&nn->mc_addrs); - INIT_WORK(&nn->mc_work, nfp_net_mc_addr_config); + spin_lock_init(&nn->mbox_amsg.lock); + INIT_LIST_HEAD(&nn->mbox_amsg.list); + INIT_WORK(&nn->mbox_amsg.work, nfp_net_mbox_amsg_work); return register_netdev(nn->dp.netdev); @@ -2704,6 +2706,6 @@ void nfp_net_clean(struct nfp_net *nn) unregister_netdev(nn->dp.netdev); nfp_net_ipsec_clean(nn); nfp_ccm_mbox_clean(nn); - flush_work(&nn->mc_work); + flush_work(&nn->mbox_amsg.work); nfp_net_reconfig_wait_posted(nn); } From e010ae08c71fda8be3d6bda256837795a0b3ea41 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 8 Feb 2023 18:13:59 +0100 Subject: [PATCH 04/35] ipv6: Fix datagram socket connection with DSCP. Take into account the IPV6_TCLASS socket option (DSCP) in ip6_datagram_flow_key_init(). Otherwise fib6_rule_match() can't properly match the DSCP value, resulting in invalid route lookup. For example: ip route add unreachable table main 2001:db8::10/124 ip route add table 100 2001:db8::10/124 dev eth0 ip -6 rule add dsfield 0x04 table 100 echo test | socat - UDP6:[2001:db8::11]:54321,ipv6-tclass=0x04 Without this patch, socat fails at connect() time ("No route to host") because the fib-rule doesn't jump to table 100 and the lookup ends up being done in the main table. Fixes: 2cc67cc731d9 ("[IPV6] ROUTE: Routing by Traffic Class.") Signed-off-by: Guillaume Nault Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e624497fa992..9b6818453afe 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -51,7 +51,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; - fl6->flowlabel = np->flow_label; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) From 8230680f36fd1525303d1117768c8852314c488c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 8 Feb 2023 18:14:03 +0100 Subject: [PATCH 05/35] ipv6: Fix tcp socket connection with DSCP. Take into account the IPV6_TCLASS socket option (DSCP) in tcp_v6_connect(). Otherwise fib6_rule_match() can't properly match the DSCP value, resulting in invalid route lookup. For example: ip route add unreachable table main 2001:db8::10/124 ip route add table 100 2001:db8::10/124 dev eth0 ip -6 rule add dsfield 0x04 table 100 echo test | socat - TCP6:[2001:db8::11]:54321,ipv6-tclass=0x04 Without this patch, socat fails at connect() time ("No route to host") because the fib-rule doesn't jump to table 100 and the lookup ends up being done in the main table. Fixes: 2cc67cc731d9 ("[IPV6] ROUTE: Routing by Traffic Class.") Signed-off-by: Guillaume Nault Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 11b736a76bd7..0d25e813288d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -272,6 +272,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; From c21a20d9d102ab809a992a6b056abbec4cd4c1cd Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 8 Feb 2023 18:14:07 +0100 Subject: [PATCH 06/35] selftests: fib_rule_tests: Test UDP and TCP connections with DSCP rules. Add the fib_rule6_send and fib_rule4_send tests to verify that DSCP values are properly taken into account when UDP or TCP sockets try to connect(). Tests are done with nettest, which needs a new option to specify the DS Field value of the socket being tested. This new option is named '-Q', in reference to the similar option used by ping. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_rule_tests.sh | 128 +++++++++++++++++- tools/testing/selftests/net/nettest.c | 51 ++++++- 2 files changed, 177 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index c245476fa29d..63c3eaec8d30 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -10,8 +10,10 @@ ret=0 PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no} IP="ip -netns testns" +IP_PEER="ip -netns peerns" RTABLE=100 +RTABLE_PEER=101 GW_IP4=192.51.100.2 SRC_IP=192.51.100.3 GW_IP6=2001:db8:1::2 @@ -20,7 +22,9 @@ SRC_IP6=2001:db8:1::3 DEV_ADDR=192.51.100.1 DEV_ADDR6=2001:db8:1::1 DEV=dummy0 -TESTS="fib_rule6 fib_rule4" +TESTS="fib_rule6 fib_rule4 fib_rule6_connect fib_rule4_connect" + +SELFTEST_PATH="" log_test() { @@ -52,6 +56,31 @@ log_section() echo "######################################################################" } +check_nettest() +{ + if which nettest > /dev/null 2>&1; then + return 0 + fi + + # Add the selftest directory to PATH if not already done + if [ "${SELFTEST_PATH}" = "" ]; then + SELFTEST_PATH="$(dirname $0)" + PATH="${PATH}:${SELFTEST_PATH}" + + # Now retry with the new path + if which nettest > /dev/null 2>&1; then + return 0 + fi + + if [ "${ret}" -eq 0 ]; then + ret="${ksft_skip}" + fi + echo "nettest not found (try 'make -C ${SELFTEST_PATH} nettest')" + fi + + return 1 +} + setup() { set -e @@ -72,6 +101,39 @@ cleanup() ip netns del testns } +setup_peer() +{ + set -e + + ip netns add peerns + $IP_PEER link set dev lo up + + ip link add name veth0 netns testns type veth \ + peer name veth1 netns peerns + $IP link set dev veth0 up + $IP_PEER link set dev veth1 up + + $IP address add 192.0.2.10 peer 192.0.2.11/32 dev veth0 + $IP_PEER address add 192.0.2.11 peer 192.0.2.10/32 dev veth1 + + $IP address add 2001:db8::10 peer 2001:db8::11/128 dev veth0 nodad + $IP_PEER address add 2001:db8::11 peer 2001:db8::10/128 dev veth1 nodad + + $IP_PEER address add 198.51.100.11/32 dev lo + $IP route add table $RTABLE_PEER 198.51.100.11/32 via 192.0.2.11 + + $IP_PEER address add 2001:db8::1:11/128 dev lo + $IP route add table $RTABLE_PEER 2001:db8::1:11/128 via 2001:db8::11 + + set +e +} + +cleanup_peer() +{ + $IP link del dev veth0 + ip netns del peerns +} + fib_check_iproute_support() { ip rule help 2>&1 | grep -q $1 @@ -190,6 +252,37 @@ fib_rule6_test() fi } +# Verify that the IPV6_TCLASS option of UDPv6 and TCPv6 sockets is properly +# taken into account when connecting the socket and when sending packets. +fib_rule6_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -6 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -6 -B -t 5 -N testns -O peerns -U -D \ + -Q "${dsfield}" -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -6 -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 2001:db8::1:11 -r 2001:db8::1:11 + log_test $? 0 "rule6 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -6 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + fib_rule4_del() { $IP rule del $1 @@ -296,6 +389,37 @@ fib_rule4_test() fi } +# Verify that the IP_TOS option of UDPv4 and TCPv4 sockets is properly taken +# into account when connecting the socket and when sending packets. +fib_rule4_connect_test() +{ + local dsfield + + if ! check_nettest; then + echo "SKIP: Could not run test without nettest tool" + return + fi + + setup_peer + $IP -4 rule add dsfield 0x04 table $RTABLE_PEER + + # Combine the base DS Field value (0x04) with all possible ECN values + # (Not-ECT: 0, ECT(1): 1, ECT(0): 2, CE: 3). + # The ECN bits shouldn't influence the result of the test. + for dsfield in 0x04 0x05 0x06 0x07; do + nettest -q -B -t 5 -N testns -O peerns -D -U -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield udp connect (dsfield ${dsfield})" + + nettest -q -B -t 5 -N testns -O peerns -Q "${dsfield}" \ + -l 198.51.100.11 -r 198.51.100.11 + log_test $? 0 "rule4 dsfield tcp connect (dsfield ${dsfield})" + done + + $IP -4 rule del dsfield 0x04 table $RTABLE_PEER + cleanup_peer +} + run_fibrule_tests() { log_section "IPv4 fib rule" @@ -345,6 +469,8 @@ do case $t in fib_rule6_test|fib_rule6) fib_rule6_test;; fib_rule4_test|fib_rule4) fib_rule4_test;; + fib_rule6_connect_test|fib_rule6_connect) fib_rule6_connect_test;; + fib_rule4_connect_test|fib_rule4_connect) fib_rule4_connect_test;; help) echo "Test names: $TESTS"; exit 0;; diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c index 7900fa98eccb..ee9a72982705 100644 --- a/tools/testing/selftests/net/nettest.c +++ b/tools/testing/selftests/net/nettest.c @@ -87,6 +87,7 @@ struct sock_args { int use_setsockopt; int use_freebind; int use_cmsg; + uint8_t dsfield; const char *dev; const char *server_dev; int ifindex; @@ -580,6 +581,36 @@ static int set_reuseaddr(int sd) return rc; } +static int set_dsfield(int sd, int version, int dsfield) +{ + if (!dsfield) + return 0; + + switch (version) { + case AF_INET: + if (setsockopt(sd, SOL_IP, IP_TOS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IP_TOS)"); + return -1; + } + break; + + case AF_INET6: + if (setsockopt(sd, SOL_IPV6, IPV6_TCLASS, &dsfield, + sizeof(dsfield)) < 0) { + log_err_errno("setsockopt(IPV6_TCLASS)"); + return -1; + } + break; + + default: + log_error("Invalid address family\n"); + return -1; + } + + return 0; +} + static int str_to_uint(const char *str, int min, int max, unsigned int *value) { int number; @@ -1317,6 +1348,9 @@ static int msock_init(struct sock_args *args, int server) (char *)&one, sizeof(one)) < 0) log_err_errno("Setting SO_BROADCAST error"); + if (set_dsfield(sd, AF_INET, args->dsfield) != 0) + goto out_err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto out_err; else if (args->use_setsockopt && @@ -1445,6 +1479,9 @@ static int lsock_init(struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1658,6 +1695,9 @@ static int connectsock(void *addr, socklen_t alen, struct sock_args *args) if (set_reuseport(sd) != 0) goto err; + if (set_dsfield(sd, args->version, args->dsfield) != 0) + goto err; + if (args->dev && bind_to_device(sd, args->dev) != 0) goto err; else if (args->use_setsockopt && @@ -1862,7 +1902,7 @@ static int ipc_parent(int cpid, int fd, struct sock_args *args) return client_status; } -#define GETOPT_STR "sr:l:c:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" +#define GETOPT_STR "sr:l:c:Q:p:t:g:P:DRn:M:X:m:d:I:BN:O:SUCi6xL:0:1:2:3:Fbqf" #define OPT_FORCE_BIND_KEY_IFINDEX 1001 #define OPT_NO_BIND_KEY_IFINDEX 1002 @@ -1893,6 +1933,8 @@ static void print_usage(char *prog) " -D|R datagram (D) / raw (R) socket (default stream)\n" " -l addr local address to bind to in server mode\n" " -c addr local address to bind to in client mode\n" + " -Q dsfield DS Field value of the socket (the IP_TOS or\n" + " IPV6_TCLASS socket option)\n" " -x configure XFRM policy on socket\n" "\n" " -d dev bind socket to given device name\n" @@ -1971,6 +2013,13 @@ int main(int argc, char *argv[]) args.has_local_ip = 1; args.client_local_addr_str = optarg; break; + case 'Q': + if (str_to_uint(optarg, 0, 255, &tmp) != 0) { + fprintf(stderr, "Invalid DS Field\n"); + return 1; + } + args.dsfield = tmp; + break; case 'p': if (str_to_uint(optarg, 1, 65535, &tmp) != 0) { fprintf(stderr, "Invalid port\n"); From 6e77a5a4af05d5e7391c841a4a4f3e4cadf72c25 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Feb 2023 18:21:23 +0000 Subject: [PATCH 07/35] net: initialize net->notrefcnt_tracker earlier syzbot was able to trigger a warning [1] from net_free() calling ref_tracker_dir_exit(&net->notrefcnt_tracker) while the corresponding ref_tracker_dir_init() has not been done yet. copy_net_ns() can indeed bypass the call to setup_net() in some error conditions. Note: We might factorize/move more code in preinit_net() in the future. [1] INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 0 PID: 5817 Comm: syz-executor.3 Not tainted 6.2.0-rc7-next-20230208-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/12/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106 assign_lock_key kernel/locking/lockdep.c:982 [inline] register_lock_class+0xdb6/0x1120 kernel/locking/lockdep.c:1295 __lock_acquire+0x10a/0x5df0 kernel/locking/lockdep.c:4951 lock_acquire.part.0+0x11c/0x370 kernel/locking/lockdep.c:5691 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 ref_tracker_dir_exit+0x52/0x600 lib/ref_tracker.c:24 net_free net/core/net_namespace.c:442 [inline] net_free+0x98/0xd0 net/core/net_namespace.c:436 copy_net_ns+0x4f3/0x6b0 net/core/net_namespace.c:493 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:228 ksys_unshare+0x449/0x920 kernel/fork.c:3205 __do_sys_unshare kernel/fork.c:3276 [inline] __se_sys_unshare kernel/fork.c:3274 [inline] __x64_sys_unshare+0x31/0x40 kernel/fork.c:3274 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") Reported-by: syzbot Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230208182123.3821604-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 078a0a420c8a..7b69cf882b8e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -304,6 +304,12 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +/* init code that must occur even if setup_net() is not called. */ +static __net_init void preinit_net(struct net *net) +{ + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -316,7 +322,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -472,6 +477,8 @@ struct net *copy_net_ns(unsigned long flags, rv = -ENOMEM; goto dec_ucounts; } + + preinit_net(net); refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); @@ -1118,6 +1125,7 @@ void __init net_ns_init(void) init_net.key_domain = &init_net_key_domain; #endif down_write(&pernet_ops_rwsem); + preinit_net(&init_net); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); From ec76d0c2da5c6dfb6a33f1545cc15997013923da Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Wed, 8 Feb 2023 14:38:59 -0800 Subject: [PATCH 08/35] vmxnet3: move rss code block under eop descriptor Commit b3973bb40041 ("vmxnet3: set correct hash type based on rss information") added hashType information into skb. However, rssType field is populated for eop descriptor. This can lead to incorrectly reporting of hashType for packets which use multiple rx descriptors. Multiple rx descriptors are used for Jumbo frame or LRO packets, which can hit this issue. This patch moves the RSS codeblock under eop descritor. Cc: stable@vger.kernel.org Fixes: b3973bb40041 ("vmxnet3: set correct hash type based on rss information") Signed-off-by: Ronak Doshi Acked-by: Peng Li Acked-by: Guolin Yang Link: https://lore.kernel.org/r/20230208223900.5794-1-doshir@vmware.com Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 56267c327f0b..682987040ea8 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1546,31 +1546,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, rxd->len = rbi->len; } -#ifdef VMXNET3_RSS - if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && - (adapter->netdev->features & NETIF_F_RXHASH)) { - enum pkt_hash_types hash_type; - - switch (rcd->rssType) { - case VMXNET3_RCD_RSS_TYPE_IPV4: - case VMXNET3_RCD_RSS_TYPE_IPV6: - hash_type = PKT_HASH_TYPE_L3; - break; - case VMXNET3_RCD_RSS_TYPE_TCPIPV4: - case VMXNET3_RCD_RSS_TYPE_TCPIPV6: - case VMXNET3_RCD_RSS_TYPE_UDPIPV4: - case VMXNET3_RCD_RSS_TYPE_UDPIPV6: - hash_type = PKT_HASH_TYPE_L4; - break; - default: - hash_type = PKT_HASH_TYPE_L3; - break; - } - skb_set_hash(ctx->skb, - le32_to_cpu(rcd->rssHash), - hash_type); - } -#endif skb_record_rx_queue(ctx->skb, rq->qid); skb_put(ctx->skb, rcd->len); @@ -1653,6 +1628,31 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq, u32 mtu = adapter->netdev->mtu; skb->len += skb->data_len; +#ifdef VMXNET3_RSS + if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE && + (adapter->netdev->features & NETIF_F_RXHASH)) { + enum pkt_hash_types hash_type; + + switch (rcd->rssType) { + case VMXNET3_RCD_RSS_TYPE_IPV4: + case VMXNET3_RCD_RSS_TYPE_IPV6: + hash_type = PKT_HASH_TYPE_L3; + break; + case VMXNET3_RCD_RSS_TYPE_TCPIPV4: + case VMXNET3_RCD_RSS_TYPE_TCPIPV6: + case VMXNET3_RCD_RSS_TYPE_UDPIPV4: + case VMXNET3_RCD_RSS_TYPE_UDPIPV6: + hash_type = PKT_HASH_TYPE_L4; + break; + default: + hash_type = PKT_HASH_TYPE_L3; + break; + } + skb_set_hash(skb, + le32_to_cpu(rcd->rssHash), + hash_type); + } +#endif vmxnet3_rx_csum(adapter, skb, (union Vmxnet3_GenericDesc *)rcd); skb->protocol = eth_type_trans(skb, adapter->netdev); From 0ed577e7e8e508c24e22ba07713ecc4903e147c3 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Thu, 9 Feb 2023 14:14:32 +0530 Subject: [PATCH 09/35] net: ethernet: ti: am65-cpsw: Add RX DMA Channel Teardown Quirk In TI's AM62x/AM64x SoCs, successful teardown of RX DMA Channel raises an interrupt. The process of servicing this interrupt involves flushing all pending RX DMA descriptors and clearing the teardown completion marker (TDCM). The am65_cpsw_nuss_rx_packets() function invoked from the RX NAPI callback services the interrupt. Thus, it is necessary to wait for this handler to run, drain all packets and clear TDCM, before calling napi_disable() in am65_cpsw_nuss_common_stop() function post channel teardown. If napi_disable() executes before ensuring that TDCM is cleared, the TDCM remains set when the interfaces are down, resulting in an interrupt storm when the interfaces are brought up again. Since the interrupt raised to indicate the RX DMA Channel teardown is specific to the AM62x and AM64x SoCs, add a quirk for it. Fixes: 4f7cce272403 ("net: ethernet: ti: am65-cpsw: add support for am64x cpsw3g") Co-developed-by: Vignesh Raghavendra Signed-off-by: Vignesh Raghavendra Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20230209084432.189222-1-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 12 +++++++++++- drivers/net/ethernet/ti/am65-cpsw-nuss.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index ecbde83b5243..6cda4b7c10cb 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -501,7 +501,15 @@ static int am65_cpsw_nuss_common_stop(struct am65_cpsw_common *common) k3_udma_glue_disable_tx_chn(common->tx_chns[i].tx_chn); } + reinit_completion(&common->tdown_complete); k3_udma_glue_tdown_rx_chn(common->rx_chns.rx_chn, true); + + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) { + i = wait_for_completion_timeout(&common->tdown_complete, msecs_to_jiffies(1000)); + if (!i) + dev_err(common->dev, "rx teardown timeout\n"); + } + napi_disable(&common->napi_rx); for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++) @@ -721,6 +729,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_common *common, if (cppi5_desc_is_tdcm(desc_dma)) { dev_dbg(dev, "%s RX tdown flow: %u\n", __func__, flow_idx); + if (common->pdata.quirks & AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ) + complete(&common->tdown_complete); return 0; } @@ -2672,7 +2682,7 @@ static const struct am65_cpsw_pdata j721e_pdata = { }; static const struct am65_cpsw_pdata am64x_cpswxg_pdata = { - .quirks = 0, + .quirks = AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ, .ale_dev_id = "am64-cpswxg", .fdqring_mode = K3_RINGACC_RING_MODE_RING, }; diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index 4b75620f8d28..e5f1c44788c1 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -90,6 +90,7 @@ struct am65_cpsw_rx_chn { }; #define AM65_CPSW_QUIRK_I2027_NO_TX_CSUM BIT(0) +#define AM64_CPSW_QUIRK_DMA_RX_TDOWN_IRQ BIT(1) struct am65_cpsw_pdata { u32 quirks; From a1221703a0f75a9d81748c516457e0fc76951496 Mon Sep 17 00:00:00 2001 From: Pietro Borrello Date: Thu, 9 Feb 2023 12:13:05 +0000 Subject: [PATCH 10/35] sctp: sctp_sock_filter(): avoid list_entry() on possibly empty list Use list_is_first() to check whether tsp->asoc matches the first element of ep->asocs, as the list is not guaranteed to have an entry. Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Pietro Borrello Acked-by: Xin Long Link: https://lore.kernel.org/r/20230208-sctp-filter-v2-1-6e1f4017f326@diag.uniroma1.it Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sctp/diag.c b/net/sctp/diag.c index a557009e9832..c3d6b92dd386 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -343,11 +343,9 @@ static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) + if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) From ee059170b1f7e94e55fa6cadee544e176a6e59c2 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 9 Feb 2023 11:37:39 -0300 Subject: [PATCH 11/35] net/sched: tcindex: update imperfect hash filters respecting rcu The imperfect hash area can be updated while packets are traversing, which will cause a use-after-free when 'tcf_exts_exec()' is called with the destroyed tcf_ext. CPU 0: CPU 1: tcindex_set_parms tcindex_classify tcindex_lookup tcindex_lookup tcf_exts_change tcf_exts_exec [UAF] Stop operating on the shared area directly, by using a local copy, and update the filter with 'rcu_replace_pointer()'. Delete the old filter version only after a rcu grace period elapsed. Fixes: 9b0d4446b569 ("net: sched: avoid atomic swap in tcf_exts_change") Reported-by: valis Suggested-by: valis Signed-off-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20230209143739.279867-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/cls_tcindex.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ee2a050c887b..ba7f22a49397 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -339,6 +340,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcf_result cr = {}; int err, balloc = 0; struct tcf_exts e; + bool update_h = false; err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) @@ -456,10 +458,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } } - if (cp->perfect) + if (cp->perfect) { r = cp->perfect + handle; - else - r = tcindex_lookup(cp, handle) ? : &new_filter_result; + } else { + /* imperfect area is updated in-place using rcu */ + update_h = !!tcindex_lookup(cp, handle); + r = &new_filter_result; + } if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); @@ -485,7 +490,28 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, rcu_assign_pointer(tp->root, cp); - if (r == &new_filter_result) { + if (update_h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *cf; + + f->result.res = r->res; + tcf_exts_change(&f->result.exts, &r->exts); + + /* imperfect area bucket */ + fp = cp->h + (handle % cp->hash); + + /* lookup the filter, guaranteed to exist */ + for (cf = rcu_dereference_bh_rtnl(*fp); cf; + fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) + if (cf->key == handle) + break; + + f->next = cf->next; + + cf = rcu_replace_pointer(*fp, f, 1); + tcf_exts_get_net(&cf->result.exts); + tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); + } else if (r == &new_filter_result) { struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; From 1f090494170ea298530cf1285fb8d078e355b4c0 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 9 Feb 2023 17:01:30 +0100 Subject: [PATCH 12/35] ice: xsk: Fix cleaning of XDP_TX frames Incrementation of xsk_frames inside the for-loop produces infinite loop, if we have both normal AF_XDP-TX and XDP_TXed buffers to complete. Split xsk_frames into 2 variables (xsk_frames and completed_frames) to eliminate this bug. Fixes: 29322791bc8b ("ice: xsk: change batched Tx descriptor cleaning") Acked-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Reviewed-by: Alexander Duyck Acked-by: Tony Nguyen Link: https://lore.kernel.org/r/20230209160130.1779890-1-larysa.zaremba@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_xsk.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 7105de6fb344..374b7f10b549 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -800,6 +800,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) struct ice_tx_desc *tx_desc; u16 cnt = xdp_ring->count; struct ice_tx_buf *tx_buf; + u16 completed_frames = 0; u16 xsk_frames = 0; u16 last_rs; int i; @@ -809,19 +810,21 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) if ((tx_desc->cmd_type_offset_bsz & cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE))) { if (last_rs >= ntc) - xsk_frames = last_rs - ntc + 1; + completed_frames = last_rs - ntc + 1; else - xsk_frames = last_rs + cnt - ntc + 1; + completed_frames = last_rs + cnt - ntc + 1; } - if (!xsk_frames) + if (!completed_frames) return; - if (likely(!xdp_ring->xdp_tx_active)) + if (likely(!xdp_ring->xdp_tx_active)) { + xsk_frames = completed_frames; goto skip; + } ntc = xdp_ring->next_to_clean; - for (i = 0; i < xsk_frames; i++) { + for (i = 0; i < completed_frames; i++) { tx_buf = &xdp_ring->tx_buf[ntc]; if (tx_buf->raw_buf) { @@ -837,7 +840,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring) } skip: tx_desc->cmd_type_offset_bsz = 0; - xdp_ring->next_to_clean += xsk_frames; + xdp_ring->next_to_clean += completed_frames; if (xdp_ring->next_to_clean >= cnt) xdp_ring->next_to_clean -= cnt; if (xsk_frames) From 7fa0b526f865cb42aa33917fd02a92cb03746f4d Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Thu, 9 Feb 2023 09:28:33 -0800 Subject: [PATCH 13/35] i40e: Add checking for null for nlmsg_find_attr() The result of nlmsg_find_attr() 'br_spec' is dereferenced in nla_for_each_nested(), but it can take NULL value in nla_find() function, which will result in an error. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 51616018dd1b ("i40e: Add support for getlink, setlink ndo ops") Signed-off-by: Natalia Petrova Reviewed-by: Jesse Brandeburg Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230209172833.3596034-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 53d0083e35da..4626d2a1af91 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13167,6 +13167,8 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev, } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!br_spec) + return -EINVAL; nla_for_each_nested(attr, br_spec, rem) { __u16 mode; From ca43ccf41224b023fc290073d5603a755fd12eed Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 9 Feb 2023 16:22:01 -0800 Subject: [PATCH 14/35] dccp/tcp: Avoid negative sk_forward_alloc by ipv6_pinfo.pktoptions. Eric Dumazet pointed out [0] that when we call skb_set_owner_r() for ipv6_pinfo.pktoptions, sk_rmem_schedule() has not been called, resulting in a negative sk_forward_alloc. We add a new helper which clones a skb and sets its owner only when sk_rmem_schedule() succeeds. Note that we move skb_set_owner_r() forward in (dccp|tcp)_v6_do_rcv() because tcp_send_synack() can make sk_forward_alloc negative before ipv6_opt_accepted() in the crossed SYN-ACK or self-connect() cases. [0]: https://lore.kernel.org/netdev/CANn89iK9oc20Jdi_41jb9URdF210r7d1Y-+uypbMSbOfY6jqrg@mail.gmail.com/ Fixes: 323fbd0edf3f ("net: dccp: Add handling of IPV6_PKTOPTIONS to dccp_v6_do_rcv()") Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 13 +++++++++++++ net/dccp/ipv6.c | 7 ++----- net/ipv6/tcp_ipv6.c | 10 +++------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index dcd72e6285b2..556209727633 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2434,6 +2434,19 @@ static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struc return false; } +static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) +{ + skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + if (skb) { + if (sk_rmem_schedule(sk, skb, skb->truesize)) { + skb_set_owner_r(skb, sk); + return skb; + } + __kfree_skb(skb); + } + return NULL; +} + static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4260fe466993..b9d7c3dd1cb3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -551,11 +551,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); } return newsk; @@ -615,7 +613,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) @@ -679,7 +677,6 @@ ipv6_pktoptions: np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0d25e813288d..a52a4f12f146 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1388,14 +1388,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_mask(sk, GFP_ATOMIC)); + newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; - if (newnp->pktoptions) { + if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); - skb_set_owner_r(newnp->pktoptions, newsk); - } } } else { if (!req_unhash && found_dup_sk) { @@ -1467,7 +1464,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); + opt_skb = skb_clone_and_charge_r(skb, sk); reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1553,7 +1550,6 @@ ipv6_pktoptions: if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { - skb_set_owner_r(opt_skb, sk); tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { From 62ec33b44e0f7168ff2886520fec6fb62d03b5a3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 9 Feb 2023 16:22:02 -0800 Subject: [PATCH 15/35] net: Remove WARN_ON_ONCE(sk->sk_forward_alloc) from sk_stream_kill_queues(). Christoph Paasch reported that commit b5fc29233d28 ("inet6: Remove inet6_destroy_sock() in sk->sk_prot->destroy().") started triggering WARN_ON_ONCE(sk->sk_forward_alloc) in sk_stream_kill_queues(). [0 - 2] Also, we can reproduce it by a program in [3]. In the commit, we delay freeing ipv6_pinfo.pktoptions from sk->destroy() to sk->sk_destruct(), so sk->sk_forward_alloc is no longer zero in inet_csk_destroy_sock(). The same check has been in inet_sock_destruct() from at least v2.6, we can just remove the WARN_ON_ONCE(). However, among the users of sk_stream_kill_queues(), only CAIF is not calling inet_sock_destruct(). Thus, we add the same WARN_ON_ONCE() to caif_sock_destructor(). [0]: https://lore.kernel.org/netdev/39725AB4-88F1-41B3-B07F-949C5CAEFF4F@icloud.com/ [1]: https://github.com/multipath-tcp/mptcp_net-next/issues/341 [2]: WARNING: CPU: 0 PID: 3232 at net/core/stream.c:212 sk_stream_kill_queues+0x2f9/0x3e0 Modules linked in: CPU: 0 PID: 3232 Comm: syz-executor.0 Not tainted 6.2.0-rc5ab24eb4698afbe147b424149c529e2a43ec24eb5 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:sk_stream_kill_queues+0x2f9/0x3e0 Code: 03 0f b6 04 02 84 c0 74 08 3c 03 0f 8e ec 00 00 00 8b ab 08 01 00 00 e9 60 ff ff ff e8 d0 5f b6 fe 0f 0b eb 97 e8 c7 5f b6 fe <0f> 0b eb a0 e8 be 5f b6 fe 0f 0b e9 6a fe ff ff e8 02 07 e3 fe e9 RSP: 0018:ffff88810570fc68 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff888101f38f40 RSI: ffffffff8285e529 RDI: 0000000000000005 RBP: 0000000000000ce0 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000ce0 R11: 0000000000000001 R12: ffff8881009e9488 R13: ffffffff84af2cc0 R14: 0000000000000000 R15: ffff8881009e9458 FS: 00007f7fdfbd5800(0000) GS:ffff88811b600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32923000 CR3: 00000001062fc006 CR4: 0000000000170ef0 Call Trace: inet_csk_destroy_sock+0x1a1/0x320 __tcp_close+0xab6/0xe90 tcp_close+0x30/0xc0 inet_release+0xe9/0x1f0 inet6_release+0x4c/0x70 __sock_release+0xd2/0x280 sock_close+0x15/0x20 __fput+0x252/0xa20 task_work_run+0x169/0x250 exit_to_user_mode_prepare+0x113/0x120 syscall_exit_to_user_mode+0x1d/0x40 do_syscall_64+0x48/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f7fdf7ae28d Code: c1 20 00 00 75 10 b8 03 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ee fb ff ff 48 89 04 24 b8 03 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 37 fc ff ff 48 89 d0 48 83 c4 08 48 3d 01 RSP: 002b:00000000007dfbb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00007f7fdf7ae28d RDX: 0000000000000000 RSI: ffffffffffffffff RDI: 0000000000000003 RBP: 0000000000000000 R08: 000000007f338e0f R09: 0000000000000e0f R10: 000000007f338e13 R11: 0000000000000293 R12: 00007f7fdefff000 R13: 00007f7fdefffcd8 R14: 00007f7fdefffce0 R15: 00007f7fdefffcd8 [3]: https://lore.kernel.org/netdev/20230208004245.83497-1-kuniyu@amazon.com/ Fixes: b5fc29233d28 ("inet6: Remove inet6_destroy_sock() in sk->sk_prot->destroy().") Reported-by: syzbot Reported-by: Christoph Paasch Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/caif/caif_socket.c | 1 + net/core/stream.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 748be7253248..78c9729a6057 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -1015,6 +1015,7 @@ static void caif_sock_destructor(struct sock *sk) return; } sk_stream_kill_queues(&cf_sk->sk); + WARN_ON_ONCE(sk->sk_forward_alloc); caif_free_client(&cf_sk->layer); } diff --git a/net/core/stream.c b/net/core/stream.c index cd06750dd329..434446ab14c5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -209,7 +209,6 @@ void sk_stream_kill_queues(struct sock *sk) sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket From 2f4796518315ab246638db8feebfcb494212e7ee Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Thu, 9 Feb 2023 01:16:48 -0800 Subject: [PATCH 16/35] af_key: Fix heap information leak Since x->encap of pfkey_msg2xfrm_state() is not initialized to 0, kernel heap data can be leaked. Fix with kzalloc() to prevent this. Signed-off-by: Hyunwoo Kim Acked-by: Herbert Xu Reviewed-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/key/af_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 2bdbcec781cd..a815f5ab4c49 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1261,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; - x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; From 2fa28f5c6fcbfc794340684f36d2581b4f2d20b5 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Fri, 10 Feb 2023 10:05:51 +0800 Subject: [PATCH 17/35] net: openvswitch: fix possible memory leak in ovs_meter_cmd_set() old_meter needs to be free after it is detached regardless of whether the new meter is successfully attached. Fixes: c7c4c44c9a95 ("net: openvswitch: expand the meters supported number") Signed-off-by: Hangyu Hua Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 6e38f68f88c2..f2698d2316df 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -449,7 +449,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) err = attach_meter(meter_tbl, meter); if (err) - goto exit_unlock; + goto exit_free_old_meter; ovs_unlock(); @@ -472,6 +472,8 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_free_old_meter: + ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); From c68f345b7c425b38656e1791a0486769a8797016 Mon Sep 17 00:00:00 2001 From: Miko Larsson Date: Fri, 10 Feb 2023 09:13:44 +0100 Subject: [PATCH 18/35] net/usb: kalmia: Don't pass act_len in usb_bulk_msg error path syzbot reported that act_len in kalmia_send_init_packet() is uninitialized when passing it to the first usb_bulk_msg error path. Jiri Pirko noted that it's pointless to pass it in the error path, and that the value that would be printed in the second error path would be the value of act_len from the first call to usb_bulk_msg.[1] With this in mind, let's just not pass act_len to the usb_bulk_msg error paths. 1: https://lore.kernel.org/lkml/Y9pY61y1nwTuzMOa@nanopsycho/ Fixes: d40261236e8e ("net/usb: Add Samsung Kalmia driver for Samsung GT-B3730") Reported-and-tested-by: syzbot+cd80c5ef5121bfe85b55@syzkaller.appspotmail.com Signed-off-by: Miko Larsson Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/usb/kalmia.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/kalmia.c b/drivers/net/usb/kalmia.c index 9f2b70ef39aa..613fc6910f14 100644 --- a/drivers/net/usb/kalmia.c +++ b/drivers/net/usb/kalmia.c @@ -65,8 +65,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, init_msg, init_msg_len, &act_len, KALMIA_USB_TIMEOUT); if (status != 0) { netdev_err(dev->net, - "Error sending init packet. Status %i, length %i\n", - status, act_len); + "Error sending init packet. Status %i\n", + status); return status; } else if (act_len != init_msg_len) { @@ -83,8 +83,8 @@ kalmia_send_init_packet(struct usbnet *dev, u8 *init_msg, u8 init_msg_len, if (status != 0) netdev_err(dev->net, - "Error receiving init result. Status %i, length %i\n", - status, act_len); + "Error receiving init result. Status %i\n", + status); else if (act_len != expected_len) netdev_err(dev->net, "Unexpected init result length: %i\n", act_len); From 9b55d3f0a69af649c62cbc2633e6d695bb3cc583 Mon Sep 17 00:00:00 2001 From: Felix Riemann Date: Fri, 10 Feb 2023 13:36:44 +0100 Subject: [PATCH 19/35] net: Fix unwanted sign extension in netdev_stats_to_stats64() When converting net_device_stats to rtnl_link_stats64 sign extension is triggered on ILP32 machines as 6c1c509778 changed the previous "ulong -> u64" conversion to "long -> u64" by accessing the net_device_stats fields through a (signed) atomic_long_t. This causes for example the received bytes counter to jump to 16EiB after having received 2^31 bytes. Casting the atomic value to "unsigned long" beforehand converting it into u64 avoids this. Fixes: 6c1c5097781f ("net: add atomic_long_t to net_device_stats fields") Signed-off-by: Felix Riemann Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index b76fb37b381e..ea0a7bac1e5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10375,7 +10375,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = atomic_long_read(&src[i]); + dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); From 2038cc592811209de20c4e094ca08bfb1e6fbc6c Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 10 Feb 2023 12:31:55 -0500 Subject: [PATCH 20/35] bnxt_en: Fix mqprio and XDP ring checking logic In bnxt_reserve_rings(), there is logic to check that the number of TX rings reserved is enough to cover all the mqprio TCs, but it fails to account for the TX XDP rings. So the check will always fail if there are mqprio TCs and TX XDP rings. As a result, the driver always fails to initialize after the XDP program is attached and the device will be brought down. A subsequent ifconfig up will also fail because the number of TX rings is set to an inconsistent number. Fix the check to properly account for TX XDP rings. If the check fails, set the number of TX rings back to a consistent number after calling netdev_reset_tc(). Fixes: 674f50a5b026 ("bnxt_en: Implement new method to reserve rings.") Reviewed-by: Hongguang Gao Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 240a7e8a7652..6c32f5c427b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9274,10 +9274,14 @@ int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init) netdev_err(bp->dev, "ring reservation/IRQ init failure rc: %d\n", rc); return rc; } - if (tcs && (bp->tx_nr_rings_per_tc * tcs != bp->tx_nr_rings)) { + if (tcs && (bp->tx_nr_rings_per_tc * tcs != + bp->tx_nr_rings - bp->tx_nr_rings_xdp)) { netdev_err(bp->dev, "tx ring reservation failure\n"); netdev_reset_tc(bp->dev); - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + if (bp->tx_nr_rings_xdp) + bp->tx_nr_rings_per_tc = bp->tx_nr_rings_xdp; + else + bp->tx_nr_rings_per_tc = bp->tx_nr_rings; return -ENOMEM; } return 0; From 3e6dc119a37bceb06e1d595b1d41b52f3e99132d Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Wed, 25 Jan 2023 12:37:40 +0100 Subject: [PATCH 21/35] ice: Fix check for weight and priority of a scheduling node Currently checks for weight and priority ranges don't check incoming value from the devlink. Instead it checks node current weight or priority. This makes those checks useless. Change range checks in ice_set_object_tx_priority() and ice_set_object_tx_weight() to check against incoming priority an weight. Fixes: 42c2eb6b1f43 ("ice: Implement devlink-rate API") Signed-off-by: Michal Wilczynski Acked-by: Jesse Brandeburg Reviewed-by: Paul Menzel Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 8286e47b4bae..0fae0186bd85 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -899,7 +899,7 @@ static int ice_set_object_tx_priority(struct ice_port_info *pi, struct ice_sched { int status; - if (node->tx_priority >= 8) { + if (priority >= 8) { NL_SET_ERR_MSG_MOD(extack, "Priority should be less than 8"); return -EINVAL; } @@ -929,7 +929,7 @@ static int ice_set_object_tx_weight(struct ice_port_info *pi, struct ice_sched_n { int status; - if (node->tx_weight > 200 || node->tx_weight < 1) { + if (weight > 200 || weight < 1) { NL_SET_ERR_MSG_MOD(extack, "Weight must be between 1 and 200"); return -EINVAL; } From 43fbca02c2ddc39ff5879b6f3a4a097b1ba02098 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Mon, 6 Feb 2023 15:54:36 -0800 Subject: [PATCH 22/35] ice: fix lost multicast packets in promisc mode There was a problem reported to us where the addition of a VF with an IPv6 address ending with a particular sequence would cause the parent device on the PF to no longer be able to respond to neighbor discovery packets. In this case, we had an ovs-bridge device living on top of a VLAN, which was on top of a PF, and it would not be able to talk anymore (the neighbor entry would expire and couldn't be restored). The root cause of the issue is that if the PF is asked to be in IFF_PROMISC mode (promiscuous mode) and it had an ipv6 address that needed the 33:33:ff:00:00:04 multicast address to work, then when the VF was added with the need for the same multicast address, the VF would steal all the traffic destined for that address. The ice driver didn't auto-subscribe a request of IFF_PROMISC to the "multicast replication from other port's traffic" meaning that it won't get for instance, packets with an exact destination in the VF, as above. The VF's IPv6 address, which adds a "perfect filter" for 33:33:ff:00:00:04, results in no packets for that multicast address making it to the PF (which is in promisc but NOT "multicast replication"). The fix is to enable "multicast promiscuous" whenever the driver is asked to enable IFF_PROMISC, and make sure to disable it when appropriate. Fixes: e94d44786693 ("ice: Implement filter sync, NDO operations and bump version") Signed-off-by: Jesse Brandeburg Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index b288a01a321a..8ec24f6cf6be 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -275,6 +275,8 @@ static int ice_set_promisc(struct ice_vsi *vsi, u8 promisc_m) if (status && status != -EEXIST) return status; + netdev_dbg(vsi->netdev, "set promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return 0; } @@ -300,6 +302,8 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m) promisc_m, 0); } + netdev_dbg(vsi->netdev, "clear promisc filter bits for VSI %i: 0x%x\n", + vsi->vsi_num, promisc_m); return status; } @@ -414,6 +418,16 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) } err = 0; vlan_ops->dis_rx_filtering(vsi); + + /* promiscuous mode implies allmulticast so + * that VSIs that are in promiscuous mode are + * subscribed to multicast packets coming to + * the port + */ + err = ice_set_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) + goto out_promisc; } } else { /* Clear Rx filter to remove traffic from wire */ @@ -430,6 +444,18 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) NETIF_F_HW_VLAN_CTAG_FILTER) vlan_ops->ena_rx_filtering(vsi); } + + /* disable allmulti here, but only if allmulti is not + * still enabled for the netdev + */ + if (!(vsi->current_netdev_flags & IFF_ALLMULTI)) { + err = ice_clear_promisc(vsi, + ICE_MCAST_PROMISC_BITS); + if (err) { + netdev_err(netdev, "Error %d clearing multicast promiscuous on VSI %i\n", + err, vsi->vsi_num); + } + } } } goto exit; From 4562c65ec852067c6196abdcf2d925f08841dcbc Mon Sep 17 00:00:00 2001 From: Johannes Zink Date: Fri, 10 Feb 2023 15:39:37 +0100 Subject: [PATCH 23/35] net: stmmac: fix order of dwmac5 FlexPPS parametrization sequence So far changing the period by just setting new period values while running did not work. The order as indicated by the publicly available reference manual of the i.MX8MP [1] indicates a sequence: * initiate the programming sequence * set the values for PPS period and start time * start the pulse train generation. This is currently not used in dwmac5_flex_pps_config(), which instead does: * initiate the programming sequence and immediately start the pulse train generation * set the values for PPS period and start time This caused the period values written not to take effect until the FlexPPS output was disabled and re-enabled again. This patch fix the order and allows the period to be set immediately. [1] https://www.nxp.com/webapp/Download?colCode=IMX8MPRM Fixes: 9a8a02c9d46d ("net: stmmac: Add Flexible PPS support") Signed-off-by: Johannes Zink Link: https://lore.kernel.org/r/20230210143937.3427483-1-j.zink@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 413f66017219..e95d35f1e5a0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -541,9 +541,9 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, return 0; } - val |= PPSCMDx(index, 0x2); val |= TRGTMODSELx(index, 0x2); val |= PPSEN0; + writel(val, ioaddr + MAC_PPS_CONTROL); writel(cfg->start.tv_sec, ioaddr + MAC_PPSx_TARGET_TIME_SEC(index)); @@ -568,6 +568,7 @@ int dwmac5_flex_pps_config(void __iomem *ioaddr, int index, writel(period - 1, ioaddr + MAC_PPSx_WIDTH(index)); /* Finally, activate it */ + val |= PPSCMDx(index, 0x2); writel(val, ioaddr + MAC_PPS_CONTROL); return 0; } From 21c167aa0ba943a7cac2f6969814f83bb701666b Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 10 Feb 2023 17:08:25 -0300 Subject: [PATCH 24/35] net/sched: act_ctinfo: use percpu stats The tc action act_ctinfo was using shared stats, fix it to use percpu stats since bstats_update() must be called with locks or with a percpu pointer argument. tdc results: 1..12 ok 1 c826 - Add ctinfo action with default setting ok 2 0286 - Add ctinfo action with dscp ok 3 4938 - Add ctinfo action with valid cpmark and zone ok 4 7593 - Add ctinfo action with drop control ok 5 2961 - Replace ctinfo action zone and action control ok 6 e567 - Delete ctinfo action with valid index ok 7 6a91 - Delete ctinfo action with invalid index ok 8 5232 - List ctinfo actions ok 9 7702 - Flush ctinfo actions ok 10 3201 - Add ctinfo action with duplicate index ok 11 8295 - Add ctinfo action with invalid index ok 12 3964 - Replace ctinfo action with invalid goto_chain control Fixes: 24ec483cec98 ("net: sched: Introduce act_ctinfo action") Reviewed-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Larysa Zaremba Link: https://lore.kernel.org/r/20230210200824.444856-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/act_ctinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 4b1b59da5c0b..4d15b6a6169c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -93,7 +93,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); @@ -212,8 +212,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_ctinfo_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; From 05d7623a892a9da62da0e714428e38f09e4a64d8 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 10 Feb 2023 22:21:26 +0200 Subject: [PATCH 25/35] net: stmmac: Restrict warning on disabling DMA store and fwd mode When setting 'snps,force_thresh_dma_mode' DT property, the following warning is always emitted, regardless the status of force_sf_dma_mode: dwmac-starfive 10020000.ethernet: force_sf_dma_mode is ignored if force_thresh_dma_mode is set. Do not print the rather misleading message when DMA store and forward mode is already disabled. Fixes: e2a240c7d3bc ("driver:net:stmmac: Disable DMA store and forward mode if platform data force_thresh_dma_mode is set.") Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20230210202126.877548-1-cristian.ciocaltea@collabora.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index eb6d9cd8e93f..0046a4ee6e64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -559,7 +559,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode"); - if (plat->force_thresh_dma_mode) { + if (plat->force_thresh_dma_mode && plat->force_sf_dma_mode) { plat->force_sf_dma_mode = 0; dev_warn(&pdev->dev, "force_sf_dma_mode is ignored if force_thresh_dma_mode is set.\n"); From f9cd6a4418bac6a046ee78382423b1ae7565fb24 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 8 Feb 2023 10:43:32 +0800 Subject: [PATCH 26/35] ixgbe: allow to increase MTU to 3K with XDP enabled Recently I encountered one case where I cannot increase the MTU size directly from 1500 to a much bigger value with XDP enabled if the server is equipped with IXGBE card, which happened on thousands of servers in production environment. After applying the current patch, we can set the maximum MTU size to 3K. This patch follows the behavior of changing MTU as i40e/ice does. References: [1] commit 23b44513c3e6 ("ice: allow 3k MTU for XDP") [2] commit 0c8493d90b6b ("i40e: add XDP support for pass and drop actions") Fixes: fabf1bce103a ("ixgbe: Prevent unsupported configurations with XDP") Signed-off-by: Jason Xing Reviewed-by: Alexander Duyck Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ab8370c413f3..25ca329f7d3c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6777,6 +6777,18 @@ static void ixgbe_free_all_rx_resources(struct ixgbe_adapter *adapter) ixgbe_free_rx_resources(adapter->rx_ring[i]); } +/** + * ixgbe_max_xdp_frame_size - returns the maximum allowed frame size for XDP + * @adapter: device handle, pointer to adapter + */ +static int ixgbe_max_xdp_frame_size(struct ixgbe_adapter *adapter) +{ + if (PAGE_SIZE >= 8192 || adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) + return IXGBE_RXBUFFER_2K; + else + return IXGBE_RXBUFFER_3K; +} + /** * ixgbe_change_mtu - Change the Maximum Transfer Unit * @netdev: network interface device structure @@ -6788,18 +6800,13 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { struct ixgbe_adapter *adapter = netdev_priv(netdev); - if (adapter->xdp_prog) { + if (ixgbe_enabled_xdp_adapter(adapter)) { int new_frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; - int i; - for (i = 0; i < adapter->num_rx_queues; i++) { - struct ixgbe_ring *ring = adapter->rx_ring[i]; - - if (new_frame_size > ixgbe_rx_bufsz(ring)) { - e_warn(probe, "Requested MTU size is not supported with XDP\n"); - return -EINVAL; - } + if (new_frame_size > ixgbe_max_xdp_frame_size(adapter)) { + e_warn(probe, "Requested MTU size is not supported with XDP\n"); + return -EINVAL; } } From ce45ffb815e8e238f05de1630be3969b6bb15e4e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 8 Feb 2023 10:43:33 +0800 Subject: [PATCH 27/35] i40e: add double of VLAN header when computing the max MTU Include the second VLAN HLEN into account when computing the maximum MTU size as other drivers do. Fixes: 0c8493d90b6b ("i40e: add XDP support for pass and drop actions") Signed-off-by: Jason Xing Reviewed-by: Alexander Duyck Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 4626d2a1af91..52eec0a50492 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2921,7 +2921,7 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu) struct i40e_pf *pf = vsi->back; if (i40e_enabled_xdp_vsi(vsi)) { - int frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + int frame_size = new_mtu + I40E_PACKET_HDR_PAD; if (frame_size > i40e_max_xdp_frame_size(vsi)) return -EINVAL; From 0967bf837784a11c65d66060623a74e65211af0b Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 9 Feb 2023 10:41:28 +0800 Subject: [PATCH 28/35] ixgbe: add double of VLAN header when computing the max MTU Include the second VLAN HLEN into account when computing the maximum MTU size as other drivers do. Fixes: fabf1bce103a ("ixgbe: Prevent unsupported configurations with XDP") Signed-off-by: Jason Xing Reviewed-by: Alexander Duyck Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 ++ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index bc68b8f2176d..8736ca4b2628 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -73,6 +73,8 @@ #define IXGBE_RXBUFFER_4K 4096 #define IXGBE_MAX_RXBUFFER 16384 /* largest size for a single descriptor */ +#define IXGBE_PKT_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)) + /* Attempt to maximize the headroom available for incoming frames. We * use a 2K buffer for receives and need 1536/1534 to store the data for * the frame. This leaves us with 512 bytes of room. From that we need diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 25ca329f7d3c..4507fba8747a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6801,8 +6801,7 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) struct ixgbe_adapter *adapter = netdev_priv(netdev); if (ixgbe_enabled_xdp_adapter(adapter)) { - int new_frame_size = new_mtu + ETH_HLEN + ETH_FCS_LEN + - VLAN_HLEN; + int new_frame_size = new_mtu + IXGBE_PKT_HDR_PAD; if (new_frame_size > ixgbe_max_xdp_frame_size(adapter)) { e_warn(probe, "Requested MTU size is not supported with XDP\n"); From 2558b8039d059342197610498c8749ad294adee5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Feb 2023 16:00:59 +0000 Subject: [PATCH 29/35] net: use a bounce buffer for copying skb->mark syzbot found arm64 builds would crash in sock_recv_mark() when CONFIG_HARDENED_USERCOPY=y x86 and powerpc are not detecting the issue because they define user_access_begin. This will be handled in a different patch, because a check_object_size() is missing. Only data from skb->cb[] can be copied directly to/from user space, as explained in commit 79a8a642bf05 ("net: Whitelist the skbuff_head_cache "cb" field") syzbot report was: usercopy: Kernel memory exposure attempt detected from SLUB object 'skbuff_head_cache' (offset 168, size 4)! ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:102 ! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 4410 Comm: syz-executor533 Not tainted 6.2.0-rc7-syzkaller-17907-g2d3827b3f393 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/21/2023 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : usercopy_abort+0x90/0x94 mm/usercopy.c:90 lr : usercopy_abort+0x90/0x94 mm/usercopy.c:90 sp : ffff80000fb9b9a0 x29: ffff80000fb9b9b0 x28: ffff0000c6073400 x27: 0000000020001a00 x26: 0000000000000014 x25: ffff80000cf52000 x24: fffffc0000000000 x23: 05ffc00000000200 x22: fffffc000324bf80 x21: ffff0000c92fe1a8 x20: 0000000000000001 x19: 0000000000000004 x18: 0000000000000000 x17: 656a626f2042554c x16: ffff0000c6073dd0 x15: ffff80000dbd2118 x14: ffff0000c6073400 x13: 00000000ffffffff x12: ffff0000c6073400 x11: ff808000081bbb4c x10: 0000000000000000 x9 : 7b0572d7cc0ccf00 x8 : 7b0572d7cc0ccf00 x7 : ffff80000bf650d4 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000 x2 : ffff0001fefbff08 x1 : 0000000100000000 x0 : 000000000000006c Call trace: usercopy_abort+0x90/0x94 mm/usercopy.c:90 __check_heap_object+0xa8/0x100 mm/slub.c:4761 check_heap_object mm/usercopy.c:196 [inline] __check_object_size+0x208/0x6b8 mm/usercopy.c:251 check_object_size include/linux/thread_info.h:199 [inline] __copy_to_user include/linux/uaccess.h:115 [inline] put_cmsg+0x408/0x464 net/core/scm.c:238 sock_recv_mark net/socket.c:975 [inline] __sock_recv_cmsgs+0x1fc/0x248 net/socket.c:984 sock_recv_cmsgs include/net/sock.h:2728 [inline] packet_recvmsg+0x2d8/0x678 net/packet/af_packet.c:3482 ____sys_recvmsg+0x110/0x3a0 ___sys_recvmsg net/socket.c:2737 [inline] __sys_recvmsg+0x194/0x210 net/socket.c:2767 __do_sys_recvmsg net/socket.c:2777 [inline] __se_sys_recvmsg net/socket.c:2774 [inline] __arm64_sys_recvmsg+0x2c/0x3c net/socket.c:2774 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall+0x64/0x178 arch/arm64/kernel/syscall.c:52 el0_svc_common+0xbc/0x180 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x110 arch/arm64/kernel/syscall.c:193 el0_svc+0x58/0x14c arch/arm64/kernel/entry-common.c:637 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:655 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Code: 91388800 aa0903e1 f90003e8 94e6d752 (d4210000) Fixes: 6fd1d51cfa25 ("net: SO_RCVMARK socket option for SO_MARK with recvmsg()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Erin MacNeil Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20230213160059.3829741-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/socket.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/socket.c b/net/socket.c index 888cd618a968..c12af3c84d3a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -971,9 +971,12 @@ static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - if (sock_flag(sk, SOCK_RCVMARK) && skb) - put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), - &skb->mark); + if (sock_flag(sk, SOCK_RCVMARK) && skb) { + /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ + __u32 mark = skb->mark; + + put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); + } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, From 207ce626add80ddd941f62fc2fe5d77586e0801b Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Mon, 13 Feb 2023 10:58:22 -0800 Subject: [PATCH 30/35] igb: Fix PPS input and output using 3rd and 4th SDP Fix handling of the tsync interrupt to compare the pin number with IGB_N_SDP instead of IGB_N_EXTTS/IGB_N_PEROUT and fix the indexing to the perout array. Fixes: cf99c1dd7b77 ("igb: move PEROUT and EXTTS isr logic to separate functions") Reported-by: Matt Corallo Signed-off-by: Miroslav Lichvar Reviewed-by: Jacob Keller Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230213185822.3960072-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igb/igb_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 3c0c35ecea10..d8e3048b93dd 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6794,7 +6794,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) struct timespec64 ts; u32 tsauxc; - if (pin < 0 || pin >= IGB_N_PEROUT) + if (pin < 0 || pin >= IGB_N_SDP) return; spin_lock(&adapter->tmreg_lock); @@ -6802,7 +6802,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) if (hw->mac.type == e1000_82580 || hw->mac.type == e1000_i354 || hw->mac.type == e1000_i350) { - s64 ns = timespec64_to_ns(&adapter->perout[pin].period); + s64 ns = timespec64_to_ns(&adapter->perout[tsintr_tt].period); u32 systiml, systimh, level_mask, level, rem; u64 systim, now; @@ -6850,8 +6850,8 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) ts.tv_nsec = (u32)systim; ts.tv_sec = ((u32)(systim >> 32)) & 0xFF; } else { - ts = timespec64_add(adapter->perout[pin].start, - adapter->perout[pin].period); + ts = timespec64_add(adapter->perout[tsintr_tt].start, + adapter->perout[tsintr_tt].period); } /* u32 conversion of tv_sec is safe until y2106 */ @@ -6860,7 +6860,7 @@ static void igb_perout(struct igb_adapter *adapter, int tsintr_tt) tsauxc = rd32(E1000_TSAUXC); tsauxc |= TSAUXC_EN_TT0; wr32(E1000_TSAUXC, tsauxc); - adapter->perout[pin].start = ts; + adapter->perout[tsintr_tt].start = ts; spin_unlock(&adapter->tmreg_lock); } @@ -6874,7 +6874,7 @@ static void igb_extts(struct igb_adapter *adapter, int tsintr_tt) struct ptp_clock_event event; struct timespec64 ts; - if (pin < 0 || pin >= IGB_N_EXTTS) + if (pin < 0 || pin >= IGB_N_SDP) return; if (hw->mac.type == e1000_82580 || From 11a4d6f67cf55883dc78e31c247d1903ed7feccc Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 14 Feb 2023 01:26:06 +0000 Subject: [PATCH 31/35] tipc: fix kernel warning when sending SYN message When sending a SYN message, this kernel stack trace is observed: ... [ 13.396352] RIP: 0010:_copy_from_iter+0xb4/0x550 ... [ 13.398494] Call Trace: [ 13.398630] [ 13.398630] ? __alloc_skb+0xed/0x1a0 [ 13.398630] tipc_msg_build+0x12c/0x670 [tipc] [ 13.398630] ? shmem_add_to_page_cache.isra.71+0x151/0x290 [ 13.398630] __tipc_sendmsg+0x2d1/0x710 [tipc] [ 13.398630] ? tipc_connect+0x1d9/0x230 [tipc] [ 13.398630] ? __local_bh_enable_ip+0x37/0x80 [ 13.398630] tipc_connect+0x1d9/0x230 [tipc] [ 13.398630] ? __sys_connect+0x9f/0xd0 [ 13.398630] __sys_connect+0x9f/0xd0 [ 13.398630] ? preempt_count_add+0x4d/0xa0 [ 13.398630] ? fpregs_assert_state_consistent+0x22/0x50 [ 13.398630] __x64_sys_connect+0x16/0x20 [ 13.398630] do_syscall_64+0x42/0x90 [ 13.398630] entry_SYSCALL_64_after_hwframe+0x63/0xcd It is because commit a41dad905e5a ("iov_iter: saner checks for attempt to copy to/from iterator") has introduced sanity check for copying from/to iov iterator. Lacking of copy direction from the iterator viewpoint would lead to kernel stack trace like above. This commit fixes this issue by initializing the iov iterator with the correct copy direction when sending SYN or ACK without data. Fixes: f25dcc7687d4 ("tipc: tipc ->sendmsg() conversion") Reported-by: syzbot+d43608d061e8847ec9f3@syzkaller.appspotmail.com Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Link: https://lore.kernel.org/r/20230214012606.5804-1-tung.q.nguyen@dektech.com.au Signed-off-by: Jakub Kicinski --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b35c8701876a..a38733f2197a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2614,6 +2614,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, /* Send a 'SYN-' to destination */ m.msg_name = dest; m.msg_namelen = destlen; + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); /* If connect is in non-blocking case, set MSG_DONTWAIT to * indicate send_msg() is never blocked. @@ -2776,6 +2777,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0); __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: From 42018a322bd453e38b3ffee294982243e50a484f Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 13 Feb 2023 22:47:29 -0300 Subject: [PATCH 32/35] net/sched: tcindex: search key must be 16 bits Syzkaller found an issue where a handle greater than 16 bits would trigger a null-ptr-deref in the imperfect hash area update. general protection fault, probably for non-canonical address 0xdffffc0000000015: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000a8-0x00000000000000af] CPU: 0 PID: 5070 Comm: syz-executor456 Not tainted 6.2.0-rc7-syzkaller-00112-gc68f345b7c42 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/21/2023 RIP: 0010:tcindex_set_parms+0x1a6a/0x2990 net/sched/cls_tcindex.c:509 Code: 01 e9 e9 fe ff ff 4c 8b bd 28 fe ff ff e8 0e 57 7d f9 48 8d bb a8 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 94 0c 00 00 48 8b 85 f8 fd ff ff 48 8b 9b a8 00 RSP: 0018:ffffc90003d3ef88 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000015 RSI: ffffffff8803a102 RDI: 00000000000000a8 RBP: ffffc90003d3f1d8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88801e2b10a8 R13: dffffc0000000000 R14: 0000000000030000 R15: ffff888017b3be00 FS: 00005555569af300(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056041c6d2000 CR3: 000000002bfca000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcindex_change+0x1ea/0x320 net/sched/cls_tcindex.c:572 tc_new_tfilter+0x96e/0x2220 net/sched/cls_api.c:2155 rtnetlink_rcv_msg+0x959/0xca0 net/core/rtnetlink.c:6132 netlink_rcv_skb+0x165/0x440 net/netlink/af_netlink.c:2574 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x547/0x7f0 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x91b/0xe10 net/netlink/af_netlink.c:1942 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xd3/0x120 net/socket.c:734 ____sys_sendmsg+0x334/0x8c0 net/socket.c:2476 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2530 __sys_sendmmsg+0x18f/0x460 net/socket.c:2616 __do_sys_sendmmsg net/socket.c:2645 [inline] __se_sys_sendmmsg net/socket.c:2642 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2642 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 Fixes: ee059170b1f7 ("net/sched: tcindex: update imperfect hash filters respecting rcu") Signed-off-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reported-by: syzbot Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ba7f22a49397..6640e75eaa02 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -503,7 +503,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, /* lookup the filter, guaranteed to exist */ for (cf = rcu_dereference_bh_rtnl(*fp); cf; fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) - if (cf->key == handle) + if (cf->key == (u16)handle) break; f->next = cf->next; From fda6c89fe3d9aca073495a664e1d5aea28cd4377 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 13 Feb 2023 22:53:55 -0800 Subject: [PATCH 33/35] net: mpls: fix stale pointer if allocation fails during device rename lianhui reports that when MPLS fails to register the sysctl table under new location (during device rename) the old pointers won't get overwritten and may be freed again (double free). Handle this gracefully. The best option would be unregistering the MPLS from the device completely on failure, but unfortunately mpls_ifdown() can fail. So failing fully is also unreliable. Another option is to register the new table first then only remove old one if the new one succeeds. That requires more code, changes order of notifications and two tables may be visible at the same time. sysctl point is not used in the rest of the code - set to NULL on failures and skip unregister if already NULL. Reported-by: lianhui tang Fixes: 0fae3bf018d9 ("mpls: handle device renames for per-device sysctls") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..dc5165d3eec4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1428,6 +1428,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, free: kfree(table); out: + mdev->sysctl = NULL; return -ENOBUFS; } @@ -1437,6 +1438,9 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct net *net = dev_net(dev); struct ctl_table *table; + if (!mdev->sysctl) + return; + table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); From 5d54cb1767e06025819daa6769e0f18dcbc60936 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Tue, 14 Feb 2023 10:55:48 -0800 Subject: [PATCH 34/35] igb: conditionalize I2C bit banging on external thermal sensor support Commit a97f8783a937 ("igb: unbreak I2C bit-banging on i350") introduced code to change I2C settings to bit banging unconditionally. However, this patch introduced a regression: On an Intel S2600CWR Server Board with three NICs: - 1x dual-port copper Intel I350 Gigabit Network Connection [8086:1521] (rev 01) fw 1.63, 0x80000dda - 2x quad-port SFP+ with copper SFP Avago ABCU-5700RZ Intel I350 Gigabit Fiber Network Connection [8086:1522] (rev 01) fw 1.52.0 the SFP NICs no longer get link at all. Reverting commit a97f8783a937 or switching to the Intel out-of-tree driver both fix the problem. Per the igb out-of-tree driver, I2C bit banging on i350 depends on support for an external thermal sensor (ETS). However, commit a97f8783a937 added bit banging unconditionally. Additionally, the out-of-tree driver always calls init_thermal_sensor_thresh on probe, while our driver only calls init_thermal_sensor_thresh only in igb_reset(), and only if an ETS is present, ignoring the internal thermal sensor. The affected SFPs don't provide an ETS. Per Intel, the behaviour is a result of i350 firmware requirements. This patch fixes the problem by aligning the behaviour to the out-of-tree driver: - split igb_init_i2c() into two functions: - igb_init_i2c() only performs the basic I2C initialization. - igb_set_i2c_bb() makes sure that E1000_CTRL_I2C_ENA is set and enables bit-banging. - igb_probe() only calls igb_set_i2c_bb() if an ETS is present. - igb_probe() calls init_thermal_sensor_thresh() unconditionally. - igb_reset() aligns its behaviour to igb_probe(), i. e., call igb_set_i2c_bb() if an ETS is present and call init_thermal_sensor_thresh() unconditionally. Fixes: a97f8783a937 ("igb: unbreak I2C bit-banging on i350") Tested-by: Mateusz Palczewski Co-developed-by: Jamie Bainbridge Signed-off-by: Jamie Bainbridge Signed-off-by: Corinna Vinschen Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230214185549.1306522-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igb/igb_main.c | 42 +++++++++++++++++------ 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index d8e3048b93dd..b5b443883da9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2256,6 +2256,30 @@ static void igb_enable_mas(struct igb_adapter *adapter) } } +#ifdef CONFIG_IGB_HWMON +/** + * igb_set_i2c_bb - Init I2C interface + * @hw: pointer to hardware structure + **/ +static void igb_set_i2c_bb(struct e1000_hw *hw) +{ + u32 ctrl_ext; + s32 i2cctl; + + ctrl_ext = rd32(E1000_CTRL_EXT); + ctrl_ext |= E1000_CTRL_I2C_ENA; + wr32(E1000_CTRL_EXT, ctrl_ext); + wrfl(); + + i2cctl = rd32(E1000_I2CPARAMS); + i2cctl |= E1000_I2CBB_EN + | E1000_I2C_CLK_OE_N + | E1000_I2C_DATA_OE_N; + wr32(E1000_I2CPARAMS, i2cctl); + wrfl(); +} +#endif + void igb_reset(struct igb_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; @@ -2400,7 +2424,8 @@ void igb_reset(struct igb_adapter *adapter) * interface. */ if (adapter->ets) - mac->ops.init_thermal_sensor_thresh(hw); + igb_set_i2c_bb(hw); + mac->ops.init_thermal_sensor_thresh(hw); } } #endif @@ -3117,21 +3142,12 @@ static void igb_init_mas(struct igb_adapter *adapter) **/ static s32 igb_init_i2c(struct igb_adapter *adapter) { - struct e1000_hw *hw = &adapter->hw; s32 status = 0; - s32 i2cctl; /* I2C interface supported on i350 devices */ if (adapter->hw.mac.type != e1000_i350) return 0; - i2cctl = rd32(E1000_I2CPARAMS); - i2cctl |= E1000_I2CBB_EN - | E1000_I2C_CLK_OUT | E1000_I2C_CLK_OE_N - | E1000_I2C_DATA_OUT | E1000_I2C_DATA_OE_N; - wr32(E1000_I2CPARAMS, i2cctl); - wrfl(); - /* Initialize the i2c bus which is controlled by the registers. * This bus will use the i2c_algo_bit structure that implements * the protocol through toggling of the 4 bits in the register. @@ -3521,6 +3537,12 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->ets = true; else adapter->ets = false; + /* Only enable I2C bit banging if an external thermal + * sensor is supported. + */ + if (adapter->ets) + igb_set_i2c_bb(hw); + hw->mac.ops.init_thermal_sensor_thresh(hw); if (igb_sysfs_init(adapter)) dev_err(&pdev->dev, "failed to allocate sysfs resources\n"); From b20b8aec6ffc07bb547966b356780cd344f20f5b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Feb 2023 09:31:39 +0200 Subject: [PATCH 35/35] devlink: Fix netdev notifier chain corruption Cited commit changed devlink to register its netdev notifier block on the global netdev notifier chain instead of on the per network namespace one. However, when changing the network namespace of the devlink instance, devlink still tries to unregister its notifier block from the chain of the old namespace and register it on the chain of the new namespace. This results in corruption of the notifier chains, as the same notifier block is registered on two different chains: The global one and the per network namespace one. In turn, this causes other problems such as the inability to dismantle namespaces due to netdev reference count issues. Fix by preventing devlink from moving its notifier block between namespaces. Reproducer: # echo "10 1" > /sys/bus/netdevsim/new_device # ip netns add test123 # devlink dev reload netdevsim/netdevsim10 netns test123 # ip netns del test123 [ 71.935619] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 71.938348] leaked reference. Fixes: 565b4824c39f ("devlink: change port event netdev notifier from per-net to global") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Jacob Keller Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230215073139.1360108-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 -- net/core/dev.c | 8 -------- net/core/devlink.c | 5 +---- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aad12a179e54..e6e02184c25a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2839,8 +2839,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); diff --git a/net/core/dev.c b/net/core/dev.c index ea0a7bac1e5c..f23e287602b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1869,14 +1869,6 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } -void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, - struct notifier_block *nb) -{ - rtnl_lock(); - __move_netdevice_notifier_net(src_net, dst_net, nb); - rtnl_unlock(); -} - int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) diff --git a/net/core/devlink.c b/net/core/devlink.c index 909a10e4b0dd..0bfc144df8b9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4742,11 +4742,8 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (err) return err; - if (dest_net && !net_eq(dest_net, curr_net)) { - move_netdevice_notifier_net(curr_net, dest_net, - &devlink->netdevice_nb); + if (dest_net && !net_eq(dest_net, curr_net)) write_pnet(&devlink->_net, dest_net); - } err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err);