From 5fb45f95eec682621748b7cb012c6a8f0f981e6a Mon Sep 17 00:00:00 2001 From: Qingfang DENG Date: Thu, 8 Dec 2022 20:35:29 +0800 Subject: [PATCH 01/64] netfilter: flowtable: really fix NAT IPv6 offload The for-loop was broken from the start. It translates to: for (i = 0; i < 4; i += 4) which means the loop statement is run only once, so only the highest 32-bit of the IPv6 address gets mangled. Fix the loop increment. Fixes: 0e07e25b481a ("netfilter: flowtable: fix NAT IPv6 offload mangling") Fixes: 5c27d8d76ce8 ("netfilter: nf_flow_table_offload: add IPv6 support") Signed-off-by: Qingfang DENG Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 0fdcdb2c9ae4..4d9b99abe37d 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -383,12 +383,12 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, const __be32 *addr, const __be32 *mask) { struct flow_action_entry *entry; - int i, j; + int i; - for (i = 0, j = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32), j++) { + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) { entry = flow_action_entry_next(flow_rule); flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, - offset + i, &addr[j], mask); + offset + i * sizeof(u32), &addr[i], mask); } } From ba57ee0944ff0085652cf8df91f9c571883debe6 Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Mon, 12 Dec 2022 15:43:51 +0800 Subject: [PATCH 02/64] ipvs: add a 'default' case in do_ip_vs_set_ctl() It is better to return the default switch case with '-EINVAL', in case new commands are added. otherwise, return a uninitialized value of ret. Signed-off-by: Li Qiong Reviewed-by: Simon Horman Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 988222fff9f0..97f6a1c8933a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2590,6 +2590,11 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; } out_unlock: From f9645abe4255bd79e4c63799634c996dd53db321 Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Mon, 12 Dec 2022 11:07:05 +0100 Subject: [PATCH 03/64] netfilter: conntrack: document sctp timeouts Exposed through sysctl, update documentation to describe sctp states and their default timeouts. Signed-off-by: Sriram Yagnaraman Signed-off-by: Pablo Neira Ayuso --- .../networking/nf_conntrack-sysctl.rst | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 1120d71f28d7..49db1d11d7c4 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -163,6 +163,39 @@ nf_conntrack_timestamp - BOOLEAN Enable connection tracking flow timestamping. +nf_conntrack_sctp_timeout_closed - INTEGER (seconds) + default 10 + +nf_conntrack_sctp_timeout_cookie_wait - INTEGER (seconds) + default 3 + +nf_conntrack_sctp_timeout_cookie_echoed - INTEGER (seconds) + default 3 + +nf_conntrack_sctp_timeout_established - INTEGER (seconds) + default 432000 (5 days) + +nf_conntrack_sctp_timeout_shutdown_sent - INTEGER (seconds) + default 0.3 + +nf_conntrack_sctp_timeout_shutdown_recd - INTEGER (seconds) + default 0.3 + +nf_conntrack_sctp_timeout_shutdown_ack_sent - INTEGER (seconds) + default 3 + +nf_conntrack_sctp_timeout_heartbeat_sent - INTEGER (seconds) + default 30 + + This timeout is used to setup conntrack entry on secondary paths. + Default is set to hb_interval. + +nf_conntrack_sctp_timeout_heartbeat_acked - INTEGER (seconds) + default 210 + + This timeout is used to setup conntrack entry on secondary paths. + Default is set to (hb_interval * path_max_retrans + rto_max) + nf_conntrack_udp_timeout - INTEGER (seconds) default 30 From f3b4a00f0f62da252c598310698dfc82ef2f2e2e Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Sun, 11 Dec 2022 09:55:32 +0200 Subject: [PATCH 04/64] net: macsec: fix net device access prior to holding a lock Currently macsec offload selection update routine accesses the net device prior to holding the relevant lock. Fix by holding the lock prior to the device access. Fixes: dcb780fb2795 ("net: macsec: add nla support for changing the offloading selection") Reviewed-by: Raed Salem Signed-off-by: Emeel Hakim Link: https://lore.kernel.org/r/20221211075532.28099-1-ehakim@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 937f5b1f04ff..bf8ac7a3ded7 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2593,7 +2593,7 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) const struct macsec_ops *ops; struct macsec_context ctx; struct macsec_dev *macsec; - int ret; + int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; @@ -2606,28 +2606,36 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) macsec_genl_offload_policy, NULL)) return -EINVAL; + rtnl_lock(); + dev = get_dev_from_nl(genl_info_net(info), attrs); - if (IS_ERR(dev)) - return PTR_ERR(dev); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } macsec = macsec_priv(dev); - if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) - return -EINVAL; + if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) { + ret = -EINVAL; + goto out; + } offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload == offload) - return 0; + goto out; /* Check if the offloading mode is supported by the underlying layers */ if (offload != MACSEC_OFFLOAD_OFF && - !macsec_check_offload(offload, macsec)) - return -EOPNOTSUPP; + !macsec_check_offload(offload, macsec)) { + ret = -EOPNOTSUPP; + goto out; + } /* Check if the net device is busy. */ - if (netif_running(dev)) - return -EBUSY; - - rtnl_lock(); + if (netif_running(dev)) { + ret = -EBUSY; + goto out; + } prev_offload = macsec->offload; macsec->offload = offload; @@ -2662,7 +2670,7 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) rollback: macsec->offload = prev_offload; - +out: rtnl_unlock(); return ret; } From 3d0b738fc5adf9f380702ac1424672e4b32c3781 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 12 Dec 2022 11:56:45 +0800 Subject: [PATCH 05/64] bonding: add missed __rcu annotation for curr_active_slave There is one direct accesses to bond->curr_active_slave in bond_miimon_commit(). Protected it by rcu_access_pointer() since the later of this function also use this one. Signed-off-by: Hangbin Liu Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f7767afe116b..6a4bbd5aa3e0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2699,7 +2699,7 @@ static void bond_miimon_commit(struct bonding *bond) bond_miimon_link_change(bond, slave, BOND_LINK_UP); - if (!bond->curr_active_slave || slave == primary) + if (!rcu_access_pointer(bond->curr_active_slave) || slave == primary) goto do_failover; continue; From e95cc44763a41d5c715ef16742bcb1d8e6524a62 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 12 Dec 2022 11:56:46 +0800 Subject: [PATCH 06/64] bonding: do failover when high prio link up Currently, when a high prio link enslaved, or when current link down, the high prio port could be selected. But when high prio link up, the new active slave reselection is not triggered. Fix it by checking link's prio when getting up. Making the do_failover after looping all slaves as there may be multi high prio slaves up. Reported-by: Liang Li Fixes: 0a2ff7cc8ad4 ("Bonding: add per-port priority for failover re-selection") Signed-off-by: Hangbin Liu Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6a4bbd5aa3e0..b4c65783960a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2654,8 +2654,9 @@ static void bond_miimon_link_change(struct bonding *bond, static void bond_miimon_commit(struct bonding *bond) { - struct list_head *iter; struct slave *slave, *primary; + bool do_failover = false; + struct list_head *iter; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { @@ -2699,8 +2700,9 @@ static void bond_miimon_commit(struct bonding *bond) bond_miimon_link_change(bond, slave, BOND_LINK_UP); - if (!rcu_access_pointer(bond->curr_active_slave) || slave == primary) - goto do_failover; + if (!rcu_access_pointer(bond->curr_active_slave) || slave == primary || + slave->prio > rcu_dereference(bond->curr_active_slave)->prio) + do_failover = true; continue; @@ -2721,7 +2723,7 @@ static void bond_miimon_commit(struct bonding *bond) bond_miimon_link_change(bond, slave, BOND_LINK_DOWN); if (slave == rcu_access_pointer(bond->curr_active_slave)) - goto do_failover; + do_failover = true; continue; @@ -2732,8 +2734,9 @@ static void bond_miimon_commit(struct bonding *bond) continue; } + } -do_failover: + if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); @@ -3531,6 +3534,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) */ static void bond_ab_arp_commit(struct bonding *bond) { + bool do_failover = false; struct list_head *iter; unsigned long last_tx; struct slave *slave; @@ -3560,8 +3564,9 @@ static void bond_ab_arp_commit(struct bonding *bond) slave_info(bond->dev, slave->dev, "link status definitely up\n"); if (!rtnl_dereference(bond->curr_active_slave) || - slave == rtnl_dereference(bond->primary_slave)) - goto do_failover; + slave == rtnl_dereference(bond->primary_slave) || + slave->prio > rtnl_dereference(bond->curr_active_slave)->prio) + do_failover = true; } @@ -3580,7 +3585,7 @@ static void bond_ab_arp_commit(struct bonding *bond) if (slave == rtnl_dereference(bond->curr_active_slave)) { RCU_INIT_POINTER(bond->current_arp_slave, NULL); - goto do_failover; + do_failover = true; } continue; @@ -3604,8 +3609,9 @@ static void bond_ab_arp_commit(struct bonding *bond) slave->link_new_state); continue; } + } -do_failover: + if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); From 42a8d4aaea8414f60eb2ed2d92df89a6e2db4615 Mon Sep 17 00:00:00 2001 From: Liang Li Date: Mon, 12 Dec 2022 11:56:47 +0800 Subject: [PATCH 07/64] selftests: bonding: add bonding prio option test Add a test for bonding prio option. Here is the test result: ]# ./option_prio.sh TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=arp_ip_target and primary_reselect=0) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=arp_ip_target and primary_reselect=1) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=arp_ip_target and primary_reselect=2) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=miimon and primary_reselect=0) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=miimon and primary_reselect=1) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=1 monitor=miimon and primary_reselect=2) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=5 monitor=miimon and primary_reselect=0) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=5 monitor=miimon and primary_reselect=1) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=5 monitor=miimon and primary_reselect=2) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=6 monitor=miimon and primary_reselect=0) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=6 monitor=miimon and primary_reselect=1) [ OK ] TEST: prio_test (Test bonding option 'prio' with mode=6 monitor=miimon and primary_reselect=2) [ OK ] Signed-off-by: Liang Li Signed-off-by: Hangbin Liu Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/option_prio.sh | 245 ++++++++++++++++++ 2 files changed, 247 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/option_prio.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 0f3921908b07..8e3b786a748f 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -7,7 +7,8 @@ TEST_PROGS := \ bond-lladdr-target.sh \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ - mode-2-recovery-updelay.sh + mode-2-recovery-updelay.sh \ + option_prio.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/option_prio.sh b/tools/testing/selftests/drivers/net/bonding/option_prio.sh new file mode 100755 index 000000000000..c32eebff5005 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/option_prio.sh @@ -0,0 +1,245 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bonding option prio +# + +ALL_TESTS=" + prio_arp_ip_target_test + prio_miimon_test +" + +REQUIRE_MZ=no +REQUIRE_JQ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/net_forwarding_lib.sh + +destroy() +{ + ip link del bond0 &>/dev/null + ip link del br0 &>/dev/null + ip link del veth0 &>/dev/null + ip link del veth1 &>/dev/null + ip link del veth2 &>/dev/null + ip netns del ns1 &>/dev/null + ip link del veth3 &>/dev/null +} + +cleanup() +{ + pre_cleanup + + destroy +} + +skip() +{ + local skip=1 + ip link add name bond0 type bond mode 1 miimon 100 &>/dev/null + ip link add name veth0 type veth peer name veth0_p + ip link set veth0 master bond0 + + # check if iproute support prio option + ip link set dev veth0 type bond_slave prio 10 + [[ $? -ne 0 ]] && skip=0 + + # check if bonding support prio option + ip -d link show veth0 | grep -q "prio 10" + [[ $? -ne 0 ]] && skip=0 + + ip link del bond0 &>/dev/null + ip link del veth0 + + return $skip +} + +active_slave="" +check_active_slave() +{ + local target_active_slave=$1 + active_slave="$(cat /sys/class/net/bond0/bonding/active_slave)" + test "$active_slave" = "$target_active_slave" + check_err $? "Current active slave is $active_slave but not $target_active_slave" +} + + +# Test bonding prio option with mode=$mode monitor=$monitor +# and primary_reselect=$primary_reselect +prio_test() +{ + RET=0 + + local monitor=$1 + local mode=$2 + local primary_reselect=$3 + + local bond_ip4="192.169.1.2" + local peer_ip4="192.169.1.1" + local bond_ip6="2009:0a:0b::02" + local peer_ip6="2009:0a:0b::01" + + + # create veths + ip link add name veth0 type veth peer name veth0_p + ip link add name veth1 type veth peer name veth1_p + ip link add name veth2 type veth peer name veth2_p + + # create bond + if [[ "$monitor" == "miimon" ]];then + ip link add name bond0 type bond mode $mode miimon 100 primary veth1 primary_reselect $primary_reselect + elif [[ "$monitor" == "arp_ip_target" ]];then + ip link add name bond0 type bond mode $mode arp_interval 1000 arp_ip_target $peer_ip4 primary veth1 primary_reselect $primary_reselect + elif [[ "$monitor" == "ns_ip6_target" ]];then + ip link add name bond0 type bond mode $mode arp_interval 1000 ns_ip6_target $peer_ip6 primary veth1 primary_reselect $primary_reselect + fi + ip link set bond0 up + ip link set veth0 master bond0 + ip link set veth1 master bond0 + ip link set veth2 master bond0 + # check bonding member prio value + ip link set dev veth0 type bond_slave prio 0 + ip link set dev veth1 type bond_slave prio 10 + ip link set dev veth2 type bond_slave prio 11 + ip -d link show veth0 | grep -q 'prio 0' + check_err $? "veth0 prio is not 0" + ip -d link show veth1 | grep -q 'prio 10' + check_err $? "veth0 prio is not 10" + ip -d link show veth2 | grep -q 'prio 11' + check_err $? "veth0 prio is not 11" + + ip link set veth0 up + ip link set veth1 up + ip link set veth2 up + ip link set veth0_p up + ip link set veth1_p up + ip link set veth2_p up + + # prepare ping target + ip link add name br0 type bridge + ip link set br0 up + ip link set veth0_p master br0 + ip link set veth1_p master br0 + ip link set veth2_p master br0 + ip link add name veth3 type veth peer name veth3_p + ip netns add ns1 + ip link set veth3_p master br0 up + ip link set veth3 netns ns1 up + ip netns exec ns1 ip addr add $peer_ip4/24 dev veth3 + ip netns exec ns1 ip addr add $peer_ip6/64 dev veth3 + ip addr add $bond_ip4/24 dev bond0 + ip addr add $bond_ip6/64 dev bond0 + sleep 5 + + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 1." + ping6 $peer_ip6 -c5 -I bond0 &>/dev/null + check_err $? "ping6 failed 1." + + # active salve should be the primary slave + check_active_slave veth1 + + # active slave should be the higher prio slave + ip link set $active_slave down + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 2." + check_active_slave veth2 + + # when only 1 slave is up + ip link set $active_slave down + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 3." + check_active_slave veth0 + + # when a higher prio slave change to up + ip link set veth2 up + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 4." + case $primary_reselect in + "0") + check_active_slave "veth2" + ;; + "1") + check_active_slave "veth0" + ;; + "2") + check_active_slave "veth0" + ;; + esac + local pre_active_slave=$active_slave + + # when the primary slave change to up + ip link set veth1 up + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 5." + case $primary_reselect in + "0") + check_active_slave "veth1" + ;; + "1") + check_active_slave "$pre_active_slave" + ;; + "2") + check_active_slave "$pre_active_slave" + ip link set $active_slave down + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 6." + check_active_slave "veth1" + ;; + esac + + # Test changing bond salve prio + if [[ "$primary_reselect" == "0" ]];then + ip link set dev veth0 type bond_slave prio 1000000 + ip link set dev veth1 type bond_slave prio 0 + ip link set dev veth2 type bond_slave prio -50 + ip -d link show veth0 | grep -q 'prio 1000000' + check_err $? "veth0 prio is not 1000000" + ip -d link show veth1 | grep -q 'prio 0' + check_err $? "veth1 prio is not 0" + ip -d link show veth2 | grep -q 'prio -50' + check_err $? "veth3 prio is not -50" + check_active_slave "veth1" + + ip link set $active_slave down + ping $peer_ip4 -c5 -I bond0 &>/dev/null + check_err $? "ping failed 7." + check_active_slave "veth0" + fi + + cleanup + + log_test "prio_test" "Test bonding option 'prio' with mode=$mode monitor=$monitor and primary_reselect=$primary_reselect" +} + +prio_miimon_test() +{ + local mode + local primary_reselect + + for mode in 1 5 6; do + for primary_reselect in 0 1 2; do + prio_test "miimon" $mode $primary_reselect + done + done +} + +prio_arp_ip_target_test() +{ + local primary_reselect + + for primary_reselect in 0 1 2; do + prio_test "arp_ip_target" 1 $primary_reselect + done +} + +if skip;then + log_test_skip "option_prio.sh" "Current iproute doesn't support 'prio'." + exit 0 +fi + +trap cleanup EXIT + +tests_run + +exit "$EXIT_STATUS" From ddc9648db162eee556edd5222d2808fe33730203 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 12 Dec 2022 16:41:37 +0800 Subject: [PATCH 08/64] mISDN: hfcsusb: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() or consume_skb() from hardware interrupt context or with hardware interrupts being disabled. It should use dev_kfree_skb_irq() or dev_consume_skb_irq() instead. The difference between them is free reason, dev_kfree_skb_irq() means the SKB is dropped in error and dev_consume_skb_irq() means the SKB is consumed in normal. skb_queue_purge() is called under spin_lock_irqsave() in hfcusb_l2l1D(), kfree_skb() is called in it, to fix this, use skb_queue_splice_init() to move the dch->squeue to a free queue, also enqueue the tx_skb and rx_skb, at last calling __skb_queue_purge() to free the SKBs afer unlock. In tx_iso_complete(), dev_kfree_skb() is called to consume the transmitted SKB, so replace it with dev_consume_skb_irq(). Fixes: 69f52adb2d53 ("mISDN: Add HFC USB driver") Signed-off-by: Yang Yingliang Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- drivers/isdn/hardware/mISDN/hfcsusb.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c index 651f2f8f685b..1efd17979f24 100644 --- a/drivers/isdn/hardware/mISDN/hfcsusb.c +++ b/drivers/isdn/hardware/mISDN/hfcsusb.c @@ -326,20 +326,24 @@ hfcusb_l2l1D(struct mISDNchannel *ch, struct sk_buff *skb) test_and_clear_bit(FLG_L2_ACTIVATED, &dch->Flags); if (hw->protocol == ISDN_P_NT_S0) { + struct sk_buff_head free_queue; + + __skb_queue_head_init(&free_queue); hfcsusb_ph_command(hw, HFC_L1_DEACTIVATE_NT); spin_lock_irqsave(&hw->lock, flags); - skb_queue_purge(&dch->squeue); + skb_queue_splice_init(&dch->squeue, &free_queue); if (dch->tx_skb) { - dev_kfree_skb(dch->tx_skb); + __skb_queue_tail(&free_queue, dch->tx_skb); dch->tx_skb = NULL; } dch->tx_idx = 0; if (dch->rx_skb) { - dev_kfree_skb(dch->rx_skb); + __skb_queue_tail(&free_queue, dch->rx_skb); dch->rx_skb = NULL; } test_and_clear_bit(FLG_TX_BUSY, &dch->Flags); spin_unlock_irqrestore(&hw->lock, flags); + __skb_queue_purge(&free_queue); #ifdef FIXME if (test_and_clear_bit(FLG_L1_BUSY, &dch->Flags)) dchannel_sched_event(&hc->dch, D_CLEARBUSY); @@ -1330,7 +1334,7 @@ tx_iso_complete(struct urb *urb) printk("\n"); } - dev_kfree_skb(tx_skb); + dev_consume_skb_irq(tx_skb); tx_skb = NULL; if (fifo->dch && get_next_dframe(fifo->dch)) tx_skb = fifo->dch->tx_skb; From f0f596bd75a9d573ca9b587abb39cee0b916bb82 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 12 Dec 2022 16:41:38 +0800 Subject: [PATCH 09/64] mISDN: hfcpci: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() or consume_skb() from hardware interrupt context or with hardware interrupts being disabled. skb_queue_purge() is called under spin_lock_irqsave() in hfcpci_l2l1D(), kfree_skb() is called in it, to fix this, use skb_queue_splice_init() to move the dch->squeue to a free queue, also enqueue the tx_skb and rx_skb, at last calling __skb_queue_purge() to free the SKBs afer unlock. Fixes: 1700fe1a10dc ("Add mISDN HFC PCI driver") Signed-off-by: Yang Yingliang Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- drivers/isdn/hardware/mISDN/hfcpci.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index e964a8dd8512..c0331b268010 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -1617,16 +1617,19 @@ hfcpci_l2l1D(struct mISDNchannel *ch, struct sk_buff *skb) test_and_clear_bit(FLG_L2_ACTIVATED, &dch->Flags); spin_lock_irqsave(&hc->lock, flags); if (hc->hw.protocol == ISDN_P_NT_S0) { + struct sk_buff_head free_queue; + + __skb_queue_head_init(&free_queue); /* prepare deactivation */ Write_hfc(hc, HFCPCI_STATES, 0x40); - skb_queue_purge(&dch->squeue); + skb_queue_splice_init(&dch->squeue, &free_queue); if (dch->tx_skb) { - dev_kfree_skb(dch->tx_skb); + __skb_queue_tail(&free_queue, dch->tx_skb); dch->tx_skb = NULL; } dch->tx_idx = 0; if (dch->rx_skb) { - dev_kfree_skb(dch->rx_skb); + __skb_queue_tail(&free_queue, dch->rx_skb); dch->rx_skb = NULL; } test_and_clear_bit(FLG_TX_BUSY, &dch->Flags); @@ -1639,10 +1642,12 @@ hfcpci_l2l1D(struct mISDNchannel *ch, struct sk_buff *skb) hc->hw.mst_m &= ~HFCPCI_MASTER; Write_hfc(hc, HFCPCI_MST_MODE, hc->hw.mst_m); ret = 0; + spin_unlock_irqrestore(&hc->lock, flags); + __skb_queue_purge(&free_queue); } else { ret = l1_event(dch->l1, hh->prim); + spin_unlock_irqrestore(&hc->lock, flags); } - spin_unlock_irqrestore(&hc->lock, flags); break; } if (!ret) From 1232946cf522b8de9e398828bde325d7c41f29dd Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 12 Dec 2022 16:41:39 +0800 Subject: [PATCH 10/64] mISDN: hfcmulti: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() or consume_skb() from hardware interrupt context or with hardware interrupts being disabled. skb_queue_purge() is called under spin_lock_irqsave() in handle_dmsg() and hfcm_l1callback(), kfree_skb() is called in them, to fix this, use skb_queue_splice_init() to move the dch->squeue to a free queue, also enqueue the tx_skb and rx_skb, at last calling __skb_queue_purge() to free the SKBs afer unlock. Fixes: af69fb3a8ffa ("Add mISDN HFC multiport driver") Signed-off-by: Yang Yingliang Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- drivers/isdn/hardware/mISDN/hfcmulti.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c index 4f7eaa17fb27..e840609c50eb 100644 --- a/drivers/isdn/hardware/mISDN/hfcmulti.c +++ b/drivers/isdn/hardware/mISDN/hfcmulti.c @@ -3217,6 +3217,7 @@ static int hfcm_l1callback(struct dchannel *dch, u_int cmd) { struct hfc_multi *hc = dch->hw; + struct sk_buff_head free_queue; u_long flags; switch (cmd) { @@ -3245,6 +3246,7 @@ hfcm_l1callback(struct dchannel *dch, u_int cmd) l1_event(dch->l1, HW_POWERUP_IND); break; case HW_DEACT_REQ: + __skb_queue_head_init(&free_queue); /* start deactivation */ spin_lock_irqsave(&hc->lock, flags); if (hc->ctype == HFC_TYPE_E1) { @@ -3264,20 +3266,21 @@ hfcm_l1callback(struct dchannel *dch, u_int cmd) plxsd_checksync(hc, 0); } } - skb_queue_purge(&dch->squeue); + skb_queue_splice_init(&dch->squeue, &free_queue); if (dch->tx_skb) { - dev_kfree_skb(dch->tx_skb); + __skb_queue_tail(&free_queue, dch->tx_skb); dch->tx_skb = NULL; } dch->tx_idx = 0; if (dch->rx_skb) { - dev_kfree_skb(dch->rx_skb); + __skb_queue_tail(&free_queue, dch->rx_skb); dch->rx_skb = NULL; } test_and_clear_bit(FLG_TX_BUSY, &dch->Flags); if (test_and_clear_bit(FLG_BUSY_TIMER, &dch->Flags)) del_timer(&dch->timer); spin_unlock_irqrestore(&hc->lock, flags); + __skb_queue_purge(&free_queue); break; case HW_POWERUP_REQ: spin_lock_irqsave(&hc->lock, flags); @@ -3384,6 +3387,9 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb) case PH_DEACTIVATE_REQ: test_and_clear_bit(FLG_L2_ACTIVATED, &dch->Flags); if (dch->dev.D.protocol != ISDN_P_TE_S0) { + struct sk_buff_head free_queue; + + __skb_queue_head_init(&free_queue); spin_lock_irqsave(&hc->lock, flags); if (debug & DEBUG_HFCMULTI_MSG) printk(KERN_DEBUG @@ -3405,14 +3411,14 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb) /* deactivate */ dch->state = 1; } - skb_queue_purge(&dch->squeue); + skb_queue_splice_init(&dch->squeue, &free_queue); if (dch->tx_skb) { - dev_kfree_skb(dch->tx_skb); + __skb_queue_tail(&free_queue, dch->tx_skb); dch->tx_skb = NULL; } dch->tx_idx = 0; if (dch->rx_skb) { - dev_kfree_skb(dch->rx_skb); + __skb_queue_tail(&free_queue, dch->rx_skb); dch->rx_skb = NULL; } test_and_clear_bit(FLG_TX_BUSY, &dch->Flags); @@ -3424,6 +3430,7 @@ handle_dmsg(struct mISDNchannel *ch, struct sk_buff *skb) #endif ret = 0; spin_unlock_irqrestore(&hc->lock, flags); + __skb_queue_purge(&free_queue); } else ret = l1_event(dch->l1, hh->prim); break; From de5dc44370fbd6b46bd7f1a1e00369be54a041c8 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Mon, 12 Dec 2022 11:00:31 -0800 Subject: [PATCH 11/64] igb: Initialize mailbox message for VF reset When a MAC address is not assigned to the VF, that portion of the message sent to the VF is not set. The memory, however, is allocated from the stack meaning that information may be leaked to the VM. Initialize the message buffer to 0 so that no information is passed to the VM in this case. Fixes: 6ddbc4cf1f4d ("igb: Indicate failure on vf reset for empty mac address") Reported-by: Akihiko Odaki Signed-off-by: Tony Nguyen Reviewed-by: Akihiko Odaki Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221212190031.3983342-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 97290fc0fddd..3c0c35ecea10 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7525,7 +7525,7 @@ static void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf) { struct e1000_hw *hw = &adapter->hw; unsigned char *vf_mac = adapter->vf_data[vf].vf_mac_addresses; - u32 reg, msgbuf[3]; + u32 reg, msgbuf[3] = {}; u8 *addr = (u8 *)(&msgbuf[1]); /* process all the same items cleared in a function level reset */ From 2d4ee16d969c97996e80e4c9cb6de0acaff22c9f Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 13 Dec 2022 15:52:08 -0700 Subject: [PATCH 12/64] wireguard: timers: cast enum limits members to int in prints Since gcc13, each member of an enum has the same type as the enum. And that is inherited from its members. Provided "REKEY_AFTER_MESSAGES = 1ULL << 60", the named type is unsigned long. This generates warnings with gcc-13: error: format '%d' expects argument of type 'int', but argument 6 has type 'long unsigned int' Cast those particular enum members to int when printing them. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36113 Cc: Martin Liska Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/all/20221213225208.3343692-2-Jason@zx2c4.com/ Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/timers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireguard/timers.c b/drivers/net/wireguard/timers.c index b5706b6718b1..53d8a57a0dfa 100644 --- a/drivers/net/wireguard/timers.c +++ b/drivers/net/wireguard/timers.c @@ -46,7 +46,7 @@ static void wg_expired_retransmit_handshake(struct timer_list *timer) if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n", peer->device->dev->name, peer->internal_id, - &peer->endpoint.addr, MAX_TIMER_HANDSHAKES + 2); + &peer->endpoint.addr, (int)MAX_TIMER_HANDSHAKES + 2); del_timer(&peer->timer_send_keepalive); /* We drop all packets without a keypair and don't try again, @@ -64,7 +64,7 @@ static void wg_expired_retransmit_handshake(struct timer_list *timer) ++peer->timer_handshake_attempts; pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", peer->device->dev->name, peer->internal_id, - &peer->endpoint.addr, REKEY_TIMEOUT, + &peer->endpoint.addr, (int)REKEY_TIMEOUT, peer->timer_handshake_attempts + 1); /* We clear the endpoint address src address, in case this is @@ -94,7 +94,7 @@ static void wg_expired_new_handshake(struct timer_list *timer) pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->device->dev->name, peer->internal_id, - &peer->endpoint.addr, KEEPALIVE_TIMEOUT + REKEY_TIMEOUT); + &peer->endpoint.addr, (int)(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)); /* We clear the endpoint address src address, in case this is the cause * of trouble. */ @@ -126,7 +126,7 @@ static void wg_queued_expired_zero_key_material(struct work_struct *work) pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->device->dev->name, peer->internal_id, - &peer->endpoint.addr, REJECT_AFTER_TIME * 3); + &peer->endpoint.addr, (int)REJECT_AFTER_TIME * 3); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_peer_put(peer); From d2b497a973fcb76a6b7a552f081b83a1edd91c86 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Mon, 12 Dec 2022 10:16:00 +0000 Subject: [PATCH 13/64] docs/bpf: Reword docs for BPF_MAP_TYPE_SK_STORAGE Improve the grammar of the function descriptions and highlight that the key is a socket fd. Fixes: f3212ad5b7e9 ("docs/bpf: Add documentation for BPF_MAP_TYPE_SK_STORAGE") Reported-by: Martin KaFai Lau Signed-off-by: Donald Hunter Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20221212101600.56026-1-donald.hunter@gmail.com --- Documentation/bpf/map_sk_storage.rst | 56 +++++++++++++++------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/Documentation/bpf/map_sk_storage.rst b/Documentation/bpf/map_sk_storage.rst index 047e16c8aaa8..4e9d23ab9ecd 100644 --- a/Documentation/bpf/map_sk_storage.rst +++ b/Documentation/bpf/map_sk_storage.rst @@ -34,13 +34,12 @@ bpf_sk_storage_get() void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) -Socket-local storage can be retrieved using the ``bpf_sk_storage_get()`` -helper. The helper gets the storage from ``sk`` that is associated with ``map``. -If the ``BPF_LOCAL_STORAGE_GET_F_CREATE`` flag is used then -``bpf_sk_storage_get()`` will create the storage for ``sk`` if it does not -already exist. ``value`` can be used together with -``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise it -will be zero initialized. Returns a pointer to the storage on success, or +Socket-local storage for ``map`` can be retrieved from socket ``sk`` using the +``bpf_sk_storage_get()`` helper. If the ``BPF_LOCAL_STORAGE_GET_F_CREATE`` +flag is used then ``bpf_sk_storage_get()`` will create the storage for ``sk`` +if it does not already exist. ``value`` can be used together with +``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise +it will be zero initialized. Returns a pointer to the storage on success, or ``NULL`` in case of failure. .. note:: @@ -54,9 +53,9 @@ bpf_sk_storage_delete() long bpf_sk_storage_delete(struct bpf_map *map, void *sk) -Socket-local storage can be deleted using the ``bpf_sk_storage_delete()`` -helper. The helper deletes the storage from ``sk`` that is identified by -``map``. Returns ``0`` on success, or negative error in case of failure. +Socket-local storage for ``map`` can be deleted from socket ``sk`` using the +``bpf_sk_storage_delete()`` helper. Returns ``0`` on success, or negative +error in case of failure. User space ---------- @@ -68,16 +67,20 @@ bpf_map_update_elem() int bpf_map_update_elem(int map_fd, const void *key, const void *value, __u64 flags) -Socket-local storage for the socket identified by ``key`` belonging to -``map_fd`` can be added or updated using the ``bpf_map_update_elem()`` libbpf -function. ``key`` must be a pointer to a valid ``fd`` in the user space -program. The ``flags`` parameter can be used to control the update behaviour: +Socket-local storage for map ``map_fd`` can be added or updated locally to a +socket using the ``bpf_map_update_elem()`` libbpf function. The socket is +identified by a `socket` ``fd`` stored in the pointer ``key``. The pointer +``value`` has the data to be added or updated to the socket ``fd``. The type +and size of ``value`` should be the same as the value type of the map +definition. -- ``BPF_ANY`` will create storage for ``fd`` or update existing storage. -- ``BPF_NOEXIST`` will create storage for ``fd`` only if it did not already - exist, otherwise the call will fail with ``-EEXIST``. -- ``BPF_EXIST`` will update existing storage for ``fd`` if it already exists, - otherwise the call will fail with ``-ENOENT``. +The ``flags`` parameter can be used to control the update behaviour: + +- ``BPF_ANY`` will create storage for `socket` ``fd`` or update existing storage. +- ``BPF_NOEXIST`` will create storage for `socket` ``fd`` only if it did not + already exist, otherwise the call will fail with ``-EEXIST``. +- ``BPF_EXIST`` will update existing storage for `socket` ``fd`` if it already + exists, otherwise the call will fail with ``-ENOENT``. Returns ``0`` on success, or negative error in case of failure. @@ -88,10 +91,10 @@ bpf_map_lookup_elem() int bpf_map_lookup_elem(int map_fd, const void *key, void *value) -Socket-local storage for the socket identified by ``key`` belonging to -``map_fd`` can be retrieved using the ``bpf_map_lookup_elem()`` libbpf -function. ``key`` must be a pointer to a valid ``fd`` in the user space -program. Returns ``0`` on success, or negative error in case of failure. +Socket-local storage for map ``map_fd`` can be retrieved from a socket using +the ``bpf_map_lookup_elem()`` libbpf function. The storage is retrieved from +the socket identified by a `socket` ``fd`` stored in the pointer +``key``. Returns ``0`` on success, or negative error in case of failure. bpf_map_delete_elem() ~~~~~~~~~~~~~~~~~~~~~ @@ -100,9 +103,10 @@ bpf_map_delete_elem() int bpf_map_delete_elem(int map_fd, const void *key) -Socket-local storage for the socket identified by ``key`` belonging to -``map_fd`` can be deleted using the ``bpf_map_delete_elem()`` libbpf -function. Returns ``0`` on success, or negative error in case of failure. +Socket-local storage for map ``map_fd`` can be deleted from a socket using the +``bpf_map_delete_elem()`` libbpf function. The storage is deleted from the +socket identified by a `socket` ``fd`` stored in the pointer ``key``. Returns +``0`` on success, or negative error in case of failure. Examples ======== From ec9230b18b45853287298d70be23f8ec6bd44ff0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Dec 2022 17:22:24 -0800 Subject: [PATCH 14/64] selftests/bpf: Fix a selftest compilation error with CONFIG_SMP=n Kernel test robot reported bpf selftest build failure when CONFIG_SMP is not set. The error message looks below: >> progs/rcu_read_lock.c:256:34: error: no member named 'last_wakee' in 'struct task_struct' last_wakee = task->real_parent->last_wakee; ~~~~~~~~~~~~~~~~~ ^ 1 error generated. When CONFIG_SMP is not set, the field 'last_wakee' is not available in struct 'task_struct'. Hence the above compilation failure. To fix the issue, let us choose another field 'group_leader' which is available regardless of CONFIG_SMP set or not. Fixes: fe147956fca4 ("bpf/selftests: Add selftests for new task kfuncs") Fixes: 48671232fcb8 ("selftests/bpf: Add tests for bpf_rcu_read_lock()") Reported-by: kernel test robot Signed-off-by: David Vernet Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20221213012224.379581-1-yhs@fb.com --- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 8 ++++---- tools/testing/selftests/bpf/progs/task_kfunc_failure.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 125f908024d3..5cecbdbbb16e 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -288,13 +288,13 @@ out: SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") int task_untrusted_non_rcuptr(void *ctx) { - struct task_struct *task, *last_wakee; + struct task_struct *task, *group_leader; task = bpf_get_current_task_btf(); bpf_rcu_read_lock(); - /* the pointer last_wakee marked as untrusted */ - last_wakee = task->real_parent->last_wakee; - (void)bpf_task_storage_get(&map_a, last_wakee, 0, 0); + /* the pointer group_leader marked as untrusted */ + group_leader = task->real_parent->group_leader; + (void)bpf_task_storage_get(&map_a, group_leader, 0, 0); bpf_rcu_read_unlock(); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 87fa1db9d9b5..1b47b94dbca0 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -73,7 +73,7 @@ int BPF_PROG(task_kfunc_acquire_trusted_walked, struct task_struct *task, u64 cl struct task_struct *acquired; /* Can't invoke bpf_task_acquire() on a trusted pointer obtained from walking a struct. */ - acquired = bpf_task_acquire(task->last_wakee); + acquired = bpf_task_acquire(task->group_leader); bpf_task_release(acquired); return 0; From a8dfde09c90109e3a98af54847e91bde7dc2d5c2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 13 Dec 2022 14:05:00 -0800 Subject: [PATCH 15/64] selftests/bpf: Select CONFIG_FUNCTION_ERROR_INJECTION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF selftests require CONFIG_FUNCTION_ERROR_INJECTION to work. However, CONFIG_FUNCTION_ERROR_INJECTION is no longer 'y' by default after recent changes. As a result, we are seeing errors like the following from BPF CI: bpf_testmod_test_read() is not modifiable __x64_sys_setdomainname is not sleepable __x64_sys_getpgid is not sleepable Fix this by explicitly selecting CONFIG_FUNCTION_ERROR_INJECTION in the selftest config. Fixes: a4412fdd49dc ("error-injection: Add prompt for function error injection") Reported-by: Daniel Müller Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Daniel Müller Link: https://lore.kernel.org/bpf/20221213220500.3427947-1-song@kernel.org --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 612f699dc4f7..63cd4ab70171 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -16,6 +16,7 @@ CONFIG_CRYPTO_USER_API_HASH=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FPROBE=y CONFIG_FTRACE_SYSCALLS=y +CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_FUNCTION_TRACER=y CONFIG_GENEVE=y CONFIG_IKCONFIG=y From e89f3edffb860a0f54a9ed16deadb7a4a1fa3862 Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Tue, 13 Dec 2022 12:57:14 -0500 Subject: [PATCH 16/64] bpf: prevent leak of lsm program after failed attach In [0], we added the ability to bpf_prog_attach LSM programs to cgroups, but in our validation to make sure the prog is meant to be attached to BPF_LSM_CGROUP, we return too early if the check fails. This results in lack of decrementing prog's refcnt (through bpf_prog_put) leaving the LSM program alive past the point of the expected lifecycle. This fix allows for the decrement to take place. [0] https://lore.kernel.org/all/20220628174314.1216643-4-sdf@google.com/ Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Signed-off-by: Milan Landaverde Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221213175714.31963-1-milan@mdaverde.com --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35972afb6850..64131f88c553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3518,9 +3518,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LSM: if (ptype == BPF_PROG_TYPE_LSM && prog->expected_attach_type != BPF_LSM_CGROUP) - return -EINVAL; - - ret = cgroup_bpf_prog_attach(attr, ptype, prog); + ret = -EINVAL; + else + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: ret = -EINVAL; From 4121d4481b72501aa4d22680be4ea1096d69d133 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Dec 2022 13:35:42 +0100 Subject: [PATCH 17/64] bpf: Synchronize dispatcher update with bpf_dispatcher_xdp_func Hao Sun reported crash in dispatcher image [1]. Currently we don't have any sync between bpf_dispatcher_update and bpf_dispatcher_xdp_func, so following race is possible: cpu 0: cpu 1: bpf_prog_run_xdp ... bpf_dispatcher_xdp_func in image at offset 0x0 bpf_dispatcher_update update image at offset 0x800 bpf_dispatcher_update update image at offset 0x0 in image at offset 0x0 -> crash Fixing this by synchronizing dispatcher image update (which is done in bpf_dispatcher_update function) with bpf_dispatcher_xdp_func that reads and execute the dispatcher image. Calling synchronize_rcu after updating and installing new image ensures that readers leave old image before it's changed in the next dispatcher update. The update itself is locked with dispatcher's mutex. The bpf_prog_run_xdp is called under local_bh_disable and synchronize_rcu will wait for it to leave [2]. [1] https://lore.kernel.org/bpf/Y5SFho7ZYXr9ifRn@krava/T/#m00c29ece654bc9f332a17df493bbca33e702896c [2] https://lore.kernel.org/bpf/0B62D35A-E695-4B7A-A0D4-774767544C1A@gmail.com/T/#mff43e2c003ae99f4a38f353c7969be4c7162e877 Reported-by: Hao Sun Signed-off-by: Jiri Olsa Acked-by: Yonghong Song Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20221214123542.1389719-1-jolsa@kernel.org Signed-off-by: Martin KaFai Lau --- kernel/bpf/dispatcher.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index c19719f48ce0..fa3e9225aedc 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -125,6 +125,11 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func); + /* Make sure all the callers executing the previous/old half of the + * image leave it, so following update call can modify it safely. + */ + synchronize_rcu(); + if (new) d->image_off = noff; } From 628050ec952d2e2e46ec9fb6aa07e41139e030c8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 13 Dec 2022 02:19:08 +0200 Subject: [PATCH 18/64] net: enetc: avoid buffer leaks on xdp_do_redirect() failure Before enetc_clean_rx_ring_xdp() calls xdp_do_redirect(), each software BD in the RX ring between index orig_i and i can have one of 2 refcount values on its page. We are the owner of the current buffer that is being processed, so the refcount will be at least 1. If the current owner of the buffer at the diametrically opposed index in the RX ring (i.o.w, the other half of this page) has not yet called kfree(), this page's refcount could even be 2. enetc_page_reusable() in enetc_flip_rx_buff() tests for the page refcount against 1, and [ if it's 2 ] does not attempt to reuse it. But if enetc_flip_rx_buff() is put after the xdp_do_redirect() call, the page refcount can have one of 3 values. It can also be 0, if there is no owner of the other page half, and xdp_do_redirect() for this buffer ran so far that it triggered a flush of the devmap/cpumap bulk queue, and the consumers of those bulk queues also freed the buffer, all by the time xdp_do_redirect() returns the execution back to enetc. This is the reason why enetc_flip_rx_buff() is called before xdp_do_redirect(), but there is a big flaw with that reasoning: enetc_flip_rx_buff() will set rx_swbd->page = NULL on both sides of the enetc_page_reusable() branch, and if xdp_do_redirect() returns an error, we call enetc_xdp_free(), which does not deal gracefully with that. In fact, what happens is quite special. The page refcounts start as 1. enetc_flip_rx_buff() figures they're reusable, transfers these rx_swbd->page pointers to a different rx_swbd in enetc_reuse_page(), and bumps the refcount to 2. When xdp_do_redirect() later returns an error, we call the no-op enetc_xdp_free(), but we still haven't lost the reference to that page. A copy of it is still at rx_ring->next_to_alloc, but that has refcount 2 (and there are no concurrent owners of it in flight, to drop the refcount). What really kills the system is when we'll flip the rx_swbd->page the second time around. With an updated refcount of 2, the page will not be reusable and we'll really leak it. Then enetc_new_page() will have to allocate more pages, which will then eventually leak again on further errors from xdp_do_redirect(). The problem, summarized, is that we zeroize rx_swbd->page before we're completely done with it, and this makes it impossible for the error path to do something with it. Since the packet is potentially multi-buffer and therefore the rx_swbd->page is potentially an array, manual passing of the old pointers between enetc_flip_rx_buff() and enetc_xdp_free() is a bit difficult. For the sake of going with a simple solution, we accept the possibility of racing with xdp_do_redirect(), and we move the flip procedure to execute only on the redirect success path. By racing, I mean that the page may be deemed as not reusable by enetc (having a refcount of 0), but there will be no leak in that case, either. Once we accept that, we have something better to do with buffers on XDP_REDIRECT failure. Since we haven't performed half-page flipping yet, we won't, either (and this way, we can avoid enetc_xdp_free() completely, which gives the entire page to the slab allocator). Instead, we'll call enetc_xdp_drop(), which will recycle this half of the buffer back to the RX ring. Fixes: 9d2b68cc108d ("net: enetc: add support for XDP_REDIRECT") Suggested-by: Lorenzo Bianconi Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20221213001908.2347046-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 35 +++++--------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 8671591cb750..3a79ead5219a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1489,23 +1489,6 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, rx_ring->stats.xdp_drops++; } -static void enetc_xdp_free(struct enetc_bdr *rx_ring, int rx_ring_first, - int rx_ring_last) -{ - while (rx_ring_first != rx_ring_last) { - struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[rx_ring_first]; - - if (rx_swbd->page) { - dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, - rx_swbd->dir); - __free_page(rx_swbd->page); - rx_swbd->page = NULL; - } - enetc_bdr_idx_inc(rx_ring, &rx_ring_first); - } - rx_ring->stats.xdp_redirect_failures++; -} - static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) @@ -1527,8 +1510,8 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, int orig_i, orig_cleaned_cnt; struct xdp_buff xdp_buff; struct sk_buff *skb; - int tmp_orig_i, err; u32 bd_status; + int err; rxbd = enetc_rxbd(rx_ring, i); bd_status = le32_to_cpu(rxbd->r.lstatus); @@ -1615,18 +1598,16 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, break; } - tmp_orig_i = orig_i; - - while (orig_i != i) { - enetc_flip_rx_buff(rx_ring, - &rx_ring->rx_swbd[orig_i]); - enetc_bdr_idx_inc(rx_ring, &orig_i); - } - err = xdp_do_redirect(rx_ring->ndev, &xdp_buff, prog); if (unlikely(err)) { - enetc_xdp_free(rx_ring, tmp_orig_i, i); + enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_redirect_failures++; } else { + while (orig_i != i) { + enetc_flip_rx_buff(rx_ring, + &rx_ring->rx_swbd[orig_i]); + enetc_bdr_idx_inc(rx_ring, &orig_i); + } xdp_redirect_frm_cnt++; rx_ring->stats.xdp_redirect++; } From 9f28157778ede0d4f183f7ab3b46995bb400abbe Mon Sep 17 00:00:00 2001 From: Minsuk Kang Date: Wed, 14 Dec 2022 10:51:39 +0900 Subject: [PATCH 19/64] nfc: pn533: Clear nfc_target before being used Fix a slab-out-of-bounds read that occurs in nla_put() called from nfc_genl_send_target() when target->sensb_res_len, which is duplicated from an nfc_target in pn533, is too large as the nfc_target is not properly initialized and retains garbage values. Clear nfc_targets with memset() before they are used. Found by a modified version of syzkaller. BUG: KASAN: slab-out-of-bounds in nla_put Call Trace: memcpy nla_put nfc_genl_dump_targets genl_lock_dumpit netlink_dump __netlink_dump_start genl_family_rcv_msg_dumpit genl_rcv_msg netlink_rcv_skb genl_rcv netlink_unicast netlink_sendmsg sock_sendmsg ____sys_sendmsg ___sys_sendmsg __sys_sendmsg do_syscall_64 Fixes: 673088fb42d0 ("NFC: pn533: Send ATR_REQ directly for active device detection") Fixes: 361f3cb7f9cf ("NFC: DEP link hook implementation for pn533") Signed-off-by: Minsuk Kang Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20221214015139.119673-1-linuxlovemin@yonsei.ac.kr Signed-off-by: Jakub Kicinski --- drivers/nfc/pn533/pn533.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c index d9f6367b9993..f0cac1900552 100644 --- a/drivers/nfc/pn533/pn533.c +++ b/drivers/nfc/pn533/pn533.c @@ -1295,6 +1295,8 @@ static int pn533_poll_dep_complete(struct pn533 *dev, void *arg, if (IS_ERR(resp)) return PTR_ERR(resp); + memset(&nfc_target, 0, sizeof(struct nfc_target)); + rsp = (struct pn533_cmd_jump_dep_response *)resp->data; rc = rsp->status & PN533_CMD_RET_MASK; @@ -1926,6 +1928,8 @@ static int pn533_in_dep_link_up_complete(struct pn533 *dev, void *arg, dev_dbg(dev->dev, "Creating new target\n"); + memset(&nfc_target, 0, sizeof(struct nfc_target)); + nfc_target.supported_protocols = NFC_PROTO_NFC_DEP_MASK; nfc_target.nfcid1_len = 10; memcpy(nfc_target.nfcid1, rsp->nfcid3t, nfc_target.nfcid1_len); From 1c123c567fb138ebd187480b7fc0610fcb0851f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 15 Dec 2022 00:02:53 +0100 Subject: [PATCH 20/64] bpf: Resolve fext program type when checking map compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_prog_map_compatible() check makes sure that BPF program types are not mixed inside BPF map types that can contain programs (tail call maps, cpumaps and devmaps). It does this by setting the fields of the map->owner struct to the values of the first program being checked against, and rejecting any subsequent programs if the values don't match. One of the values being set in the map owner struct is the program type, and since the code did not resolve the prog type for fext programs, the map owner type would be set to PROG_TYPE_EXT and subsequent loading of programs of the target type into the map would fail. This bug is seen in particular for XDP programs that are loaded as PROG_TYPE_EXT using libxdp; these cannot insert programs into devmaps and cpumaps because the check fails as described above. Fix the bug by resolving the fext program type to its target program type as elsewhere in the verifier. v3: - Add Yonghong's ACK Fixes: f45d5b6ce2e8 ("bpf: generalise tail call map compatibility check") Acked-by: Yonghong Song Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221214230254.790066-1-toke@redhat.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f98dec6e90f..b334f4ddc4d5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2092,6 +2092,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { + enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; if (fp->kprobe_override) @@ -2102,12 +2103,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map, /* There's no owner yet where we could check for * compatibility. */ - map->owner.type = fp->type; + map->owner.type = prog_type; map->owner.jited = fp->jited; map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = map->owner.type == fp->type && + ret = map->owner.type == prog_type && map->owner.jited == fp->jited && map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } From f506439ec3dee11e0e77b0a1f3fb3eec22c97873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 15 Dec 2022 00:02:54 +0100 Subject: [PATCH 21/64] selftests/bpf: Add a test for using a cpumap from an freplace-to-XDP program MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a simple test for inserting an XDP program into a cpumap that is "owned" by an XDP program that was loaded as PROG_TYPE_EXT (as libxdp does). Prior to the kernel fix this would fail because the map type ownership would be set to PROG_TYPE_EXT instead of being resolved to PROG_TYPE_XDP. v5: - Fix a few nits from Andrii, add his ACK v4: - Use skeletons for selftest v3: - Update comment to better explain the cause - Add Yonghong's ACK Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221214230254.790066-2-toke@redhat.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/fexit_bpf2bpf.c | 48 +++++++++++++++++++ .../selftests/bpf/progs/freplace_progmap.c | 24 ++++++++++ 2 files changed, 72 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/freplace_progmap.c diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index d1e32e792536..20f5fa0fcec9 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -4,6 +4,8 @@ #include #include #include "bind4_prog.skel.h" +#include "freplace_progmap.skel.h" +#include "xdp_dummy.skel.h" typedef int (*test_cb)(struct bpf_object *obj); @@ -500,6 +502,50 @@ cleanup: bind4_prog__destroy(skel); } +static void test_func_replace_progmap(void) +{ + struct bpf_cpumap_val value = { .qsize = 1 }; + struct freplace_progmap *skel = NULL; + struct xdp_dummy *tgt_skel = NULL; + __u32 key = 0; + int err; + + skel = freplace_progmap__open(); + if (!ASSERT_OK_PTR(skel, "prog_open")) + return; + + tgt_skel = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(tgt_skel, "tgt_prog_load")) + goto out; + + err = bpf_program__set_attach_target(skel->progs.xdp_cpumap_prog, + bpf_program__fd(tgt_skel->progs.xdp_dummy_prog), + "xdp_dummy_prog"); + if (!ASSERT_OK(err, "set_attach_target")) + goto out; + + err = freplace_progmap__load(skel); + if (!ASSERT_OK(err, "obj_load")) + goto out; + + /* Prior to fixing the kernel, loading the PROG_TYPE_EXT 'redirect' + * program above will cause the map owner type of 'cpumap' to be set to + * PROG_TYPE_EXT. This in turn will cause the bpf_map_update_elem() + * below to fail, because the program we are inserting into the map is + * of PROG_TYPE_XDP. After fixing the kernel, the initial ownership will + * be correctly resolved to the *target* of the PROG_TYPE_EXT program + * (i.e., PROG_TYPE_XDP) and the map update will succeed. + */ + value.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_drop_prog); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.cpu_map), + &key, &value, 0); + ASSERT_OK(err, "map_update"); + +out: + xdp_dummy__destroy(tgt_skel); + freplace_progmap__destroy(skel); +} + /* NOTE: affect other tests, must run in serial mode */ void serial_test_fexit_bpf2bpf(void) { @@ -525,4 +571,6 @@ void serial_test_fexit_bpf2bpf(void) test_func_replace_global_func(); if (test__start_subtest("fentry_to_cgroup_bpf")) test_fentry_to_cgroup_bpf(); + if (test__start_subtest("func_replace_progmap")) + test_func_replace_progmap(); } diff --git a/tools/testing/selftests/bpf/progs/freplace_progmap.c b/tools/testing/selftests/bpf/progs/freplace_progmap.c new file mode 100644 index 000000000000..81b56b9aa7d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/freplace_progmap.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __type(key, __u32); + __type(value, struct bpf_cpumap_val); + __uint(max_entries, 1); +} cpu_map SEC(".maps"); + +SEC("xdp/cpumap") +int xdp_drop_prog(struct xdp_md *ctx) +{ + return XDP_DROP; +} + +SEC("freplace") +int xdp_cpumap_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&cpu_map, 0, XDP_PASS); +} + +char _license[] SEC("license") = "GPL"; From 3ff8bff704f4de125dca2262e5b5b963a3da1d87 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Dec 2022 00:05:53 +0300 Subject: [PATCH 22/64] unix: Fix race in SOCK_SEQPACKET's unix_dgram_sendmsg() There is a race resulting in alive SOCK_SEQPACKET socket may change its state from TCP_ESTABLISHED to TCP_CLOSE: unix_release_sock(peer) unix_dgram_sendmsg(sk) sock_orphan(peer) sock_set_flag(peer, SOCK_DEAD) sock_alloc_send_pskb() if !(sk->sk_shutdown & SEND_SHUTDOWN) OK if sock_flag(peer, SOCK_DEAD) sk->sk_state = TCP_CLOSE sk->sk_shutdown = SHUTDOWN_MASK After that socket sk remains almost normal: it is able to connect, listen, accept and recvmsg, while it can't sendmsg. Since this is the only possibility for alive SOCK_SEQPACKET to change the state in such way, we should better fix this strange and potentially danger corner case. Note, that we will return EPIPE here like this is normally done in sock_alloc_send_pskb(). Originally used ECONNREFUSED looks strange, since it's strange to return a specific retval in dependence of race in kernel, when user can't affect on this. Also, move TCP_CLOSE assignment for SOCK_DGRAM sockets under state lock to fix race with unix_dgram_connect(): unix_dgram_connect(other) unix_dgram_sendmsg(sk) unix_peer(sk) = NULL unix_state_unlock(sk) unix_state_double_lock(sk, other) sk->sk_state = TCP_ESTABLISHED unix_peer(sk) = other unix_state_double_unlock(sk, other) sk->sk_state = TCP_CLOSED This patch fixes both of these races. Fixes: 83301b5367a9 ("af_unix: Set TCP_ESTABLISHED for datagram sockets too") Signed-off-by: Kirill Tkhai Link: https://lore.kernel.org/r/135fda25-22d5-837a-782b-ceee50e19844@ya.ru Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ede2b2a140a4..f0c2293f1d3b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1999,13 +1999,20 @@ restart_locked: unix_state_lock(sk); err = 0; - if (unix_peer(sk) == other) { + if (sk->sk_type == SOCK_SEQPACKET) { + /* We are here only when racing with unix_release_sock() + * is clearing @other. Never change state to TCP_CLOSE + * unlike SOCK_DGRAM wants. + */ + unix_state_unlock(sk); + err = -EPIPE; + } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); + sk->sk_state = TCP_CLOSE; unix_state_unlock(sk); - sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; From 7e43039a49c2da45edc1d9d7c9ede4003ab45a5f Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Tue, 13 Dec 2022 20:56:14 +0800 Subject: [PATCH 23/64] r6040: Fix kmemleak in probe and remove There is a memory leaks reported by kmemleak: unreferenced object 0xffff888116111000 (size 2048): comm "modprobe", pid 817, jiffies 4294759745 (age 76.502s) hex dump (first 32 bytes): 00 c4 0a 04 81 88 ff ff 08 10 11 16 81 88 ff ff ................ 08 10 11 16 81 88 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_trace+0x22/0x60 [] phy_device_create+0x4e/0x90 [] get_phy_device+0xd2/0x220 [] mdiobus_scan+0xa4/0x2e0 [] __mdiobus_register+0x482/0x8b0 [] r6040_init_one+0x714/0xd2c [r6040] ... The problem occurs in probe process as follows: r6040_init_one: mdiobus_register mdiobus_scan <- alloc and register phy_device, the reference count of phy_device is 3 r6040_mii_probe phy_connect <- connect to the first phy_device, so the reference count of the first phy_device is 4, others are 3 register_netdev <- fault inject succeeded, goto error handling path // error handling path err_out_mdio_unregister: mdiobus_unregister(lp->mii_bus); err_out_mdio: mdiobus_free(lp->mii_bus); <- the reference count of the first phy_device is 1, it is not released and other phy_devices are released // similarly, the remove process also has the same problem The root cause is traced to the phy_device is not disconnected when removes one r6040 device in r6040_remove_one() or on error handling path after r6040_mii probed successfully. In r6040_mii_probe(), a net ethernet device is connected to the first PHY device of mii_bus, in order to notify the connected driver when the link status changes, which is the default behavior of the PHY infrastructure to handle everything. Therefore the phy_device should be disconnected when removes one r6040 device or on error handling path. Fix it by adding phy_disconnect() when removes one r6040 device or on error handling path after r6040_mii probed successfully. Fixes: 3831861b4ad8 ("r6040: implement phylib") Signed-off-by: Li Zetao Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221213125614.927754-1-lizetao1@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/rdc/r6040.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c index eecd52ed1ed2..f4d434c379e7 100644 --- a/drivers/net/ethernet/rdc/r6040.c +++ b/drivers/net/ethernet/rdc/r6040.c @@ -1159,10 +1159,12 @@ static int r6040_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) err = register_netdev(dev); if (err) { dev_err(&pdev->dev, "Failed to register net device\n"); - goto err_out_mdio_unregister; + goto err_out_phy_disconnect; } return 0; +err_out_phy_disconnect: + phy_disconnect(dev->phydev); err_out_mdio_unregister: mdiobus_unregister(lp->mii_bus); err_out_mdio: @@ -1186,6 +1188,7 @@ static void r6040_remove_one(struct pci_dev *pdev) struct r6040_private *lp = netdev_priv(dev); unregister_netdev(dev); + phy_disconnect(dev->phydev); mdiobus_unregister(lp->mii_bus); mdiobus_free(lp->mii_bus); netif_napi_del(&lp->napi); From 2cb815cfc78b137ee38bcd65e7c955d6cc2cc250 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Wed, 14 Dec 2022 16:01:17 +0800 Subject: [PATCH 24/64] net: stmmac: fix errno when create_singlethread_workqueue() fails We should set the return value to -ENOMEM explicitly when create_singlethread_workqueue() fails in stmmac_dvr_probe(), otherwise we'll lose the error value. Fixes: a137f3f27f92 ("net: stmmac: fix possible memory leak in stmmac_dvr_probe()") Signed-off-by: Gaosheng Cui Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221214080117.3514615-1-cuigaosheng1@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ec64b65dee34..c6951c976f5d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7099,6 +7099,7 @@ int stmmac_dvr_probe(struct device *device, priv->wq = create_singlethread_workqueue("stmmac_wq"); if (!priv->wq) { dev_err(priv->device, "failed to create workqueue\n"); + ret = -ENOMEM; goto error_wq_init; } From c72a7e42592b2e18d862cf120876070947000d7a Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 14 Dec 2022 10:51:18 +0000 Subject: [PATCH 25/64] ravb: Fix "failed to switch device to config mode" message during unbind This patch fixes the error "ravb 11c20000.ethernet eth0: failed to switch device to config mode" during unbind. We are doing register access after pm_runtime_put_sync(). We usually do cleanup in reverse order of init. Currently in remove(), the "pm_runtime_put_sync" is not in reverse order. Probe reset_control_deassert(rstc); pm_runtime_enable(&pdev->dev); pm_runtime_get_sync(&pdev->dev); remove pm_runtime_put_sync(&pdev->dev); unregister_netdev(ndev); .. ravb_mdio_release(priv); pm_runtime_disable(&pdev->dev); Consider the call to unregister_netdev() unregister_netdev->unregister_netdevice_queue->rollback_registered_many that calls the below functions which access the registers after pm_runtime_put_sync() 1) ravb_get_stats 2) ravb_close Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Cc: stable@vger.kernel.org Signed-off-by: Biju Das Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221214105118.2495313-1-biju.das.jz@bp.renesas.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 33f723a9f471..b4e0fc7f65bd 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -2903,12 +2903,12 @@ static int ravb_remove(struct platform_device *pdev) priv->desc_bat_dma); /* Set reset mode */ ravb_write(ndev, CCC_OPC_RESET, CCC); - pm_runtime_put_sync(&pdev->dev); unregister_netdev(ndev); if (info->nc_queues) netif_napi_del(&priv->napi[RAVB_NC]); netif_napi_del(&priv->napi[RAVB_BE]); ravb_mdio_release(priv); + pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); reset_control_assert(priv->rstc); free_netdev(ndev); From a7d82367daa6baa5e8399e6327e7f2f463534505 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 14 Dec 2022 13:01:20 +0200 Subject: [PATCH 26/64] net: dsa: mv88e6xxx: avoid reg_lock deadlock in mv88e6xxx_setup_port() In the blamed commit, it was not noticed that one implementation of chip->info->ops->phylink_get_caps(), called by mv88e6xxx_get_caps(), may access hardware registers, and in doing so, it takes the mv88e6xxx_reg_lock(). Namely, this is mv88e6352_phylink_get_caps(). This is a problem because mv88e6xxx_get_caps(), apart from being a top-level function (method invoked by dsa_switch_ops), is now also directly called from mv88e6xxx_setup_port(), which runs under the mv88e6xxx_reg_lock() taken by mv88e6xxx_setup(). Therefore, when running on mv88e6352, the reg_lock would be acquired a second time and the system would deadlock on driver probe. The things that mv88e6xxx_setup() can compete with in terms of register access with are the IRQ handlers and MDIO bus operations registered by mv88e6xxx_probe(). So there is a real need to acquire the register lock. The register lock can, in principle, be dropped and re-acquired pretty much at will within the driver, as long as no operations that involve waiting for indirect access to complete (essentially, callers of mv88e6xxx_smi_direct_wait() and mv88e6xxx_wait_mask()) are interrupted with the lock released. However, I would guess that in mv88e6xxx_setup(), the critical section is kept open for such a long time just in order to optimize away multiple lock/unlock operations on the registers. We could, in principle, drop the reg_lock right before the mv88e6xxx_setup_port() -> mv88e6xxx_get_caps() call, and re-acquire it immediately afterwards. But this would look ugly, because mv88e6xxx_setup_port() would release a lock which it didn't acquire, but the caller did. A cleaner solution to this issue comes from the observation that struct mv88e6xxxx_ops methods generally assume they are called with the reg_lock already acquired. Whereas mv88e6352_phylink_get_caps() is more the exception rather than the norm, in that it acquires the lock itself. Let's enforce the same locking pattern/convention for chip->info->ops->phylink_get_caps() as well, and make mv88e6xxx_get_caps(), the top-level function, acquire the register lock explicitly, for this one implementation that will access registers for port 4 to work properly. This means that mv88e6xxx_setup_port() will no longer call the top-level function, but the low-level mv88e6xxx_ops method which expects the correct calling context (register lock held). Compared to chip->info->ops->phylink_get_caps(), mv88e6xxx_get_caps() also fixes up the supported_interfaces bitmap for internal ports, since that can be done generically and does not require per-switch knowledge. That's code which will no longer execute, however mv88e6xxx_setup_port() doesn't need that. It just needs to look at the mac_capabilities bitmap. Fixes: cc1049ccee20 ("net: dsa: mv88e6xxx: fix speed setting for CPU/DSA ports") Reported-by: Maksim Kiselev Signed-off-by: Vladimir Oltean Tested-by: Maksim Kiselev Link: https://lore.kernel.org/r/20221214110120.3368472-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mv88e6xxx/chip.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index ba4fff8690aa..242b8b325504 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -689,13 +689,12 @@ static void mv88e6352_phylink_get_caps(struct mv88e6xxx_chip *chip, int port, /* Port 4 supports automedia if the serdes is associated with it. */ if (port == 4) { - mv88e6xxx_reg_lock(chip); err = mv88e6352_g2_scratch_port_has_serdes(chip, port); if (err < 0) dev_err(chip->dev, "p%d: failed to read scratch\n", port); if (err <= 0) - goto unlock; + return; cmode = mv88e6352_get_port4_serdes_cmode(chip); if (cmode < 0) @@ -703,8 +702,6 @@ static void mv88e6352_phylink_get_caps(struct mv88e6xxx_chip *chip, int port, port); else mv88e6xxx_translate_cmode(cmode, supported); -unlock: - mv88e6xxx_reg_unlock(chip); } } @@ -831,7 +828,9 @@ static void mv88e6xxx_get_caps(struct dsa_switch *ds, int port, { struct mv88e6xxx_chip *chip = ds->priv; + mv88e6xxx_reg_lock(chip); chip->info->ops->phylink_get_caps(chip, port, config); + mv88e6xxx_reg_unlock(chip); if (mv88e6xxx_phy_is_internal(ds, port)) { __set_bit(PHY_INTERFACE_MODE_INTERNAL, @@ -3307,7 +3306,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) struct phylink_config pl_config = {}; unsigned long caps; - mv88e6xxx_get_caps(ds, port, &pl_config); + chip->info->ops->phylink_get_caps(chip, port, &pl_config); caps = pl_config.mac_capabilities; From db0b124f02ba68de6517ac303d431af220ccfe9f Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 9 Dec 2022 12:15:17 +0800 Subject: [PATCH 27/64] igc: Enhance Qbv scheduling by using first flag bit The I225 hardware has a limitation that packets can only be scheduled in the [0, cycle-time] interval. So, scheduling a packet to the start of the next cycle doesn't usually work. To overcome this, we use the Transmit Descriptor first flag to indicates that a packet should be the first packet (from a queue) in a cycle according to the section 7.5.2.9.3.4 The First Packet on Each QBV Cycle in Intel Discrete I225/6 User Manual. But this only works if there was any packet from that queue during the current cycle, to avoid this issue, we issue an empty packet if that's not the case. Also require one more descriptor to be available, to take into account the empty packet that might be issued. Test Setup: Talker: Use l2_tai to generate the launchtime into packet load. Listener: Use timedump.c to compute the delta between packet arrival and LaunchTime packet payload. Test Result: Before: 1666000610127300000,1666000610127300096,96,621273 1666000610127400000,1666000610127400192,192,621274 1666000610127500000,1666000610127500032,32,621275 1666000610127600000,1666000610127600128,128,621276 1666000610127700000,1666000610127700224,224,621277 1666000610127800000,1666000610127800064,64,621278 1666000610127900000,1666000610127900160,160,621279 1666000610128000000,1666000610128000000,0,621280 1666000610128100000,1666000610128100096,96,621281 1666000610128200000,1666000610128200192,192,621282 1666000610128300000,1666000610128300032,32,621283 1666000610128400000,1666000610128301056,-98944,621284 1666000610128500000,1666000610128302080,-197920,621285 1666000610128600000,1666000610128302848,-297152,621286 1666000610128700000,1666000610128303872,-396128,621287 1666000610128800000,1666000610128304896,-495104,621288 1666000610128900000,1666000610128305664,-594336,621289 1666000610129000000,1666000610128306688,-693312,621290 1666000610129100000,1666000610128307712,-792288,621291 1666000610129200000,1666000610128308480,-891520,621292 1666000610129300000,1666000610128309504,-990496,621293 1666000610129400000,1666000610128310528,-1089472,621294 1666000610129500000,1666000610128311296,-1188704,621295 1666000610129600000,1666000610128312320,-1287680,621296 1666000610129700000,1666000610128313344,-1386656,621297 1666000610129800000,1666000610128314112,-1485888,621298 1666000610129900000,1666000610128315136,-1584864,621299 1666000610130000000,1666000610128316160,-1683840,621300 1666000610130100000,1666000610128316928,-1783072,621301 1666000610130200000,1666000610128317952,-1882048,621302 1666000610130300000,1666000610128318976,-1981024,621303 1666000610130400000,1666000610128319744,-2080256,621304 1666000610130500000,1666000610128320768,-2179232,621305 1666000610130600000,1666000610128321792,-2278208,621306 1666000610130700000,1666000610128322816,-2377184,621307 1666000610130800000,1666000610128323584,-2476416,621308 1666000610130900000,1666000610128324608,-2575392,621309 1666000610131000000,1666000610128325632,-2674368,621310 1666000610131100000,1666000610128326400,-2773600,621311 1666000610131200000,1666000610128327424,-2872576,621312 1666000610131300000,1666000610128328448,-2971552,621313 1666000610131400000,1666000610128329216,-3070784,621314 1666000610131500000,1666000610131500032,32,621315 1666000610131600000,1666000610131600128,128,621316 1666000610131700000,1666000610131700224,224,621317 After: 1666073510646200000,1666073510646200064,64,2676462 1666073510646300000,1666073510646300160,160,2676463 1666073510646400000,1666073510646400256,256,2676464 1666073510646500000,1666073510646500096,96,2676465 1666073510646600000,1666073510646600192,192,2676466 1666073510646700000,1666073510646700032,32,2676467 1666073510646800000,1666073510646800128,128,2676468 1666073510646900000,1666073510646900224,224,2676469 1666073510647000000,1666073510647000064,64,2676470 1666073510647100000,1666073510647100160,160,2676471 1666073510647200000,1666073510647200256,256,2676472 1666073510647300000,1666073510647300096,96,2676473 1666073510647400000,1666073510647400192,192,2676474 1666073510647500000,1666073510647500032,32,2676475 1666073510647600000,1666073510647600128,128,2676476 1666073510647700000,1666073510647700224,224,2676477 1666073510647800000,1666073510647800064,64,2676478 1666073510647900000,1666073510647900160,160,2676479 1666073510648000000,1666073510648000000,0,2676480 1666073510648100000,1666073510648100096,96,2676481 1666073510648200000,1666073510648200192,192,2676482 1666073510648300000,1666073510648300032,32,2676483 1666073510648400000,1666073510648400128,128,2676484 1666073510648500000,1666073510648500224,224,2676485 1666073510648600000,1666073510648600064,64,2676486 1666073510648700000,1666073510648700160,160,2676487 1666073510648800000,1666073510648800000,0,2676488 1666073510648900000,1666073510648900096,96,2676489 1666073510649000000,1666073510649000192,192,2676490 1666073510649100000,1666073510649100032,32,2676491 1666073510649200000,1666073510649200128,128,2676492 1666073510649300000,1666073510649300224,224,2676493 1666073510649400000,1666073510649400064,64,2676494 1666073510649500000,1666073510649500160,160,2676495 1666073510649600000,1666073510649600000,0,2676496 1666073510649700000,1666073510649700096,96,2676497 1666073510649800000,1666073510649800192,192,2676498 1666073510649900000,1666073510649900032,32,2676499 1666073510650000000,1666073510650000128,128,2676500 Fixes: 82faa9b79950 ("igc: Add support for ETF offloading") Signed-off-by: Vinicius Costa Gomes Co-developed-by: Aravindhan Gunasekaran Signed-off-by: Aravindhan Gunasekaran Co-developed-by: Muhammad Husaini Zulkifli Signed-off-by: Muhammad Husaini Zulkifli Signed-off-by: Malli C Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 2 + drivers/net/ethernet/intel/igc/igc_defines.h | 2 + drivers/net/ethernet/intel/igc/igc_main.c | 176 ++++++++++++++++--- 3 files changed, 151 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 1e7e7071f64d..66a57636d329 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -94,6 +94,8 @@ struct igc_ring { u8 queue_index; /* logical index of the ring*/ u8 reg_idx; /* physical index of the ring */ bool launchtime_enable; /* true if LaunchTime is enabled */ + ktime_t last_tx_cycle; /* end of the cycle with a launchtime transmission */ + ktime_t last_ff_cycle; /* Last cycle with an active first flag */ u32 start_time; u32 end_time; diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index f7311aeb293b..a7b22639cfcd 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -321,6 +321,8 @@ #define IGC_ADVTXD_L4LEN_SHIFT 8 /* Adv ctxt L4LEN shift */ #define IGC_ADVTXD_MSS_SHIFT 16 /* Adv ctxt MSS shift */ +#define IGC_ADVTXD_TSN_CNTX_FIRST 0x00000080 + /* Transmit Control */ #define IGC_TCTL_EN 0x00000002 /* enable Tx */ #define IGC_TCTL_PSP 0x00000008 /* pad short packets */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 1586e1e435c6..c67a8e8ea972 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1000,25 +1000,118 @@ static int igc_write_mc_addr_list(struct net_device *netdev) return netdev_mc_count(netdev); } -static __le32 igc_tx_launchtime(struct igc_adapter *adapter, ktime_t txtime) +static __le32 igc_tx_launchtime(struct igc_ring *ring, ktime_t txtime, + bool *first_flag, bool *insert_empty) { + struct igc_adapter *adapter = netdev_priv(ring->netdev); ktime_t cycle_time = adapter->cycle_time; ktime_t base_time = adapter->base_time; + ktime_t now = ktime_get_clocktai(); + ktime_t baset_est, end_of_cycle; u32 launchtime; + s64 n; - /* FIXME: when using ETF together with taprio, we may have a - * case where 'delta' is larger than the cycle_time, this may - * cause problems if we don't read the current value of - * IGC_BASET, as the value writen into the launchtime - * descriptor field may be misinterpreted. + n = div64_s64(ktime_sub_ns(now, base_time), cycle_time); + + baset_est = ktime_add_ns(base_time, cycle_time * (n)); + end_of_cycle = ktime_add_ns(baset_est, cycle_time); + + if (ktime_compare(txtime, end_of_cycle) >= 0) { + if (baset_est != ring->last_ff_cycle) { + *first_flag = true; + ring->last_ff_cycle = baset_est; + + if (ktime_compare(txtime, ring->last_tx_cycle) > 0) + *insert_empty = true; + } + } + + /* Introducing a window at end of cycle on which packets + * potentially not honor launchtime. Window of 5us chosen + * considering software update the tail pointer and packets + * are dma'ed to packet buffer. */ - div_s64_rem(ktime_sub_ns(txtime, base_time), cycle_time, &launchtime); + if ((ktime_sub_ns(end_of_cycle, now) < 5 * NSEC_PER_USEC)) + netdev_warn(ring->netdev, "Packet with txtime=%llu may not be honoured\n", + txtime); + + ring->last_tx_cycle = end_of_cycle; + + launchtime = ktime_sub_ns(txtime, baset_est); + if (launchtime > 0) + div_s64_rem(launchtime, cycle_time, &launchtime); + else + launchtime = 0; return cpu_to_le32(launchtime); } +static int igc_init_empty_frame(struct igc_ring *ring, + struct igc_tx_buffer *buffer, + struct sk_buff *skb) +{ + unsigned int size; + dma_addr_t dma; + + size = skb_headlen(skb); + + dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE); + if (dma_mapping_error(ring->dev, dma)) { + netdev_err_once(ring->netdev, "Failed to map DMA for TX\n"); + return -ENOMEM; + } + + buffer->skb = skb; + buffer->protocol = 0; + buffer->bytecount = skb->len; + buffer->gso_segs = 1; + buffer->time_stamp = jiffies; + dma_unmap_len_set(buffer, len, skb->len); + dma_unmap_addr_set(buffer, dma, dma); + + return 0; +} + +static int igc_init_tx_empty_descriptor(struct igc_ring *ring, + struct sk_buff *skb, + struct igc_tx_buffer *first) +{ + union igc_adv_tx_desc *desc; + u32 cmd_type, olinfo_status; + int err; + + if (!igc_desc_unused(ring)) + return -EBUSY; + + err = igc_init_empty_frame(ring, first, skb); + if (err) + return err; + + cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT | + IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD | + first->bytecount; + olinfo_status = first->bytecount << IGC_ADVTXD_PAYLEN_SHIFT; + + desc = IGC_TX_DESC(ring, ring->next_to_use); + desc->read.cmd_type_len = cpu_to_le32(cmd_type); + desc->read.olinfo_status = cpu_to_le32(olinfo_status); + desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(first, dma)); + + netdev_tx_sent_queue(txring_txq(ring), skb->len); + + first->next_to_watch = desc; + + ring->next_to_use++; + if (ring->next_to_use == ring->count) + ring->next_to_use = 0; + + return 0; +} + +#define IGC_EMPTY_FRAME_SIZE 60 + static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, - struct igc_tx_buffer *first, + __le32 launch_time, bool first_flag, u32 vlan_macip_lens, u32 type_tucmd, u32 mss_l4len_idx) { @@ -1037,26 +1130,17 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, if (test_bit(IGC_RING_FLAG_TX_CTX_IDX, &tx_ring->flags)) mss_l4len_idx |= tx_ring->reg_idx << 4; + if (first_flag) + mss_l4len_idx |= IGC_ADVTXD_TSN_CNTX_FIRST; + context_desc->vlan_macip_lens = cpu_to_le32(vlan_macip_lens); context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd); context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx); - - /* We assume there is always a valid Tx time available. Invalid times - * should have been handled by the upper layers. - */ - if (tx_ring->launchtime_enable) { - struct igc_adapter *adapter = netdev_priv(tx_ring->netdev); - ktime_t txtime = first->skb->tstamp; - - skb_txtime_consumed(first->skb); - context_desc->launch_time = igc_tx_launchtime(adapter, - txtime); - } else { - context_desc->launch_time = 0; - } + context_desc->launch_time = launch_time; } -static void igc_tx_csum(struct igc_ring *tx_ring, struct igc_tx_buffer *first) +static void igc_tx_csum(struct igc_ring *tx_ring, struct igc_tx_buffer *first, + __le32 launch_time, bool first_flag) { struct sk_buff *skb = first->skb; u32 vlan_macip_lens = 0; @@ -1096,7 +1180,8 @@ no_csum: vlan_macip_lens |= skb_network_offset(skb) << IGC_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGC_TX_FLAGS_VLAN_MASK; - igc_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0); + igc_tx_ctxtdesc(tx_ring, launch_time, first_flag, + vlan_macip_lens, type_tucmd, 0); } static int __igc_maybe_stop_tx(struct igc_ring *tx_ring, const u16 size) @@ -1320,6 +1405,7 @@ dma_error: static int igc_tso(struct igc_ring *tx_ring, struct igc_tx_buffer *first, + __le32 launch_time, bool first_flag, u8 *hdr_len) { u32 vlan_macip_lens, type_tucmd, mss_l4len_idx; @@ -1406,8 +1492,8 @@ static int igc_tso(struct igc_ring *tx_ring, vlan_macip_lens |= (ip.hdr - skb->data) << IGC_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGC_TX_FLAGS_VLAN_MASK; - igc_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, - type_tucmd, mss_l4len_idx); + igc_tx_ctxtdesc(tx_ring, launch_time, first_flag, + vlan_macip_lens, type_tucmd, mss_l4len_idx); return 1; } @@ -1415,11 +1501,14 @@ static int igc_tso(struct igc_ring *tx_ring, static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, struct igc_ring *tx_ring) { + bool first_flag = false, insert_empty = false; u16 count = TXD_USE_COUNT(skb_headlen(skb)); __be16 protocol = vlan_get_protocol(skb); struct igc_tx_buffer *first; + __le32 launch_time = 0; u32 tx_flags = 0; unsigned short f; + ktime_t txtime; u8 hdr_len = 0; int tso = 0; @@ -1433,11 +1522,40 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, count += TXD_USE_COUNT(skb_frag_size( &skb_shinfo(skb)->frags[f])); - if (igc_maybe_stop_tx(tx_ring, count + 3)) { + if (igc_maybe_stop_tx(tx_ring, count + 5)) { /* this is a hard error */ return NETDEV_TX_BUSY; } + if (!tx_ring->launchtime_enable) + goto done; + + txtime = skb->tstamp; + skb->tstamp = ktime_set(0, 0); + launch_time = igc_tx_launchtime(tx_ring, txtime, &first_flag, &insert_empty); + + if (insert_empty) { + struct igc_tx_buffer *empty_info; + struct sk_buff *empty; + void *data; + + empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + empty = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC); + if (!empty) + goto done; + + data = skb_put(empty, IGC_EMPTY_FRAME_SIZE); + memset(data, 0, IGC_EMPTY_FRAME_SIZE); + + igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0); + + if (igc_init_tx_empty_descriptor(tx_ring, + empty, + empty_info) < 0) + dev_kfree_skb_any(empty); + } + +done: /* record the location of the first descriptor for this packet */ first = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; first->type = IGC_TX_BUFFER_TYPE_SKB; @@ -1474,11 +1592,11 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb, first->tx_flags = tx_flags; first->protocol = protocol; - tso = igc_tso(tx_ring, first, &hdr_len); + tso = igc_tso(tx_ring, first, launch_time, first_flag, &hdr_len); if (tso < 0) goto out_drop; else if (!tso) - igc_tx_csum(tx_ring, first); + igc_tx_csum(tx_ring, first, launch_time, first_flag); igc_tx_map(tx_ring, first, hdr_len); From d8f45be01dd9381065a3778a579385249ed011dc Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 9 Dec 2022 12:15:18 +0800 Subject: [PATCH 28/64] igc: Use strict cycles for Qbv scheduling Configuring strict cycle mode in the controller forces more well behaved transmissions when taprio is offloaded. When set this strict_cycle and strict_end, transmission is not enabled if the whole packet cannot be completed before end of the Qbv cycle. Fixes: 82faa9b79950 ("igc: Add support for ETF offloading") Signed-off-by: Vinicius Costa Gomes Signed-off-by: Aravindhan Gunasekaran Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index f975ed807da1..684aedd4d088 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -140,15 +140,8 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_STQT(i), ring->start_time); wr32(IGC_ENDQT(i), ring->end_time); - if (adapter->base_time) { - /* If we have a base_time we are in "taprio" - * mode and we need to be strict about the - * cycles: only transmit a packet if it can be - * completed during that cycle. - */ - txqctl |= IGC_TXQCTL_STRICT_CYCLE | - IGC_TXQCTL_STRICT_END; - } + txqctl |= IGC_TXQCTL_STRICT_CYCLE | + IGC_TXQCTL_STRICT_END; if (ring->launchtime_enable) txqctl |= IGC_TXQCTL_QUEUE_MODE_LAUNCHT; From 3b61764fb49a6e147ac90d71dccdddc9d5508ba1 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Fri, 9 Dec 2022 12:15:19 +0800 Subject: [PATCH 29/64] igc: Add checking for basetime less than zero Using the tc qdisc command, the user can set basetime to any value. Checking should be done on the driver's side to prevent registering basetime values that are less than zero. Fixes: ec50a9d437f0 ("igc: Add support for taprio offloading") Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index c67a8e8ea972..6ed12329ae80 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6047,6 +6047,9 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, if (!qopt->enable) return igc_tsn_clear_schedule(adapter); + if (qopt->base_time < 0) + return -ERANGE; + if (adapter->base_time) return -EALREADY; From e17090eb24944fbbe1f24d9f336d7bad4fbe47e8 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 9 Dec 2022 12:15:20 +0800 Subject: [PATCH 30/64] igc: allow BaseTime 0 enrollment for Qbv Introduce qbv_enable flag in igc_adapter struct to store the Qbv on/off. So this allow the BaseTime to enroll with zero value. Fixes: 61572d5f8f91 ("igc: Simplify TSN flags handling") Signed-off-by: Muhammad Husaini Zulkifli Signed-off-by: Tan Tee Min Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 2 ++ drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 66a57636d329..df3e26c0cf01 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -184,6 +184,7 @@ struct igc_adapter { ktime_t base_time; ktime_t cycle_time; + bool qbv_enable; /* OS defined structs */ struct pci_dev *pdev; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 6ed12329ae80..6c98f3402f8c 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6044,6 +6044,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, u32 start_time = 0, end_time = 0; size_t n; + adapter->qbv_enable = qopt->enable; + if (!qopt->enable) return igc_tsn_clear_schedule(adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 684aedd4d088..bb10d7b65232 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -36,7 +36,7 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) { unsigned int new_flags = adapter->flags & ~IGC_FLAG_TSN_ANY_ENABLED; - if (adapter->base_time) + if (adapter->qbv_enable) new_flags |= IGC_FLAG_TSN_QBV_ENABLED; if (is_any_launchtime(adapter)) From 6d05251d537a4d3835959a8cdd8cbbbdcdc0c904 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 9 Dec 2022 12:15:21 +0800 Subject: [PATCH 31/64] igc: recalculate Qbv end_time by considering cycle time Qbv users can specify a cycle time that is not equal to the total GCL intervals. Hence, recalculation is necessary here to exclude the time interval that exceeds the cycle time. As those GCL which exceeds the cycle time will be truncated. According to IEEE Std. 802.1Q-2018 section 8.6.9.2, once the end of the list is reached, it will switch to the END_OF_CYCLE state and leave the gates in the same state until the next cycle is started. Fixes: ec50a9d437f0 ("igc: Add support for taprio offloading") Signed-off-by: Tan Tee Min Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 6c98f3402f8c..86e46208d95c 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6067,6 +6067,21 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, end_time += e->interval; + /* If any of the conditions below are true, we need to manually + * control the end time of the cycle. + * 1. Qbv users can specify a cycle time that is not equal + * to the total GCL intervals. Hence, recalculation is + * necessary here to exclude the time interval that + * exceeds the cycle time. + * 2. According to IEEE Std. 802.1Q-2018 section 8.6.9.2, + * once the end of the list is reached, it will switch + * to the END_OF_CYCLE state and leave the gates in the + * same state until the next cycle is started. + */ + if (end_time > adapter->cycle_time || + n + 1 == qopt->num_entries) + end_time = adapter->cycle_time; + for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *ring = adapter->tx_ring[i]; From 72abeedd83982c1bc6023f631e412db78374d9b4 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 9 Dec 2022 12:15:22 +0800 Subject: [PATCH 32/64] igc: Set Qbv start_time and end_time to end_time if not being configured in GCL The default setting of end_time minus start_time is whole 1 second. Thus, if it's not being configured in any GCL entry then it will be staying at original 1 second. This patch is changing the start_time and end_time to be end_time as if setting zero will be having weird HW behavior where the gate will not be fully closed. Fixes: ec50a9d437f0 ("igc: Add support for taprio offloading") Signed-off-by: Tan Tee Min Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 86e46208d95c..44b1740dc098 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6043,6 +6043,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, bool queue_configured[IGC_MAX_TX_QUEUES] = { }; u32 start_time = 0, end_time = 0; size_t n; + int i; adapter->qbv_enable = qopt->enable; @@ -6063,7 +6064,6 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, for (n = 0; n < qopt->num_entries; n++) { struct tc_taprio_sched_entry *e = &qopt->entries[n]; - int i; end_time += e->interval; @@ -6102,6 +6102,18 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, start_time += e->interval; } + /* Check whether a queue gets configured. + * If not, set the start and end time to be end time. + */ + for (i = 0; i < adapter->num_tx_queues; i++) { + if (!queue_configured[i]) { + struct igc_ring *ring = adapter->tx_ring[i]; + + ring->start_time = end_time; + ring->end_time = end_time; + } + } + return 0; } From 32f1002ed4851d9eb28ea1aba58757bd4b66e63b Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 15 Dec 2022 03:47:59 +0000 Subject: [PATCH 33/64] net: dsa: mt7530: remove redundant assignment Russell King correctly pointed out that the MAC_2500FD capability is already added for port 5 (if not in RGMII mode) and port 6 (which only supports SGMII) by mt7531_mac_port_get_caps. Remove the reduntant setting of this capability flag which was added by a previous commit. Fixes: e19de30d2080 ("net: dsa: mt7530: add support for in-band link status") Reported-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Daniel Golle Link: https://lore.kernel.org/r/Y5qY7x6la5TxZxzX@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index e74c6b406172..908fa89444c9 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2919,9 +2919,6 @@ static void mt753x_phylink_get_caps(struct dsa_switch *ds, int port, config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; - if ((priv->id == ID_MT7531) && mt753x_is_mac_port(port)) - config->mac_capabilities |= MAC_2500FD; - /* This driver does not make use of the speed, duplex, pause or the * advertisement in its mac_config, so it is safe to mark this driver * as non-legacy. From b4cafb3d2c740f8d1b1234b43ac4a60e5291c960 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 14 Dec 2022 18:01:00 -0800 Subject: [PATCH 34/64] devlink: hold region lock when flushing snapshots Netdevsim triggers a splat on reload, when it destroys regions with snapshots pending: WARNING: CPU: 1 PID: 787 at net/core/devlink.c:6291 devlink_region_snapshot_del+0x12e/0x140 CPU: 1 PID: 787 Comm: devlink Not tainted 6.1.0-07460-g7ae9888d6e1c #580 RIP: 0010:devlink_region_snapshot_del+0x12e/0x140 Call Trace: devl_region_destroy+0x70/0x140 nsim_dev_reload_down+0x2f/0x60 [netdevsim] devlink_reload+0x1f7/0x360 devlink_nl_cmd_reload+0x6ce/0x860 genl_family_rcv_msg_doit.isra.0+0x145/0x1c0 This is the locking assert in devlink_region_snapshot_del(), we're supposed to be holding the region->snapshot_lock here. Fixes: 2dec18ad826f ("net: devlink: remove region snapshots list dependency on devlink->lock") Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/devlink.c b/net/core/devlink.c index 6004bd0ccee4..d2df30829083 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -11925,8 +11925,10 @@ void devl_region_destroy(struct devlink_region *region) devl_assert_locked(devlink); /* Free all snapshots of region */ + mutex_lock(®ion->snapshot_lock); list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); list_del(®ion->list); mutex_destroy(®ion->snapshot_lock); From 2fc60e2ff972d3dca836bff0b08cbe503c4ca1ce Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 14 Dec 2022 18:01:01 -0800 Subject: [PATCH 35/64] selftests: devlink: fix the fd redirect in dummy_reporter_test $number + > bash means redirect FD $number, e.g. commonly used 2> redirects stderr (fd 2). The test uses 8192> to write the number 8192 to a file, this results in: ./devlink.sh: line 499: 8192: Bad file descriptor Oddly the test also papers over this issue by checking for failure (expecting an error rather than success) so it passes, anyway. Fixes: ff18176ad806 ("selftests: Add a test of large binary to devlink health test") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/netdevsim/devlink.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 9de1d123f4f5..a08c02abde12 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -496,8 +496,8 @@ dummy_reporter_test() check_reporter_info dummy healthy 3 3 10 true - echo 8192> $DEBUGFS_DIR/health/binary_len - check_fail $? "Failed set dummy reporter binary len to 8192" + echo 8192 > $DEBUGFS_DIR/health/binary_len + check_err $? "Failed set dummy reporter binary len to 8192" local dump=$(devlink health dump show $DL_HANDLE reporter dummy -j) check_err $? "Failed show dump of dummy reporter" From d1c4a3469e73730f7cbbcec661c2a9081af1aa45 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 14 Dec 2022 18:01:02 -0800 Subject: [PATCH 36/64] selftests: devlink: add a warning for interfaces coming up NetworkManager (and other daemons) may bring the interface up and cause failures in quiescence checks. Print a helpful warning, and take the interface down again. I seem to forget about this every time I run these tests on a new VM. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../selftests/drivers/net/netdevsim/devlink_trap.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index 109900c817be..b64d98ca0df7 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -47,6 +47,17 @@ if [ -d "${NETDEVSIM_PATH}/devices/netdevsim${DEV_ADDR}" ]; then exit 1 fi +check_netdev_down() +{ + state=$(cat /sys/class/net/${NETDEV}/flags) + + if [ $((state & 1)) -ne 0 ]; then + echo "WARNING: unexpected interface UP, disable NetworkManager?" + + ip link set dev $NETDEV down + fi +} + init_test() { RET=0 @@ -151,6 +162,7 @@ trap_stats_test() RET=0 + check_netdev_down for trap_name in $(devlink_traps_get); do devlink_trap_stats_idle_test $trap_name check_err $? "Stats of trap $trap_name not idle when netdev down" @@ -254,6 +266,7 @@ trap_group_stats_test() RET=0 + check_netdev_down for group_name in $(devlink_trap_groups_get); do devlink_trap_group_stats_idle_test $group_name check_err $? "Stats of trap group $group_name not idle when netdev down" From 68bb10101e6b0a6bb44e9c908ef795fc4af99eae Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 15 Dec 2022 15:46:33 +0100 Subject: [PATCH 37/64] openvswitch: Fix flow lookup to use unmasked key The commit mentioned below causes the ovs_flow_tbl_lookup() function to be called with the masked key. However, it's supposed to be called with the unmasked key. This due to the fact that the datapath supports installing wider flows, and OVS relies on this behavior. For example if ipv4(src=1.1.1.1/192.0.0.0, dst=1.1.1.2/192.0.0.0) exists, a wider flow (smaller mask) of ipv4(src=192.1.1.1/128.0.0.0,dst=192.1.1.2/ 128.0.0.0) is allowed to be added. However, if we try to add a wildcard rule, the installation fails: $ ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=1.1.1.1/192.0.0.0,dst=1.1.1.2/192.0.0.0,frag=no)" 2 $ ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=192.1.1.1/0.0.0.0,dst=49.1.1.2/0.0.0.0,frag=no)" 2 ovs-vswitchd: updating flow table (File exists) The reason is that the key used to determine if the flow is already present in the system uses the original key ANDed with the mask. This results in the IP address not being part of the (miniflow) key, i.e., being substituted with an all-zero value. When doing the actual lookup, this results in the key wrongfully matching the first flow, and therefore the flow does not get installed. This change reverses the commit below, but rather than having the key on the stack, it's allocated. Fixes: 190aa3e77880 ("openvswitch: Fix Frame-size larger than 1024 bytes warning.") Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 932bcf766d63..9ca721c9fa71 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -973,6 +973,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -1000,24 +1001,26 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_key; + } + + ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; + ovs_flow_mask_key(&new_flow->key, key, true, &mask); + /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); + key, log); if (error) goto err_kfree_flow; - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); - /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); @@ -1044,7 +1047,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); + flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1114,6 +1117,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); return 0; err_unlock_ovs: @@ -1123,6 +1128,8 @@ err_kfree_acts: ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); +err_kfree_key: + kfree(key); error: return error; } From 078838f5b9c9203e94d7724f997392ea8012ea6a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Dec 2022 17:39:05 +0100 Subject: [PATCH 38/64] net: ethernet: ti: am65-cpsw: fix CONFIG_PM #ifdef The #ifdef check is incorrect and leads to a warning: drivers/net/ethernet/ti/am65-cpsw-nuss.c:1679:13: error: 'am65_cpsw_nuss_remove_rx_chns' defined but not used [-Werror=unused-function] 1679 | static void am65_cpsw_nuss_remove_rx_chns(void *data) It's better to remove the #ifdef here and use the modern SYSTEM_SLEEP_PM_OPS() macro instead. Fixes: 24bc19b05f1f ("net: ethernet: ti: am65-cpsw: Add suspend/resume support") Signed-off-by: Arnd Bergmann Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20221215163918.611609-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 9decb0c7961b..ecbde83b5243 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2878,7 +2878,6 @@ static int am65_cpsw_nuss_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP static int am65_cpsw_nuss_suspend(struct device *dev) { struct am65_cpsw_common *common = dev_get_drvdata(dev); @@ -2964,10 +2963,9 @@ static int am65_cpsw_nuss_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ static const struct dev_pm_ops am65_cpsw_nuss_dev_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(am65_cpsw_nuss_suspend, am65_cpsw_nuss_resume) + SYSTEM_SLEEP_PM_OPS(am65_cpsw_nuss_suspend, am65_cpsw_nuss_resume) }; static struct platform_driver am65_cpsw_nuss_driver = { From 214964a13ab56a9757d146b79b468a7ca190fbfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Dec 2022 20:41:22 -0800 Subject: [PATCH 39/64] devlink: protect devlink dump by the instance lock Take the instance lock around devlink_nl_fill() when dumping, doit takes it already. We are only dumping basic info so in the worst case we were risking data races around the reload statistics. Until the big devlink mutex was removed all relevant code was protected by it, so the missing instance lock was not exposed. Fixes: d3efc2a6a6d8 ("net: devlink: remove devlink_mutex") Reviewed-by: Jiri Pirko Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20221216044122.1863550-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/devlink.c b/net/core/devlink.c index d2df30829083..032d6d0a5ce6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1648,10 +1648,13 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, continue; } + devl_lock(devlink); err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); + devl_unlock(devlink); devlink_put(devlink); + if (err) goto out; idx++; From 2d7afdcbc9d32423f177ee12b7c93783aea338fb Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 14 Dec 2022 23:11:58 -0700 Subject: [PATCH 40/64] skbuff: Account for tail adjustment during pull operations Extending the tail can have some unexpected side effects if a program uses a helper like BPF_FUNC_skb_pull_data to read partial content beyond the head skb headlen when all the skbs in the gso frag_list are linear with no head_frag - kernel BUG at net/core/skbuff.c:4219! pc : skb_segment+0xcf4/0xd2c lr : skb_segment+0x63c/0xd2c Call trace: skb_segment+0xcf4/0xd2c __udp_gso_segment+0xa4/0x544 udp4_ufo_fragment+0x184/0x1c0 inet_gso_segment+0x16c/0x3a4 skb_mac_gso_segment+0xd4/0x1b0 __skb_gso_segment+0xcc/0x12c udp_rcv_segment+0x54/0x16c udp_queue_rcv_skb+0x78/0x144 udp_unicast_rcv_skb+0x8c/0xa4 __udp4_lib_rcv+0x490/0x68c udp_rcv+0x20/0x30 ip_protocol_deliver_rcu+0x1b0/0x33c ip_local_deliver+0xd8/0x1f0 ip_rcv+0x98/0x1a4 deliver_ptype_list_skb+0x98/0x1ec __netif_receive_skb_core+0x978/0xc60 Fix this by marking these skbs as GSO_DODGY so segmentation can handle the tail updates accordingly. Fixes: 3dcbdb134f32 ("net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/1671084718-24796-1-git-send-email-quic_subashab@quicinc.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3cbba7099c0f..4a0eb5593275 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2482,6 +2482,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } else { /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; if (skb_shared(list)) { /* Sucks! We need to fork list. :-( */ From 9cd3fd2054c3b3055163accbf2f31a4426f10317 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 17 Dec 2022 14:17:07 -0800 Subject: [PATCH 41/64] net_sched: reject TCF_EM_SIMPLE case for complex ematch module When TCF_EM_SIMPLE was introduced, it is supposed to be convenient for ematch implementation: https://lore.kernel.org/all/20050105110048.GO26856@postel.suug.ch/ "You don't have to, providing a 32bit data chunk without TCF_EM_SIMPLE set will simply result in allocating & copy. It's an optimization, nothing more." So if an ematch module provides ops->datalen that means it wants a complex data structure (saved in its em->data) instead of a simple u32 value. We should simply reject such a combination, otherwise this u32 could be misinterpreted as a pointer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+4caeae4c7103813598ae@syzkaller.appspotmail.com Reported-by: Jun Nie Cc: Jamal Hadi Salim Cc: Paolo Abeni Signed-off-by: Cong Wang Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/sched/ematch.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 4ce681361851..5c1235e6076a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -255,6 +255,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { + if (em->ops->datalen > 0) + goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; From 4feb2c44629e6f9b459b41a5a60491069d346a95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:19:47 +0000 Subject: [PATCH 42/64] rxrpc: Fix missing unlock in rxrpc_do_sendmsg() One of the error paths in rxrpc_do_sendmsg() doesn't unlock the call mutex before returning. Fix it to do this. Note that this still doesn't get rid of the checker warning: ../net/rxrpc/sendmsg.c:617:5: warning: context imbalance in 'rxrpc_do_sendmsg' - wrong count at exit I think the interplay between the socket lock and the call's user_mutex may be too complicated for checker to analyse, especially as rxrpc_new_client_call_for_sendmsg(), which it calls, returns with the call's user_mutex if successful but unconditionally drops the socket lock. Fixes: e754eba685aa ("rxrpc: Provide a cmsg to specify the amount of Tx data for a call") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/sendmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9fa7e37f7155..cde1e65f16b4 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -625,7 +625,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) - goto error_put; + goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } From fdb99487b0189f0ef883e353ad7484c78a8bd425 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:19:56 +0000 Subject: [PATCH 43/64] rxrpc: Fix security setting propagation Fix the propagation of the security settings from sendmsg to the rxrpc_call struct. Fixes: f3441d4125fc ("rxrpc: Copy client call parameters into rxrpc_call earlier") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/call_object.c | 1 + net/rxrpc/conn_client.c | 2 -- net/rxrpc/security.c | 6 +++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index be5eb8cdf549..89dcf60b1158 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -217,6 +217,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); + call->security_level = cp->security_level; if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); if (cp->upgrade) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index a08e33c9e54b..87efa0373aed 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -551,8 +551,6 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; - call->security = conn->security; - call->security_ix = conn->security_ix; call->dest_srx.srx_service = conn->service_id; trace_rxrpc_connect_call(call); diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 209f2c25a0da..ab968f65a490 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -67,13 +67,13 @@ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) */ int rxrpc_init_client_call_security(struct rxrpc_call *call) { - const struct rxrpc_security *sec; + const struct rxrpc_security *sec = &rxrpc_no_security; struct rxrpc_key_token *token; struct key *key = call->key; int ret; if (!key) - return 0; + goto found; ret = key_validate(key); if (ret < 0) @@ -88,7 +88,7 @@ int rxrpc_init_client_call_security(struct rxrpc_call *call) found: call->security = sec; - _leave(" = 0"); + call->security_ix = sec->security_index; return 0; } From eaa02390adb03b82f04babebf0cdd233793aecf5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:04 +0000 Subject: [PATCH 44/64] rxrpc: Fix NULL deref in rxrpc_unuse_local() Fix rxrpc_unuse_local() to get the debug_id *after* checking to see if local is NULL. Fixes: a2cf3264f331 ("rxrpc: Fold __rxrpc_unuse_local() into rxrpc_unuse_local()") Reported-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com Signed-off-by: David Howells Tested-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- net/rxrpc/local_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 44222923c0d1..24ee585d9aaf 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -357,10 +357,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - unsigned int debug_id = local->debug_id; + unsigned int debug_id; int r, u; if (local) { + debug_id = local->debug_id; r = refcount_read(&local->ref); u = atomic_dec_return(&local->active_users); trace_rxrpc_local(debug_id, why, r, u); From 8fbcc83334a7b5b42b6bc1fae2458bf25eb57768 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:13 +0000 Subject: [PATCH 45/64] rxrpc: Fix I/O thread startup getting skipped When starting a kthread, the __kthread_create_on_node() function, as called from kthread_run(), waits for a completion to indicate that the task_struct (or failure state) of the new kernel thread is available before continuing. This does not wait, however, for the thread function to be invoked and, indeed, will skip it if kthread_stop() gets called before it gets there. If this happens, though, kthread_run() will have returned successfully, indicating that the thread was started and returning the task_struct pointer. The actual error indication is returned by kthread_stop(). Note that this is ambiguous, as the caller cannot tell whether the -EINTR error code came from kthread() or from the thread function. This was encountered in the new rxrpc I/O thread, where if the system is being pounded hard by, say, syzbot, the check of KTHREAD_SHOULD_STOP can be delayed long enough for kthread_stop() to get called when rxrpc releases a socket - and this causes an oops because the I/O thread function doesn't get started and thus doesn't remove the rxrpc_local struct from the local_endpoints list. Fix this by using a completion to wait for the thread to actually enter rxrpc_io_thread(). This makes sure the thread can't be prematurely stopped and makes sure the relied-upon cleanup is done. Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Reported-by: syzbot+3538a6a72efa8b059c38@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Marc Dionne cc: Hillf Danton Link: https://lore.kernel.org/r/000000000000229f1505ef2b6159@google.com/ Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/io_thread.c | 2 ++ net/rxrpc/local_object.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e7dccab7b741..37f3aec784cc 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -287,6 +287,7 @@ struct rxrpc_local { struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; + struct completion io_thread_ready; /* Indication that the I/O thread started */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head rx_queue; /* Received packets */ diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index d83ae3193032..e460e4151c16 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -426,6 +426,8 @@ int rxrpc_io_thread(void *data) struct rxrpc_call *call; struct sk_buff *skb; + complete(&local->io_thread_ready); + skb_queue_head_init(&rx_queue); set_user_nice(current, MIN_NICE); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 24ee585d9aaf..270b63d8f37a 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -97,6 +97,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); init_rwsem(&local->defrag_sem); + init_completion(&local->io_thread_ready); skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; @@ -189,6 +190,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) goto error_sock; } + wait_for_completion(&local->io_thread_ready); local->io_thread = io_thread; _leave(" = 0"); return 0; From 608aecd16a31269485e2980898029dd01b03a73e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:21 +0000 Subject: [PATCH 46/64] rxrpc: Fix locking issues in rxrpc_put_peer_locked() Now that rxrpc_put_local() may call kthread_stop(), it can't be called under spinlock as it might sleep. This can cause a problem in the peer keepalive code in rxrpc as it tries to avoid dropping the peer_hash_lock from the point it needs to re-add peer->keepalive_link to going round the loop again in rxrpc_peer_keepalive_dispatch(). Fix this by just dropping the lock when we don't need it and accepting that we'll have to take it again. This code is only called about every 20s for each peer, so not very often. This allows rxrpc_put_peer_unlocked() to be removed also. If triggered, this bug produces an oops like the following, as reproduced by a syzbot reproducer for a different oops[1]: BUG: sleeping function called from invalid context at kernel/sched/completion.c:101 ... RCU nest depth: 0, expected: 0 3 locks held by kworker/u9:0/50: #0: ffff88810e74a138 ((wq_completion)krxrpcd){+.+.}-{0:0}, at: process_one_work+0x294/0x636 #1: ffff8881013a7e20 ((work_completion)(&rxnet->peer_keepalive_work)){+.+.}-{0:0}, at: process_one_work+0x294/0x636 #2: ffff88817d366390 (&rxnet->peer_hash_lock){+.+.}-{2:2}, at: rxrpc_peer_keepalive_dispatch+0x2bd/0x35f ... Call Trace: dump_stack_lvl+0x4c/0x5f __might_resched+0x2cf/0x2f2 __wait_for_common+0x87/0x1e8 kthread_stop+0x14d/0x255 rxrpc_peer_keepalive_dispatch+0x333/0x35f rxrpc_peer_keepalive_worker+0x2e9/0x449 process_one_work+0x3c1/0x636 worker_thread+0x25f/0x359 kthread+0x1a6/0x1b5 ret_from_fork+0x1f/0x30 Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/0000000000002b4a9f05ef2b616f@google.com/ [1] Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/peer_event.c | 10 +++++++--- net/rxrpc/peer_object.c | 19 ------------------- 3 files changed, 7 insertions(+), 23 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 37f3aec784cc..5b732a4af009 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1073,7 +1073,6 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); -void rxrpc_put_peer_locked(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 6685bf917aa6..552ba84a255c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -235,6 +235,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, struct rxrpc_peer *peer; const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; time64_t keepalive_at; + bool use; int slot; spin_lock(&rxnet->peer_hash_lock); @@ -247,9 +248,10 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, if (!rxrpc_get_peer_maybe(peer, rxrpc_peer_get_keepalive)) continue; - if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { - spin_unlock(&rxnet->peer_hash_lock); + use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); + spin_unlock(&rxnet->peer_hash_lock); + if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; _debug("%02x peer %u t=%d {%pISp}", @@ -270,9 +272,11 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, spin_lock(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); + spin_unlock(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } - rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive); + rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); + spin_lock(&rxnet->peer_hash_lock); } spin_unlock(&rxnet->peer_hash_lock); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 608946dcc505..82de295393a0 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -438,25 +438,6 @@ void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) } } -/* - * Drop a ref on a peer record where the caller already holds the - * peer_hash_lock. - */ -void rxrpc_put_peer_locked(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) -{ - unsigned int debug_id = peer->debug_id; - bool dead; - int r; - - dead = __refcount_dec_and_test(&peer->ref, &r); - trace_rxrpc_peer(debug_id, r - 1, why); - if (dead) { - hash_del_rcu(&peer->hash_link); - list_del_init(&peer->keepalive_link); - rxrpc_free_peer(peer); - } -} - /* * Make sure all peer records have been discarded. */ From c838f1a73d77abadb0810eff0e150ac88fef3da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:30 +0000 Subject: [PATCH 47/64] rxrpc: Fix switched parameters in peer tracing Fix the switched parameters on rxrpc_alloc_peer() and rxrpc_get_peer(). The ref argument and the why argument got mixed. Fixes: 47c810a79844 ("rxrpc: trace: Don't use __builtin_return_address for rxrpc_peer tracing") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/peer_object.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 049b52e7aa6a..c6cfed00d0c6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -471,7 +471,7 @@ TRACE_EVENT(rxrpc_peer, TP_STRUCT__entry( __field(unsigned int, peer ) __field(int, ref ) - __field(int, why ) + __field(enum rxrpc_peer_trace, why ) ), TP_fast_assign( diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 82de295393a0..4eecea2be307 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -226,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, rxrpc_peer_init_rtt(peer); peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; - trace_rxrpc_peer(peer->debug_id, why, 1); + trace_rxrpc_peer(peer->debug_id, 1, why); } _leave(" = %p", peer); @@ -382,7 +382,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace int r; __refcount_inc(&peer->ref, &r); - trace_rxrpc_peer(peer->debug_id, why, r + 1); + trace_rxrpc_peer(peer->debug_id, r + 1, why); return peer; } From 743d1768a008c8eae56ead497c9ba8237b14ee81 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:38 +0000 Subject: [PATCH 48/64] rxrpc: Fix I/O thread stop The rxrpc I/O thread checks to see if there's any work it needs to do, and if not, checks kthread_should_stop() before scheduling, and if it should stop, breaks out of the loop and tries to clean up and exit. This can, however, race with socket destruction, wherein outstanding calls are aborted and released from the socket and then the socket unuses the local endpoint, causing kthread_stop() to be issued. The abort is deferred to the I/O thread and the event can by issued between the I/O thread checking if there's any work to be done (such as processing call aborts) and the stop being seen. This results in the I/O thread stopping processing of events whilst call cleanup events are still outstanding, leading to connections or other objects still being around and uncleaned up, which can result in assertions being triggered, e.g.: rxrpc: AF_RXRPC: Leaked client conn 00000000e8009865 {2} ------------[ cut here ]------------ kernel BUG at net/rxrpc/conn_client.c:64! Fix this by retrieving the kthread_should_stop() indication, then checking to see if there's more work to do, and going back round the loop if there is, and breaking out of the loop only if there wasn't. This was triggered by a syzbot test that produced some other symptom[1]. Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/0000000000002b4a9f05ef2b616f@google.com/ [1] Signed-off-by: David S. Miller --- net/rxrpc/io_thread.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index e460e4151c16..e6b9f0ceae17 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -425,6 +425,7 @@ int rxrpc_io_thread(void *data) struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; + bool should_stop; complete(&local->io_thread_ready); @@ -478,13 +479,14 @@ int rxrpc_io_thread(void *data) } set_current_state(TASK_INTERRUPTIBLE); + should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || !list_empty(&local->call_attend_q)) { __set_current_state(TASK_RUNNING); continue; } - if (kthread_should_stop()) + if (should_stop) break; schedule(); } From 11e1706bc84f60040578056f8cef3d0139b92dda Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:46 +0000 Subject: [PATCH 49/64] rxrpc: rxperf: Fix uninitialised variable Dan Carpenter sayeth[1]: The patch 75bfdbf2fca3: "rxrpc: Implement an in-kernel rxperf server for testing purposes" from Nov 3, 2022, leads to the following Smatch static checker warning: net/rxrpc/rxperf.c:337 rxperf_deliver_to_call() error: uninitialized symbol 'ret'. Fix this by initialising ret to 0. The value is only used for tracing purposes in the rxperf server. Fixes: 75bfdbf2fca3 ("rxrpc: Implement an in-kernel rxperf server for testing purposes") Reported-by: Dan Carpenter Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-December/006124.html [1] Signed-off-by: David S. Miller --- net/rxrpc/rxperf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 66f5eea291ff..d33a109e846c 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -275,7 +275,7 @@ static void rxperf_deliver_to_call(struct work_struct *work) struct rxperf_call *call = container_of(work, struct rxperf_call, work); enum rxperf_call_state state; u32 abort_code, remote_abort = 0; - int ret; + int ret = 0; if (call->state == RXPERF_CALL_COMPLETE) return; From 31d35a02ad5b803354fe0727686fcbace7a343fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Dec 2022 16:20:55 +0000 Subject: [PATCH 50/64] rxrpc: Fix the return value of rxrpc_new_incoming_call() Dan Carpenter sayeth[1]: The patch 5e6ef4f1017c: "rxrpc: Make the I/O thread take over the call and local processor work" from Jan 23, 2020, leads to the following Smatch static checker warning: net/rxrpc/io_thread.c:283 rxrpc_input_packet() warn: bool is not less than zero. Fix this (for now) by changing rxrpc_new_incoming_call() to return an int with 0 or error code rather than bool. Note that the actual return value of rxrpc_input_packet() is currently ignored. I have a separate patch to clean that up. Fixes: 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") Reported-by: Dan Carpenter Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-December/006123.html [1] Signed-off-by: David S. Miller --- net/rxrpc/ar-internal.h | 6 +++--- net/rxrpc/call_accept.c | 18 +++++++++--------- net/rxrpc/io_thread.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5b732a4af009..18092526d3c8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -812,9 +812,9 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); -bool rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, - struct rxrpc_connection *, struct sockaddr_rxrpc *, - struct sk_buff *); +int rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, + struct rxrpc_connection *, struct sockaddr_rxrpc *, + struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index d1850863507f..c02401656fa9 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -326,11 +326,11 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * If we want to report an error, we mark the skb with the packet type and * abort code and return false. */ -bool rxrpc_new_incoming_call(struct rxrpc_local *local, - struct rxrpc_peer *peer, - struct rxrpc_connection *conn, - struct sockaddr_rxrpc *peer_srx, - struct sk_buff *skb) +int rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb) { const struct rxrpc_security *sec = NULL; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -342,7 +342,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, /* Don't set up a call for anything other than the first DATA packet. */ if (sp->hdr.seq != 1 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - return true; /* Just discard */ + return 0; /* Just discard */ rcu_read_lock(); @@ -413,7 +413,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, _leave(" = %p{%d}", call, call->debug_id); rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return true; + return 0; unsupported_service: trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, @@ -425,10 +425,10 @@ no_call: reject: rcu_read_unlock(); _leave(" = f [%u]", skb->mark); - return false; + return -EPROTO; discard: rcu_read_unlock(); - return true; + return 0; } /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index e6b9f0ceae17..1ad067d66fb6 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -292,7 +292,7 @@ protocol_error: skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: rxrpc_reject_packet(local, skb); - return ret; + return 0; } /* @@ -384,7 +384,7 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (rxrpc_to_client(sp)) goto bad_message; if (rxrpc_new_incoming_call(conn->local, conn->peer, conn, - peer_srx, skb)) + peer_srx, skb) == 0) return 0; goto reject_packet; } From 10073399cb5e389ab275bd1c9df4b486a2f0c9d4 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 19 Dec 2022 09:22:15 +0100 Subject: [PATCH 51/64] net: microchip: vcap: Fix initialization of value and mask Fix the following smatch warning: smatch warnings: drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c:103 vcap_debugfs_show_rule_keyfield() error: uninitialized symbol 'value'. drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c:106 vcap_debugfs_show_rule_keyfield() error: uninitialized symbol 'mask'. In case the vcap field was VCAP_FIELD_U128 and the key was different than IP6_S/DIP then the value and mask were not initialized, therefore initialize them. Fixes: 610c32b2ce66 ("net: microchip: vcap: Add vcap_get_rule") Reported-by: kernel test robot Reported-by: Dan Carpenter Reviewed-by: Saeed Mahameed Signed-off-by: Horatiu Vultur Reviewed-by: Michal Swiatkowski Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c b/drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c index 895bfff550d2..e0b206247f2e 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_api_debugfs.c @@ -83,6 +83,8 @@ static void vcap_debugfs_show_rule_keyfield(struct vcap_control *vctrl, hex = true; break; case VCAP_FIELD_U128: + value = data->u128.value; + mask = data->u128.mask; if (key == VCAP_KF_L3_IP6_SIP || key == VCAP_KF_L3_IP6_DIP) { u8 nvalue[16], nmask[16]; From d83b950d44d2982c0e62e3d81b0f35ab09431008 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 18 Dec 2022 19:08:40 +0100 Subject: [PATCH 52/64] myri10ge: Fix an error handling path in myri10ge_probe() Some memory allocated in myri10ge_probe_slices() is not released in the error handling path of myri10ge_probe(). Add the corresponding kfree(), as already done in the remove function. Fixes: 0dcffac1a329 ("myri10ge: add multislices support") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 8073d7a90a26..c5687d94ea88 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3912,6 +3912,7 @@ abort_with_slices: myri10ge_free_slices(mgp); abort_with_firmware: + kfree(mgp->msix_vectors); myri10ge_dummy_rdma(mgp, 0); abort_with_ioremap: From e0c8bccd40fc1c19e1d246c39bcf79e357e1ada3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Dec 2022 16:29:17 +0000 Subject: [PATCH 53/64] net: stream: purge sk_error_queue in sk_stream_kill_queues() Changheon Lee reported TCP socket leaks, with a nice repro. It seems we leak TCP sockets with the following sequence: 1) SOF_TIMESTAMPING_TX_ACK is enabled on the socket. Each ACK will cook an skb put in error queue, from __skb_tstamp_tx(). __skb_tstamp_tx() is using skb_clone(), unless SOF_TIMESTAMPING_OPT_TSONLY was also requested. 2) If the application is also using MSG_ZEROCOPY, then we put in the error queue cloned skbs that had a struct ubuf_info attached to them. Whenever an struct ubuf_info is allocated, sock_zerocopy_alloc() does a sock_hold(). As long as the cloned skbs are still in sk_error_queue, socket refcount is kept elevated. 3) Application closes the socket, while error queue is not empty. Since tcp_close() no longer purges the socket error queue, we might end up with a TCP socket with at least one skb in error queue keeping the socket alive forever. This bug can be (ab)used to consume all kernel memory and freeze the host. We need to purge the error queue, with proper synchronization against concurrent writers. Fixes: 24bcbe1cc69f ("net: stream: don't purge sk_error_queue in sk_stream_kill_queues()") Reported-by: Changheon Lee Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/stream.c b/net/core/stream.c index 5b1fe2b82eac..cd06750dd329 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -196,6 +196,12 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); + /* Next, the write queue. */ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); From 1b0c84a32e37cf85d552261005091eb695313f38 Mon Sep 17 00:00:00 2001 From: Huanhuan Wang Date: Fri, 16 Dec 2022 15:31:01 +0100 Subject: [PATCH 54/64] nfp: fix unaligned io read of capabilities word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The address of 32-bit extend capability is not qword aligned, and may cause exception in some arch. Fixes: 484963ce9f1e ("nfp: extend capability and control words") Signed-off-by: Huanhuan Wang Reviewed-by: Niklas Söderlund Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 2314cf55e821..09053373288f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2509,7 +2509,7 @@ static int nfp_net_read_caps(struct nfp_net *nn) { /* Get some of the read-only fields from the BAR */ nn->cap = nn_readl(nn, NFP_NET_CFG_CAP); - nn->cap_w1 = nn_readq(nn, NFP_NET_CFG_CAP_WORD1); + nn->cap_w1 = nn_readl(nn, NFP_NET_CFG_CAP_WORD1); nn->max_mtu = nn_readl(nn, NFP_NET_CFG_MAX_MTU); /* ABI 4.x and ctrl vNIC always use chained metadata, in other cases From 2856a62762c8409e360d4fd452194c8e57ba1058 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 16 Dec 2022 11:44:09 +0800 Subject: [PATCH 55/64] mctp: serial: Fix starting value for frame check sequence RFC1662 defines the start state for the crc16 FCS to be 0xffff, but we're currently starting at zero. This change uses the correct start state. We're only early in the adoption for the serial binding, so there aren't yet any other users to interface to. Fixes: a0c2ccd9b5ad ("mctp: Add MCTP-over-serial transport binding") Reported-by: Harsh Tyagi Tested-by: Harsh Tyagi Signed-off-by: Jeremy Kerr Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/mctp/mctp-serial.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index 7cd103fd34ef..9f9eaf896047 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -35,6 +35,8 @@ #define BYTE_FRAME 0x7e #define BYTE_ESC 0x7d +#define FCS_INIT 0xffff + static DEFINE_IDA(mctp_serial_ida); enum mctp_serial_state { @@ -123,7 +125,7 @@ static void mctp_serial_tx_work(struct work_struct *work) buf[2] = dev->txlen; if (!dev->txpos) - dev->txfcs = crc_ccitt(0, buf + 1, 2); + dev->txfcs = crc_ccitt(FCS_INIT, buf + 1, 2); txlen = write_chunk(dev, buf + dev->txpos, 3 - dev->txpos); if (txlen <= 0) { @@ -303,7 +305,7 @@ static void mctp_serial_push_header(struct mctp_serial *dev, unsigned char c) case 1: if (c == MCTP_SERIAL_VERSION) { dev->rxpos++; - dev->rxfcs = crc_ccitt_byte(0, c); + dev->rxfcs = crc_ccitt_byte(FCS_INIT, c); } else { dev->rxstate = STATE_ERR; } From 115dd5469019296040359060743de77071ccb6ec Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 13 Dec 2022 14:11:36 +0900 Subject: [PATCH 56/64] Documentation: devlink: add missing toc entry for etas_es58x devlink doc toc entry is missing for etas_es58x devlink doc and triggers this warning: Documentation/networking/devlink/etas_es58x.rst: WARNING: document isn't included in any toctree Add the missing toc entry. Fixes: 9f63f96aac92 ("Documentation: devlink: add devlink documentation for the etas_es58x driver") Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/all/20221213051136.721887-1-mailhol.vincent@wanadoo.fr Reported-by: Stephen Rothwell Reported-by: kernel test robot Signed-off-by: Marc Kleine-Budde --- Documentation/networking/devlink/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index 4b653d040627..fee4d3968309 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -50,6 +50,7 @@ parameters, info versions, and other features it supports. :maxdepth: 1 bnxt + etas_es58x hns3 ionic ice From 3bc2afcba81275306adfbfca83f38a52858c5940 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Tue, 13 Dec 2022 17:43:51 +0800 Subject: [PATCH 57/64] can: flexcan: avoid unbalanced pm_runtime_enable warning When do suspend/resume, meet the following warning message: [ 30.028336] flexcan 425b0000.can: Unbalanced pm_runtime_enable! Balance the pm_runtime_force_suspend() and pm_runtime_force_resume(). Fixes: 8cb53b485f18 ("can: flexcan: add auto stop mode for IMX93 to support wakeup") Signed-off-by: Haibo Chen Link: https://lore.kernel.org/all/20221213094351.3023858-1-haibo.chen@nxp.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan/flexcan-core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index 0aeff34e5ae1..6d638c93977b 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -2349,9 +2349,15 @@ static int __maybe_unused flexcan_noirq_resume(struct device *device) if (netif_running(dev)) { int err; - err = pm_runtime_force_resume(device); - if (err) - return err; + /* For the wakeup in auto stop mode, no need to gate on the + * clock here, hardware will do this automatically. + */ + if (!(device_may_wakeup(device) && + priv->devtype_data.quirks & FLEXCAN_QUIRK_AUTO_STOP_MODE)) { + err = pm_runtime_force_resume(device); + if (err) + return err; + } if (device_may_wakeup(device)) flexcan_enable_wakeup_irq(priv, false); From f006229135b7debf4037adb1eb93e358559593db Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 19 Dec 2022 11:39:27 +0100 Subject: [PATCH 58/64] can: kvaser_usb: hydra: help gcc-13 to figure out cmd_len MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Debian's gcc-13 [1] throws the following error in kvaser_usb_hydra_cmd_size(): [1] gcc version 13.0.0 20221214 (experimental) [master r13-4693-g512098a3316] (Debian 13-20221214-1) | drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c:502:65: error: | array subscript ‘struct kvaser_cmd_ext[0]’ is partly outside array | bounds of ‘unsigned char[32]’ [-Werror=array-bounds=] | 502 | ret = le16_to_cpu(((struct kvaser_cmd_ext *)cmd)->len); kvaser_usb_hydra_cmd_size() returns the size of given command. It depends on the command number (cmd->header.cmd_no). For extended commands (cmd->header.cmd_no == CMD_EXTENDED) the above shown code is executed. Help gcc to recognize that this code path is not taken in all cases, by calling kvaser_usb_hydra_cmd_size() directly after assigning the command number. Fixes: aec5fb2268b7 ("can: kvaser_usb: Add support for Kvaser USB hydra family") Cc: Jimmy Assarsson Cc: Anssi Hannula Link: https://lore.kernel.org/all/20221219110104.1073881-1-mkl@pengutronix.de Tested-by: Jimmy Assarsson Signed-off-by: Marc Kleine-Budde --- .../net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c index f688124d6d66..ef341c4254fc 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c @@ -545,6 +545,7 @@ static int kvaser_usb_hydra_send_simple_cmd(struct kvaser_usb *dev, u8 cmd_no, int channel) { struct kvaser_cmd *cmd; + size_t cmd_len; int err; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); @@ -552,6 +553,7 @@ static int kvaser_usb_hydra_send_simple_cmd(struct kvaser_usb *dev, return -ENOMEM; cmd->header.cmd_no = cmd_no; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); if (channel < 0) { kvaser_usb_hydra_set_cmd_dest_he (cmd, KVASER_USB_HYDRA_HE_ADDRESS_ILLEGAL); @@ -568,7 +570,7 @@ static int kvaser_usb_hydra_send_simple_cmd(struct kvaser_usb *dev, kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); if (err) goto end; @@ -584,6 +586,7 @@ kvaser_usb_hydra_send_simple_cmd_async(struct kvaser_usb_net_priv *priv, { struct kvaser_cmd *cmd; struct kvaser_usb *dev = priv->dev; + size_t cmd_len; int err; cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC); @@ -591,14 +594,14 @@ kvaser_usb_hydra_send_simple_cmd_async(struct kvaser_usb_net_priv *priv, return -ENOMEM; cmd->header.cmd_no = cmd_no; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); kvaser_usb_hydra_set_cmd_dest_he (cmd, dev->card_data.hydra.channel_to_he[priv->channel]); kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd_async(priv, cmd, - kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd_async(priv, cmd, cmd_len); if (err) kfree(cmd); @@ -742,6 +745,7 @@ static int kvaser_usb_hydra_get_single_capability(struct kvaser_usb *dev, { struct kvaser_usb_dev_card_data *card_data = &dev->card_data; struct kvaser_cmd *cmd; + size_t cmd_len; u32 value = 0; u32 mask = 0; u16 cap_cmd_res; @@ -753,13 +757,14 @@ static int kvaser_usb_hydra_get_single_capability(struct kvaser_usb *dev, return -ENOMEM; cmd->header.cmd_no = CMD_GET_CAPABILITIES_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); cmd->cap_req.cap_cmd = cpu_to_le16(cap_cmd_req); kvaser_usb_hydra_set_cmd_dest_he(cmd, card_data->hydra.sysdbg_he); kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); if (err) goto end; @@ -1578,6 +1583,7 @@ static int kvaser_usb_hydra_get_busparams(struct kvaser_usb_net_priv *priv, struct kvaser_usb *dev = priv->dev; struct kvaser_usb_net_hydra_priv *hydra = priv->sub_priv; struct kvaser_cmd *cmd; + size_t cmd_len; int err; if (!hydra) @@ -1588,6 +1594,7 @@ static int kvaser_usb_hydra_get_busparams(struct kvaser_usb_net_priv *priv, return -ENOMEM; cmd->header.cmd_no = CMD_GET_BUSPARAMS_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); kvaser_usb_hydra_set_cmd_dest_he (cmd, dev->card_data.hydra.channel_to_he[priv->channel]); kvaser_usb_hydra_set_cmd_transid @@ -1597,7 +1604,7 @@ static int kvaser_usb_hydra_get_busparams(struct kvaser_usb_net_priv *priv, reinit_completion(&priv->get_busparams_comp); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); if (err) return err; @@ -1624,6 +1631,7 @@ static int kvaser_usb_hydra_set_bittiming(const struct net_device *netdev, struct kvaser_cmd *cmd; struct kvaser_usb_net_priv *priv = netdev_priv(netdev); struct kvaser_usb *dev = priv->dev; + size_t cmd_len; int err; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); @@ -1631,6 +1639,7 @@ static int kvaser_usb_hydra_set_bittiming(const struct net_device *netdev, return -ENOMEM; cmd->header.cmd_no = CMD_SET_BUSPARAMS_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); memcpy(&cmd->set_busparams_req.busparams_nominal, busparams, sizeof(cmd->set_busparams_req.busparams_nominal)); @@ -1639,7 +1648,7 @@ static int kvaser_usb_hydra_set_bittiming(const struct net_device *netdev, kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); kfree(cmd); @@ -1652,6 +1661,7 @@ static int kvaser_usb_hydra_set_data_bittiming(const struct net_device *netdev, struct kvaser_cmd *cmd; struct kvaser_usb_net_priv *priv = netdev_priv(netdev); struct kvaser_usb *dev = priv->dev; + size_t cmd_len; int err; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); @@ -1659,6 +1669,7 @@ static int kvaser_usb_hydra_set_data_bittiming(const struct net_device *netdev, return -ENOMEM; cmd->header.cmd_no = CMD_SET_BUSPARAMS_FD_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); memcpy(&cmd->set_busparams_req.busparams_data, busparams, sizeof(cmd->set_busparams_req.busparams_data)); @@ -1676,7 +1687,7 @@ static int kvaser_usb_hydra_set_data_bittiming(const struct net_device *netdev, kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); kfree(cmd); @@ -1804,6 +1815,7 @@ static int kvaser_usb_hydra_get_software_info(struct kvaser_usb *dev) static int kvaser_usb_hydra_get_software_details(struct kvaser_usb *dev) { struct kvaser_cmd *cmd; + size_t cmd_len; int err; u32 flags; struct kvaser_usb_dev_card_data *card_data = &dev->card_data; @@ -1813,6 +1825,7 @@ static int kvaser_usb_hydra_get_software_details(struct kvaser_usb *dev) return -ENOMEM; cmd->header.cmd_no = CMD_GET_SOFTWARE_DETAILS_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); cmd->sw_detail_req.use_ext_cmd = 1; kvaser_usb_hydra_set_cmd_dest_he (cmd, KVASER_USB_HYDRA_HE_ADDRESS_ILLEGAL); @@ -1820,7 +1833,7 @@ static int kvaser_usb_hydra_get_software_details(struct kvaser_usb *dev) kvaser_usb_hydra_set_cmd_transid (cmd, kvaser_usb_hydra_get_next_transid(dev)); - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); if (err) goto end; @@ -1938,6 +1951,7 @@ static int kvaser_usb_hydra_set_opt_mode(const struct kvaser_usb_net_priv *priv) { struct kvaser_usb *dev = priv->dev; struct kvaser_cmd *cmd; + size_t cmd_len; int err; if ((priv->can.ctrlmode & @@ -1953,6 +1967,7 @@ static int kvaser_usb_hydra_set_opt_mode(const struct kvaser_usb_net_priv *priv) return -ENOMEM; cmd->header.cmd_no = CMD_SET_DRIVERMODE_REQ; + cmd_len = kvaser_usb_hydra_cmd_size(cmd); kvaser_usb_hydra_set_cmd_dest_he (cmd, dev->card_data.hydra.channel_to_he[priv->channel]); kvaser_usb_hydra_set_cmd_transid @@ -1962,7 +1977,7 @@ static int kvaser_usb_hydra_set_opt_mode(const struct kvaser_usb_net_priv *priv) else cmd->set_ctrlmode.mode = KVASER_USB_HYDRA_CTRLMODE_NORMAL; - err = kvaser_usb_send_cmd(dev, cmd, kvaser_usb_hydra_cmd_size(cmd)); + err = kvaser_usb_send_cmd(dev, cmd, cmd_len); kfree(cmd); return err; From 62e027fb0e5293d95e8d36655757ef4687c8795d Mon Sep 17 00:00:00 2001 From: Arun Ramadoss Date: Tue, 13 Dec 2022 15:44:40 +0530 Subject: [PATCH 59/64] net: dsa: microchip: remove IRQF_TRIGGER_FALLING in request_threaded_irq KSZ swithes used interrupts for detecting the phy link up and down. During registering the interrupt handler, it used IRQF_TRIGGER_FALLING flag. But this flag has to be retrieved from device tree instead of hard coding in the driver, so removing the flag. Fixes: ff319a644829 ("net: dsa: microchip: move interrupt handling logic from lan937x to ksz_common") Reported-by: Christian Eggers Signed-off-by: Arun Ramadoss Link: https://lore.kernel.org/r/20221213101440.24667-1-arun.ramadoss@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 423f944cc34c..9b20c2ee6d62 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1996,8 +1996,7 @@ static int ksz_irq_common_setup(struct ksz_device *dev, struct ksz_irq *kirq) irq_create_mapping(kirq->domain, n); ret = request_threaded_irq(kirq->irq_num, NULL, ksz_irq_thread_fn, - IRQF_ONESHOT | IRQF_TRIGGER_FALLING, - kirq->name, kirq); + IRQF_ONESHOT, kirq->name, kirq); if (ret) goto out; From b389a902dd5be4ece505a2e0463b9b034de04bf5 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 15 Dec 2022 13:49:33 +0800 Subject: [PATCH 60/64] mctp: Remove device type check at unregister The unregister check could be incorrectly triggered if a netdev changes its type after register. That is possible for a tun device using TUNSETLINK ioctl, resulting in mctp unregister failing and the netdev unregister waiting forever. This was encountered by https://github.com/openthread/openthread/issues/8523 Neither check at register or unregister is required. They were added in an attempt to track down mctp_ptr being set unexpectedly, which should not happen in normal operation. Fixes: 7b1871af75f3 ("mctp: Warn if pointer is set for a wrong dev type") Signed-off-by: Matt Johnston Link: https://lore.kernel.org/r/20221215054933.2403401-1-matt@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/device.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/net/mctp/device.c b/net/mctp/device.c index 99a3bda8852f..acb97b257428 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -429,12 +429,6 @@ static void mctp_unregister(struct net_device *dev) struct mctp_dev *mdev; mdev = mctp_dev_get_rtnl(dev); - if (mdev && !mctp_known(dev)) { - // Sanity check, should match what was set in mctp_register - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); - return; - } if (!mdev) return; @@ -451,14 +445,8 @@ static int mctp_register(struct net_device *dev) struct mctp_dev *mdev; /* Already registered? */ - mdev = rtnl_dereference(dev->mctp_ptr); - - if (mdev) { - if (!mctp_known(dev)) - netdev_warn(dev, "%s: BUG mctp_ptr set for unknown type %d", - __func__, dev->type); + if (rtnl_dereference(dev->mctp_ptr)) return 0; - } /* only register specific types */ if (!mctp_known(dev)) From fb87bd47516d9a26b6d549231aa743b20fd4a569 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 16 Dec 2022 07:45:26 -0500 Subject: [PATCH 61/64] net: Introduce sk_use_task_frag in struct sock. Sockets that can be used while recursing into memory reclaim, like those used by network block devices and file systems, mustn't use current->task_frag: if the current process is already using it, then the inner memory reclaim call would corrupt the task_frag structure. To avoid this, sk_page_frag() uses ->sk_allocation to detect sockets that mustn't use current->task_frag, assuming that those used during memory reclaim had their allocation constraints reflected in ->sk_allocation. This unfortunately doesn't cover all cases: in an attempt to remove all usage of GFP_NOFS and GFP_NOIO, sunrpc stopped setting these flags in ->sk_allocation, and used memalloc_nofs critical sections instead. This breaks the sk_page_frag() heuristic since the allocation constraints are now stored in current->flags, which sk_page_frag() can't read without risking triggering a cache miss and slowing down TCP's fast path. This patch creates a new field in struct sock, named sk_use_task_frag, which sockets with memory reclaim constraints can set to false if they can't safely use current->task_frag. In such cases, sk_page_frag() now always returns the socket's page_frag (->sk_frag). The first user is sunrpc, which needs to avoid using current->task_frag but can keep ->sk_allocation set to GFP_KERNEL otherwise. Eventually, it might be possible to simplify sk_page_frag() by only testing ->sk_use_task_frag and avoid relying on the ->sk_allocation heuristic entirely (assuming other sockets will set ->sk_use_task_frag according to their constraints in the future). The new ->sk_use_task_frag field is placed in a hole in struct sock and belongs to a cache line shared with ->sk_shutdown. Therefore it should be hot and shouldn't have negative performance impacts on TCP's fast path (sk_shutdown is tested just before the while() loop in tcp_sendmsg_locked()). Link: https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/ Signed-off-by: Guillaume Nault Reviewed-by: Benjamin Coddington Signed-off-by: Jakub Kicinski --- include/net/sock.h | 11 +++++++++-- net/core/sock.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ecea3dcc2217..fefe1f4abf19 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -318,6 +318,9 @@ struct sk_filter; * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. + * Sockets that can be used under memory reclaim should + * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests @@ -512,6 +515,7 @@ struct sock { u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; + bool sk_use_task_frag; struct socket *sk_socket; void *sk_user_data; @@ -2561,14 +2565,17 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * usage if the caller is potentially doing any of them. - * This assumes that page fault handlers use the GFP_NOFS flags. + * This assumes that page fault handlers use the GFP_NOFS flags or + * explicitly disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + if (sk->sk_use_task_frag && + (sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | + __GFP_FS)) == (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; diff --git a/net/core/sock.c b/net/core/sock.c index d2587d8712db..f954d5893e79 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; + sk->sk_use_task_frag = true; sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); From 98123866fcf3fe95a0c1b198ef122dfdbd351916 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 16 Dec 2022 07:45:27 -0500 Subject: [PATCH 62/64] Treewide: Stop corrupting socket's task_frag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since moving to memalloc_nofs_save/restore, SUNRPC has stopped setting the GFP_NOIO flag on sk_allocation which the networking system uses to decide when it is safe to use current->task_frag. The results of this are unexpected corruption in task_frag when SUNRPC is involved in memory reclaim. The corruption can be seen in crashes, but the root cause is often difficult to ascertain as a crashing machine's stack trace will have no evidence of being near NFS or SUNRPC code. I believe this problem to be much more pervasive than reports to the community may indicate. Fix this by having kernel users of sockets that may corrupt task_frag due to reclaim set sk_use_task_frag = false. Preemptively correcting this situation for users that still set sk_allocation allows them to convert to memalloc_nofs_save/restore without the same unexpected corruptions that are sure to follow, unlikely to show up in testing, and difficult to bisect. CC: Philipp Reisner CC: Lars Ellenberg CC: "Christoph Böhmwalder" CC: Jens Axboe CC: Josef Bacik CC: Keith Busch CC: Christoph Hellwig CC: Sagi Grimberg CC: Lee Duncan CC: Chris Leech CC: Mike Christie CC: "James E.J. Bottomley" CC: "Martin K. Petersen" CC: Valentina Manea CC: Shuah Khan CC: Greg Kroah-Hartman CC: David Howells CC: Marc Dionne CC: Steve French CC: Christine Caulfield CC: David Teigland CC: Mark Fasheh CC: Joel Becker CC: Joseph Qi CC: Eric Van Hensbergen CC: Latchesar Ionkov CC: Dominique Martinet CC: Ilya Dryomov CC: Xiubo Li CC: Chuck Lever CC: Jeff Layton CC: Trond Myklebust CC: Anna Schumaker CC: Steffen Klassert CC: Herbert Xu Suggested-by: Guillaume Nault Signed-off-by: Benjamin Coddington Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- drivers/block/drbd/drbd_receiver.c | 3 +++ drivers/block/nbd.c | 1 + drivers/nvme/host/tcp.c | 1 + drivers/scsi/iscsi_tcp.c | 1 + drivers/usb/usbip/usbip_common.c | 1 + fs/cifs/connect.c | 1 + fs/dlm/lowcomms.c | 2 ++ fs/ocfs2/cluster/tcp.c | 1 + net/9p/trans_fd.c | 1 + net/ceph/messenger.c | 1 + net/sunrpc/xprtsock.c | 3 +++ net/xfrm/espintcp.c | 1 + 12 files changed, 17 insertions(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 0e58a3187345..757f4692b5bd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1030,6 +1030,9 @@ randomize: sock.socket->sk->sk_allocation = GFP_NOIO; msock.socket->sk->sk_allocation = GFP_NOIO; + sock.socket->sk->sk_use_task_frag = false; + msock.socket->sk->sk_use_task_frag = false; + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e379ccc63c52..592cfa8b765a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, noreclaim_flag = memalloc_noreclaim_save(); do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; + sock->sk->sk_use_task_frag = false; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_control = NULL; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b69b89166b6b..8cedc1ef496c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) queue->sock->sk->sk_rcvtimeo = 10 * HZ; queue->sock->sk->sk_allocation = GFP_ATOMIC; + queue->sock->sk->sk_use_task_frag = false; nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 5fb1f364e815..1d1cf641937c 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session, sk->sk_reuse = SK_CAN_REUSE; sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */ sk->sk_allocation = GFP_ATOMIC; + sk->sk_use_task_frag = false; sk_set_memalloc(sk); sock_no_linger(sk); diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index f8b326eed54d..a2b2da1255dd 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size) do { sock->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_use_task_frag = false; result = sock_recvmsg(sock, &msg, MSG_WAITALL); if (result <= 0) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e80252a83225..7bc7b5e03c51 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server) cifs_dbg(FYI, "Socket created\n"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_use_task_frag = false; if (sfamily == AF_INET6) cifs_reclassify_socket6(socket); else diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 8b80ca0cd65f..4450721ec83c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con) if (dlm_config.ci_protocol == DLM_PROTO_SCTP) sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; + sk->sk_use_task_frag = false; sk->sk_error_report = lowcomms_error_report; release_sock(sk); } @@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void) listen_con.sock = sock; sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; sock->sk->sk_data_ready = lowcomms_listen_data_ready; release_sock(sock->sk); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 37d222bdfc8c..a07b24d170f2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work) sc->sc_sock = sock; /* freed by sc_kref_release */ sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; myaddr.sin_family = AF_INET; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 07db2f436d44..d9120f14684b 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) } csocket->sk->sk_allocation = GFP_NOIO; + csocket->sk->sk_use_task_frag = false; file = sock_alloc_file(csocket, 0, NULL); if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dfa237fbd5a3..1d06e114ba3f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con) if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_use_task_frag = false; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c0506d0d7478..aaa5b2741b79 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_write_space = xs_udp_write_space; sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; xprt_clear_connected(xprt); @@ -2082,6 +2083,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_use_task_frag = false; xprt_set_connected(xprt); @@ -2249,6 +2251,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; + sk->sk_use_task_frag = false; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d6fece1ed982..74a54295c164 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -489,6 +489,7 @@ static int espintcp_init_sk(struct sock *sk) /* avoid using task_frag */ sk->sk_allocation = GFP_ATOMIC; + sk->sk_use_task_frag = false; return 0; From 08f65892c5ee15806dce7259e06c384b8cd768d7 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 16 Dec 2022 07:45:28 -0500 Subject: [PATCH 63/64] net: simplify sk_page_frag Now that in-kernel socket users that may recurse during reclaim have benn converted to sk_use_task_frag = false, we can have sk_page_frag() simply check that value. Signed-off-by: Benjamin Coddington Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index fefe1f4abf19..dcd72e6285b2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2564,19 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag - * usage if the caller is potentially doing any of them. - * This assumes that page fault handlers use the GFP_NOFS flags or - * explicitly disable sk_use_task_frag. + * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (sk->sk_use_task_frag && - (sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | - __GFP_FS)) == - (__GFP_DIRECT_RECLAIM | __GFP_FS)) + if (sk->sk_use_task_frag) return ¤t->task_frag; return &sk->sk_frag; From 19e72b064fc32cd58f6fc0b1eb64ac2e4f770e76 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 19 Dec 2022 10:27:55 +0800 Subject: [PATCH 64/64] net: fec: check the return value of build_skb() The build_skb might return a null pointer but there is no check on the return value in the fec_enet_rx_queue(). So a null pointer dereference might occur. To avoid this, we check the return value of build_skb. If the return value is a null pointer, the driver will recycle the page and update the statistic of ndev. Then jump to rx_processing_done to clear the status flags of the BD so that the hardware can recycle the BD. Fixes: 95698ff6177b ("net: fec: using page pool to manage RX buffers") Signed-off-by: Wei Fang Reviewed-by: Shenwei Wang Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20221219022755.1047573-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 5528b0af82ae..644f3c963730 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1674,6 +1674,14 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id) * bridging applications. */ skb = build_skb(page_address(page), PAGE_SIZE); + if (unlikely(!skb)) { + page_pool_recycle_direct(rxq->page_pool, page); + ndev->stats.rx_dropped++; + + netdev_err_once(ndev, "build_skb failed!\n"); + goto rx_processing_done; + } + skb_reserve(skb, data_start); skb_put(skb, pkt_len - sub_len); skb_mark_for_recycle(skb);