From 2cdaa3eefed83082923cf219c8b6a314e622da74 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 18 Apr 2023 23:31:26 +0200 Subject: [PATCH 01/33] netfilter: conntrack: restore IPS_CONFIRMED out of nf_conntrack_hash_check_insert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") consolidates IPS_CONFIRMED bit set in nf_conntrack_hash_check_insert(). However, this breaks ctnetlink: # conntrack -I -p tcp --timeout 123 --src 1.2.3.4 --dst 5.6.7.8 --state ESTABLISHED --sport 1 --dport 4 -u SEEN_REPLY conntrack v1.4.6 (conntrack-tools): Operation failed: Device or resource busy This is a partial revert of the aforementioned commit to restore IPS_CONFIRMED. Fixes: e6d57e9ff0ae ("netfilter: conntrack: fix rmmod double-free race") Reported-by: Stéphane Graber Tested-by: Stéphane Graber Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_bpf.c | 1 + net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_netlink.c | 3 +++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index cd99e6dc1f35..34913521c385 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -381,6 +381,7 @@ __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c6a6a6099b4e..7ba6ab9b54b5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -932,7 +932,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) goto out; } - ct->status |= IPS_CONFIRMED; smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bfc3aaa2c872..d3ee18854698 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2316,6 +2316,9 @@ ctnetlink_create_conntrack(struct net *net, nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); + /* we must add conntrack extensions before confirmation. */ + ct->status |= IPS_CONFIRMED; + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Wed, 19 Apr 2023 13:15:26 +0800 Subject: [PATCH 02/33] netfilter: conntrack: fix wrong ct->timeout value (struct nf_conn)->timeout is an interval before the conntrack confirmed. After confirmed, it becomes a timestamp. It is observed that timeout of an unconfirmed conntrack: - Set by calling ctnetlink_change_timeout(). As a result, `nfct_time_stamp` was wrongly added to `ct->timeout` twice. - Get by calling ctnetlink_dump_timeout(). As a result, `nfct_time_stamp` was wrongly subtracted. Call Trace: dump_stack_lvl ctnetlink_dump_timeout __ctnetlink_glue_build ctnetlink_glue_build __nfqnl_enqueue_packet nf_queue nf_hook_slow ip_mc_output ? __pfx_ip_finish_output ip_send_skb ? __pfx_dst_output udp_send_skb udp_sendmsg ? __pfx_ip_generic_getfrag sock_sendmsg Separate the 2 cases in: - Setting `ct->timeout` in __nf_ct_set_timeout(). - Getting `ct->timeout` in ctnetlink_dump_timeout(). Pablo appends: Update ctnetlink to set up the timeout _after_ the IPS_CONFIRMED flag is set on, otherwise conntrack creation via ctnetlink breaks. Note that the problem described in this patch occurs since the introduction of the nfnetlink_queue conntrack support, select a sufficiently old Fixes: tag for -stable kernel to pick up this fix. Fixes: a4b4766c3ceb ("netfilter: nfnetlink_queue: rename related to nfqueue attaching conntrack info") Signed-off-by: Tzung-Bi Shih Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 6 +++++- net/netfilter/nf_conntrack_netlink.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 71d1269fe4d4..3384859a8921 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -89,7 +89,11 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) timeout = INT_MAX; - WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); + + if (nf_ct_is_confirmed(ct)) + WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); + else + ct->timeout = (u32)timeout; } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d3ee18854698..6f3b23a6653c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -176,7 +176,12 @@ nla_put_failure: static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { - long timeout = nf_ct_expires(ct) / HZ; + long timeout; + + if (nf_ct_is_confirmed(ct)) + timeout = nf_ct_expires(ct) / HZ; + else + timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; @@ -2253,9 +2258,6 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; - __nf_ct_set_timeout(ct, timeout); - rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; @@ -2319,6 +2321,9 @@ ctnetlink_create_conntrack(struct net *net, /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; + timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; + __nf_ct_set_timeout(ct, timeout); + if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) From 4f3ed1293feb9502dc254b05802faf1ad3317ac6 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Sun, 16 Apr 2023 19:12:22 +0000 Subject: [PATCH 03/33] ixgbe: Allow flow hash to be set via ethtool ixgbe currently returns `EINVAL` whenever the flowhash it set by ethtool because the ethtool code in the kernel passes a non-zero value for hfunc that ixgbe should allow. When ethtool is called with `ETHTOOL_SRXFHINDIR`, `ethtool_set_rxfh_indir` will call ixgbe's set_rxfh function with `ETH_RSS_HASH_NO_CHANGE`. This value should be accepted. When ethtool is called with `ETHTOOL_SRSSH`, `ethtool_set_rxfh` will call ixgbe's set_rxfh function with `rxfh.hfunc`, which appears to be hardcoded in ixgbe to always be `ETH_RSS_HASH_TOP`. This value should also be accepted. Before this patch: $ sudo ethtool -L eth1 combined 10 $ sudo ethtool -X eth1 default Cannot set RX flow hash configuration: Invalid argument After this patch: $ sudo ethtool -L eth1 combined 10 $ sudo ethtool -X eth1 default $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 10 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 8 9 0 1 2 3 4 5 16: 6 7 8 9 0 1 2 3 24: 4 5 6 7 8 9 0 1 ... Fixes: 1c7cf0784e4d ("ixgbe: support for ethtool set_rxfh") Signed-off-by: Joe Damato Reviewed-by: Sridhar Samudrala Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 6cfc9dc16537..821dfd323fa9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3131,8 +3131,8 @@ static int ixgbe_set_rxfh(struct net_device *netdev, const u32 *indir, int i; u32 reta_entries = ixgbe_rss_indir_tbl_entries(adapter); - if (hfunc) - return -EINVAL; + if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + return -EOPNOTSUPP; /* Fill out the redirection table */ if (indir) { From e85d3d55875f7a1079edfbc4e4e98d6f8aea9ac7 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Sun, 16 Apr 2023 19:12:23 +0000 Subject: [PATCH 04/33] ixgbe: Enable setting RSS table to default values ethtool uses `ETHTOOL_GRXRINGS` to compute how many queues are supported by RSS. The driver should return the smaller of either: - The maximum number of RSS queues the device supports, OR - The number of RX queues configured Prior to this change, running `ethtool -X $iface default` fails if the number of queues configured is larger than the number supported by RSS, even though changing the queue count correctly resets the flowhash to use all supported queues. Other drivers (for example, i40e) will succeed but the flow hash will reset to support the maximum number of queues supported by RSS, even if that amount is smaller than the configured amount. Prior to this change: $ sudo ethtool -L eth1 combined 20 $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 20 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 8 9 10 11 12 13 14 15 16: 0 1 2 3 4 5 6 7 24: 8 9 10 11 12 13 14 15 32: 0 1 2 3 4 5 6 7 ... You can see that the flowhash was correctly set to use the maximum number of queues supported by the driver (16). However, asking the NIC to reset to "default" fails: $ sudo ethtool -X eth1 default Cannot set RX flow hash configuration: Invalid argument After this change, the flowhash can be reset to default which will use all of the available RSS queues (16) or the configured queue count, whichever is smaller. Starting with eth1 which has 10 queues and a flowhash distributing to all 10 queues: $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 10 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 8 9 0 1 2 3 4 5 16: 6 7 8 9 0 1 2 3 ... Increasing the queue count to 48 resets the flowhash to distribute to 16 queues, as it did before this patch: $ sudo ethtool -L eth1 combined 48 $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 16 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 8 9 10 11 12 13 14 15 16: 0 1 2 3 4 5 6 7 ... Due to the other bugfix in this series, the flowhash can be set to use queues 0-5: $ sudo ethtool -X eth1 equal 5 $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 16 RX ring(s): 0: 0 1 2 3 4 0 1 2 8: 3 4 0 1 2 3 4 0 16: 1 2 3 4 0 1 2 3 ... Due to this bugfix, the flowhash can be reset to default and use 16 queues: $ sudo ethtool -X eth1 default $ sudo ethtool -x eth1 RX flow hash indirection table for eth1 with 16 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 8 9 10 11 12 13 14 15 16: 0 1 2 3 4 5 6 7 ... Fixes: 91cd94bfe4f0 ("ixgbe: add basic support for setting and getting nfc controls") Signed-off-by: Joe Damato Reviewed-by: Sridhar Samudrala Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 821dfd323fa9..0bbad4a5cc2f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2665,6 +2665,14 @@ static int ixgbe_get_rss_hash_opts(struct ixgbe_adapter *adapter, return 0; } +static int ixgbe_rss_indir_tbl_max(struct ixgbe_adapter *adapter) +{ + if (adapter->hw.mac.type < ixgbe_mac_X550) + return 16; + else + return 64; +} + static int ixgbe_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, u32 *rule_locs) { @@ -2673,7 +2681,8 @@ static int ixgbe_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, switch (cmd->cmd) { case ETHTOOL_GRXRINGS: - cmd->data = adapter->num_rx_queues; + cmd->data = min_t(int, adapter->num_rx_queues, + ixgbe_rss_indir_tbl_max(adapter)); ret = 0; break; case ETHTOOL_GRXCLSRLCNT: @@ -3075,14 +3084,6 @@ static int ixgbe_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) return ret; } -static int ixgbe_rss_indir_tbl_max(struct ixgbe_adapter *adapter) -{ - if (adapter->hw.mac.type < ixgbe_mac_X550) - return 16; - else - return 64; -} - static u32 ixgbe_get_rxfh_key_size(struct net_device *netdev) { return IXGBE_RSS_KEY_SIZE; From e9fce818fe003b6c527f25517b9ac08eb4661b5d Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 22 Mar 2023 09:52:26 +0100 Subject: [PATCH 05/33] net/mlx5e: Don't clone flow post action attributes second time The code already clones post action attributes in mlx5e_clone_flow_attr_for_post_act(). Creating another copy in mlx5e_tc_post_act_add() is a erroneous leftover from original implementation. Instead, assign handle->attribute to post_attr provided by the caller. Note that cloning the attribute second time is not just wasteful but also causes issues like second copy not being properly updated in neigh update code which leads to following use-after-free: Feb 21 09:02:00 c-237-177-40-045 kernel: BUG: KASAN: use-after-free in mlx5_cmd_set_fte+0x200d/0x24c0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_report+0xbb/0x1a0 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_stack+0x1e/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_set_track+0x21/0x30 Feb 21 09:02:00 c-237-177-40-045 kernel: __kasan_kmalloc+0x7a/0x90 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_stack+0x1e/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_set_track+0x21/0x30 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_free_info+0x2a/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: ____kasan_slab_free+0x11a/0x1b0 Feb 21 09:02:00 c-237-177-40-045 kernel: page dumped because: kasan: bad access detected Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_core 0000:08:00.0: mlx5_cmd_out_err:803:(pid 8833): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0xf2ff71), err(-22) Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_core 0000:08:00.0 enp8s0f0: Failed to add post action rule Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_core 0000:08:00.0: mlx5e_tc_encap_flows_add:190:(pid 8833): Failed to update flow post acts, -22 Feb 21 09:02:00 c-237-177-40-045 kernel: Call Trace: Feb 21 09:02:00 c-237-177-40-045 kernel: Feb 21 09:02:00 c-237-177-40-045 kernel: dump_stack_lvl+0x57/0x7d Feb 21 09:02:00 c-237-177-40-045 kernel: print_report+0x170/0x471 Feb 21 09:02:00 c-237-177-40-045 kernel: ? mlx5_cmd_set_fte+0x200d/0x24c0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_report+0xbb/0x1a0 Feb 21 09:02:00 c-237-177-40-045 kernel: ? mlx5_cmd_set_fte+0x200d/0x24c0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_cmd_set_fte+0x200d/0x24c0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: ? __module_address.part.0+0x62/0x200 Feb 21 09:02:00 c-237-177-40-045 kernel: ? mlx5_cmd_stub_create_flow_table+0xd0/0xd0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: ? __raw_spin_lock_init+0x3b/0x110 Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_cmd_create_fte+0x80/0xb0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: add_rule_fg+0xe80/0x19c0 [mlx5_core] -- Feb 21 09:02:00 c-237-177-40-045 kernel: Allocated by task 13476: Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_stack+0x1e/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_set_track+0x21/0x30 Feb 21 09:02:00 c-237-177-40-045 kernel: __kasan_kmalloc+0x7a/0x90 Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_packet_reformat_alloc+0x7b/0x230 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_tc_tun_create_header_ipv4+0x977/0xf10 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_attach_encap+0x15b4/0x1e10 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: post_process_attr+0x305/0xa30 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_tc_add_fdb_flow+0x4c0/0xcf0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: __mlx5e_add_fdb_flow+0x7cf/0xe90 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_configure_flower+0xcaa/0x4b90 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_rep_setup_tc_cls_flower+0x99/0x1b0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_rep_setup_tc_cb+0x133/0x1e0 [mlx5_core] -- Feb 21 09:02:00 c-237-177-40-045 kernel: Freed by task 8833: Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_stack+0x1e/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_set_track+0x21/0x30 Feb 21 09:02:00 c-237-177-40-045 kernel: kasan_save_free_info+0x2a/0x40 Feb 21 09:02:00 c-237-177-40-045 kernel: ____kasan_slab_free+0x11a/0x1b0 Feb 21 09:02:00 c-237-177-40-045 kernel: __kmem_cache_free+0x1de/0x400 Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5_packet_reformat_dealloc+0xad/0x100 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_tc_encap_flows_del+0x3c0/0x500 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_rep_update_flows+0x40c/0xa80 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: mlx5e_rep_neigh_update+0x473/0x7a0 [mlx5_core] Feb 21 09:02:00 c-237-177-40-045 kernel: process_one_work+0x7c2/0x1310 Feb 21 09:02:00 c-237-177-40-045 kernel: worker_thread+0x59d/0xec0 Feb 21 09:02:00 c-237-177-40-045 kernel: kthread+0x28f/0x330 Fixes: 8300f225268b ("net/mlx5e: Create new flow attr for multi table actions") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/tc/post_act.c | 11 ++--------- .../net/ethernet/mellanox/mlx5/core/en/tc/post_act.h | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c index 4e48946c4c2a..0290e0dea539 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c @@ -106,22 +106,17 @@ err_rule: } struct mlx5e_post_act_handle * -mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *attr) +mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *post_attr) { - u32 attr_sz = ns_to_attr_sz(post_act->ns_type); struct mlx5e_post_act_handle *handle; - struct mlx5_flow_attr *post_attr; int err; handle = kzalloc(sizeof(*handle), GFP_KERNEL); - post_attr = mlx5_alloc_flow_attr(post_act->ns_type); - if (!handle || !post_attr) { - kfree(post_attr); + if (!handle) { kfree(handle); return ERR_PTR(-ENOMEM); } - memcpy(post_attr, attr, attr_sz); post_attr->chain = 0; post_attr->prio = 0; post_attr->ft = post_act->ft; @@ -145,7 +140,6 @@ mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *at return handle; err_xarray: - kfree(post_attr); kfree(handle); return ERR_PTR(err); } @@ -164,7 +158,6 @@ mlx5e_tc_post_act_del(struct mlx5e_post_act *post_act, struct mlx5e_post_act_han if (!IS_ERR_OR_NULL(handle->rule)) mlx5e_tc_post_act_unoffload(post_act, handle); xa_erase(&post_act->ids, handle->id); - kfree(handle->attr); kfree(handle); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h index f476774c0b75..40b8df184af5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h @@ -19,7 +19,7 @@ void mlx5e_tc_post_act_destroy(struct mlx5e_post_act *post_act); struct mlx5e_post_act_handle * -mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *attr); +mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *post_attr); void mlx5e_tc_post_act_del(struct mlx5e_post_act *post_act, struct mlx5e_post_act_handle *handle); From 8ac04a28144cfa89b61be518268233742c23d88d Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 24 Mar 2023 17:52:17 +0100 Subject: [PATCH 06/33] net/mlx5e: Release the label when replacing existing ct entry Cited commit doesn't release the label mapping when replacing existing ct entry which leads to following memleak report: unreferenced object 0xffff8881854cf280 (size 96): comm "kworker/u48:74", pid 23093, jiffies 4296664564 (age 175.944s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000002722d368>] __kmalloc+0x4b/0x1c0 [<00000000cc44e18f>] mapping_add+0x6e8/0xc90 [mlx5_core] [<000000003ad942a7>] mlx5_get_label_mapping+0x66/0xe0 [mlx5_core] [<00000000266308ac>] mlx5_tc_ct_entry_create_mod_hdr+0x1c4/0xf50 [mlx5_core] [<000000009a768b4f>] mlx5_tc_ct_entry_add_rule+0x16f/0xaf0 [mlx5_core] [<00000000a178f3e5>] mlx5_tc_ct_block_flow_offload_add+0x10cb/0x1f90 [mlx5_core] [<000000007b46c496>] mlx5_tc_ct_block_flow_offload+0x14a/0x630 [mlx5_core] [<00000000a9a18ac5>] nf_flow_offload_tuple+0x1a3/0x390 [nf_flow_table] [<00000000d0881951>] flow_offload_work_handler+0x257/0xd30 [nf_flow_table] [<000000009e4935a4>] process_one_work+0x7c2/0x13e0 [<00000000f5cd36a7>] worker_thread+0x59d/0xec0 [<00000000baed1daf>] kthread+0x28f/0x330 [<0000000063d282a4>] ret_from_fork+0x1f/0x30 Fix the issue by correctly releasing the label mapping. Fixes: 94ceffb48eac ("net/mlx5e: Implement CT entry update") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 314983bc6f08..ee49bd2461e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -920,6 +920,7 @@ mlx5_tc_ct_entry_replace_rule(struct mlx5_tc_ct_priv *ct_priv, zone_rule->rule = rule; mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, old_attr, zone_rule->mh); zone_rule->mh = mh; + mlx5_put_label_mapping(ct_priv, old_attr->ct_attr.ct_labels_id); kfree(old_attr); kvfree(spec); From fd745f4c0abe41ebb09d11bf622b054a0f3e7b49 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Fri, 10 Mar 2023 11:06:48 +0200 Subject: [PATCH 07/33] net/mlx5: E-switch, Create per vport table based on devlink encap mode Currently when creating per vport table, create flags are hardcoded. Devlink encap mode is set based on user input and HW capability. Create per vport table based on devlink encap mode. Fixes: c796bb7cd230 ("net/mlx5: E-switch, Generalize per vport table API") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/tc/sample.c | 4 ++-- .../net/ethernet/mellanox/mlx5/core/esw/vporttbl.c | 12 +++++++++++- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 +- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c index 558a776359af..5db239cae814 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c @@ -14,10 +14,10 @@ #define MLX5_ESW_VPORT_TBL_SIZE_SAMPLE (64 * 1024) -static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_sample_ns = { +static struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_sample_ns = { .max_fte = MLX5_ESW_VPORT_TBL_SIZE_SAMPLE, .max_num_groups = 0, /* default num of groups */ - .flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | MLX5_FLOW_TABLE_TUNNEL_EN_DECAP, + .flags = 0, }; struct mlx5e_tc_psample { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c index 9e72118f2e4c..749c3957a128 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c @@ -11,7 +11,7 @@ struct mlx5_vport_key { u16 prio; u16 vport; u16 vhca_id; - const struct esw_vport_tbl_namespace *vport_ns; + struct esw_vport_tbl_namespace *vport_ns; } __packed; struct mlx5_vport_table { @@ -21,6 +21,14 @@ struct mlx5_vport_table { struct mlx5_vport_key key; }; +static void +esw_vport_tbl_init(struct mlx5_eswitch *esw, struct esw_vport_tbl_namespace *ns) +{ + if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) + ns->flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); +} + static struct mlx5_flow_table * esw_vport_tbl_create(struct mlx5_eswitch *esw, struct mlx5_flow_namespace *ns, const struct esw_vport_tbl_namespace *vport_ns) @@ -80,6 +88,7 @@ mlx5_esw_vporttbl_get(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr u32 hkey; mutex_lock(&esw->fdb_table.offloads.vports.lock); + esw_vport_tbl_init(esw, attr->vport_ns); hkey = flow_attr_to_vport_key(esw, attr, &skey); e = esw_vport_tbl_lookup(esw, &skey, hkey); if (e) { @@ -127,6 +136,7 @@ mlx5_esw_vporttbl_put(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr u32 hkey; mutex_lock(&esw->fdb_table.offloads.vports.lock); + esw_vport_tbl_init(esw, attr->vport_ns); hkey = flow_attr_to_vport_key(esw, attr, &key); e = esw_vport_tbl_lookup(esw, &key, hkey); if (!e || --e->num_rules) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 19e9a77c4633..9d5a5756a15a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -674,7 +674,7 @@ struct mlx5_vport_tbl_attr { u32 chain; u16 prio; u16 vport; - const struct esw_vport_tbl_namespace *vport_ns; + struct esw_vport_tbl_namespace *vport_ns; }; struct mlx5_flow_table * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 25a8076a77bf..706746cd10af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -73,7 +73,7 @@ #define MLX5_ESW_FT_OFFLOADS_DROP_RULE (1) -static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_mirror_ns = { +static struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_mirror_ns = { .max_fte = MLX5_ESW_VPORT_TBL_SIZE, .max_num_groups = MLX5_ESW_VPORT_TBL_NUM_GROUPS, .flags = 0, From 4c8189302567f75099a336b0efcff8291ec86ff4 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Fri, 10 Mar 2023 09:56:08 +0200 Subject: [PATCH 08/33] net/mlx5: E-switch, Don't destroy indirect table in split rule Source port rewrite (forward to ovs internal port or statck device) isn't supported in the rule of split action. So there is no indirect table in split rule. The cited commit destroyes indirect table in split rule. The indirect table for other rules will be destroyed wrongly. It will cause traffic loss. Fix it by removing the destroy function in split rule. And also remove the destroy function in error flow. Fixes: 10742efc20a4 ("net/mlx5e: VF tunnel TX traffic offloading") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 706746cd10af..c99d208722f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -760,7 +760,6 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, kfree(dest); return rule; err_chain_src_rewrite: - esw_put_dest_tables_loop(esw, attr, 0, i); mlx5_esw_vporttbl_put(esw, &fwd_attr); err_get_fwd: mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); @@ -803,7 +802,6 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw, if (fwd_rule) { mlx5_esw_vporttbl_put(esw, &fwd_attr); mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); - esw_put_dest_tables_loop(esw, attr, 0, esw_attr->split_count); } else { if (split) mlx5_esw_vporttbl_put(esw, &fwd_attr); From 4fbef0f8ea6350eaea89b1e3175f9325252913ac Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 4 Apr 2023 03:45:04 +0300 Subject: [PATCH 09/33] net/mlx5: Release tunnel device after tc update skb The cited commit causes a regression. Tunnel device is not released after tc update skb if skb needs to be freed. The following error message will be printed: unregister_netdevice: waiting for vxlan1 to become free. Usage count = 11 Fix it by releasing tunnel device if skb needs to be freed. Fixes: 93a1ab2c545b ("net/mlx5: Refactor tc miss handling to a single function") Signed-off-by: Chris Mi Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index 8f7452dc00ee..668fdee9cf05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -715,5 +715,6 @@ forward: return; free_skb: + dev_put(tc_priv.fwd_dev); dev_kfree_skb_any(skb); } From 0a6b069cc60d68d33b4f6e7dd7f1adc3ec749766 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 29 Mar 2023 15:24:32 +0300 Subject: [PATCH 10/33] net/mlx5e: Fix error flow in representor failing to add vport rx rule On representor init rx error flow the flow steering pointer is being released so mlx5e_attach_netdev() doesn't have a valid fs pointer in its error flow. Make sure the pointer is nullified when released and add a check in mlx5e_fs_cleanup() to verify fs is not null as representor cleanup callback would be called anyway. Fixes: af8bbf730068 ("net/mlx5e: Convert mlx5e_flow_steering member of mlx5e_priv to pointer") Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 05796f8b1d7c..f1dac0244958 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -1490,6 +1490,8 @@ err: void mlx5e_fs_cleanup(struct mlx5e_flow_steering *fs) { + if (!fs) + return; debugfs_remove_recursive(fs->dfs_root); mlx5e_fs_ethtool_free(fs); mlx5e_fs_tc_free(fs); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7ca7e9b57607..579c2d217fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5270,6 +5270,7 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) mlx5e_health_destroy_reporters(priv); mlx5e_ktls_cleanup(priv); mlx5e_fs_cleanup(priv->fs); + priv->fs = NULL; } static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 8ff654b4e9e1..6e18d91c3d76 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -828,6 +828,7 @@ static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, static void mlx5e_cleanup_rep(struct mlx5e_priv *priv) { mlx5e_fs_cleanup(priv->fs); + priv->fs = NULL; } static int mlx5e_create_rep_ttc_table(struct mlx5e_priv *priv) @@ -994,6 +995,7 @@ err_close_drop_rq: priv->rx_res = NULL; err_free_fs: mlx5e_fs_cleanup(priv->fs); + priv->fs = NULL; return err; } From 21608a2cf38e9f743636b5f86507ffbca18ecee7 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 9 Apr 2023 10:44:31 +0300 Subject: [PATCH 11/33] Revert "net/mlx5: Remove "recovery" arg from mlx5_load_one() function" This reverts commit 5977ac3910f1cbaf44dca48179118b25c206ac29. Revert this patch as we need the "recovery" arg back in mlx5_load_one() function. This arg will be used in the next patch for using recovery timeout during sync reset flow. Signed-off-by: Moshe Shemesh Reviewed-by: Maher Sanalla Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 9 +++++---- drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 4c2dad9d7cfb..289e915def98 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -167,7 +167,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) if (mlx5_health_wait_pci_up(dev)) mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else - mlx5_load_one(dev); + mlx5_load_one(dev, false); devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index f1de152a6113..ad90bf125e94 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1509,13 +1509,13 @@ out: return err; } -int mlx5_load_one(struct mlx5_core_dev *dev) +int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery) { struct devlink *devlink = priv_to_devlink(dev); int ret; devl_lock(devlink); - ret = mlx5_load_one_devl_locked(dev, false); + ret = mlx5_load_one_devl_locked(dev, recovery); devl_unlock(devlink); return ret; } @@ -1912,7 +1912,8 @@ static void mlx5_pci_resume(struct pci_dev *pdev) mlx5_pci_trace(dev, "Enter, loading driver..\n"); - err = mlx5_load_one(dev); + err = mlx5_load_one(dev, false); + if (!err) devlink_health_reporter_state_update(dev->priv.health.fw_fatal_reporter, DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); @@ -2003,7 +2004,7 @@ static int mlx5_resume(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); - return mlx5_load_one(dev); + return mlx5_load_one(dev, false); } static const struct pci_device_id mlx5_core_pci_table[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index be0785f83083..a3c5c2dab5fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -321,7 +321,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev); void mlx5_uninit_one(struct mlx5_core_dev *dev); void mlx5_unload_one(struct mlx5_core_dev *dev, bool suspend); void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev, bool suspend); -int mlx5_load_one(struct mlx5_core_dev *dev); +int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery); int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 function_id, From dfad99750c0f83b0242572a573afa2c055f85b36 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 2 Apr 2023 06:49:53 +0300 Subject: [PATCH 12/33] net/mlx5: Use recovery timeout on sync reset flow Use the same timeout for sync reset flow and health recovery flow, since the former involves driver's recovery from firmware reset, which is similar to health recovery. Otherwise, in some cases, such as a firmware upgrade on the DPU, the firmware pre-init bit may not be ready within current timeout and the driver will abort loading back after reset. Signed-off-by: Moshe Shemesh Fixes: 37ca95e62ee2 ("net/mlx5: Increase FW pre-init timeout for health recovery") Reviewed-by: Maher Sanalla Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index c5d2fdcabd56..e5f03d071a37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -202,7 +202,7 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a break; /* On fw_activate action, also driver is reloaded and reinit performed */ *actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); - ret = mlx5_load_one_devl_locked(dev, false); + ret = mlx5_load_one_devl_locked(dev, true); break; default: /* Unsupported action should not get to this function */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 289e915def98..50022e7565f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -167,7 +167,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) if (mlx5_health_wait_pci_up(dev)) mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else - mlx5_load_one(dev, false); + mlx5_load_one(dev, true); devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); @@ -499,7 +499,7 @@ int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) err = fw_reset->ret; if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) { mlx5_unload_one_devl_locked(dev, false); - mlx5_load_one_devl_locked(dev, false); + mlx5_load_one_devl_locked(dev, true); } out: clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); From 1b540decd03acd736693aabb6cb46c71fca38ae6 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 12 Mar 2023 16:37:36 +0200 Subject: [PATCH 13/33] net/mlx5e: Nullify table pointer when failing to create On failing to create promisc flow steering table, the pointer is returned with an error. Nullify it so unloading the driver won't try to destroy a non existing table. Failing to create promisc table may happen over BF devices when the ARM side is going through a firmware tear down. The host side start a reload flow. While the driver unloads, it tries to remove the promisc table. Remove WARN in this state as it is a valid error flow. Fixes: 1c46d7409f30 ("net/mlx5e: Optimize promiscuous mode") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index f1dac0244958..33bfe4d7338b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -783,6 +783,7 @@ static int mlx5e_create_promisc_table(struct mlx5e_flow_steering *fs) ft->t = mlx5_create_auto_grouped_flow_table(fs->ns, &ft_attr); if (IS_ERR(ft->t)) { err = PTR_ERR(ft->t); + ft->t = NULL; fs_err(fs, "fail to create promisc table err=%d\n", err); return err; } @@ -810,7 +811,7 @@ static void mlx5e_del_promisc_rule(struct mlx5e_flow_steering *fs) static void mlx5e_destroy_promisc_table(struct mlx5e_flow_steering *fs) { - if (WARN(!fs->promisc.ft.t, "Trying to remove non-existing promiscuous table")) + if (!fs->promisc.ft.t) return; mlx5e_del_promisc_rule(fs); mlx5_destroy_flow_table(fs->promisc.ft.t); From 081abcacaf0a9505c0d4cc9780b12e6ce5d33630 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 13 Apr 2023 15:06:59 +0200 Subject: [PATCH 14/33] Revert "net/mlx5e: Don't use termination table when redundant" This reverts commit 14624d7247fcd0f3114a6f5f17b3c8d1020fbbb7. The termination table usage is requires for DMFS steering mode as firmware doesn't support mixed table destinations list which causes following syndrome with hairpin rules: [81922.283225] mlx5_core 0000:08:00.0: mlx5_cmd_out_err:803:(pid 25977): SET_FLOW_TABLE_ENTRY(0x936) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0xaca205), err(-22) Fixes: 14624d7247fc ("net/mlx5e: Don't use termination table when redundant") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../mlx5/core/eswitch_offloads_termtbl.c | 32 +++---------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c index 3a9a6bb9158d..edd910258314 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c @@ -210,18 +210,6 @@ static bool mlx5_eswitch_offload_is_uplink_port(const struct mlx5_eswitch *esw, return (port_mask & port_value) == MLX5_VPORT_UPLINK; } -static bool -mlx5_eswitch_is_push_vlan_no_cap(struct mlx5_eswitch *esw, - struct mlx5_flow_act *flow_act) -{ - if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH && - !(mlx5_fs_get_capabilities(esw->dev, MLX5_FLOW_NAMESPACE_FDB) & - MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX)) - return true; - - return false; -} - bool mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr, @@ -237,7 +225,10 @@ mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw, (!mlx5_eswitch_offload_is_uplink_port(esw, spec) && !esw_attr->int_port)) return false; - if (mlx5_eswitch_is_push_vlan_no_cap(esw, flow_act)) + /* push vlan on RX */ + if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH && + !(mlx5_fs_get_capabilities(esw->dev, MLX5_FLOW_NAMESPACE_FDB) & + MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX)) return true; /* hairpin */ @@ -261,31 +252,19 @@ mlx5_eswitch_add_termtbl_rule(struct mlx5_eswitch *esw, struct mlx5_flow_act term_tbl_act = {}; struct mlx5_flow_handle *rule = NULL; bool term_table_created = false; - bool is_push_vlan_on_rx; int num_vport_dests = 0; int i, curr_dest; - is_push_vlan_on_rx = mlx5_eswitch_is_push_vlan_no_cap(esw, flow_act); mlx5_eswitch_termtbl_actions_move(flow_act, &term_tbl_act); term_tbl_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; for (i = 0; i < num_dest; i++) { struct mlx5_termtbl_handle *tt; - bool hairpin = false; /* only vport destinations can be terminated */ if (dest[i].type != MLX5_FLOW_DESTINATION_TYPE_VPORT) continue; - if (attr->dests[num_vport_dests].rep && - attr->dests[num_vport_dests].rep->vport == MLX5_VPORT_UPLINK) - hairpin = true; - - if (!is_push_vlan_on_rx && !hairpin) { - num_vport_dests++; - continue; - } - if (attr->dests[num_vport_dests].flags & MLX5_ESW_DEST_ENCAP) { term_tbl_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; term_tbl_act.pkt_reformat = attr->dests[num_vport_dests].pkt_reformat; @@ -333,9 +312,6 @@ revert_changes: for (curr_dest = 0; curr_dest < num_vport_dests; curr_dest++) { struct mlx5_termtbl_handle *tt = attr->dests[curr_dest].termtbl; - if (!tt) - continue; - attr->dests[curr_dest].termtbl = NULL; /* search for the destination associated with the From e375b503943f512cdd3a7dd12849972b7a006076 Mon Sep 17 00:00:00 2001 From: Jeroen de Borst Date: Wed, 19 Apr 2023 14:05:58 -0700 Subject: [PATCH 15/33] gve: update MAINTAINERS This reflects role changes in our team. Signed-off-by: Jeroen de Borst Signed-off-by: Praveen Kaligineedi Link: https://lore.kernel.org/r/20230419210558.1893400-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c6545eb54104..eefcea6f8757 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8690,7 +8690,7 @@ F: drivers/input/touchscreen/goodix* GOOGLE ETHERNET DRIVERS M: Jeroen de Borst -M: Catherine Sullivan +M: Praveen Kaligineedi R: Shailend Chand L: netdev@vger.kernel.org S: Supported From 461bb5b97049a149278f2c27a3aa12af16da6a2e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 20 Apr 2023 15:36:07 +0300 Subject: [PATCH 16/33] net: dpaa: Fix uninitialized variable in dpaa_stop() The return value is not initialized on the success path. Fixes: 901bdff2f529 ("net: fman: Change return type of disable to void") Signed-off-by: Dan Carpenter Acked-by: Madalin Bucur Reviewed-by: Sean Anderson Link: https://lore.kernel.org/r/8c9dc377-8495-495f-a4e5-4d2d0ee12f0c@kili.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 9318a2554056..f96196617121 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -299,7 +299,8 @@ static int dpaa_stop(struct net_device *net_dev) { struct mac_device *mac_dev; struct dpaa_priv *priv; - int i, err, error; + int i, error; + int err = 0; priv = netdev_priv(net_dev); mac_dev = priv->mac_dev; From 63cfd210034c772fad047afa13dd5a4664b0a72e Mon Sep 17 00:00:00 2001 From: Huanhuan Wang Date: Thu, 20 Apr 2023 16:01:25 +0200 Subject: [PATCH 17/33] nfp: fix incorrect pointer deference when offloading IPsec with bonding There are two pointers in struct xfrm_dev_offload, *dev, *real_dev. The *dev points whether bonding interface or real interface, if bonding IPsec offload is used, it points bonding interface; if not, it points real interface. And *real_dev always points real interface. So nfp should always use real_dev instead of dev. Prior to this change the system becomes unresponsive when offloading IPsec for a device which is a lower device to a bonding device. Fixes: 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer") CC: stable@vger.kernel.org Signed-off-by: Huanhuan Wang Acked-by: Simon Horman Signed-off-by: Louis Peens Link: https://lore.kernel.org/r/20230420140125.38521-1-louis.peens@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/crypto/ipsec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index c0dcce8ae437..b1f026b81dea 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -269,7 +269,7 @@ static void set_sha2_512hmac(struct nfp_ipsec_cfg_add_sa *cfg, int *trunc_len) static int nfp_net_xfrm_add_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { - struct net_device *netdev = x->xso.dev; + struct net_device *netdev = x->xso.real_dev; struct nfp_ipsec_cfg_mssg msg = {}; int i, key_len, trunc_len, err = 0; struct nfp_ipsec_cfg_add_sa *cfg; @@ -513,7 +513,7 @@ static void nfp_net_xfrm_del_state(struct xfrm_state *x) .cmd = NFP_IPSEC_CFG_MSSG_INV_SA, .sa_idx = x->xso.offload_handle - 1, }; - struct net_device *netdev = x->xso.dev; + struct net_device *netdev = x->xso.real_dev; struct nfp_net *nn; int err; From 7041101ff6c3073fd8f2e99920f535b111c929cb Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 20 Apr 2023 16:59:46 +0200 Subject: [PATCH 18/33] net/sched: sch_fq: fix integer overflow of "credit" if sch_fq is configured with "initial quantum" having values greater than INT_MAX, the first assignment of "credit" does signed integer overflow to a very negative value. In this situation, the syzkaller script provided by Cristoph triggers the CPU soft-lockup warning even with few sockets. It's not an infinite loop, but "credit" wasn't probably meant to be minus 2Gb for each new flow. Capping "initial quantum" to INT_MAX proved to fix the issue. v2: validation of "initial quantum" is done in fq_policy, instead of open coding in fq_change() _ suggested by Jakub Kicinski Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/377 Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler") Reviewed-by: Eric Dumazet Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/7b3a3c7e36d03068707a021760a194a8eb5ad41a.1682002300.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq.c | 6 ++++- .../tc-testing/tc-tests/qdiscs/fq.json | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 48d14fb90ba0..f59a2cb2c803 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -779,13 +779,17 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; } +static struct netlink_range_validation iq_range = { + .max = INT_MAX, +}; + static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, - [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range), [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json index 8acb904d1419..3593fb8f79ad 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json @@ -114,6 +114,28 @@ "$IP link del dev $DUMMY type dummy" ] }, + { + "id": "10f7", + "name": "Create FQ with invalid initial_quantum setting", + "category": [ + "qdisc", + "fq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root fq initial_quantum 0x80000000", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc fq 1: root.*initial_quantum 2048Mb", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY type dummy" + ] + }, { "id": "9398", "name": "Create FQ with maxrate setting", From 2cc8a008d62f3c04eeb7ec6fe59e542802bb8df3 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 20 Apr 2023 20:36:33 +0200 Subject: [PATCH 19/33] net/sched: cls_api: Initialize miss_cookie_node when action miss is not used Function tcf_exts_init_ex() sets exts->miss_cookie_node ptr only when use_action_miss is true so it assumes in other case that the field is set to NULL by the caller. If not then the field contains garbage and subsequent tcf_exts_destroy() call results in a crash. Ensure that the field .miss_cookie_node pointer is NULL when use_action_miss parameter is false to avoid this potential scenario. Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Ivan Vecera Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230420183634.1139391-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 35785a36c802..3c3629c9e7b6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3211,6 +3211,7 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; + exts->miss_cookie_node = NULL; /* Note: we do not own yet a reference on net. * This reference might be taken later from tcf_exts_get_net(). */ From 99e5acae193e369b71217efe6f1dad42f3f18815 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 20 Apr 2023 20:40:35 +0800 Subject: [PATCH 20/33] ipv4: Fix potential uninit variable access bug in __ip_make_skb() Like commit ea30388baebc ("ipv6: Fix an uninit variable access bug in __ip6_make_skb()"). icmphdr does not in skb linear region under the scenario of SOCK_RAW socket. Access icmp_hdr(skb)->type directly will trigger the uninit variable access bug. Use a local variable icmp_type to carry the correct value in different scenarios. Fixes: 96793b482540 ("[IPV4]: Add ICMPMsgStats MIB (RFC 4293)") Reviewed-by: Willem de Bruijn Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4e4e308c3230..89bd7872c629 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1570,9 +1570,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk, cork->dst = NULL; skb_dst_set(skb, &rt->dst); - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); + if (iph->protocol == IPPROTO_ICMP) { + u8 icmp_type; + + /* For such sockets, transhdrlen is zero when do ip_append_data(), + * so icmphdr does not in skb linear region and can not get icmp_type + * by icmp_hdr(skb)->type. + */ + if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + icmp_type = fl4->fl4_icmp_type; + else + icmp_type = icmp_hdr(skb)->type; + icmp_out_count(net, icmp_type); + } ip_cork_release(cork); out: From cf88231d973909dc8d578138509973e062aba3d7 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 20 Apr 2023 17:04:23 +0100 Subject: [PATCH 21/33] dt-bindings: net: mediatek: add WED RX binding for MT7981 eth driver Add compatible string for mediatek,mt7981-wed as MT7981 also supports RX WED just like MT7986, but needs a different firmware file. Signed-off-by: Daniel Golle Acked-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- .../devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml index 5c223cb063d4..f7d578a171a4 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml @@ -20,6 +20,7 @@ properties: items: - enum: - mediatek,mt7622-wed + - mediatek,mt7981-wed - mediatek,mt7986-wed - const: syscon From 86ce0d09e42463816f13766ac286012ec7b71fdc Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 20 Apr 2023 17:05:05 +0100 Subject: [PATCH 22/33] net: ethernet: mtk_eth_soc: use WO firmware for MT7981 In order to support wireless offloading on MT7981 we need to load the appropriate firmware. Recognize MT7981 and load mt7981_wo.bin. Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_wed_mcu.c | 7 ++++++- drivers/net/ethernet/mediatek/mtk_wed_wo.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c index 6bad0d262f28..071ed3dea860 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_mcu.c +++ b/drivers/net/ethernet/mediatek/mtk_wed_mcu.c @@ -326,7 +326,11 @@ mtk_wed_mcu_load_firmware(struct mtk_wed_wo *wo) wo->hw->index + 1); /* load firmware */ - fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1 : MT7986_FIRMWARE_WO0; + if (of_device_is_compatible(wo->hw->node, "mediatek,mt7981-wed")) + fw_name = MT7981_FIRMWARE_WO; + else + fw_name = wo->hw->index ? MT7986_FIRMWARE_WO1 : MT7986_FIRMWARE_WO0; + ret = request_firmware(&fw, fw_name, wo->hw->dev); if (ret) return ret; @@ -386,5 +390,6 @@ int mtk_wed_mcu_init(struct mtk_wed_wo *wo) 100, MTK_FW_DL_TIMEOUT); } +MODULE_FIRMWARE(MT7981_FIRMWARE_WO); MODULE_FIRMWARE(MT7986_FIRMWARE_WO0); MODULE_FIRMWARE(MT7986_FIRMWARE_WO1); diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h index dbcf42ce9173..7a1a2a28f1ac 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_wo.h +++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h @@ -88,6 +88,7 @@ enum mtk_wed_dummy_cr_idx { MTK_WED_DUMMY_CR_WO_STATUS, }; +#define MT7981_FIRMWARE_WO "mediatek/mt7981_wo.bin" #define MT7986_FIRMWARE_WO0 "mediatek/mt7986_wo_0.bin" #define MT7986_FIRMWARE_WO1 "mediatek/mt7986_wo_1.bin" From e0416e7d33361d2ad0bf9f007428346579ac854a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Apr 2023 23:03:46 +0100 Subject: [PATCH 23/33] rxrpc: Fix potential race in error handling in afs_make_call() If the rxrpc call set up by afs_make_call() receives an error whilst it is transmitting the request, there's the possibility that it may get to the point the rxrpc call is ended (after the error_kill_call label) just as the call is queued for async processing. This could manifest itself as call->rxcall being seen as NULL in afs_deliver_to_call() when it tries to lock the call. Fix this by splitting rxrpc_kernel_end_call() into a function to shut down an rxrpc call and a function to release the caller's reference and calling the latter only when we get to afs_put_call(). Reported-by: Jeffrey Altman Signed-off-by: David Howells Tested-by: kafs-testing+fedora36_64checkkafs-build-306@auristor.com cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.rst | 17 ++++++++++----- fs/afs/rxrpc.c | 9 ++++---- include/net/af_rxrpc.h | 3 ++- net/rxrpc/af_rxrpc.c | 35 ++++++++++++++++++++---------- net/rxrpc/rxperf.c | 3 ++- 5 files changed, 44 insertions(+), 23 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index ec1323d92c96..e807e18ba32a 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -848,14 +848,21 @@ The kernel interface functions are as follows: returned. The caller now holds a reference on this and it must be properly ended. - (#) End a client call:: + (#) Shut down a client call:: - void rxrpc_kernel_end_call(struct socket *sock, + void rxrpc_kernel_shutdown_call(struct socket *sock, + struct rxrpc_call *call); + + This is used to shut down a previously begun call. The user_call_ID is + expunged from AF_RXRPC's knowledge and will not be seen again in + association with the specified call. + + (#) Release the ref on a client call:: + + void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); - This is used to end a previously begun call. The user_call_ID is expunged - from AF_RXRPC's knowledge and will not be seen again in association with - the specified call. + This is used to release the caller's ref on an rxrpc call. (#) Send data through a call:: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 7817e2b860e5..e08b850c3e6d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -179,7 +179,8 @@ void afs_put_call(struct afs_call *call) ASSERT(call->type->name != NULL); if (call->rxcall) { - rxrpc_kernel_end_call(net->socket, call->rxcall); + rxrpc_kernel_shutdown_call(net->socket, call->rxcall); + rxrpc_kernel_put_call(net->socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -420,10 +421,8 @@ error_kill_call: * The call, however, might be queued on afs_async_calls and we need to * make sure we don't get any more notifications that might requeue it. */ - if (call->rxcall) { - rxrpc_kernel_end_call(call->net->socket, call->rxcall); - call->rxcall = NULL; - } + if (call->rxcall) + rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall); if (call->async) { if (cancel_work_sync(&call->async_work)) afs_put_call(call); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ba717eac0229..01a35e113ab9 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -57,7 +57,8 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, struct iov_iter *, size_t *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, enum rxrpc_abort_reason); -void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); +void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); +void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 102f5cbff91a..c32b164206f9 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -342,31 +342,44 @@ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, } /** - * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using * @sock: The socket the call is on * @call: The call to end * - * Allow a kernel service to end a call it was using. The call must be + * Allow a kernel service to shut down a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ -void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) +void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, refcount_read(&call->ref)); mutex_lock(&call->user_mutex); - rxrpc_release_call(rxrpc_sk(sock->sk), call); + if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + rxrpc_release_call(rxrpc_sk(sock->sk), call); - /* Make sure we're not going to call back into a kernel service */ - if (call->notify_rx) { - spin_lock(&call->notify_lock); - call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + /* Make sure we're not going to call back into a kernel service */ + if (call->notify_rx) { + spin_lock(&call->notify_lock); + call->notify_rx = rxrpc_dummy_notify_rx; + spin_unlock(&call->notify_lock); + } } - mutex_unlock(&call->user_mutex); +} +EXPORT_SYMBOL(rxrpc_kernel_shutdown_call); + +/** + * rxrpc_kernel_put_call - Release a reference to a call + * @sock: The socket the call is on + * @call: The call to put + * + * Drop the application's ref on an rxrpc call. + */ +void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call) +{ rxrpc_put_call(call, rxrpc_call_put_kernel); } -EXPORT_SYMBOL(rxrpc_kernel_end_call); +EXPORT_SYMBOL(rxrpc_kernel_put_call); /** * rxrpc_kernel_check_life - Check to see whether a call is still alive diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 4a2e90015ca7..085e7892d310 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -342,7 +342,8 @@ static void rxperf_deliver_to_call(struct work_struct *work) call_complete: rxperf_set_call_complete(call, ret, remote_abort); /* The call may have been requeued */ - rxrpc_kernel_end_call(rxperf_socket, call->rxcall); + rxrpc_kernel_shutdown_call(rxperf_socket, call->rxcall); + rxrpc_kernel_put_call(rxperf_socket, call->rxcall); cancel_work(&call->work); kfree(call); } From b148b9abc8444c9e0579a6bbe47b2da6adf5a150 Mon Sep 17 00:00:00 2001 From: Wang Zhang Date: Fri, 21 Apr 2023 23:10:09 +0800 Subject: [PATCH 24/33] net: ethernet: mediatek: remove return value check of `mtk_wed_hw_add_debugfs` Smatch complains that: mtk_wed_hw_add_debugfs() warn: 'dir' is an error pointer or valid Debugfs checks are generally not supposed to be checked for errors and it is not necessary here. fix it by just deleting the dead code. Signed-off-by: Wang Zhang Reviewed-by: Dongliang Mu Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_wed_debugfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_wed_debugfs.c b/drivers/net/ethernet/mediatek/mtk_wed_debugfs.c index 56f663439721..b244c02c5b51 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_debugfs.c +++ b/drivers/net/ethernet/mediatek/mtk_wed_debugfs.c @@ -252,8 +252,6 @@ void mtk_wed_hw_add_debugfs(struct mtk_wed_hw *hw) snprintf(hw->dirname, sizeof(hw->dirname), "wed%d", hw->index); dir = debugfs_create_dir(hw->dirname, NULL); - if (!dir) - return; hw->debugfs_dir = dir; debugfs_create_u32("regidx", 0600, dir, &hw->debugfs_reg); From fadfc57cc8047080a123b16f288b7138524fb1e2 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 21 Apr 2023 17:16:17 +0100 Subject: [PATCH 25/33] rxrpc: Fix error when reading rxrpc tokens When converting from ASSERTCMP to WARN_ON, the tested condition must be inverted, which was missed for this case. This would cause an EIO error when trying to read an rxrpc token, for instance when trying to display tokens with AuriStor's "tokens" command. Fixes: 84924aac08a4 ("rxrpc: Fix checker warning") Signed-off-by: Marc Dionne Signed-off-by: David Howells cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 8d53aded09c4..33e8302a79e3 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -680,7 +680,7 @@ static long rxrpc_read(const struct key *key, return -ENOPKG; } - if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr == + if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr != toksize)) return -EIO; } From 807cfded92b0a2e8be9c078c693075790c7c4131 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:09 -0300 Subject: [PATCH 26/33] net/sched: sch_htb: use extack on errors messages Some error messages are still being printed to dmesg. Since extack is available, provide error messages there. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 92f2975b6a82..8aef7dd9fb88 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1786,7 +1786,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy, - NULL); + extack); if (err < 0) goto failure; @@ -1858,7 +1858,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* check maximal depth */ if (parent && parent->parent && parent->parent->level < 2) { - pr_err("htb: tree is too deep\n"); + NL_SET_ERR_MSG_MOD(extack, "tree is too deep"); goto failure; } err = -ENOBUFS; @@ -1917,8 +1917,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, }; err = htb_offload(dev, &offload_opt); if (err) { - pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n", - err); + NL_SET_ERR_MSG_WEAK(extack, + "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE"); goto err_kill_estimator; } dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); @@ -1937,8 +1937,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, }; err = htb_offload(dev, &offload_opt); if (err) { - pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n", - err); + NL_SET_ERR_MSG_WEAK(extack, + "Failed to offload TC_HTB_LEAF_TO_INNER"); htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } @@ -2067,8 +2067,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put(parent_qdisc); if (warn) - pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", - cl->common.classid, (warn == -1 ? "small" : "big")); + NL_SET_ERR_MSG_FMT_MOD(extack, + "quantum of class %X is %s. Consider r2q change.", + cl->common.classid, (warn == -1 ? "small" : "big")); qdisc_class_hash_grow(sch, &q->clhash); From c69a9b023f65b73068a17f2f4eb6a1b6e257f5bf Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:10 -0300 Subject: [PATCH 27/33] net/sched: sch_qfq: use extack on errors messages Some error messages are still being printed to dmesg. Since extack is available, provide error messages there. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 02098a02943e..abcc48087831 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -402,8 +402,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, int err; int delta_w; - if (tca[TCA_OPTIONS] == NULL) { - pr_notice("qfq: no options\n"); + if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { + NL_SET_ERR_MSG_MOD(extack, "missing options"); return -EINVAL; } @@ -442,8 +442,9 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, delta_w = weight - (cl ? cl->agg->class_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { - pr_notice("qfq: total weight out of range (%d + %u)\n", - delta_w, q->wsum); + NL_SET_ERR_MSG_FMT_MOD(extack, + "total weight out of range (%d + %u)\n", + delta_w, q->wsum); return -EINVAL; } From 25369891fcef373540f8b4e0b3bccf77a04490d5 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:11 -0300 Subject: [PATCH 28/33] net/sched: sch_qfq: refactor parsing of netlink parameters Two parameters can be transformed into netlink policies and validated while parsing the netlink message. Reviewed-by: Simon Horman Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index abcc48087831..dfd9a99e6257 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -113,6 +113,7 @@ #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ +#define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT) #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ @@ -214,9 +215,14 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) return container_of(clc, struct qfq_class, common); } +static struct netlink_range_validation lmax_range = { + .min = QFQ_MIN_LMAX, + .max = QFQ_MAX_LMAX, +}; + static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { - [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, - [TCA_QFQ_LMAX] = { .type = NLA_U32 }, + [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT), + [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range), }; /* @@ -408,17 +414,13 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], - qfq_policy, NULL); + qfq_policy, extack); if (err < 0) return err; - if (tb[TCA_QFQ_WEIGHT]) { + if (tb[TCA_QFQ_WEIGHT]) weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); - if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { - pr_notice("qfq: invalid weight %u\n", weight); - return -EINVAL; - } - } else + else weight = 1; if (tb[TCA_QFQ_LMAX]) @@ -426,11 +428,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, else lmax = psched_mtu(qdisc_dev(sch)); - if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { - pr_notice("qfq: invalid max length %u\n", lmax); - return -EINVAL; - } - inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; From 7eb060a51a3ba44acefafefd242dcbf12dd91fb9 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 22 Apr 2023 12:56:12 -0300 Subject: [PATCH 29/33] selftests: tc-testing: add more tests for sch_qfq The QFQ qdisc class has parameter bounds that are not being checked for correctness. Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/qdiscs/qfq.json | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json index 330f1a25e0ab..147899a868d3 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json @@ -46,6 +46,30 @@ "$IP link del dev $DUMMY type dummy" ] }, + { + "id": "d364", + "name": "Test QFQ with max class weight setting", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: root qfq" + ], + "cmdUnderTest": "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 9999", + "expExitCode": "2", + "verifyCmd": "$TC class show dev $DUMMY", + "matchPattern": "class qfq 1:1 root weight 9999 maxpkt", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, { "id": "8452", "name": "Create QFQ with class maxpkt setting", @@ -70,6 +94,54 @@ "$IP link del dev $DUMMY type dummy" ] }, + { + "id": "22df", + "name": "Test QFQ class maxpkt setting lower bound", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: root qfq" + ], + "cmdUnderTest": "$TC class add dev $DUMMY parent 1: classid 1:1 qfq maxpkt 128", + "expExitCode": "2", + "verifyCmd": "$TC class show dev $DUMMY", + "matchPattern": "class qfq 1:1 root weight 1 maxpkt 128", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "92ee", + "name": "Test QFQ class maxpkt setting upper bound", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: root qfq" + ], + "cmdUnderTest": "$TC class add dev $DUMMY parent 1: classid 1:1 qfq maxpkt 99999", + "expExitCode": "2", + "verifyCmd": "$TC class show dev $DUMMY", + "matchPattern": "class qfq 1:1 root weight 1 maxpkt 99999", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, { "id": "d920", "name": "Create QFQ with multiple class setting", From 60fd497c99b336e466db0aaeff44c7d12a1fc10a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 24 Apr 2023 09:02:32 +1000 Subject: [PATCH 30/33] MAINTAINERS: Remove PPP maintainer I am not currently maintaining the kernel PPP code, so remove my address from the MAINTAINERS entry for it. Signed-off-by: Paul Mackerras Signed-off-by: David S. Miller --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index eefcea6f8757..623a4e76fd09 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16771,9 +16771,8 @@ F: include/uapi/linux/if_pppol2tp.h F: net/l2tp/l2tp_ppp.c PPP PROTOCOL DRIVERS AND COMPRESSORS -M: Paul Mackerras L: linux-ppp@vger.kernel.org -S: Maintained +S: Orphan F: drivers/net/ppp/ppp_* PPS SUPPORT From d913d32cc2707e9cd24fe6fa6d7d470e9c728980 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 21 Apr 2023 11:52:55 -0700 Subject: [PATCH 31/33] netlink: Use copy_to_user() for optval in netlink_getsockopt(). Brad Spencer provided a detailed report [0] that when calling getsockopt() for AF_NETLINK, some SOL_NETLINK options set only 1 byte even though such options require at least sizeof(int) as length. The options return a flag value that fits into 1 byte, but such behaviour confuses users who do not initialise the variable before calling getsockopt() and do not strictly check the returned value as char. Currently, netlink_getsockopt() uses put_user() to copy data to optlen and optval, but put_user() casts the data based on the pointer, char *optval. As a result, only 1 byte is set to optval. To avoid this behaviour, we need to use copy_to_user() or cast optval for put_user(). Note that this changes the behaviour on big-endian systems, but we document that the size of optval is int in the man page. $ man 7 netlink ... Socket options To set or get a netlink socket option, call getsockopt(2) to read or setsockopt(2) to write the option with the option level argument set to SOL_NETLINK. Unless otherwise noted, optval is a pointer to an int. Fixes: 9a4595bc7e67 ("[NETLINK]: Add set/getsockopt options to support more than 32 groups") Fixes: be0c22a46cfb ("netlink: add NETLINK_BROADCAST_ERROR socket option") Fixes: 38938bfe3489 ("netlink: add NETLINK_NO_ENOBUFS socket flag") Fixes: 0a6a3a23ea6e ("netlink: add NETLINK_CAP_ACK socket option") Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting") Fixes: 89d35528d17d ("netlink: Add new socket option to enable strict checking on dumps") Reported-by: Brad Spencer Link: https://lore.kernel.org/netdev/ZD7VkNWFfp22kTDt@datsun.rim.net/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Johannes Berg Link: https://lore.kernel.org/r/20230421185255.94606-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 75 ++++++++++++---------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f365dfdd672d..9b6eb28e6e94 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1742,7 +1742,8 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - int len, val, err; + unsigned int flag; + int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; @@ -1754,39 +1755,17 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { - int pos, idx, shift; + int pos, idx, shift, err = 0; - err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) @@ -1803,40 +1782,32 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); - break; + return err; } case NETLINK_CAP_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; - if (put_user(len, optlen) || - put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: - if (len < sizeof(int)) - return -EINVAL; - len = sizeof(int); - val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; - if (put_user(len, optlen) || put_user(val, optval)) - return -EFAULT; - err = 0; + flag = NETLINK_F_STRICT_CHK; break; default: - err = -ENOPROTOOPT; + return -ENOPROTOOPT; } - return err; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = nlk->flags & flag ? 1 : 0; + + if (put_user(len, optlen) || + copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) From d325c34d9e7e38d371c0a299d415e9b07f66a1fb Mon Sep 17 00:00:00 2001 From: Gencen Gan Date: Mon, 24 Apr 2023 23:28:01 +0800 Subject: [PATCH 32/33] net: amd: Fix link leak when verifying config failed After failing to verify configuration, it returns directly without releasing link, which may cause memory leak. Paolo Abeni thinks that the whole code of this driver is quite "suboptimal" and looks unmainatained since at least ~15y, so he suggests that we could simply remove the whole driver, please take it into consideration. Simon Horman suggests that the fix label should be set to "Linux-2.6.12-rc2" considering that the problem has existed since the driver was introduced and the commit above doesn't seem to exist in net/net-next. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Gan Gecen Reviewed-by: Dongliang Mu Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/nmclan_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index 823a329a921f..0dd391c84c13 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -651,7 +651,7 @@ static int nmclan_config(struct pcmcia_device *link) } else { pr_notice("mace id not found: %x %x should be 0x40 0x?9\n", sig[0], sig[1]); - return -ENODEV; + goto failed; } } From 50749f2dd6854a41830996ad302aef2ffaf011d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Apr 2023 15:20:22 -0700 Subject: [PATCH 33/33] tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp. syzkaller reported [0] memory leaks of an UDP socket and ZEROCOPY skbs. We can reproduce the problem with these sequences: sk = socket(AF_INET, SOCK_DGRAM, 0) sk.setsockopt(SOL_SOCKET, SO_TIMESTAMPING, SOF_TIMESTAMPING_TX_SOFTWARE) sk.setsockopt(SOL_SOCKET, SO_ZEROCOPY, 1) sk.sendto(b'', MSG_ZEROCOPY, ('127.0.0.1', 53)) sk.close() sendmsg() calls msg_zerocopy_alloc(), which allocates a skb, sets skb->cb->ubuf.refcnt to 1, and calls sock_hold(). Here, struct ubuf_info_msgzc indirectly holds a refcnt of the socket. When the skb is sent, __skb_tstamp_tx() clones it and puts the clone into the socket's error queue with the TX timestamp. When the original skb is received locally, skb_copy_ubufs() calls skb_unclone(), and pskb_expand_head() increments skb->cb->ubuf.refcnt. This additional count is decremented while freeing the skb, but struct ubuf_info_msgzc still has a refcnt, so __msg_zerocopy_callback() is not called. The last refcnt is not released unless we retrieve the TX timestamped skb by recvmsg(). Since we clear the error queue in inet_sock_destruct() after the socket's refcnt reaches 0, there is a circular dependency. If we close() the socket holding such skbs, we never call sock_put() and leak the count, sk, and skb. TCP has the same problem, and commit e0c8bccd40fc ("net: stream: purge sk_error_queue in sk_stream_kill_queues()") tried to fix it by calling skb_queue_purge() during close(). However, there is a small chance that skb queued in a qdisc or device could be put into the error queue after the skb_queue_purge() call. In __skb_tstamp_tx(), the cloned skb should not have a reference to the ubuf to remove the circular dependency, but skb_clone() does not call skb_copy_ubufs() for zerocopy skb. So, we need to call skb_orphan_frags_rx() for the cloned skb to call skb_copy_ubufs(). [0]: BUG: memory leak unreferenced object 0xffff88800c6d2d00 (size 1152): comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 cd af e8 81 00 00 00 00 ................ 02 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace: [<0000000055636812>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:2024 [<0000000054d77b7a>] sk_alloc+0x3b/0x800 net/core/sock.c:2083 [<0000000066f3c7e0>] inet_create net/ipv4/af_inet.c:319 [inline] [<0000000066f3c7e0>] inet_create+0x31e/0xe40 net/ipv4/af_inet.c:245 [<000000009b83af97>] __sock_create+0x2ab/0x550 net/socket.c:1515 [<00000000b9b11231>] sock_create net/socket.c:1566 [inline] [<00000000b9b11231>] __sys_socket_create net/socket.c:1603 [inline] [<00000000b9b11231>] __sys_socket_create net/socket.c:1588 [inline] [<00000000b9b11231>] __sys_socket+0x138/0x250 net/socket.c:1636 [<000000004fb45142>] __do_sys_socket net/socket.c:1649 [inline] [<000000004fb45142>] __se_sys_socket net/socket.c:1647 [inline] [<000000004fb45142>] __x64_sys_socket+0x73/0xb0 net/socket.c:1647 [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd BUG: memory leak unreferenced object 0xffff888017633a00 (size 240): comm "syz-executor392", pid 264, jiffies 4294785440 (age 13.044s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 2d 6d 0c 80 88 ff ff .........-m..... backtrace: [<000000002b1c4368>] __alloc_skb+0x229/0x320 net/core/skbuff.c:497 [<00000000143579a6>] alloc_skb include/linux/skbuff.h:1265 [inline] [<00000000143579a6>] sock_omalloc+0xaa/0x190 net/core/sock.c:2596 [<00000000be626478>] msg_zerocopy_alloc net/core/skbuff.c:1294 [inline] [<00000000be626478>] msg_zerocopy_realloc+0x1ce/0x7f0 net/core/skbuff.c:1370 [<00000000cbfc9870>] __ip_append_data+0x2adf/0x3b30 net/ipv4/ip_output.c:1037 [<0000000089869146>] ip_make_skb+0x26c/0x2e0 net/ipv4/ip_output.c:1652 [<00000000098015c2>] udp_sendmsg+0x1bac/0x2390 net/ipv4/udp.c:1253 [<0000000045e0e95e>] inet_sendmsg+0x10a/0x150 net/ipv4/af_inet.c:819 [<000000008d31bfde>] sock_sendmsg_nosec net/socket.c:714 [inline] [<000000008d31bfde>] sock_sendmsg+0x141/0x190 net/socket.c:734 [<0000000021e21aa4>] __sys_sendto+0x243/0x360 net/socket.c:2117 [<00000000ac0af00c>] __do_sys_sendto net/socket.c:2129 [inline] [<00000000ac0af00c>] __se_sys_sendto net/socket.c:2125 [inline] [<00000000ac0af00c>] __x64_sys_sendto+0xe1/0x1c0 net/socket.c:2125 [<0000000066999e0e>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<0000000066999e0e>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<0000000017f238c1>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") Fixes: b5947e5d1e71 ("udp: msg_zerocopy") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4c0879798eb8..2f9bb98170ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5162,6 +5162,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, skb = alloc_skb(0, GFP_ATOMIC); } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) + return; } if (!skb) return;