From cbc3a153222805d65f821e10f4f78b6afce06f86 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 22:03:53 +0000 Subject: [PATCH 1/4] tcp_metrics: add missing barriers on delete When removing an item from RCU protected list, we must prevent store-tearing, using rcu_assign_pointer() or WRITE_ONCE(). Fixes: 04f721c671656 ("tcp_metrics: Rewrite tcp_metrics_flush_all") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Neal Cardwell Signed-off-by: Paolo Abeni --- net/ipv4/tcp_metrics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c196759f1d3b..4bfa2fb27de5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -908,7 +908,7 @@ static void tcp_metrics_flush_all(struct net *net) match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); if (match) { - *pp = tm->tcpm_next; + rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); } else { pp = &tm->tcpm_next; @@ -949,7 +949,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { - *pp = tm->tcpm_next; + rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); found = true; } else { From 081480014a64a69d901f8ef1ffdd56d6085cf87e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 22:03:54 +0000 Subject: [PATCH 2/4] tcp_metrics: properly set tp->snd_ssthresh in tcp_init_metrics() We need to set tp->snd_ssthresh to TCP_INFINITE_SSTHRESH in the case tcp_get_metrics() fails for some reason. Fixes: 9ad7c049f0f7 ("tcp: RFC2988bis + taking RTT sample from 3WHS for the passive open side") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Neal Cardwell Signed-off-by: Paolo Abeni --- net/ipv4/tcp_metrics.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4bfa2fb27de5..0c03f564878f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -470,6 +470,10 @@ void tcp_init_metrics(struct sock *sk) u32 val, crtt = 0; /* cached RTT scaled by 8 */ sk_dst_confirm(sk); + /* ssthresh may have been reduced unnecessarily during. + * 3WHS. Restore it back to its initial default. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset; @@ -489,11 +493,6 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; - } else { - /* ssthresh may have been reduced unnecessarily during. - * 3WHS. Restore it back to its initial default. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) From a135798e6e200ecb2f864cecca6d257ba278370c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 22:03:55 +0000 Subject: [PATCH 3/4] tcp_metrics: do not create an entry from tcp_init_metrics() tcp_init_metrics() only wants to get metrics if they were previously stored in the cache. Creating an entry is adding useless costs, especially when tcp_no_metrics_save is set. Fixes: 51c5d0c4b169 ("tcp: Maintain dynamic metrics in local cache.") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Neal Cardwell Signed-off-by: Paolo Abeni --- net/ipv4/tcp_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0c03f564878f..7aca12c59c18 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -478,7 +478,7 @@ void tcp_init_metrics(struct sock *sk) goto reset; rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); + tm = tcp_get_metrics(sk, dst, false); if (!tm) { rcu_read_unlock(); goto reset; From 6532e257aa73645e28dee5b2232cc3c88be62083 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 22:03:56 +0000 Subject: [PATCH 4/4] tcp_metrics: optimize tcp_metrics_flush_all() This is inspired by several syzbot reports where tcp_metrics_flush_all() was seen in the traces. We can avoid acquiring tcp_metrics_lock for empty buckets, and we should add one cond_resched() to break potential long loops. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Neal Cardwell Signed-off-by: Paolo Abeni --- net/ipv4/tcp_metrics.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7aca12c59c18..c2a925538542 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -898,11 +898,13 @@ static void tcp_metrics_flush_all(struct net *net) unsigned int row; for (row = 0; row < max_rows; row++, hb++) { - struct tcp_metrics_block __rcu **pp; + struct tcp_metrics_block __rcu **pp = &hb->chain; bool match; + if (!rcu_access_pointer(*pp)) + continue; + spin_lock_bh(&tcp_metrics_lock); - pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); @@ -914,6 +916,7 @@ static void tcp_metrics_flush_all(struct net *net) } } spin_unlock_bh(&tcp_metrics_lock); + cond_resched(); } }