From 41db7626b73210cb99eea9cf672dcfce58c68ab2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 19:29:38 +0000 Subject: [PATCH 1/2] inet: returns a bool from inet_sk_get_local_port_range() Change inet_sk_get_local_port_range() to return a boolean, telling the callers if the port range was provided by IP_LOCAL_PORT_RANGE socket option. Adds documentation while we are at it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20231214192939.1962891-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 +- net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index b31be912489a..de0c69c57e3c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in *low = range & 0xffff; *high = range >> 16; } -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 70be0f6fe879..bd325b029dd1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) +/** + * inet_sk_get_local_port_range - fetch ephemeral ports range + * @sk: socket + * @low: pointer to low port + * @high: pointer to high port + * + * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) + * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. + * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. + */ +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { - const struct inet_sock *inet = inet_sk(sk); - const struct net *net = sock_net(sk); int lo, hi, sk_lo, sk_hi; + bool local_range = false; u32 sk_range; - inet_get_local_port_range(net, &lo, &hi); + inet_get_local_port_range(sock_net(sk), &lo, &hi); - sk_range = READ_ONCE(inet->local_port_range); + sk_range = READ_ONCE(inet_sk(sk)->local_port_range); if (unlikely(sk_range)) { sk_lo = sk_range & 0xffff; sk_hi = sk_range >> 16; @@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) lo = sk_lo; if (lo <= sk_hi && sk_hi <= hi) hi = sk_hi; + local_range = true; } *low = lo; *high = hi; + return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); From 207184853dbdb62d8b02c7a141d3297e94e33451 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 19:29:39 +0000 Subject: [PATCH 2/2] tcp/dccp: change source port selection at connect() time In commit 1580ab63fc9a ("tcp/dccp: better use of ephemeral ports in connect()") we added an heuristic to select even ports for connect() and odd ports for bind(). This was nice because no applications changes were needed. But it added more costs when all even ports are in use, when there are few listeners and many active connections. Since then, IP_LOCAL_PORT_RANGE has been added to permit an application to partition ephemeral port range at will. This patch extends the idea so that if IP_LOCAL_PORT_RANGE is set on a socket before accept(), port selection no longer favors even ports. This means that connect() can find a suitable source port faster, and applications can use a different split between connect() and bind() users. This should give more entropy to Toeplitz hash used in RSS: Using even ports was wasting one bit from the 16bit sport. A similar change can be done in inet_csk_find_open_port() if needed. Signed-off-by: Eric Dumazet Cc: Jakub Sitnicki Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20231214192939.1962891-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a532f749e477..9ff201bc4e6d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, bool tb_created = false; u32 remaining, offset; int ret, i, low, high; - int l3mdev; + bool local_ports; + int step, l3mdev; u32 index; if (port) { @@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, l3mdev = inet_sk_bound_l3mdev(sk); - inet_sk_get_local_port_range(sk, &low, &high); + local_ports = inet_sk_get_local_port_range(sk, &low, &high); + step = local_ports ? 1 : 2; + high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; - if (likely(remaining > 1)) + if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, @@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ - offset &= ~1U; + if (!local_ports) + offset &= ~1U; other_parity_scan: port = low + offset; - for (i = 0; i < remaining; i += 2, port += 2) { + for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) @@ -1083,10 +1087,11 @@ next_port: cond_resched(); } - offset++; - if ((offset & 1) && remaining > 1) - goto other_parity_scan; - + if (!local_ports) { + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; + } return -EADDRNOTAVAIL; ok: @@ -1109,8 +1114,8 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, get_random_u32_below(8) * 2); - WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + i = max_t(int, i, get_random_u32_below(8) * step); + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port);