From e187013abeb4c2a7ec8a4bb978844c7e92a1a6ec Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:21 +0300 Subject: [PATCH 1/5] txhash: Make rethinking txhash behavior configurable via sysctl Add a per ns sysctl that controls the txhash rethink behavior: net.core.txrehash. When enabled, the same behavior is retained, when disabled, rethink is not performed. Sysctl is enabled by default. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/core.h | 1 + include/net/sock.h | 34 +++++++++++++++++++++------------- include/uapi/linux/socket.h | 3 +++ net/core/net_namespace.c | 2 ++ net/core/sysctl_net_core.c | 14 ++++++++++++-- 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 552bc25b1933..388244e315e7 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -10,6 +10,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..0540e1b2aeb0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -587,6 +587,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) __tmp | SK_USER_DATA_NOCOPY); \ }) +static inline +struct net *sock_net(const struct sock *sk) +{ + return read_pnet(&sk->sk_net); +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ + write_pnet(&sk->sk_net, net); +} + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE @@ -2054,10 +2066,18 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) { + u8 rehash; + + if (!sk->sk_txhash) + return false; + + rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + + if (rehash) { sk_set_txhash(sk); return true; } + return false; } @@ -2704,18 +2724,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static inline -struct net *sock_net(const struct sock *sk) -{ - return read_pnet(&sk->sk_net); -} - -static inline -void sock_net_set(struct sock *sk, struct net *net) -{ - write_pnet(&sk->sk_net, net); -} - static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index eb0a9a5b6e71..0accd6102ece 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -31,4 +31,7 @@ struct __kernel_sockaddr_storage { #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) +#define SOCK_TXREHASH_DISABLED 0 +#define SOCK_TXREHASH_ENABLED 1 + #endif /* _UAPI_LINUX_SOCKET_H */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c53d9aab38ab..8711350085d6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -364,6 +364,8 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7b4d485aac7a..dbeb8ecbcd98 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, { } }; @@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl; + struct ctl_table *tbl, *tmp; tbl = netns_core_table; if (!net_eq(net, &init_net)) { @@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (tmp = tbl; tmp->procname; tmp++) + tmp->data += (char *)net - (char *)&init_net; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { From 26859240e4ee701e0379f08634957adaff67e43a Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:22 +0300 Subject: [PATCH 2/5] txhash: Add socket option to control TX hash rethink behavior Add the SO_TXREHASH socket option to control hash rethink behavior per socket. When default mode is set, sockets disable rehash at initialization and use sysctl option when entering listen state. setsockopt() overrides default behavior. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 12 +++--------- include/uapi/asm-generic/socket.h | 2 ++ include/uapi/linux/socket.h | 1 + net/core/sock.c | 13 +++++++++++++ net/ipv4/inet_connection_sock.c | 3 +++ 9 files changed, 30 insertions(+), 9 deletions(-) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 284d28755b8d..7d81535893af 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -133,6 +133,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 24e0efb360f6..1d55e57b8466 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -144,6 +144,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 845ddc63c882..654061e0964e 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -125,6 +125,8 @@ #define SO_RESERVE_MEM 0x4047 +#define SO_TXREHASH 0x4048 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 2672dd03faf3..666f81e617ea 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -126,6 +126,8 @@ #define SO_RESERVE_MEM 0x0052 +#define SO_TXREHASH 0x0053 + #if !defined(__KERNEL__) diff --git a/include/net/sock.h b/include/net/sock.h index 0540e1b2aeb0..d6c13f0fba40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -316,6 +316,7 @@ struct sk_filter; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received @@ -491,6 +492,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + u8 sk_txrehash; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; @@ -2066,18 +2068,10 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - u8 rehash; - - if (!sk->sk_txhash) - return false; - - rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - - if (rehash) { + if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } - return false; } diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index c77a1313b3b0..467ca2f28760 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -128,6 +128,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index 0accd6102ece..51d6bb2f6765 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -31,6 +31,7 @@ struct __kernel_sockaddr_storage { #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) +#define SOCK_TXREHASH_DEFAULT ((u8)-1) #define SOCK_TXREHASH_DISABLED 0 #define SOCK_TXREHASH_ENABLED 1 diff --git a/net/core/sock.c b/net/core/sock.c index cccf21f3618d..5e711b42898f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1447,6 +1447,14 @@ set_sndbuf: break; } + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + sk->sk_txrehash = (u8)val; + break; + default: ret = -ENOPROTOOPT; break; @@ -1834,6 +1842,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reserved_mem; break; + case SO_TXREHASH: + v.val = sk->sk_txrehash; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -3279,6 +3291,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; + sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc2a985f6064..b81fb13fc5f4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk) sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); + if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) + sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only From 2127324a7d4a205ceb0452512a3b8c0999b4e86e Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:23 +0300 Subject: [PATCH 3/5] txhash: Add txrehash sysctl description Update Documentation/admin-guide/sysctl/net.rst with txrehash usage description. Signed-off-by: Akhmat Karakotov Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 4150f74c521a..f86b5e1623c6 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -365,6 +365,15 @@ new netns has been created. Default : 0 (for compatibility reasons) +txrehash +-------- + +Controls default hash rethink behaviour on listening socket when SO_TXREHASH +option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt). + +If set to 1 (default), hash rethink is performed on listening socket. +If set to 0, hash rethink is not performed. + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ---------------------------------------------------------- From e7b9bfd18476cd3de9a3819235f9221e59abc80a Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:24 +0300 Subject: [PATCH 4/5] bpf: Add SO_TXREHASH setsockopt Add bpf socket option to override rehash behaviour from userspace or from bpf. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index a06931c27eeb..9615ae1ab530 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5091,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + sk->sk_txrehash = (u8)val; + break; default: ret = -EINVAL; } @@ -5269,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: *((int *)optval) = sk->sk_reuseport; break; + case SO_TXREHASH: + *((int *)optval) = sk->sk_txrehash; + break; default: goto err_clear; } From cb6cd2cec799356e5e2f75a8591894599a6ad49d Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:25 +0300 Subject: [PATCH 5/5] tcp: Change SYN ACK retransmit behaviour to account for rehash Disabling rehash behavior did not affect SYN ACK retransmits because hash was forcefully changed bypassing the sk_rethink_hash function. This patch adds a condition which checks for rehash mode before resetting hash. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- net/ipv4/tcp_output.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 5e711b42898f..d6804685f17f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1452,7 +1452,8 @@ set_sndbuf: ret = -EINVAL; break; } - sk->sk_txrehash = (u8)val; + /* Paired with READ_ONCE() in tcp_rtx_synack() */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); break; default: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 11c06b9db801..e76bf1e9251e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) struct flowi fl; int res; - tcp_rsk(req)->txhash = net_tx_rndhash(); + /* Paired with WRITE_ONCE() in sock_setsockopt() */ + if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) + tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) {