diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 4150f74c521a..f86b5e1623c6 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -365,6 +365,15 @@ new netns has been created. Default : 0 (for compatibility reasons) +txrehash +-------- + +Controls default hash rethink behaviour on listening socket when SO_TXREHASH +option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt). + +If set to 1 (default), hash rethink is performed on listening socket. +If set to 0, hash rethink is not performed. + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ---------------------------------------------------------- diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 284d28755b8d..7d81535893af 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -133,6 +133,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 24e0efb360f6..1d55e57b8466 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -144,6 +144,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 845ddc63c882..654061e0964e 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -125,6 +125,8 @@ #define SO_RESERVE_MEM 0x4047 +#define SO_TXREHASH 0x4048 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 2672dd03faf3..666f81e617ea 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -126,6 +126,8 @@ #define SO_RESERVE_MEM 0x0052 +#define SO_TXREHASH 0x0053 + #if !defined(__KERNEL__) diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 552bc25b1933..388244e315e7 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -10,6 +10,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..d6c13f0fba40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -316,6 +316,7 @@ struct sk_filter; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received @@ -491,6 +492,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + u8 sk_txrehash; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; @@ -587,6 +589,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) __tmp | SK_USER_DATA_NOCOPY); \ }) +static inline +struct net *sock_net(const struct sock *sk) +{ + return read_pnet(&sk->sk_net); +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ + write_pnet(&sk->sk_net, net); +} + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE @@ -2054,7 +2068,7 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) { + if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } @@ -2704,18 +2718,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static inline -struct net *sock_net(const struct sock *sk) -{ - return read_pnet(&sk->sk_net); -} - -static inline -void sock_net_set(struct sock *sk, struct net *net) -{ - write_pnet(&sk->sk_net, net); -} - static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index c77a1313b3b0..467ca2f28760 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -128,6 +128,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index eb0a9a5b6e71..51d6bb2f6765 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -31,4 +31,8 @@ struct __kernel_sockaddr_storage { #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) +#define SOCK_TXREHASH_DEFAULT ((u8)-1) +#define SOCK_TXREHASH_DISABLED 0 +#define SOCK_TXREHASH_ENABLED 1 + #endif /* _UAPI_LINUX_SOCKET_H */ diff --git a/net/core/filter.c b/net/core/filter.c index a06931c27eeb..9615ae1ab530 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5091,6 +5091,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + sk->sk_txrehash = (u8)val; + break; default: ret = -EINVAL; } @@ -5269,6 +5276,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: *((int *)optval) = sk->sk_reuseport; break; + case SO_TXREHASH: + *((int *)optval) = sk->sk_txrehash; + break; default: goto err_clear; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c53d9aab38ab..8711350085d6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -364,6 +364,8 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + return 0; } diff --git a/net/core/sock.c b/net/core/sock.c index cccf21f3618d..d6804685f17f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1447,6 +1447,15 @@ set_sndbuf: break; } + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + /* Paired with READ_ONCE() in tcp_rtx_synack() */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + break; + default: ret = -ENOPROTOOPT; break; @@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reserved_mem; break; + case SO_TXREHASH: + v.val = sk->sk_txrehash; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -3279,6 +3292,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; + sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7b4d485aac7a..dbeb8ecbcd98 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, { } }; @@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl; + struct ctl_table *tbl, *tmp; tbl = netns_core_table; if (!net_eq(net, &init_net)) { @@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (tmp = tbl; tmp->procname; tmp++) + tmp->data += (char *)net - (char *)&init_net; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc2a985f6064..b81fb13fc5f4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1046,6 +1046,9 @@ int inet_csk_listen_start(struct sock *sk) sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); + if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT) + sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 11c06b9db801..e76bf1e9251e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4092,7 +4092,9 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) struct flowi fl; int res; - tcp_rsk(req)->txhash = net_tx_rndhash(); + /* Paired with WRITE_ONCE() in sock_setsockopt() */ + if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) + tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) {