b261eda84e
Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work with setsockopt(SO_REUSEPORT) since v4.6. With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could build a highly efficient server application. setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener or UDP socket, and then incoming packets processed on the CPU will likely be distributed to the socket. Technically, a socket could even receive packets handled on another CPU if no sockets in the reuseport group have the same CPU receiving the flow. The logic exists in compute_score() so that a socket will get a higher score if it has the same CPU with the flow. However, the score gets ignored after the blamed two commits, which introduced a faster socket selection algorithm for SO_REUSEPORT. This patch introduces a counter of sockets with SO_INCOMING_CPU in a reuseport group to check if we should iterate all sockets to find a proper one. We increment the counter when * calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT * enabling SO_INCOMING_CPU if the socket is in a reuseport group Also, we decrement it when * detaching a socket out of the group to apply SO_INCOMING_CPU to migrated TCP requests * disabling SO_INCOMING_CPU if the socket is in a reuseport group When the counter reaches 0, we can get back to the O(1) selection algorithm. The overall changes are negligible for the non-SO_INCOMING_CPU case, and the only notable thing is that we have to update sk_incomnig_cpu under reuseport_lock. Otherwise, the race prevents transitioning to the O(n) algorithm and results in the wrong socket selection. cpu1 (setsockopt) cpu2 (listen) +-----------------+ +-------------+ lock_sock(sk1) lock_sock(sk2) reuseport_update_incoming_cpu(sk1, val) . | /* set CPU as 0 */ |- WRITE_ONCE(sk1->incoming_cpu, val) | | spin_lock_bh(&reuseport_lock) | reuseport_grow(sk2, reuse) | . | |- more_socks_size = reuse->max_socks * 2U; | |- if (more_socks_size > U16_MAX && | | reuse->num_closed_socks) | | . | | |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL); | | `- __reuseport_detach_closed_sock(sk1, reuse) | | . | | `- reuseport_put_incoming_cpu(sk1, reuse) | | . | | | /* Read shutdown()ed sk1's sk_incoming_cpu | | | * without lock_sock(). | | | */ | | `- if (sk1->sk_incoming_cpu >= 0) | | . | | | /* decrement not-yet-incremented | | | * count, which is never incremented. | | | */ | | `- __reuseport_put_incoming_cpu(reuse); | | | `- spin_lock_bh(&reuseport_lock) | |- spin_lock_bh(&reuseport_lock) | |- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...) |- if (!reuse) | . | | /* Cannot increment reuse->incoming_cpu. */ | `- goto out; | `- spin_unlock_bh(&reuseport_lock) Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Kazuho Oku <kazuhooku@gmail.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
65 lines
1.8 KiB
C
65 lines
1.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _SOCK_REUSEPORT_H
|
|
#define _SOCK_REUSEPORT_H
|
|
|
|
#include <linux/filter.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/types.h>
|
|
#include <linux/spinlock.h>
|
|
#include <net/sock.h>
|
|
|
|
extern spinlock_t reuseport_lock;
|
|
|
|
struct sock_reuseport {
|
|
struct rcu_head rcu;
|
|
|
|
u16 max_socks; /* length of socks */
|
|
u16 num_socks; /* elements in socks */
|
|
u16 num_closed_socks; /* closed elements in socks */
|
|
u16 incoming_cpu;
|
|
/* The last synq overflow event timestamp of this
|
|
* reuse->socks[] group.
|
|
*/
|
|
unsigned int synq_overflow_ts;
|
|
/* ID stays the same even after the size of socks[] grows. */
|
|
unsigned int reuseport_id;
|
|
unsigned int bind_inany:1;
|
|
unsigned int has_conns:1;
|
|
struct bpf_prog __rcu *prog; /* optional BPF sock selector */
|
|
struct sock *socks[]; /* array of sock pointers */
|
|
};
|
|
|
|
extern int reuseport_alloc(struct sock *sk, bool bind_inany);
|
|
extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
|
|
bool bind_inany);
|
|
extern void reuseport_detach_sock(struct sock *sk);
|
|
void reuseport_stop_listen_sock(struct sock *sk);
|
|
extern struct sock *reuseport_select_sock(struct sock *sk,
|
|
u32 hash,
|
|
struct sk_buff *skb,
|
|
int hdr_len);
|
|
struct sock *reuseport_migrate_sock(struct sock *sk,
|
|
struct sock *migrating_sk,
|
|
struct sk_buff *skb);
|
|
extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
|
|
extern int reuseport_detach_prog(struct sock *sk);
|
|
|
|
static inline bool reuseport_has_conns(struct sock *sk)
|
|
{
|
|
struct sock_reuseport *reuse;
|
|
bool ret = false;
|
|
|
|
rcu_read_lock();
|
|
reuse = rcu_dereference(sk->sk_reuseport_cb);
|
|
if (reuse && reuse->has_conns)
|
|
ret = true;
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
void reuseport_has_conns_set(struct sock *sk);
|
|
void reuseport_update_incoming_cpu(struct sock *sk, int val);
|
|
|
|
#endif /* _SOCK_REUSEPORT_H */
|