soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port. Motivation for soresuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
055dc21a1d
commit
da5e36308d
@ -81,7 +81,9 @@ struct inet_bind_bucket {
|
|||||||
struct net *ib_net;
|
struct net *ib_net;
|
||||||
#endif
|
#endif
|
||||||
unsigned short port;
|
unsigned short port;
|
||||||
signed short fastreuse;
|
signed char fastreuse;
|
||||||
|
signed char fastreuseport;
|
||||||
|
kuid_t fastuid;
|
||||||
int num_owners;
|
int num_owners;
|
||||||
struct hlist_node node;
|
struct hlist_node node;
|
||||||
struct hlist_head owners;
|
struct hlist_head owners;
|
||||||
@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
|
|||||||
|
|
||||||
extern struct sock *__inet_lookup_listener(struct net *net,
|
extern struct sock *__inet_lookup_listener(struct net *net,
|
||||||
struct inet_hashinfo *hashinfo,
|
struct inet_hashinfo *hashinfo,
|
||||||
|
const __be32 saddr,
|
||||||
|
const __be16 sport,
|
||||||
const __be32 daddr,
|
const __be32 daddr,
|
||||||
const unsigned short hnum,
|
const unsigned short hnum,
|
||||||
const int dif);
|
const int dif);
|
||||||
|
|
||||||
static inline struct sock *inet_lookup_listener(struct net *net,
|
static inline struct sock *inet_lookup_listener(struct net *net,
|
||||||
struct inet_hashinfo *hashinfo,
|
struct inet_hashinfo *hashinfo,
|
||||||
|
__be32 saddr, __be16 sport,
|
||||||
__be32 daddr, __be16 dport, int dif)
|
__be32 daddr, __be16 dport, int dif)
|
||||||
{
|
{
|
||||||
return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
|
return __inet_lookup_listener(net, hashinfo, saddr, sport,
|
||||||
|
daddr, ntohs(dport), dif);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Socket demux engine toys. */
|
/* Socket demux engine toys. */
|
||||||
@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
|
|||||||
struct sock *sk = __inet_lookup_established(net, hashinfo,
|
struct sock *sk = __inet_lookup_established(net, hashinfo,
|
||||||
saddr, sport, daddr, hnum, dif);
|
saddr, sport, daddr, hnum, dif);
|
||||||
|
|
||||||
return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
|
return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
|
||||||
|
daddr, hnum, dif);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct sock *inet_lookup(struct net *net,
|
static inline struct sock *inet_lookup(struct net *net,
|
||||||
|
@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
|
|||||||
break;
|
break;
|
||||||
case NFT_LOOKUP_LISTENER:
|
case NFT_LOOKUP_LISTENER:
|
||||||
sk = inet_lookup_listener(net, &tcp_hashinfo,
|
sk = inet_lookup_listener(net, &tcp_hashinfo,
|
||||||
|
saddr, sport,
|
||||||
daddr, dport,
|
daddr, dport,
|
||||||
in->ifindex);
|
in->ifindex);
|
||||||
|
|
||||||
|
@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
|
|||||||
struct sock *sk2;
|
struct sock *sk2;
|
||||||
struct hlist_node *node;
|
struct hlist_node *node;
|
||||||
int reuse = sk->sk_reuse;
|
int reuse = sk->sk_reuse;
|
||||||
|
int reuseport = sk->sk_reuseport;
|
||||||
|
kuid_t uid = sock_i_uid((struct sock *)sk);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Unlike other sk lookup places we do not check
|
* Unlike other sk lookup places we do not check
|
||||||
@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
|
|||||||
(!sk->sk_bound_dev_if ||
|
(!sk->sk_bound_dev_if ||
|
||||||
!sk2->sk_bound_dev_if ||
|
!sk2->sk_bound_dev_if ||
|
||||||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
|
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
|
||||||
if (!reuse || !sk2->sk_reuse ||
|
if ((!reuse || !sk2->sk_reuse ||
|
||||||
sk2->sk_state == TCP_LISTEN) {
|
sk2->sk_state == TCP_LISTEN) &&
|
||||||
|
(!reuseport || !sk2->sk_reuseport ||
|
||||||
|
(sk2->sk_state != TCP_TIME_WAIT &&
|
||||||
|
!uid_eq(uid, sock_i_uid(sk2))))) {
|
||||||
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
|
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
|
||||||
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
|
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
|
||||||
sk2_rcv_saddr == sk_rcv_saddr(sk))
|
sk2_rcv_saddr == sk_rcv_saddr(sk))
|
||||||
@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
|
|||||||
int ret, attempts = 5;
|
int ret, attempts = 5;
|
||||||
struct net *net = sock_net(sk);
|
struct net *net = sock_net(sk);
|
||||||
int smallest_size = -1, smallest_rover;
|
int smallest_size = -1, smallest_rover;
|
||||||
|
kuid_t uid = sock_i_uid(sk);
|
||||||
|
|
||||||
local_bh_disable();
|
local_bh_disable();
|
||||||
if (!snum) {
|
if (!snum) {
|
||||||
@ -125,9 +131,12 @@ again:
|
|||||||
spin_lock(&head->lock);
|
spin_lock(&head->lock);
|
||||||
inet_bind_bucket_for_each(tb, node, &head->chain)
|
inet_bind_bucket_for_each(tb, node, &head->chain)
|
||||||
if (net_eq(ib_net(tb), net) && tb->port == rover) {
|
if (net_eq(ib_net(tb), net) && tb->port == rover) {
|
||||||
if (tb->fastreuse > 0 &&
|
if (((tb->fastreuse > 0 &&
|
||||||
sk->sk_reuse &&
|
sk->sk_reuse &&
|
||||||
sk->sk_state != TCP_LISTEN &&
|
sk->sk_state != TCP_LISTEN) ||
|
||||||
|
(tb->fastreuseport > 0 &&
|
||||||
|
sk->sk_reuseport &&
|
||||||
|
uid_eq(tb->fastuid, uid))) &&
|
||||||
(tb->num_owners < smallest_size || smallest_size == -1)) {
|
(tb->num_owners < smallest_size || smallest_size == -1)) {
|
||||||
smallest_size = tb->num_owners;
|
smallest_size = tb->num_owners;
|
||||||
smallest_rover = rover;
|
smallest_rover = rover;
|
||||||
@ -185,14 +194,17 @@ tb_found:
|
|||||||
if (sk->sk_reuse == SK_FORCE_REUSE)
|
if (sk->sk_reuse == SK_FORCE_REUSE)
|
||||||
goto success;
|
goto success;
|
||||||
|
|
||||||
if (tb->fastreuse > 0 &&
|
if (((tb->fastreuse > 0 &&
|
||||||
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
|
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||||
|
(tb->fastreuseport > 0 &&
|
||||||
|
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
|
||||||
smallest_size == -1) {
|
smallest_size == -1) {
|
||||||
goto success;
|
goto success;
|
||||||
} else {
|
} else {
|
||||||
ret = 1;
|
ret = 1;
|
||||||
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
|
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
|
||||||
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
|
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||||
|
(sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
|
||||||
smallest_size != -1 && --attempts >= 0) {
|
smallest_size != -1 && --attempts >= 0) {
|
||||||
spin_unlock(&head->lock);
|
spin_unlock(&head->lock);
|
||||||
goto again;
|
goto again;
|
||||||
@ -212,9 +224,23 @@ tb_not_found:
|
|||||||
tb->fastreuse = 1;
|
tb->fastreuse = 1;
|
||||||
else
|
else
|
||||||
tb->fastreuse = 0;
|
tb->fastreuse = 0;
|
||||||
} else if (tb->fastreuse &&
|
if (sk->sk_reuseport) {
|
||||||
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
|
tb->fastreuseport = 1;
|
||||||
tb->fastreuse = 0;
|
tb->fastuid = uid;
|
||||||
|
} else {
|
||||||
|
tb->fastreuseport = 0;
|
||||||
|
tb->fastuid = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (tb->fastreuse &&
|
||||||
|
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
|
||||||
|
tb->fastreuse = 0;
|
||||||
|
if (tb->fastreuseport &&
|
||||||
|
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
|
||||||
|
tb->fastreuseport = 0;
|
||||||
|
tb->fastuid = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
success:
|
success:
|
||||||
if (!inet_csk(sk)->icsk_bind_hash)
|
if (!inet_csk(sk)->icsk_bind_hash)
|
||||||
inet_bind_hash(sk, tb, snum);
|
inet_bind_hash(sk, tb, snum);
|
||||||
|
@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
|
|||||||
write_pnet(&tb->ib_net, hold_net(net));
|
write_pnet(&tb->ib_net, hold_net(net));
|
||||||
tb->port = snum;
|
tb->port = snum;
|
||||||
tb->fastreuse = 0;
|
tb->fastreuse = 0;
|
||||||
|
tb->fastreuseport = 0;
|
||||||
tb->num_owners = 0;
|
tb->num_owners = 0;
|
||||||
INIT_HLIST_HEAD(&tb->owners);
|
INIT_HLIST_HEAD(&tb->owners);
|
||||||
hlist_add_head(&tb->node, &head->chain);
|
hlist_add_head(&tb->node, &head->chain);
|
||||||
@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
|
|||||||
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
|
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
|
||||||
!ipv6_only_sock(sk)) {
|
!ipv6_only_sock(sk)) {
|
||||||
__be32 rcv_saddr = inet->inet_rcv_saddr;
|
__be32 rcv_saddr = inet->inet_rcv_saddr;
|
||||||
score = sk->sk_family == PF_INET ? 1 : 0;
|
score = sk->sk_family == PF_INET ? 2 : 1;
|
||||||
if (rcv_saddr) {
|
if (rcv_saddr) {
|
||||||
if (rcv_saddr != daddr)
|
if (rcv_saddr != daddr)
|
||||||
return -1;
|
return -1;
|
||||||
score += 2;
|
score += 4;
|
||||||
}
|
}
|
||||||
if (sk->sk_bound_dev_if) {
|
if (sk->sk_bound_dev_if) {
|
||||||
if (sk->sk_bound_dev_if != dif)
|
if (sk->sk_bound_dev_if != dif)
|
||||||
return -1;
|
return -1;
|
||||||
score += 2;
|
score += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return score;
|
return score;
|
||||||
@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
|
|||||||
|
|
||||||
struct sock *__inet_lookup_listener(struct net *net,
|
struct sock *__inet_lookup_listener(struct net *net,
|
||||||
struct inet_hashinfo *hashinfo,
|
struct inet_hashinfo *hashinfo,
|
||||||
|
const __be32 saddr, __be16 sport,
|
||||||
const __be32 daddr, const unsigned short hnum,
|
const __be32 daddr, const unsigned short hnum,
|
||||||
const int dif)
|
const int dif)
|
||||||
{
|
{
|
||||||
@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
|
|||||||
struct hlist_nulls_node *node;
|
struct hlist_nulls_node *node;
|
||||||
unsigned int hash = inet_lhashfn(net, hnum);
|
unsigned int hash = inet_lhashfn(net, hnum);
|
||||||
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
||||||
int score, hiscore;
|
int score, hiscore, matches = 0, reuseport = 0;
|
||||||
|
u32 phash = 0;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
begin:
|
begin:
|
||||||
result = NULL;
|
result = NULL;
|
||||||
hiscore = -1;
|
hiscore = 0;
|
||||||
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
|
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
|
||||||
score = compute_score(sk, net, hnum, daddr, dif);
|
score = compute_score(sk, net, hnum, daddr, dif);
|
||||||
if (score > hiscore) {
|
if (score > hiscore) {
|
||||||
result = sk;
|
result = sk;
|
||||||
hiscore = score;
|
hiscore = score;
|
||||||
|
reuseport = sk->sk_reuseport;
|
||||||
|
if (reuseport) {
|
||||||
|
phash = inet_ehashfn(net, daddr, hnum,
|
||||||
|
saddr, sport);
|
||||||
|
matches = 1;
|
||||||
|
}
|
||||||
|
} else if (score == hiscore && reuseport) {
|
||||||
|
matches++;
|
||||||
|
if (((u64)phash * matches) >> 32 == 0)
|
||||||
|
result = sk;
|
||||||
|
phash = next_pseudo_random32(phash);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
|||||||
inet_bind_bucket_for_each(tb, node, &head->chain) {
|
inet_bind_bucket_for_each(tb, node, &head->chain) {
|
||||||
if (net_eq(ib_net(tb), net) &&
|
if (net_eq(ib_net(tb), net) &&
|
||||||
tb->port == port) {
|
tb->port == port) {
|
||||||
if (tb->fastreuse >= 0)
|
if (tb->fastreuse >= 0 ||
|
||||||
|
tb->fastreuseport >= 0)
|
||||||
goto next_port;
|
goto next_port;
|
||||||
WARN_ON(hlist_empty(&tb->owners));
|
WARN_ON(hlist_empty(&tb->owners));
|
||||||
if (!check_established(death_row, sk,
|
if (!check_established(death_row, sk,
|
||||||
@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
tb->fastreuse = -1;
|
tb->fastreuse = -1;
|
||||||
|
tb->fastreuseport = -1;
|
||||||
goto ok;
|
goto ok;
|
||||||
|
|
||||||
next_port:
|
next_port:
|
||||||
|
@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
|
|||||||
* no RST generated if md5 hash doesn't match.
|
* no RST generated if md5 hash doesn't match.
|
||||||
*/
|
*/
|
||||||
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
|
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
|
||||||
&tcp_hashinfo, ip_hdr(skb)->daddr,
|
&tcp_hashinfo, ip_hdr(skb)->saddr,
|
||||||
|
th->source, ip_hdr(skb)->daddr,
|
||||||
ntohs(th->source), inet_iif(skb));
|
ntohs(th->source), inet_iif(skb));
|
||||||
/* don't send rst if it can't find key */
|
/* don't send rst if it can't find key */
|
||||||
if (!sk1)
|
if (!sk1)
|
||||||
@ -2074,6 +2075,7 @@ do_time_wait:
|
|||||||
case TCP_TW_SYN: {
|
case TCP_TW_SYN: {
|
||||||
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
|
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
|
||||||
&tcp_hashinfo,
|
&tcp_hashinfo,
|
||||||
|
iph->saddr, th->source,
|
||||||
iph->daddr, th->dest,
|
iph->daddr, th->dest,
|
||||||
inet_iif(skb));
|
inet_iif(skb));
|
||||||
if (sk2) {
|
if (sk2) {
|
||||||
|
Loading…
Reference in New Issue
Block a user