soreuseport: TCP/IPv4 implementation

Allow multiple listener sockets to bind to the same port.

Motivation for soresuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket.  This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads.  In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets.  We have seen the  disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest.  With so_reusport the distribution is
uniform.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Tom Herbert 2013-01-22 09:50:24 +00:00 committed by David S. Miller
parent 055dc21a1d
commit da5e36308d
5 changed files with 73 additions and 21 deletions

View File

@ -81,7 +81,9 @@ struct inet_bind_bucket {
struct net *ib_net; struct net *ib_net;
#endif #endif
unsigned short port; unsigned short port;
signed short fastreuse; signed char fastreuse;
signed char fastreuseport;
kuid_t fastuid;
int num_owners; int num_owners;
struct hlist_node node; struct hlist_node node;
struct hlist_head owners; struct hlist_head owners;
@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
extern struct sock *__inet_lookup_listener(struct net *net, extern struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, struct inet_hashinfo *hashinfo,
const __be32 saddr,
const __be16 sport,
const __be32 daddr, const __be32 daddr,
const unsigned short hnum, const unsigned short hnum,
const int dif); const int dif);
static inline struct sock *inet_lookup_listener(struct net *net, static inline struct sock *inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, struct inet_hashinfo *hashinfo,
__be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif) __be32 daddr, __be16 dport, int dif)
{ {
return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif); return __inet_lookup_listener(net, hashinfo, saddr, sport,
daddr, ntohs(dport), dif);
} }
/* Socket demux engine toys. */ /* Socket demux engine toys. */
@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
struct sock *sk = __inet_lookup_established(net, hashinfo, struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif); saddr, sport, daddr, hnum, dif);
return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
daddr, hnum, dif);
} }
static inline struct sock *inet_lookup(struct net *net, static inline struct sock *inet_lookup(struct net *net,

View File

@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
break; break;
case NFT_LOOKUP_LISTENER: case NFT_LOOKUP_LISTENER:
sk = inet_lookup_listener(net, &tcp_hashinfo, sk = inet_lookup_listener(net, &tcp_hashinfo,
saddr, sport,
daddr, dport, daddr, dport,
in->ifindex); in->ifindex);

View File

@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
struct sock *sk2; struct sock *sk2;
struct hlist_node *node; struct hlist_node *node;
int reuse = sk->sk_reuse; int reuse = sk->sk_reuse;
int reuseport = sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
/* /*
* Unlike other sk lookup places we do not check * Unlike other sk lookup places we do not check
@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if || (!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
if (!reuse || !sk2->sk_reuse || if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) { sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
sk2_rcv_saddr == sk_rcv_saddr(sk)) sk2_rcv_saddr == sk_rcv_saddr(sk))
@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
int ret, attempts = 5; int ret, attempts = 5;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover; int smallest_size = -1, smallest_rover;
kuid_t uid = sock_i_uid(sk);
local_bh_disable(); local_bh_disable();
if (!snum) { if (!snum) {
@ -125,9 +131,12 @@ again:
spin_lock(&head->lock); spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain) inet_bind_bucket_for_each(tb, node, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) { if (net_eq(ib_net(tb), net) && tb->port == rover) {
if (tb->fastreuse > 0 && if (((tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_reuse &&
sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport &&
uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) { (tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners; smallest_size = tb->num_owners;
smallest_rover = rover; smallest_rover = rover;
@ -185,14 +194,17 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE) if (sk->sk_reuse == SK_FORCE_REUSE)
goto success; goto success;
if (tb->fastreuse > 0 && if (((tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN && sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(tb->fastreuseport > 0 &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) { smallest_size == -1) {
goto success; goto success;
} else { } else {
ret = 1; ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
(sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) { smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock); spin_unlock(&head->lock);
goto again; goto again;
@ -212,9 +224,23 @@ tb_not_found:
tb->fastreuse = 1; tb->fastreuse = 1;
else else
tb->fastreuse = 0; tb->fastreuse = 0;
} else if (tb->fastreuse && if (sk->sk_reuseport) {
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuseport = 1;
tb->fastreuse = 0; tb->fastuid = uid;
} else {
tb->fastreuseport = 0;
tb->fastuid = 0;
}
} else {
if (tb->fastreuse &&
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
tb->fastreuse = 0;
if (tb->fastreuseport &&
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
tb->fastreuseport = 0;
tb->fastuid = 0;
}
}
success: success:
if (!inet_csk(sk)->icsk_bind_hash) if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum); inet_bind_hash(sk, tb, snum);

View File

@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net)); write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum; tb->port = snum;
tb->fastreuse = 0; tb->fastreuse = 0;
tb->fastreuseport = 0;
tb->num_owners = 0; tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners); INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain); hlist_add_head(&tb->node, &head->chain);
@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) { !ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr; __be32 rcv_saddr = inet->inet_rcv_saddr;
score = sk->sk_family == PF_INET ? 1 : 0; score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) { if (rcv_saddr) {
if (rcv_saddr != daddr) if (rcv_saddr != daddr)
return -1; return -1;
score += 2; score += 4;
} }
if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif) if (sk->sk_bound_dev_if != dif)
return -1; return -1;
score += 2; score += 4;
} }
} }
return score; return score;
@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net, struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, struct inet_hashinfo *hashinfo,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum, const __be32 daddr, const unsigned short hnum,
const int dif) const int dif)
{ {
@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node; struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum); unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore; int score, hiscore, matches = 0, reuseport = 0;
u32 phash = 0;
rcu_read_lock(); rcu_read_lock();
begin: begin:
result = NULL; result = NULL;
hiscore = -1; hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) { sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif); score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) { if (score > hiscore) {
result = sk; result = sk;
hiscore = score; hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
matches = 1;
}
} else if (score == hiscore && reuseport) {
matches++;
if (((u64)phash * matches) >> 32 == 0)
result = sk;
phash = next_pseudo_random32(phash);
} }
} }
/* /*
@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_bind_bucket_for_each(tb, node, &head->chain) { inet_bind_bucket_for_each(tb, node, &head->chain) {
if (net_eq(ib_net(tb), net) && if (net_eq(ib_net(tb), net) &&
tb->port == port) { tb->port == port) {
if (tb->fastreuse >= 0) if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port; goto next_port;
WARN_ON(hlist_empty(&tb->owners)); WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk, if (!check_established(death_row, sk,
@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break; break;
} }
tb->fastreuse = -1; tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok; goto ok;
next_port: next_port:

View File

@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match. * no RST generated if md5 hash doesn't match.
*/ */
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
&tcp_hashinfo, ip_hdr(skb)->daddr, &tcp_hashinfo, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb)); ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */ /* don't send rst if it can't find key */
if (!sk1) if (!sk1)
@ -2074,6 +2075,7 @@ do_time_wait:
case TCP_TW_SYN: { case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo, &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, th->dest, iph->daddr, th->dest,
inet_iif(skb)); inet_iif(skb));
if (sk2) { if (sk2) {