inet: Add a 2nd listener hashtable (port+addr)
The current listener hashtable is hashed by port only. When a process is listening at many IP addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener() performance is degraded to a link list. It is prone to syn attack. UDP had a similar issue and a second hashtable was added to resolve it. This patch adds a second hashtable for the listener's sockets. The second hashtable is hashed by port and address. It cannot reuse the existing skc_portaddr_node which is shared with skc_bind_node. TCP listener needs to use skc_bind_node. Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to the inet_connection_sock which the listener (like TCP) also belongs to. The new portaddr hashtable may need two lookup (First by IP:PORT. Second by INADDR_ANY:PORT if the IP:PORT is a not found). Hence, it implements a similar cut off as UDP such that it will only consult the new portaddr hashtable if the current port-only hashtable has >10 sk in the link-list. lhash2 and lhash2_mask are added to 'struct inet_hashinfo'. I take this chance to plug a 4 bytes hole. It is done by first moving the existing bind_bucket_cachep up and then add the new (int lhash2_mask, *lhash2) after the existing bhash_size. Signed-off-by: Martin KaFai Lau <kafai@fb.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
f0b1e64c13
commit
61b7c691c7
@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
|
||||
* @icsk_af_ops Operations which are AF_INET{4,6} specific
|
||||
* @icsk_ulp_ops Pluggable ULP control hook
|
||||
* @icsk_ulp_data ULP private data
|
||||
* @icsk_listen_portaddr_node hash to the portaddr listener hashtable
|
||||
* @icsk_ca_state: Congestion control state
|
||||
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
|
||||
* @icsk_pending: Scheduled timer event
|
||||
@ -101,6 +102,7 @@ struct inet_connection_sock {
|
||||
const struct inet_connection_sock_af_ops *icsk_af_ops;
|
||||
const struct tcp_ulp_ops *icsk_ulp_ops;
|
||||
void *icsk_ulp_data;
|
||||
struct hlist_node icsk_listen_portaddr_node;
|
||||
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
|
||||
__u8 icsk_ca_state:6,
|
||||
icsk_ca_setsockopt:1,
|
||||
|
@ -133,12 +133,13 @@ struct inet_hashinfo {
|
||||
/* Ok, let's try this, I give up, we do need a local binding
|
||||
* TCP hash as well as the others for fast bind/connect.
|
||||
*/
|
||||
struct inet_bind_hashbucket *bhash;
|
||||
|
||||
unsigned int bhash_size;
|
||||
/* 4 bytes hole on 64 bit */
|
||||
|
||||
struct kmem_cache *bind_bucket_cachep;
|
||||
struct inet_bind_hashbucket *bhash;
|
||||
unsigned int bhash_size;
|
||||
|
||||
/* The 2nd listener table hashed by local port and address */
|
||||
unsigned int lhash2_mask;
|
||||
struct inet_listen_hashbucket *lhash2;
|
||||
|
||||
/* All the above members are written once at bootup and
|
||||
* never written again _or_ are predominantly read-access.
|
||||
@ -146,14 +147,25 @@ struct inet_hashinfo {
|
||||
* Now align to a new cache line as all the following members
|
||||
* might be often dirty.
|
||||
*/
|
||||
/* All sockets in TCP_LISTEN state will be in here. This is the only
|
||||
* table where wildcard'd TCP sockets can exist. Hash function here
|
||||
* is just local port number.
|
||||
/* All sockets in TCP_LISTEN state will be in listening_hash.
|
||||
* This is the only table where wildcard'd TCP sockets can
|
||||
* exist. listening_hash is only hashed by local port number.
|
||||
* If lhash2 is initialized, the same socket will also be hashed
|
||||
* to lhash2 by port and address.
|
||||
*/
|
||||
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
|
||||
____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
|
||||
hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
|
||||
|
||||
static inline struct inet_listen_hashbucket *
|
||||
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
|
||||
{
|
||||
return &h->lhash2[hash & h->lhash2_mask];
|
||||
}
|
||||
|
||||
static inline struct inet_ehash_bucket *inet_ehash_bucket(
|
||||
struct inet_hashinfo *hashinfo,
|
||||
unsigned int hash)
|
||||
@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
|
||||
void inet_put_port(struct sock *sk);
|
||||
|
||||
void inet_hashinfo_init(struct inet_hashinfo *h);
|
||||
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
|
||||
unsigned long numentries, int scale,
|
||||
unsigned long low_limit,
|
||||
unsigned long high_limit);
|
||||
|
||||
bool inet_ehash_insert(struct sock *sk, struct sock *osk);
|
||||
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/bootmem.h>
|
||||
|
||||
#include <net/addrconf.h>
|
||||
#include <net/inet_connection_sock.h>
|
||||
@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_inherit_port);
|
||||
|
||||
static struct inet_listen_hashbucket *
|
||||
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
|
||||
{
|
||||
u32 hash;
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
if (sk->sk_family == AF_INET6)
|
||||
hash = ipv6_portaddr_hash(sock_net(sk),
|
||||
&sk->sk_v6_rcv_saddr,
|
||||
inet_sk(sk)->inet_num);
|
||||
else
|
||||
#endif
|
||||
hash = ipv4_portaddr_hash(sock_net(sk),
|
||||
inet_sk(sk)->inet_rcv_saddr,
|
||||
inet_sk(sk)->inet_num);
|
||||
return inet_lhash2_bucket(h, hash);
|
||||
}
|
||||
|
||||
static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
|
||||
{
|
||||
struct inet_listen_hashbucket *ilb2;
|
||||
|
||||
if (!h->lhash2)
|
||||
return;
|
||||
|
||||
ilb2 = inet_lhash2_bucket_sk(h, sk);
|
||||
|
||||
spin_lock(&ilb2->lock);
|
||||
if (sk->sk_reuseport && sk->sk_family == AF_INET6)
|
||||
hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
|
||||
&ilb2->head);
|
||||
else
|
||||
hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
|
||||
&ilb2->head);
|
||||
ilb2->count++;
|
||||
spin_unlock(&ilb2->lock);
|
||||
}
|
||||
|
||||
static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
|
||||
{
|
||||
struct inet_listen_hashbucket *ilb2;
|
||||
|
||||
if (!h->lhash2 ||
|
||||
WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
|
||||
return;
|
||||
|
||||
ilb2 = inet_lhash2_bucket_sk(h, sk);
|
||||
|
||||
spin_lock(&ilb2->lock);
|
||||
hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
|
||||
ilb2->count--;
|
||||
spin_unlock(&ilb2->lock);
|
||||
}
|
||||
|
||||
static inline int compute_score(struct sock *sk, struct net *net,
|
||||
const unsigned short hnum, const __be32 daddr,
|
||||
const int dif, const int sdif, bool exact_dif)
|
||||
@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
|
||||
*/
|
||||
|
||||
/* called with rcu_read_lock() : No refcount taken on the socket */
|
||||
static struct sock *inet_lhash2_lookup(struct net *net,
|
||||
struct inet_listen_hashbucket *ilb2,
|
||||
struct sk_buff *skb, int doff,
|
||||
const __be32 saddr, __be16 sport,
|
||||
const __be32 daddr, const unsigned short hnum,
|
||||
const int dif, const int sdif)
|
||||
{
|
||||
bool exact_dif = inet_exact_dif_match(net, skb);
|
||||
struct inet_connection_sock *icsk;
|
||||
struct sock *sk, *result = NULL;
|
||||
int score, hiscore = 0;
|
||||
u32 phash = 0;
|
||||
|
||||
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
|
||||
sk = (struct sock *)icsk;
|
||||
score = compute_score(sk, net, hnum, daddr,
|
||||
dif, sdif, exact_dif);
|
||||
if (score > hiscore) {
|
||||
if (sk->sk_reuseport) {
|
||||
phash = inet_ehashfn(net, daddr, hnum,
|
||||
saddr, sport);
|
||||
result = reuseport_select_sock(sk, phash,
|
||||
skb, doff);
|
||||
if (result)
|
||||
return result;
|
||||
}
|
||||
result = sk;
|
||||
hiscore = score;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct sock *__inet_lookup_listener(struct net *net,
|
||||
struct inet_hashinfo *hashinfo,
|
||||
struct sk_buff *skb, int doff,
|
||||
@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
|
||||
unsigned int hash = inet_lhashfn(net, hnum);
|
||||
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
||||
bool exact_dif = inet_exact_dif_match(net, skb);
|
||||
struct inet_listen_hashbucket *ilb2;
|
||||
struct sock *sk, *result = NULL;
|
||||
int score, hiscore = 0;
|
||||
unsigned int hash2;
|
||||
u32 phash = 0;
|
||||
|
||||
if (ilb->count <= 10 || !hashinfo->lhash2)
|
||||
goto port_lookup;
|
||||
|
||||
/* Too many sk in the ilb bucket (which is hashed by port alone).
|
||||
* Try lhash2 (which is hashed by port and addr) instead.
|
||||
*/
|
||||
|
||||
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
|
||||
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
||||
if (ilb2->count > ilb->count)
|
||||
goto port_lookup;
|
||||
|
||||
result = inet_lhash2_lookup(net, ilb2, skb, doff,
|
||||
saddr, sport, daddr, hnum,
|
||||
dif, sdif);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
/* Lookup lhash2 with INADDR_ANY */
|
||||
|
||||
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
|
||||
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
||||
if (ilb2->count > ilb->count)
|
||||
goto port_lookup;
|
||||
|
||||
return inet_lhash2_lookup(net, ilb2, skb, doff,
|
||||
saddr, sport, daddr, hnum,
|
||||
dif, sdif);
|
||||
|
||||
port_lookup:
|
||||
sk_for_each_rcu(sk, &ilb->head) {
|
||||
score = compute_score(sk, net, hnum, daddr,
|
||||
dif, sdif, exact_dif);
|
||||
@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
|
||||
hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
|
||||
else
|
||||
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
|
||||
inet_hash2(hashinfo, sk);
|
||||
ilb->count++;
|
||||
sock_set_flag(sk, SOCK_RCU_FREE);
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
||||
@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
|
||||
struct inet_listen_hashbucket *ilb;
|
||||
spinlock_t *lock;
|
||||
bool listener = false;
|
||||
int done;
|
||||
|
||||
if (sk_unhashed(sk))
|
||||
return;
|
||||
@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
|
||||
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
||||
}
|
||||
spin_lock_bh(lock);
|
||||
if (sk_unhashed(sk))
|
||||
goto unlock;
|
||||
|
||||
if (rcu_access_pointer(sk->sk_reuseport_cb))
|
||||
reuseport_detach_sock(sk);
|
||||
if (listener)
|
||||
done = __sk_del_node_init(sk);
|
||||
else
|
||||
done = __sk_nulls_del_node_init_rcu(sk);
|
||||
if (done) {
|
||||
if (listener)
|
||||
ilb->count--;
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
||||
if (listener) {
|
||||
inet_unhash2(hashinfo, sk);
|
||||
__sk_del_node_init(sk);
|
||||
ilb->count--;
|
||||
} else {
|
||||
__sk_nulls_del_node_init_rcu(sk);
|
||||
}
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
||||
unlock:
|
||||
spin_unlock_bh(lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_unhash);
|
||||
@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
|
||||
INIT_HLIST_HEAD(&h->listening_hash[i].head);
|
||||
h->listening_hash[i].count = 0;
|
||||
}
|
||||
|
||||
h->lhash2 = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
|
||||
|
||||
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
|
||||
unsigned long numentries, int scale,
|
||||
unsigned long low_limit,
|
||||
unsigned long high_limit)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
h->lhash2 = alloc_large_system_hash(name,
|
||||
sizeof(*h->lhash2),
|
||||
numentries,
|
||||
scale,
|
||||
0,
|
||||
NULL,
|
||||
&h->lhash2_mask,
|
||||
low_limit,
|
||||
high_limit);
|
||||
|
||||
for (i = 0; i <= h->lhash2_mask; i++) {
|
||||
spin_lock_init(&h->lhash2[i].lock);
|
||||
INIT_HLIST_HEAD(&h->lhash2[i].head);
|
||||
h->lhash2[i].count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
|
||||
{
|
||||
unsigned int locksz = sizeof(spinlock_t);
|
||||
|
@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
|
||||
}
|
||||
|
||||
/* called with rcu_read_lock() */
|
||||
static struct sock *inet6_lhash2_lookup(struct net *net,
|
||||
struct inet_listen_hashbucket *ilb2,
|
||||
struct sk_buff *skb, int doff,
|
||||
const struct in6_addr *saddr,
|
||||
const __be16 sport, const struct in6_addr *daddr,
|
||||
const unsigned short hnum, const int dif, const int sdif)
|
||||
{
|
||||
bool exact_dif = inet6_exact_dif_match(net, skb);
|
||||
struct inet_connection_sock *icsk;
|
||||
struct sock *sk, *result = NULL;
|
||||
int score, hiscore = 0;
|
||||
u32 phash = 0;
|
||||
|
||||
inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
|
||||
sk = (struct sock *)icsk;
|
||||
score = compute_score(sk, net, hnum, daddr, dif, sdif,
|
||||
exact_dif);
|
||||
if (score > hiscore) {
|
||||
if (sk->sk_reuseport) {
|
||||
phash = inet6_ehashfn(net, daddr, hnum,
|
||||
saddr, sport);
|
||||
result = reuseport_select_sock(sk, phash,
|
||||
skb, doff);
|
||||
if (result)
|
||||
return result;
|
||||
}
|
||||
result = sk;
|
||||
hiscore = score;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct sock *inet6_lookup_listener(struct net *net,
|
||||
struct inet_hashinfo *hashinfo,
|
||||
struct sk_buff *skb, int doff,
|
||||
@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
|
||||
unsigned int hash = inet_lhashfn(net, hnum);
|
||||
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
||||
bool exact_dif = inet6_exact_dif_match(net, skb);
|
||||
struct inet_listen_hashbucket *ilb2;
|
||||
struct sock *sk, *result = NULL;
|
||||
int score, hiscore = 0;
|
||||
unsigned int hash2;
|
||||
u32 phash = 0;
|
||||
|
||||
if (ilb->count <= 10 || !hashinfo->lhash2)
|
||||
goto port_lookup;
|
||||
|
||||
/* Too many sk in the ilb bucket (which is hashed by port alone).
|
||||
* Try lhash2 (which is hashed by port and addr) instead.
|
||||
*/
|
||||
|
||||
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
|
||||
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
||||
if (ilb2->count > ilb->count)
|
||||
goto port_lookup;
|
||||
|
||||
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
|
||||
saddr, sport, daddr, hnum,
|
||||
dif, sdif);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
/* Lookup lhash2 with in6addr_any */
|
||||
|
||||
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
|
||||
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
|
||||
if (ilb2->count > ilb->count)
|
||||
goto port_lookup;
|
||||
|
||||
return inet6_lhash2_lookup(net, ilb2, skb, doff,
|
||||
saddr, sport, daddr, hnum,
|
||||
dif, sdif);
|
||||
|
||||
port_lookup:
|
||||
sk_for_each(sk, &ilb->head) {
|
||||
score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
|
||||
if (score > hiscore) {
|
||||
|
Loading…
Reference in New Issue
Block a user