tcp: Add TIME_WAIT sockets in bhash2.
[ Upstream commit 936a192f974018b4f6040f6f77b1cc1e75bd8666 ] Jiri Slaby reported regression of bind() with a simple repro. [0] The repro creates a TIME_WAIT socket and tries to bind() a new socket with the same local address and port. Before commit 28044fc1d495 ("net: Add a bhash2 table hashed by port and address"), the bind() failed with -EADDRINUSE, but now it succeeds. The cited commit should have put TIME_WAIT sockets into bhash2; otherwise, inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind() requests if the address is not a wildcard one. The straight option is to move sk_bind2_node from struct sock to struct sock_common to add twsk to bhash2 as implemented as RFC. [1] However, the binary layout change in the struct sock could affect performances moving hot fields on different cachelines. To avoid that, we add another TIME_WAIT list in inet_bind2_bucket and check it while validating bind(). [0]: https://lore.kernel.org/netdev/6b971a4e-c7d8-411e-1f92-fda29b5b2fb9@kernel.org/ [1]: https://lore.kernel.org/netdev/20221221151258.25748-2-kuniyu@amazon.com/ Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Jiri Slaby <jirislaby@kernel.org> Suggested-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Acked-by: Joanne Koong <joannelkoong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
parent
a1f2920a05
commit
01ca3695f1
@ -108,6 +108,10 @@ struct inet_bind2_bucket {
|
|||||||
struct hlist_node node;
|
struct hlist_node node;
|
||||||
/* List of sockets hashed to this bucket */
|
/* List of sockets hashed to this bucket */
|
||||||
struct hlist_head owners;
|
struct hlist_head owners;
|
||||||
|
/* bhash has twsk in owners, but bhash2 has twsk in
|
||||||
|
* deathrow not to add a member in struct sock_common.
|
||||||
|
*/
|
||||||
|
struct hlist_head deathrow;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
|
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
|
||||||
|
@ -73,9 +73,14 @@ struct inet_timewait_sock {
|
|||||||
u32 tw_priority;
|
u32 tw_priority;
|
||||||
struct timer_list tw_timer;
|
struct timer_list tw_timer;
|
||||||
struct inet_bind_bucket *tw_tb;
|
struct inet_bind_bucket *tw_tb;
|
||||||
|
struct inet_bind2_bucket *tw_tb2;
|
||||||
|
struct hlist_node tw_bind2_node;
|
||||||
};
|
};
|
||||||
#define tw_tclass tw_tos
|
#define tw_tclass tw_tos
|
||||||
|
|
||||||
|
#define twsk_for_each_bound_bhash2(__tw, list) \
|
||||||
|
hlist_for_each_entry(__tw, list, tw_bind2_node)
|
||||||
|
|
||||||
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
|
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
|
||||||
{
|
{
|
||||||
return (struct inet_timewait_sock *)sk;
|
return (struct inet_timewait_sock *)sk;
|
||||||
|
@ -173,22 +173,40 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
|
||||||
|
kuid_t sk_uid, bool relax,
|
||||||
|
bool reuseport_cb_ok, bool reuseport_ok)
|
||||||
|
{
|
||||||
|
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return inet_bind_conflict(sk, sk2, sk_uid, relax,
|
||||||
|
reuseport_cb_ok, reuseport_ok);
|
||||||
|
}
|
||||||
|
|
||||||
static bool inet_bhash2_conflict(const struct sock *sk,
|
static bool inet_bhash2_conflict(const struct sock *sk,
|
||||||
const struct inet_bind2_bucket *tb2,
|
const struct inet_bind2_bucket *tb2,
|
||||||
kuid_t sk_uid,
|
kuid_t sk_uid,
|
||||||
bool relax, bool reuseport_cb_ok,
|
bool relax, bool reuseport_cb_ok,
|
||||||
bool reuseport_ok)
|
bool reuseport_ok)
|
||||||
{
|
{
|
||||||
|
struct inet_timewait_sock *tw2;
|
||||||
struct sock *sk2;
|
struct sock *sk2;
|
||||||
|
|
||||||
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
|
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
|
||||||
if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
|
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
|
||||||
continue;
|
|
||||||
|
|
||||||
if (inet_bind_conflict(sk, sk2, sk_uid, relax,
|
|
||||||
reuseport_cb_ok, reuseport_ok))
|
reuseport_cb_ok, reuseport_ok))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
|
||||||
|
sk2 = (struct sock *)tw2;
|
||||||
|
|
||||||
|
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
|
||||||
|
reuseport_cb_ok, reuseport_ok))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,6 +116,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
|
|||||||
#endif
|
#endif
|
||||||
tb->rcv_saddr = sk->sk_rcv_saddr;
|
tb->rcv_saddr = sk->sk_rcv_saddr;
|
||||||
INIT_HLIST_HEAD(&tb->owners);
|
INIT_HLIST_HEAD(&tb->owners);
|
||||||
|
INIT_HLIST_HEAD(&tb->deathrow);
|
||||||
hlist_add_head(&tb->node, &head->chain);
|
hlist_add_head(&tb->node, &head->chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +138,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
|
|||||||
/* Caller must hold hashbucket lock for this tb with local BH disabled */
|
/* Caller must hold hashbucket lock for this tb with local BH disabled */
|
||||||
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
|
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
|
||||||
{
|
{
|
||||||
if (hlist_empty(&tb->owners)) {
|
if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
|
||||||
__hlist_del(&tb->node);
|
__hlist_del(&tb->node);
|
||||||
kmem_cache_free(cachep, tb);
|
kmem_cache_free(cachep, tb);
|
||||||
}
|
}
|
||||||
@ -1103,15 +1104,16 @@ ok:
|
|||||||
/* Head lock still held and bh's disabled */
|
/* Head lock still held and bh's disabled */
|
||||||
inet_bind_hash(sk, tb, tb2, port);
|
inet_bind_hash(sk, tb, tb2, port);
|
||||||
|
|
||||||
spin_unlock(&head2->lock);
|
|
||||||
|
|
||||||
if (sk_unhashed(sk)) {
|
if (sk_unhashed(sk)) {
|
||||||
inet_sk(sk)->inet_sport = htons(port);
|
inet_sk(sk)->inet_sport = htons(port);
|
||||||
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
|
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
|
||||||
}
|
}
|
||||||
if (tw)
|
if (tw)
|
||||||
inet_twsk_bind_unhash(tw, hinfo);
|
inet_twsk_bind_unhash(tw, hinfo);
|
||||||
|
|
||||||
|
spin_unlock(&head2->lock);
|
||||||
spin_unlock(&head->lock);
|
spin_unlock(&head->lock);
|
||||||
|
|
||||||
if (tw)
|
if (tw)
|
||||||
inet_twsk_deschedule_put(tw);
|
inet_twsk_deschedule_put(tw);
|
||||||
local_bh_enable();
|
local_bh_enable();
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
|
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
|
||||||
struct inet_hashinfo *hashinfo)
|
struct inet_hashinfo *hashinfo)
|
||||||
{
|
{
|
||||||
|
struct inet_bind2_bucket *tb2 = tw->tw_tb2;
|
||||||
struct inet_bind_bucket *tb = tw->tw_tb;
|
struct inet_bind_bucket *tb = tw->tw_tb;
|
||||||
|
|
||||||
if (!tb)
|
if (!tb)
|
||||||
@ -37,6 +38,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
|
|||||||
__hlist_del(&tw->tw_bind_node);
|
__hlist_del(&tw->tw_bind_node);
|
||||||
tw->tw_tb = NULL;
|
tw->tw_tb = NULL;
|
||||||
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
||||||
|
|
||||||
|
__hlist_del(&tw->tw_bind2_node);
|
||||||
|
tw->tw_tb2 = NULL;
|
||||||
|
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
|
||||||
|
|
||||||
__sock_put((struct sock *)tw);
|
__sock_put((struct sock *)tw);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -45,7 +51,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
|
|||||||
{
|
{
|
||||||
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
|
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
|
||||||
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
|
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
|
||||||
struct inet_bind_hashbucket *bhead;
|
struct inet_bind_hashbucket *bhead, *bhead2;
|
||||||
|
|
||||||
spin_lock(lock);
|
spin_lock(lock);
|
||||||
sk_nulls_del_node_init_rcu((struct sock *)tw);
|
sk_nulls_del_node_init_rcu((struct sock *)tw);
|
||||||
@ -54,9 +60,13 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
|
|||||||
/* Disassociate with bind bucket. */
|
/* Disassociate with bind bucket. */
|
||||||
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
|
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
|
||||||
hashinfo->bhash_size)];
|
hashinfo->bhash_size)];
|
||||||
|
bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
|
||||||
|
twsk_net(tw), tw->tw_num);
|
||||||
|
|
||||||
spin_lock(&bhead->lock);
|
spin_lock(&bhead->lock);
|
||||||
|
spin_lock(&bhead2->lock);
|
||||||
inet_twsk_bind_unhash(tw, hashinfo);
|
inet_twsk_bind_unhash(tw, hashinfo);
|
||||||
|
spin_unlock(&bhead2->lock);
|
||||||
spin_unlock(&bhead->lock);
|
spin_unlock(&bhead->lock);
|
||||||
|
|
||||||
refcount_dec(&tw->tw_dr->tw_refcount);
|
refcount_dec(&tw->tw_dr->tw_refcount);
|
||||||
@ -93,6 +103,12 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
|
|||||||
hlist_add_head(&tw->tw_bind_node, list);
|
hlist_add_head(&tw->tw_bind_node, list);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
|
||||||
|
struct hlist_head *list)
|
||||||
|
{
|
||||||
|
hlist_add_head(&tw->tw_bind2_node, list);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Enter the time wait state. This is called with locally disabled BH.
|
* Enter the time wait state. This is called with locally disabled BH.
|
||||||
* Essentially we whip up a timewait bucket, copy the relevant info into it
|
* Essentially we whip up a timewait bucket, copy the relevant info into it
|
||||||
@ -105,17 +121,28 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
|
|||||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||||
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
|
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
|
||||||
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
||||||
struct inet_bind_hashbucket *bhead;
|
struct inet_bind_hashbucket *bhead, *bhead2;
|
||||||
|
|
||||||
/* Step 1: Put TW into bind hash. Original socket stays there too.
|
/* Step 1: Put TW into bind hash. Original socket stays there too.
|
||||||
Note, that any socket with inet->num != 0 MUST be bound in
|
Note, that any socket with inet->num != 0 MUST be bound in
|
||||||
binding cache, even if it is closed.
|
binding cache, even if it is closed.
|
||||||
*/
|
*/
|
||||||
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
|
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
|
||||||
hashinfo->bhash_size)];
|
hashinfo->bhash_size)];
|
||||||
|
bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
|
||||||
|
|
||||||
spin_lock(&bhead->lock);
|
spin_lock(&bhead->lock);
|
||||||
|
spin_lock(&bhead2->lock);
|
||||||
|
|
||||||
tw->tw_tb = icsk->icsk_bind_hash;
|
tw->tw_tb = icsk->icsk_bind_hash;
|
||||||
WARN_ON(!icsk->icsk_bind_hash);
|
WARN_ON(!icsk->icsk_bind_hash);
|
||||||
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
|
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
|
||||||
|
|
||||||
|
tw->tw_tb2 = icsk->icsk_bind2_hash;
|
||||||
|
WARN_ON(!icsk->icsk_bind2_hash);
|
||||||
|
inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
|
||||||
|
|
||||||
|
spin_unlock(&bhead2->lock);
|
||||||
spin_unlock(&bhead->lock);
|
spin_unlock(&bhead->lock);
|
||||||
|
|
||||||
spin_lock(lock);
|
spin_lock(lock);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user