2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-08-10 07:00:51 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Generic INET transport hashtables
*
* Authors : Lotsa people , from code originally in tcp
*/
2005-08-10 07:07:13 +04:00
# include <linux/module.h>
2005-12-14 10:25:31 +03:00
# include <linux/random.h>
2005-08-10 07:08:09 +04:00
# include <linux/sched.h>
2005-08-10 07:00:51 +04:00
# include <linux/slab.h>
2005-08-10 07:08:09 +04:00
# include <linux/wait.h>
2015-05-26 17:55:34 +03:00
# include <linux/vmalloc.h>
2018-10-31 01:09:49 +03:00
# include <linux/memblock.h>
2005-08-10 07:00:51 +04:00
2016-02-10 19:50:40 +03:00
# include <net/addrconf.h>
2005-08-10 07:10:42 +04:00
# include <net/inet_connection_sock.h>
2005-08-10 07:00:51 +04:00
# include <net/inet_hashtables.h>
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
# if IS_ENABLED(CONFIG_IPV6)
# include <net/inet6_hashtables.h>
# endif
2011-08-04 07:50:44 +04:00
# include <net/secure_seq.h>
2005-12-14 10:25:31 +03:00
# include <net/ip.h>
2016-10-17 06:02:52 +03:00
# include <net/tcp.h>
2016-02-10 19:50:40 +03:00
# include <net/sock_reuseport.h>
2005-08-10 07:00:51 +04:00
2015-03-19 00:05:33 +03:00
static u32 inet_ehashfn ( const struct net * net , const __be32 laddr ,
const __u16 lport , const __be32 faddr ,
const __be16 fport )
2013-10-19 23:48:51 +04:00
{
2013-10-19 23:48:57 +04:00
static u32 inet_ehash_secret __read_mostly ;
net_get_random_once ( & inet_ehash_secret , sizeof ( inet_ehash_secret ) ) ;
2013-10-19 23:48:51 +04:00
return __inet_ehashfn ( laddr , lport , faddr , fport ,
inet_ehash_secret + net_hash_mix ( net ) ) ;
}
2015-03-19 00:05:35 +03:00
/* This function handles inet_sock, but also timewait and request sockets
* for IPv4 / IPv6 .
*/
2017-07-03 12:57:54 +03:00
static u32 sk_ehashfn ( const struct sock * sk )
2013-10-19 23:48:51 +04:00
{
2015-03-19 00:05:35 +03:00
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 & &
! ipv6_addr_v4mapped ( & sk - > sk_v6_daddr ) )
return inet6_ehashfn ( sock_net ( sk ) ,
& sk - > sk_v6_rcv_saddr , sk - > sk_num ,
& sk - > sk_v6_daddr , sk - > sk_dport ) ;
# endif
2015-03-19 00:05:34 +03:00
return inet_ehashfn ( sock_net ( sk ) ,
sk - > sk_rcv_saddr , sk - > sk_num ,
sk - > sk_daddr , sk - > sk_dport ) ;
2013-10-19 23:48:51 +04:00
}
2005-08-10 07:00:51 +04:00
/*
* Allocate and initialize a new local port bind bucket .
* The bindhash mutex for snum ' s hash chain must be held here .
*/
2006-12-07 07:33:20 +03:00
struct inet_bind_bucket * inet_bind_bucket_create ( struct kmem_cache * cachep ,
2008-01-31 16:05:50 +03:00
struct net * net ,
2005-08-10 07:00:51 +04:00
struct inet_bind_hashbucket * head ,
2018-11-07 18:36:02 +03:00
const unsigned short snum ,
int l3mdev )
2005-08-10 07:00:51 +04:00
{
2006-12-07 07:33:16 +03:00
struct inet_bind_bucket * tb = kmem_cache_alloc ( cachep , GFP_ATOMIC ) ;
2005-08-10 07:00:51 +04:00
2015-04-03 11:17:27 +03:00
if ( tb ) {
2015-03-12 07:04:08 +03:00
write_pnet ( & tb - > ib_net , net ) ;
2018-11-07 18:36:02 +03:00
tb - > l3mdev = l3mdev ;
2005-08-10 07:00:51 +04:00
tb - > port = snum ;
tb - > fastreuse = 0 ;
2013-01-22 13:50:24 +04:00
tb - > fastreuseport = 0 ;
2005-08-10 07:00:51 +04:00
INIT_HLIST_HEAD ( & tb - > owners ) ;
hlist_add_head ( & tb - > node , & head - > chain ) ;
}
return tb ;
}
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
2006-12-07 07:33:20 +03:00
void inet_bind_bucket_destroy ( struct kmem_cache * cachep , struct inet_bind_bucket * tb )
2005-08-10 07:00:51 +04:00
{
if ( hlist_empty ( & tb - > owners ) ) {
__hlist_del ( & tb - > node ) ;
kmem_cache_free ( cachep , tb ) ;
}
}
2005-08-10 07:07:13 +04:00
void inet_bind_hash ( struct sock * sk , struct inet_bind_bucket * tb ,
const unsigned short snum )
{
2009-10-15 10:30:45 +04:00
inet_sk ( sk ) - > inet_num = snum ;
2005-08-10 07:07:13 +04:00
sk_add_bind_node ( sk , & tb - > owners ) ;
2005-08-10 07:10:42 +04:00
inet_csk ( sk ) - > icsk_bind_hash = tb ;
2005-08-10 07:07:13 +04:00
}
/*
* Get rid of any references to a local port held by the given sock .
*/
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
static void __inet_put_port ( struct sock * sk )
2005-08-10 07:07:13 +04:00
{
2008-03-23 02:50:58 +03:00
struct inet_hashinfo * hashinfo = sk - > sk_prot - > h . hashinfo ;
2009-10-15 10:30:45 +04:00
const int bhash = inet_bhashfn ( sock_net ( sk ) , inet_sk ( sk ) - > inet_num ,
2008-06-17 04:12:49 +04:00
hashinfo - > bhash_size ) ;
2005-08-10 07:07:13 +04:00
struct inet_bind_hashbucket * head = & hashinfo - > bhash [ bhash ] ;
struct inet_bind_bucket * tb ;
spin_lock ( & head - > lock ) ;
2005-08-10 07:10:42 +04:00
tb = inet_csk ( sk ) - > icsk_bind_hash ;
2005-08-10 07:07:13 +04:00
__sk_del_bind_node ( sk ) ;
2005-08-10 07:10:42 +04:00
inet_csk ( sk ) - > icsk_bind_hash = NULL ;
2009-10-15 10:30:45 +04:00
inet_sk ( sk ) - > inet_num = 0 ;
2005-08-10 07:07:13 +04:00
inet_bind_bucket_destroy ( hashinfo - > bind_bucket_cachep , tb ) ;
spin_unlock ( & head - > lock ) ;
}
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
void inet_put_port ( struct sock * sk )
2005-08-10 07:07:13 +04:00
{
local_bh_disable ( ) ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
__inet_put_port ( sk ) ;
2005-08-10 07:07:13 +04:00
local_bh_enable ( ) ;
}
EXPORT_SYMBOL ( inet_put_port ) ;
2005-08-10 07:08:09 +04:00
2015-09-29 17:42:44 +03:00
int __inet_inherit_port ( const struct sock * sk , struct sock * child )
2008-04-18 10:18:15 +04:00
{
struct inet_hashinfo * table = sk - > sk_prot - > h . hashinfo ;
2010-10-21 15:06:43 +04:00
unsigned short port = inet_sk ( child ) - > inet_num ;
const int bhash = inet_bhashfn ( sock_net ( sk ) , port ,
2008-06-17 04:12:49 +04:00
table - > bhash_size ) ;
2008-04-18 10:18:15 +04:00
struct inet_bind_hashbucket * head = & table - > bhash [ bhash ] ;
struct inet_bind_bucket * tb ;
2018-11-07 18:36:02 +03:00
int l3mdev ;
2008-04-18 10:18:15 +04:00
spin_lock ( & head - > lock ) ;
tb = inet_csk ( sk ) - > icsk_bind_hash ;
2015-10-14 15:58:38 +03:00
if ( unlikely ( ! tb ) ) {
spin_unlock ( & head - > lock ) ;
return - ENOENT ;
}
2010-10-21 15:06:43 +04:00
if ( tb - > port ! = port ) {
2018-11-07 18:36:02 +03:00
l3mdev = inet_sk_bound_l3mdev ( sk ) ;
2010-10-21 15:06:43 +04:00
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket ' s icsk_bind_hash is the same
* as that of the child socket . We have to look up or
* create a new bind bucket for the child here . */
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
inet_bind_bucket_for_each ( tb , & head - > chain ) {
2010-10-21 15:06:43 +04:00
if ( net_eq ( ib_net ( tb ) , sock_net ( sk ) ) & &
2018-11-07 18:36:02 +03:00
tb - > l3mdev = = l3mdev & & tb - > port = = port )
2010-10-21 15:06:43 +04:00
break ;
}
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
if ( ! tb ) {
2010-10-21 15:06:43 +04:00
tb = inet_bind_bucket_create ( table - > bind_bucket_cachep ,
2018-11-07 18:36:02 +03:00
sock_net ( sk ) , head , port ,
l3mdev ) ;
2010-10-21 15:06:43 +04:00
if ( ! tb ) {
spin_unlock ( & head - > lock ) ;
return - ENOMEM ;
}
}
net: initialize fastreuse on inet_inherit_port
In the case of TPROXY, bind_conflict optimizations for SO_REUSEADDR or
SO_REUSEPORT are broken, possibly resulting in O(n) instead of O(1) bind
behaviour or in the incorrect reuse of a bind.
the kernel keeps track for each bind_bucket if all sockets in the
bind_bucket support SO_REUSEADDR or SO_REUSEPORT in two fastreuse flags.
These flags allow skipping the costly bind_conflict check when possible
(meaning when all sockets have the proper SO_REUSE option).
For every socket added to a bind_bucket, these flags need to be updated.
As soon as a socket that does not support reuse is added, the flag is
set to false and will never go back to true, unless the bind_bucket is
deleted.
Note that there is no mechanism to re-evaluate these flags when a socket
is removed (this might make sense when removing a socket that would not
allow reuse; this leaves room for a future patch).
For this optimization to work, it is mandatory that these flags are
properly initialized and updated.
When a child socket is created from a listen socket in
__inet_inherit_port, the TPROXY case could create a new bind bucket
without properly initializing these flags, thus preventing the
optimization to work. Alternatively, a socket not allowing reuse could
be added to an existing bind bucket without updating the flags, causing
bind_conflict to never be called as it should.
Call inet_csk_update_fastreuse when __inet_inherit_port decides to create
a new bind_bucket or use a different bind_bucket than the one of the
listen socket.
Fixes: 093d282321da ("tproxy: fix hash locking issue when using port redirection in __inet_inherit_port()")
Acked-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Tim Froidcoeur <tim.froidcoeur@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-08-11 21:33:24 +03:00
inet_csk_update_fastreuse ( tb , child ) ;
2010-10-21 15:06:43 +04:00
}
2010-11-26 17:26:27 +03:00
inet_bind_hash ( child , tb , port ) ;
2008-04-18 10:18:15 +04:00
spin_unlock ( & head - > lock ) ;
2010-10-21 15:06:43 +04:00
return 0 ;
2008-04-18 10:18:15 +04:00
}
EXPORT_SYMBOL_GPL ( __inet_inherit_port ) ;
2017-12-01 23:52:31 +03:00
static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk ( struct inet_hashinfo * h , struct sock * sk )
{
u32 hash ;
# if IS_ENABLED(CONFIG_IPV6)
if ( sk - > sk_family = = AF_INET6 )
hash = ipv6_portaddr_hash ( sock_net ( sk ) ,
& sk - > sk_v6_rcv_saddr ,
inet_sk ( sk ) - > inet_num ) ;
else
# endif
hash = ipv4_portaddr_hash ( sock_net ( sk ) ,
inet_sk ( sk ) - > inet_rcv_saddr ,
inet_sk ( sk ) - > inet_num ) ;
return inet_lhash2_bucket ( h , hash ) ;
}
static void inet_hash2 ( struct inet_hashinfo * h , struct sock * sk )
{
struct inet_listen_hashbucket * ilb2 ;
if ( ! h - > lhash2 )
return ;
ilb2 = inet_lhash2_bucket_sk ( h , sk ) ;
spin_lock ( & ilb2 - > lock ) ;
if ( sk - > sk_reuseport & & sk - > sk_family = = AF_INET6 )
hlist_add_tail_rcu ( & inet_csk ( sk ) - > icsk_listen_portaddr_node ,
& ilb2 - > head ) ;
else
hlist_add_head_rcu ( & inet_csk ( sk ) - > icsk_listen_portaddr_node ,
& ilb2 - > head ) ;
ilb2 - > count + + ;
spin_unlock ( & ilb2 - > lock ) ;
}
static void inet_unhash2 ( struct inet_hashinfo * h , struct sock * sk )
{
struct inet_listen_hashbucket * ilb2 ;
if ( ! h - > lhash2 | |
WARN_ON_ONCE ( hlist_unhashed ( & inet_csk ( sk ) - > icsk_listen_portaddr_node ) ) )
return ;
ilb2 = inet_lhash2_bucket_sk ( h , sk ) ;
spin_lock ( & ilb2 - > lock ) ;
hlist_del_init_rcu ( & inet_csk ( sk ) - > icsk_listen_portaddr_node ) ;
ilb2 - > count - - ;
spin_unlock ( & ilb2 - > lock ) ;
}
2008-11-24 04:22:55 +03:00
static inline int compute_score ( struct sock * sk , struct net * net ,
const unsigned short hnum , const __be32 daddr ,
2020-08-31 09:26:34 +03:00
const int dif , const int sdif )
2008-11-24 04:22:55 +03:00
{
int score = - 1 ;
2018-12-13 00:15:35 +03:00
if ( net_eq ( sock_net ( sk ) , net ) & & sk - > sk_num = = hnum & &
2008-11-24 04:22:55 +03:00
! ipv6_only_sock ( sk ) ) {
2018-12-13 00:15:35 +03:00
if ( sk - > sk_rcv_saddr ! = daddr )
return - 1 ;
if ( ! inet_sk_bound_dev_eq ( net , sk - > sk_bound_dev_if , dif , sdif ) )
2018-11-07 18:36:03 +03:00
return - 1 ;
2021-10-05 16:03:42 +03:00
score = sk - > sk_bound_dev_if ? 2 : 1 ;
2017-08-07 18:44:17 +03:00
2021-10-05 16:03:42 +03:00
if ( sk - > sk_family = = PF_INET )
score + + ;
2019-10-30 23:00:04 +03:00
if ( READ_ONCE ( sk - > sk_incoming_cpu ) = = raw_smp_processor_id ( ) )
2015-10-09 05:33:21 +03:00
score + + ;
2008-11-24 04:22:55 +03:00
}
return score ;
}
2020-07-17 13:35:24 +03:00
static inline struct sock * lookup_reuseport ( struct net * net , struct sock * sk ,
struct sk_buff * skb , int doff ,
__be32 saddr , __be16 sport ,
__be32 daddr , unsigned short hnum )
{
struct sock * reuse_sk = NULL ;
u32 phash ;
if ( sk - > sk_reuseport ) {
phash = inet_ehashfn ( net , daddr , hnum , saddr , sport ) ;
reuse_sk = reuseport_select_sock ( sk , phash , skb , doff ) ;
}
return reuse_sk ;
}
2005-08-10 07:09:06 +04:00
/*
2016-04-01 18:52:17 +03:00
* Here are some nice properties to exploit here . The BSD API
* does not allow a listening sock to specify the remote port nor the
2005-08-10 07:09:06 +04:00
* remote address for the connection . So always assume those are both
* wildcarded during the search since they can never be otherwise .
*/
2005-08-10 07:09:46 +04:00
2016-04-01 18:52:17 +03:00
/* called with rcu_read_lock() : No refcount taken on the socket */
2017-12-01 23:52:31 +03:00
static struct sock * inet_lhash2_lookup ( struct net * net ,
struct inet_listen_hashbucket * ilb2 ,
struct sk_buff * skb , int doff ,
const __be32 saddr , __be16 sport ,
const __be32 daddr , const unsigned short hnum ,
const int dif , const int sdif )
{
struct inet_connection_sock * icsk ;
struct sock * sk , * result = NULL ;
int score , hiscore = 0 ;
inet_lhash2_for_each_icsk_rcu ( icsk , & ilb2 - > head ) {
sk = ( struct sock * ) icsk ;
2020-08-31 09:26:34 +03:00
score = compute_score ( sk , net , hnum , daddr , dif , sdif ) ;
2017-12-01 23:52:31 +03:00
if ( score > hiscore ) {
2020-07-17 13:35:24 +03:00
result = lookup_reuseport ( net , sk , skb , doff ,
saddr , sport , daddr , hnum ) ;
if ( result )
return result ;
2017-12-01 23:52:31 +03:00
result = sk ;
hiscore = score ;
}
}
return result ;
}
inet: Run SK_LOOKUP BPF program on socket lookup
Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().
Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().
This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.
With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.
In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:
1. If any program returned SK_PASS and selected a valid socket, the socket
is used as result of socket lookup.
2. If more than one program returned SK_PASS and selected a socket,
last selection takes effect.
3. If any program returned SK_DROP, and no program returned SK_PASS and
selected a socket, socket lookup fails with -ECONNREFUSED.
4. If all programs returned SK_PASS and none of them selected a socket,
socket lookup continues to htable-based lookup.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
2020-07-17 13:35:25 +03:00
static inline struct sock * inet_lookup_run_bpf ( struct net * net ,
struct inet_hashinfo * hashinfo ,
struct sk_buff * skb , int doff ,
__be32 saddr , __be16 sport ,
2021-11-10 14:10:15 +03:00
__be32 daddr , u16 hnum , const int dif )
inet: Run SK_LOOKUP BPF program on socket lookup
Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().
Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().
This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.
With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.
In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:
1. If any program returned SK_PASS and selected a valid socket, the socket
is used as result of socket lookup.
2. If more than one program returned SK_PASS and selected a socket,
last selection takes effect.
3. If any program returned SK_DROP, and no program returned SK_PASS and
selected a socket, socket lookup fails with -ECONNREFUSED.
4. If all programs returned SK_PASS and none of them selected a socket,
socket lookup continues to htable-based lookup.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
2020-07-17 13:35:25 +03:00
{
struct sock * sk , * reuse_sk ;
bool no_reuseport ;
if ( hashinfo ! = & tcp_hashinfo )
return NULL ; /* only TCP is supported */
2021-11-10 14:10:15 +03:00
no_reuseport = bpf_sk_lookup_run_v4 ( net , IPPROTO_TCP , saddr , sport ,
daddr , hnum , dif , & sk ) ;
inet: Run SK_LOOKUP BPF program on socket lookup
Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().
Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().
This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.
With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.
In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:
1. If any program returned SK_PASS and selected a valid socket, the socket
is used as result of socket lookup.
2. If more than one program returned SK_PASS and selected a socket,
last selection takes effect.
3. If any program returned SK_DROP, and no program returned SK_PASS and
selected a socket, socket lookup fails with -ECONNREFUSED.
4. If all programs returned SK_PASS and none of them selected a socket,
socket lookup continues to htable-based lookup.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
2020-07-17 13:35:25 +03:00
if ( no_reuseport | | IS_ERR_OR_NULL ( sk ) )
return sk ;
reuse_sk = lookup_reuseport ( net , sk , skb , doff , saddr , sport , daddr , hnum ) ;
if ( reuse_sk )
sk = reuse_sk ;
return sk ;
}
2008-01-31 16:06:40 +03:00
struct sock * __inet_lookup_listener ( struct net * net ,
struct inet_hashinfo * hashinfo ,
2016-02-10 19:50:38 +03:00
struct sk_buff * skb , int doff ,
2013-01-22 13:50:24 +04:00
const __be32 saddr , __be16 sport ,
2006-09-28 05:43:33 +04:00
const __be32 daddr , const unsigned short hnum ,
2017-08-07 18:44:17 +03:00
const int dif , const int sdif )
2006-08-08 13:18:10 +04:00
{
2017-12-01 23:52:31 +03:00
struct inet_listen_hashbucket * ilb2 ;
2018-12-13 00:15:35 +03:00
struct sock * result = NULL ;
2017-12-01 23:52:31 +03:00
unsigned int hash2 ;
inet: Run SK_LOOKUP BPF program on socket lookup
Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().
Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().
This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.
With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.
In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:
1. If any program returned SK_PASS and selected a valid socket, the socket
is used as result of socket lookup.
2. If more than one program returned SK_PASS and selected a socket,
last selection takes effect.
3. If any program returned SK_DROP, and no program returned SK_PASS and
selected a socket, socket lookup fails with -ECONNREFUSED.
4. If all programs returned SK_PASS and none of them selected a socket,
socket lookup continues to htable-based lookup.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
2020-07-17 13:35:25 +03:00
/* Lookup redirect from BPF */
if ( static_branch_unlikely ( & bpf_sk_lookup_enabled ) ) {
result = inet_lookup_run_bpf ( net , hashinfo , skb , doff ,
2021-11-10 14:10:15 +03:00
saddr , sport , daddr , hnum , dif ) ;
inet: Run SK_LOOKUP BPF program on socket lookup
Run a BPF program before looking up a listening socket on the receive path.
Program selects a listening socket to yield as result of socket lookup by
calling bpf_sk_assign() helper and returning SK_PASS code. Program can
revert its decision by assigning a NULL socket with bpf_sk_assign().
Alternatively, BPF program can also fail the lookup by returning with
SK_DROP, or let the lookup continue as usual with SK_PASS on return, when
no socket has been selected with bpf_sk_assign().
This lets the user match packets with listening sockets freely at the last
possible point on the receive path, where we know that packets are destined
for local delivery after undergoing policing, filtering, and routing.
With BPF code selecting the socket, directing packets destined to an IP
range or to a port range to a single socket becomes possible.
In case multiple programs are attached, they are run in series in the order
in which they were attached. The end result is determined from return codes
of all the programs according to following rules:
1. If any program returned SK_PASS and selected a valid socket, the socket
is used as result of socket lookup.
2. If more than one program returned SK_PASS and selected a socket,
last selection takes effect.
3. If any program returned SK_DROP, and no program returned SK_PASS and
selected a socket, socket lookup fails with -ECONNREFUSED.
4. If all programs returned SK_PASS and none of them selected a socket,
socket lookup continues to htable-based lookup.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com
2020-07-17 13:35:25 +03:00
if ( result )
goto done ;
}
2017-12-01 23:52:31 +03:00
hash2 = ipv4_portaddr_hash ( net , daddr , hnum ) ;
ilb2 = inet_lhash2_bucket ( hashinfo , hash2 ) ;
result = inet_lhash2_lookup ( net , ilb2 , skb , doff ,
saddr , sport , daddr , hnum ,
dif , sdif ) ;
if ( result )
2018-08-08 11:01:26 +03:00
goto done ;
2017-12-01 23:52:31 +03:00
/* Lookup lhash2 with INADDR_ANY */
hash2 = ipv4_portaddr_hash ( net , htonl ( INADDR_ANY ) , hnum ) ;
ilb2 = inet_lhash2_bucket ( hashinfo , hash2 ) ;
2018-08-08 11:01:26 +03:00
result = inet_lhash2_lookup ( net , ilb2 , skb , doff ,
2018-12-13 00:15:35 +03:00
saddr , sport , htonl ( INADDR_ANY ) , hnum ,
2018-08-08 11:01:26 +03:00
dif , sdif ) ;
done :
2019-06-06 00:09:05 +03:00
if ( IS_ERR ( result ) )
2018-08-08 11:01:26 +03:00
return NULL ;
2008-11-24 04:22:55 +03:00
return result ;
2006-08-08 13:18:10 +04:00
}
2006-08-10 02:47:12 +04:00
EXPORT_SYMBOL_GPL ( __inet_lookup_listener ) ;
2005-12-14 10:25:31 +03:00
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
/* All sockets share common refcount, but have different destructors */
void sock_gen_put ( struct sock * sk )
{
2017-06-30 13:08:01 +03:00
if ( ! refcount_dec_and_test ( & sk - > sk_refcnt ) )
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
return ;
if ( sk - > sk_state = = TCP_TIME_WAIT )
inet_twsk_free ( inet_twsk ( sk ) ) ;
2015-03-13 02:44:08 +03:00
else if ( sk - > sk_state = = TCP_NEW_SYN_RECV )
reqsk_free ( inet_reqsk ( sk ) ) ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
else
sk_free ( sk ) ;
}
EXPORT_SYMBOL_GPL ( sock_gen_put ) ;
2015-03-16 07:12:15 +03:00
void sock_edemux ( struct sk_buff * skb )
{
sock_gen_put ( skb - > sk ) ;
}
EXPORT_SYMBOL ( sock_edemux ) ;
2012-04-15 05:34:41 +04:00
struct sock * __inet_lookup_established ( struct net * net ,
2008-01-31 16:06:40 +03:00
struct inet_hashinfo * hashinfo ,
2007-12-21 02:32:17 +03:00
const __be32 saddr , const __be16 sport ,
const __be32 daddr , const u16 hnum ,
2017-08-07 18:44:17 +03:00
const int dif , const int sdif )
2007-12-21 02:32:17 +03:00
{
2014-05-14 07:30:07 +04:00
INET_ADDR_COOKIE ( acookie , saddr , daddr ) ;
2007-12-21 02:32:17 +03:00
const __portpair ports = INET_COMBINED_PORTS ( sport , hnum ) ;
struct sock * sk ;
2008-11-17 06:40:17 +03:00
const struct hlist_nulls_node * node ;
2007-12-21 02:32:17 +03:00
/* Optimize here for direct hit, only listening connections can
* have wildcards anyways .
*/
2008-06-17 04:13:27 +04:00
unsigned int hash = inet_ehashfn ( net , daddr , hnum , saddr , sport ) ;
2009-10-09 04:16:19 +04:00
unsigned int slot = hash & hashinfo - > ehash_mask ;
2008-11-17 06:40:17 +03:00
struct inet_ehash_bucket * head = & hashinfo - > ehash [ slot ] ;
2007-12-21 02:32:17 +03:00
2008-11-17 06:40:17 +03:00
begin :
sk_nulls_for_each_rcu ( sk , node , & head - > chain ) {
2012-11-30 13:49:27 +04:00
if ( sk - > sk_hash ! = hash )
continue ;
if ( likely ( INET_MATCH ( sk , net , acookie ,
2017-08-07 18:44:17 +03:00
saddr , daddr , ports , dif , sdif ) ) ) {
2017-06-30 13:08:01 +03:00
if ( unlikely ( ! refcount_inc_not_zero ( & sk - > sk_refcnt ) ) )
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
goto out ;
2012-11-30 13:49:27 +04:00
if ( unlikely ( ! INET_MATCH ( sk , net , acookie ,
2017-08-07 18:44:17 +03:00
saddr , daddr , ports ,
dif , sdif ) ) ) {
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
sock_gen_put ( sk ) ;
2008-11-17 06:40:17 +03:00
goto begin ;
}
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
goto found ;
2008-11-17 06:40:17 +03:00
}
2007-12-21 02:32:17 +03:00
}
2008-11-17 06:40:17 +03:00
/*
* if the nulls value we got at the end of this lookup is
* not the expected one , we must restart lookup .
* We probably met an item that was moved to another chain .
*/
if ( get_nulls_value ( node ) ! = slot )
goto begin ;
2007-12-21 02:32:17 +03:00
out :
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
sk = NULL ;
found :
2007-12-21 02:32:17 +03:00
return sk ;
}
EXPORT_SYMBOL_GPL ( __inet_lookup_established ) ;
2005-12-14 10:25:31 +03:00
/* called with local bh disabled */
static int __inet_check_established ( struct inet_timewait_death_row * death_row ,
struct sock * sk , __u16 lport ,
struct inet_timewait_sock * * twp )
{
struct inet_hashinfo * hinfo = death_row - > hashinfo ;
struct inet_sock * inet = inet_sk ( sk ) ;
2009-10-15 10:30:45 +04:00
__be32 daddr = inet - > inet_rcv_saddr ;
__be32 saddr = inet - > inet_daddr ;
2005-12-14 10:25:31 +03:00
int dif = sk - > sk_bound_dev_if ;
2017-08-07 18:44:17 +03:00
struct net * net = sock_net ( sk ) ;
int sdif = l3mdev_master_ifindex_by_index ( net , dif ) ;
2014-05-14 07:30:07 +04:00
INET_ADDR_COOKIE ( acookie , saddr , daddr ) ;
2009-10-15 10:30:45 +04:00
const __portpair ports = INET_COMBINED_PORTS ( inet - > inet_dport , lport ) ;
unsigned int hash = inet_ehashfn ( net , daddr , lport ,
saddr , inet - > inet_dport ) ;
2005-12-14 10:25:31 +03:00
struct inet_ehash_bucket * head = inet_ehash_bucket ( hinfo , hash ) ;
2008-11-21 07:39:09 +03:00
spinlock_t * lock = inet_ehash_lockp ( hinfo , hash ) ;
2005-12-14 10:25:31 +03:00
struct sock * sk2 ;
2008-11-17 06:40:17 +03:00
const struct hlist_nulls_node * node ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
struct inet_timewait_sock * tw = NULL ;
2005-12-14 10:25:31 +03:00
2008-11-21 07:39:09 +03:00
spin_lock ( lock ) ;
2005-12-14 10:25:31 +03:00
2008-11-17 06:40:17 +03:00
sk_nulls_for_each ( sk2 , node , & head - > chain ) {
2012-11-30 13:49:27 +04:00
if ( sk2 - > sk_hash ! = hash )
continue ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
2012-11-30 13:49:27 +04:00
if ( likely ( INET_MATCH ( sk2 , net , acookie ,
2017-08-07 18:44:17 +03:00
saddr , daddr , ports , dif , sdif ) ) ) {
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
if ( sk2 - > sk_state = = TCP_TIME_WAIT ) {
tw = inet_twsk ( sk2 ) ;
if ( twsk_unique ( sk , sk2 , twp ) )
break ;
}
2005-12-14 10:25:31 +03:00
goto not_unique ;
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
}
2005-12-14 10:25:31 +03:00
}
/* Must record num and sport now. Otherwise we will see
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
* in hash table socket with a funny identity .
*/
2009-10-15 10:30:45 +04:00
inet - > inet_num = lport ;
inet - > inet_sport = htons ( lport ) ;
2005-12-14 10:25:31 +03:00
sk - > sk_hash = hash ;
2008-07-26 08:43:18 +04:00
WARN_ON ( ! sk_unhashed ( sk ) ) ;
2008-11-17 06:40:17 +03:00
__sk_nulls_add_node_rcu ( sk , & head - > chain ) ;
2009-12-03 01:31:19 +03:00
if ( tw ) {
2015-07-09 00:28:29 +03:00
sk_nulls_del_node_init_rcu ( ( struct sock * ) tw ) ;
2016-04-28 02:44:39 +03:00
__NET_INC_STATS ( net , LINUX_MIB_TIMEWAITRECYCLED ) ;
2009-12-03 01:31:19 +03:00
}
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
2008-04-01 06:41:46 +04:00
sock_prot_inuse_add ( sock_net ( sk ) , sk - > sk_prot , 1 ) ;
2005-12-14 10:25:31 +03:00
if ( twp ) {
* twp = tw ;
} else if ( tw ) {
/* Silly. Should hash-dance instead... */
2015-07-09 00:28:30 +03:00
inet_twsk_deschedule_put ( tw ) ;
2005-12-14 10:25:31 +03:00
}
return 0 ;
not_unique :
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
2005-12-14 10:25:31 +03:00
return - EADDRNOTAVAIL ;
}
2015-05-27 20:46:02 +03:00
static u32 inet_sk_port_offset ( const struct sock * sk )
2005-12-14 10:25:31 +03:00
{
const struct inet_sock * inet = inet_sk ( sk ) ;
2015-05-27 20:46:02 +03:00
2009-10-15 10:30:45 +04:00
return secure_ipv4_port_ephemeral ( inet - > inet_rcv_saddr ,
inet - > inet_daddr ,
inet - > inet_dport ) ;
2005-12-14 10:25:31 +03:00
}
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
/* Searches for an exsiting socket in the ehash bucket list.
* Returns true if found , false otherwise .
2015-10-02 21:43:32 +03:00
*/
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
static bool inet_ehash_lookup_by_sk ( struct sock * sk ,
struct hlist_nulls_head * list )
{
const __portpair ports = INET_COMBINED_PORTS ( sk - > sk_dport , sk - > sk_num ) ;
const int sdif = sk - > sk_bound_dev_if ;
const int dif = sk - > sk_bound_dev_if ;
const struct hlist_nulls_node * node ;
struct net * net = sock_net ( sk ) ;
struct sock * esk ;
INET_ADDR_COOKIE ( acookie , sk - > sk_daddr , sk - > sk_rcv_saddr ) ;
sk_nulls_for_each_rcu ( esk , node , list ) {
if ( esk - > sk_hash ! = sk - > sk_hash )
continue ;
if ( sk - > sk_family = = AF_INET ) {
if ( unlikely ( INET_MATCH ( esk , net , acookie ,
sk - > sk_daddr ,
sk - > sk_rcv_saddr ,
ports , dif , sdif ) ) ) {
return true ;
}
}
# if IS_ENABLED(CONFIG_IPV6)
else if ( sk - > sk_family = = AF_INET6 ) {
if ( unlikely ( INET6_MATCH ( esk , net ,
& sk - > sk_v6_daddr ,
& sk - > sk_v6_rcv_saddr ,
ports , dif , sdif ) ) ) {
return true ;
}
}
# endif
}
return false ;
}
/* Insert a socket into ehash, and eventually remove another one
* ( The another one can be a SYN_RECV or TIMEWAIT )
* If an existing socket already exists , socket sk is not inserted ,
* and sets found_dup_sk parameter to true .
*/
bool inet_ehash_insert ( struct sock * sk , struct sock * osk , bool * found_dup_sk )
2007-12-21 02:31:33 +03:00
{
2008-03-23 02:50:58 +03:00
struct inet_hashinfo * hashinfo = sk - > sk_prot - > h . hashinfo ;
2008-11-17 06:40:17 +03:00
struct hlist_nulls_head * list ;
2007-12-21 02:31:33 +03:00
struct inet_ehash_bucket * head ;
2015-03-19 00:05:34 +03:00
spinlock_t * lock ;
2015-10-22 18:20:46 +03:00
bool ret = true ;
2007-12-21 02:31:33 +03:00
2015-10-02 21:43:32 +03:00
WARN_ON_ONCE ( ! sk_unhashed ( sk ) ) ;
2007-12-21 02:31:33 +03:00
2015-03-19 00:05:34 +03:00
sk - > sk_hash = sk_ehashfn ( sk ) ;
2007-12-21 02:31:33 +03:00
head = inet_ehash_bucket ( hashinfo , sk - > sk_hash ) ;
list = & head - > chain ;
lock = inet_ehash_lockp ( hashinfo , sk - > sk_hash ) ;
2008-11-21 07:39:09 +03:00
spin_lock ( lock ) ;
2015-07-09 00:28:29 +03:00
if ( osk ) {
2015-10-22 18:20:46 +03:00
WARN_ON_ONCE ( sk - > sk_hash ! = osk - > sk_hash ) ;
ret = sk_nulls_del_node_init_rcu ( osk ) ;
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
} else if ( found_dup_sk ) {
* found_dup_sk = inet_ehash_lookup_by_sk ( sk , list ) ;
if ( * found_dup_sk )
ret = false ;
2009-12-04 06:46:54 +03:00
}
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
2015-10-22 18:20:46 +03:00
if ( ret )
__sk_nulls_add_node_rcu ( sk , list ) ;
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
2008-11-21 07:39:09 +03:00
spin_unlock ( lock ) ;
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
2015-10-02 21:43:32 +03:00
return ret ;
}
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
bool inet_ehash_nolisten ( struct sock * sk , struct sock * osk , bool * found_dup_sk )
2015-10-02 21:43:32 +03:00
{
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
bool ok = inet_ehash_insert ( sk , osk , found_dup_sk ) ;
2015-10-22 18:20:46 +03:00
if ( ok ) {
sock_prot_inuse_add ( sock_net ( sk ) , sk - > sk_prot , 1 ) ;
} else {
2021-10-14 16:41:26 +03:00
this_cpu_inc ( * sk - > sk_prot - > orphan_count ) ;
2017-12-20 06:12:51 +03:00
inet_sk_set_state ( sk , TCP_CLOSE ) ;
2015-10-22 18:20:46 +03:00
sock_set_flag ( sk , SOCK_DEAD ) ;
inet_csk_destroy_sock ( sk ) ;
}
return ok ;
2007-12-21 02:31:33 +03:00
}
2015-10-22 18:20:46 +03:00
EXPORT_SYMBOL_GPL ( inet_ehash_nolisten ) ;
2007-12-21 02:31:33 +03:00
2016-02-10 19:50:40 +03:00
static int inet_reuseport_add_sock ( struct sock * sk ,
2017-01-17 18:51:01 +03:00
struct inet_listen_hashbucket * ilb )
2016-02-10 19:50:40 +03:00
{
2016-04-29 02:24:32 +03:00
struct inet_bind_bucket * tb = inet_csk ( sk ) - > icsk_bind_hash ;
2019-12-14 05:20:41 +03:00
const struct hlist_nulls_node * node ;
2016-02-10 19:50:40 +03:00
struct sock * sk2 ;
kuid_t uid = sock_i_uid ( sk ) ;
2019-12-14 05:20:41 +03:00
sk_nulls_for_each_rcu ( sk2 , node , & ilb - > nulls_head ) {
2016-02-10 19:50:40 +03:00
if ( sk2 ! = sk & &
sk2 - > sk_family = = sk - > sk_family & &
ipv6_only_sock ( sk2 ) = = ipv6_only_sock ( sk ) & &
sk2 - > sk_bound_dev_if = = sk - > sk_bound_dev_if & &
2016-04-29 02:24:32 +03:00
inet_csk ( sk2 ) - > icsk_bind_hash = = tb & &
2016-02-10 19:50:40 +03:00
sk2 - > sk_reuseport & & uid_eq ( uid , sock_i_uid ( sk2 ) ) & &
2017-01-17 18:51:01 +03:00
inet_rcv_saddr_equal ( sk , sk2 , false ) )
bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.
BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].
At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this
point, it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.
For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb(). Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.
Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.
The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).
In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages. Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb. The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.
The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").
The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk. It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()". Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case). The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want. This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match). The bpf prog can decide if it wants to do SK_DROP as its
discretion.
When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk. If not, the kernel
will select one from "reuse->socks[]" (as before this patch).
The SK_DROP and SK_PASS handling logic will be in the next patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-08-08 11:01:25 +03:00
return reuseport_add_sock ( sk , sk2 ,
inet_rcv_saddr_any ( sk ) ) ;
2016-02-10 19:50:40 +03:00
}
bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.
BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].
At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this
point, it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.
For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb(). Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.
Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.
The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).
In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages. Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb. The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.
The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").
The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk. It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()". Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case). The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want. This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match). The bpf prog can decide if it wants to do SK_DROP as its
discretion.
When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk. If not, the kernel
will select one from "reuse->socks[]" (as before this patch).
The SK_DROP and SK_PASS handling logic will be in the next patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-08-08 11:01:25 +03:00
return reuseport_alloc ( sk , inet_rcv_saddr_any ( sk ) ) ;
2016-02-10 19:50:40 +03:00
}
2017-01-17 18:51:01 +03:00
int __inet_hash ( struct sock * sk , struct sock * osk )
2007-12-21 02:31:33 +03:00
{
2008-03-23 02:50:58 +03:00
struct inet_hashinfo * hashinfo = sk - > sk_prot - > h . hashinfo ;
2008-11-20 11:40:07 +03:00
struct inet_listen_hashbucket * ilb ;
2016-02-10 19:50:40 +03:00
int err = 0 ;
2007-12-21 02:31:33 +03:00
2015-10-22 18:20:46 +03:00
if ( sk - > sk_state ! = TCP_LISTEN ) {
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
inet_ehash_nolisten ( sk , osk , NULL ) ;
2016-02-10 19:50:40 +03:00
return 0 ;
2015-10-22 18:20:46 +03:00
}
2008-07-26 08:43:18 +04:00
WARN_ON ( ! sk_unhashed ( sk ) ) ;
2008-11-20 11:40:07 +03:00
ilb = & hashinfo - > listening_hash [ inet_sk_listen_hashfn ( sk ) ] ;
2007-12-21 02:31:33 +03:00
2008-11-20 11:40:07 +03:00
spin_lock ( & ilb - > lock ) ;
2016-02-10 19:50:40 +03:00
if ( sk - > sk_reuseport ) {
2017-01-17 18:51:01 +03:00
err = inet_reuseport_add_sock ( sk , ilb ) ;
2016-02-10 19:50:40 +03:00
if ( err )
goto unlock ;
}
2016-04-25 17:42:12 +03:00
if ( IS_ENABLED ( CONFIG_IPV6 ) & & sk - > sk_reuseport & &
sk - > sk_family = = AF_INET6 )
2019-12-14 05:20:41 +03:00
__sk_nulls_add_node_tail_rcu ( sk , & ilb - > nulls_head ) ;
2016-04-25 17:42:12 +03:00
else
2019-12-14 05:20:41 +03:00
__sk_nulls_add_node_rcu ( sk , & ilb - > nulls_head ) ;
2017-12-01 23:52:31 +03:00
inet_hash2 ( hashinfo , sk ) ;
2017-12-01 23:52:29 +03:00
ilb - > count + + ;
2016-04-01 18:52:17 +03:00
sock_set_flag ( sk , SOCK_RCU_FREE ) ;
2008-04-01 06:41:46 +04:00
sock_prot_inuse_add ( sock_net ( sk ) , sk - > sk_prot , 1 ) ;
2016-02-10 19:50:40 +03:00
unlock :
2008-11-20 11:40:07 +03:00
spin_unlock ( & ilb - > lock ) ;
2016-02-10 19:50:40 +03:00
return err ;
2007-12-21 02:31:33 +03:00
}
2015-03-19 00:05:36 +03:00
EXPORT_SYMBOL ( __inet_hash ) ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
2016-02-10 19:50:35 +03:00
int inet_hash ( struct sock * sk )
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
{
2016-02-10 19:50:40 +03:00
int err = 0 ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
if ( sk - > sk_state ! = TCP_CLOSE ) {
local_bh_disable ( ) ;
2017-01-17 18:51:01 +03:00
err = __inet_hash ( sk , NULL ) ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
local_bh_enable ( ) ;
}
2016-02-10 19:50:35 +03:00
2016-02-10 19:50:40 +03:00
return err ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
}
EXPORT_SYMBOL_GPL ( inet_hash ) ;
void inet_unhash ( struct sock * sk )
{
2008-03-23 02:50:58 +03:00
struct inet_hashinfo * hashinfo = sk - > sk_prot - > h . hashinfo ;
2018-02-01 13:26:23 +03:00
struct inet_listen_hashbucket * ilb = NULL ;
2008-11-24 04:22:55 +03:00
spinlock_t * lock ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
if ( sk_unhashed ( sk ) )
2008-11-20 11:40:07 +03:00
return ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
2016-04-01 18:52:17 +03:00
if ( sk - > sk_state = = TCP_LISTEN ) {
2017-12-01 23:52:29 +03:00
ilb = & hashinfo - > listening_hash [ inet_sk_listen_hashfn ( sk ) ] ;
lock = & ilb - > lock ;
2016-04-01 18:52:17 +03:00
} else {
2008-11-24 04:22:55 +03:00
lock = inet_ehash_lockp ( hashinfo , sk - > sk_hash ) ;
2016-04-01 18:52:17 +03:00
}
2008-11-24 04:22:55 +03:00
spin_lock_bh ( lock ) ;
2017-12-01 23:52:31 +03:00
if ( sk_unhashed ( sk ) )
goto unlock ;
2016-02-10 19:50:40 +03:00
if ( rcu_access_pointer ( sk - > sk_reuseport_cb ) )
tcp: Keep TCP_CLOSE sockets in the reuseport group.
When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.
The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.
Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, we cannot do that because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.
This patch allows TCP_CLOSE sockets to remain in the reuseport group and
access it while any child socket references them. The point is that
reuseport_detach_sock() was called twice from inet_unhash() and
sk_destruct(). This patch replaces the first reuseport_detach_sock() with
reuseport_stop_listen_sock(), which checks if the reuseport group is
capable of migration. If capable, it decrements num_socks, moves the socket
backwards in socks[] and increments num_closed_socks. When all connections
are migrated, sk_destruct() calls reuseport_detach_sock() to remove the
socket from socks[], decrement num_closed_socks, and set NULL to
sk_reuseport_cb.
By this change, closed or shutdowned sockets can keep sk_reuseport_cb.
Consequently, calling listen() after shutdown() can cause EADDRINUSE or
EBUSY in inet_csk_bind_conflict() or reuseport_add_sock() which expects
such sockets not to have the reuseport group. Therefore, this patch also
loosens such validation rules so that a socket can listen again if it has a
reuseport group with num_closed_socks more than 0.
When such sockets listen again, we handle them in reuseport_resurrect(). If
there is an existing reuseport group (reuseport_add_sock() path), we move
the socket from the old group to the new one and free the old one if
necessary. If there is no existing group (reuseport_alloc() path), we
allocate a new reuseport group, detach sk from the old one, and free it if
necessary, not to break the current shutdown behaviour:
- we cannot carry over the eBPF prog of shutdowned sockets
- we cannot attach/detach an eBPF prog to/from listening sockets via
shutdowned sockets
Note that when the number of sockets gets over U16_MAX, we try to detach a
closed socket randomly to make room for the new listening socket in
reuseport_grow().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-4-kuniyu@amazon.co.jp
2021-06-12 15:32:16 +03:00
reuseport_stop_listen_sock ( sk ) ;
2018-02-01 13:26:23 +03:00
if ( ilb ) {
2017-12-01 23:52:31 +03:00
inet_unhash2 ( hashinfo , sk ) ;
2019-12-14 05:20:41 +03:00
ilb - > count - - ;
2017-12-01 23:52:29 +03:00
}
2019-12-14 05:20:41 +03:00
__sk_nulls_del_node_init_rcu ( sk ) ;
2017-12-01 23:52:31 +03:00
sock_prot_inuse_add ( sock_net ( sk ) , sk - > sk_prot , - 1 ) ;
unlock :
2008-11-24 11:09:29 +03:00
spin_unlock_bh ( lock ) ;
[SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of
sk->sk_prot->get_port: both v4 and v6 use inet_csk_get_port
sk->sk_prot->hash: inet_hash is directly used, only v6 need
a specific version to deal with mapped sockets
sk->sk_prot->unhash: both v4 and v6 use inet_hash directly
struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so
that inet_csk_get_port can find the per family routine.
Now only the lookup routines receive as a parameter a struct inet_hashtable.
With this we further reuse code, reducing the difference among INET transport
protocols.
Eventually work has to be done on UDP and SCTP to make them share this
infrastructure and get as a bonus inet_diag interfaces so that iproute can be
used with these protocols.
net-2.6/net/ipv4/inet_hashtables.c:
struct proto | +8
struct inet_connection_sock_af_ops | +8
2 structs changed
__inet_hash_nolisten | +18
__inet_hash | -210
inet_put_port | +8
inet_bind_bucket_create | +1
__inet_hash_connect | -8
5 functions changed, 27 bytes added, 218 bytes removed, diff: -191
net-2.6/net/core/sock.c:
proto_seq_show | +3
1 function changed, 3 bytes added, diff: +3
net-2.6/net/ipv4/inet_connection_sock.c:
inet_csk_get_port | +15
1 function changed, 15 bytes added, diff: +15
net-2.6/net/ipv4/tcp.c:
tcp_set_state | -7
1 function changed, 7 bytes removed, diff: -7
net-2.6/net/ipv4/tcp_ipv4.c:
tcp_v4_get_port | -31
tcp_v4_hash | -48
tcp_v4_destroy_sock | -7
tcp_v4_syn_recv_sock | -2
tcp_unhash | -179
5 functions changed, 267 bytes removed, diff: -267
net-2.6/net/ipv6/inet6_hashtables.c:
__inet6_hash | +8
1 function changed, 8 bytes added, diff: +8
net-2.6/net/ipv4/inet_hashtables.c:
inet_unhash | +190
inet_hash | +242
2 functions changed, 432 bytes added, diff: +432
vmlinux:
16 functions changed, 485 bytes added, 492 bytes removed, diff: -7
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c:
tcp_v6_get_port | -31
tcp_v6_hash | -7
tcp_v6_syn_recv_sock | -9
3 functions changed, 47 bytes removed, diff: -47
/home/acme/git/net-2.6/net/dccp/proto.c:
dccp_destroy_sock | -7
dccp_unhash | -179
dccp_hash | -49
dccp_set_state | -7
dccp_done | +1
5 functions changed, 1 bytes added, 242 bytes removed, diff: -241
/home/acme/git/net-2.6/net/dccp/ipv4.c:
dccp_v4_get_port | -31
dccp_v4_request_recv_sock | -2
2 functions changed, 33 bytes removed, diff: -33
/home/acme/git/net-2.6/net/dccp/ipv6.c:
dccp_v6_get_port | -31
dccp_v6_hash | -7
dccp_v6_request_recv_sock | +5
3 functions changed, 5 bytes added, 38 bytes removed, diff: -33
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-02-03 15:06:04 +03:00
}
EXPORT_SYMBOL_GPL ( inet_unhash ) ;
2007-12-21 02:31:33 +03:00
2021-02-09 22:20:27 +03:00
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32 bit integers ( vs RFC ' short integers ' )
* because 2 ^ 16 is not a multiple of num_ephemeral and this
* property might be used by clever attacker .
* RFC claims using TABLE_LENGTH = 10 buckets gives an improvement ,
* we use 256 instead to really give more isolation and
* privacy , this only consumes 1 KB of kernel memory .
*/
# define INET_TABLE_PERTURB_SHIFT 8
static u32 table_perturb [ 1 < < INET_TABLE_PERTURB_SHIFT ] ;
2008-01-31 16:04:45 +03:00
int __inet_hash_connect ( struct inet_timewait_death_row * death_row ,
2008-02-05 14:14:44 +03:00
struct sock * sk , u32 port_offset ,
2008-01-31 16:04:45 +03:00
int ( * check_established ) ( struct inet_timewait_death_row * ,
2015-03-19 00:05:37 +03:00
struct sock * , __u16 , struct inet_timewait_sock * * ) )
2005-12-14 10:25:31 +03:00
{
struct inet_hashinfo * hinfo = death_row - > hashinfo ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
struct inet_timewait_sock * tw = NULL ;
2007-02-09 17:24:47 +03:00
struct inet_bind_hashbucket * head ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
int port = inet_sk ( sk ) - > inet_num ;
2008-03-25 20:26:21 +03:00
struct net * net = sock_net ( sk ) ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
struct inet_bind_bucket * tb ;
u32 remaining , offset ;
int ret , i , low , high ;
2018-11-07 18:36:02 +03:00
int l3mdev ;
2021-02-09 22:20:27 +03:00
u32 index ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
if ( port ) {
head = & hinfo - > bhash [ inet_bhashfn ( net , port ,
hinfo - > bhash_size ) ] ;
tb = inet_csk ( sk ) - > icsk_bind_hash ;
spin_lock_bh ( & head - > lock ) ;
if ( sk_head ( & tb - > owners ) = = sk & & ! sk - > sk_bind_node . next ) {
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
inet_ehash_nolisten ( sk , NULL , NULL ) ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
spin_unlock_bh ( & head - > lock ) ;
return 0 ;
}
spin_unlock ( & head - > lock ) ;
/* No definite answer... Walk to established hash table */
ret = check_established ( death_row , sk , port , NULL ) ;
local_bh_enable ( ) ;
return ret ;
}
2005-12-14 10:25:31 +03:00
2018-11-07 18:36:02 +03:00
l3mdev = inet_sk_bound_l3mdev ( sk ) ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
inet_get_local_port_range ( net , & low , & high ) ;
high + + ; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low ;
if ( likely ( remaining > 1 ) )
remaining & = ~ 1U ;
2005-12-14 10:25:31 +03:00
2021-02-09 22:20:27 +03:00
net_get_random_once ( table_perturb , sizeof ( table_perturb ) ) ;
index = hash_32 ( port_offset , INET_TABLE_PERTURB_SHIFT ) ;
offset = ( READ_ONCE ( table_perturb [ index ] ) + port_offset ) % remaining ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
/* In first pass we try ports of @low parity.
* inet_csk_get_port ( ) does the opposite choice .
*/
offset & = ~ 1U ;
other_parity_scan :
port = low + offset ;
for ( i = 0 ; i < remaining ; i + = 2 , port + = 2 ) {
if ( unlikely ( port > = high ) )
port - = remaining ;
if ( inet_is_local_reserved_port ( net , port ) )
continue ;
head = & hinfo - > bhash [ inet_bhashfn ( net , port ,
hinfo - > bhash_size ) ] ;
spin_lock_bh ( & head - > lock ) ;
2007-10-11 04:30:46 +04:00
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
/* Does not bother with rcv_saddr checks, because
* the established check is already unique enough .
tcp/dccp: try to not exhaust ip_local_port_range in connect()
A long standing problem on busy servers is the tiny available TCP port
range (/proc/sys/net/ipv4/ip_local_port_range) and the default
sequential allocation of source ports in connect() system call.
If a host is having a lot of active TCP sessions, chances are
very high that all ports are in use by at least one flow,
and subsequent bind(0) attempts fail, or have to scan a big portion of
space to find a slot.
In this patch, I changed the starting point in __inet_hash_connect()
so that we try to favor even [1] ports, leaving odd ports for bind()
users.
We still perform a sequential search, so there is no guarantee, but
if connect() targets are very different, end result is we leave
more ports available to bind(), and we spread them all over the range,
lowering time for both connect() and bind() to find a slot.
This strategy only works well if /proc/sys/net/ipv4/ip_local_port_range
is even, ie if start/end values have different parity.
Therefore, default /proc/sys/net/ipv4/ip_local_port_range was changed to
32768 - 60999 (instead of 32768 - 61000)
There is no change on security aspects here, only some poor hashing
schemes could be eventually impacted by this change.
[1] : The odd/even property depends on ip_local_port_range values parity
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-25 00:49:35 +03:00
*/
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
inet_bind_bucket_for_each ( tb , & head - > chain ) {
2018-11-07 18:36:02 +03:00
if ( net_eq ( ib_net ( tb ) , net ) & & tb - > l3mdev = = l3mdev & &
tb - > port = = port ) {
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
if ( tb - > fastreuse > = 0 | |
tb - > fastreuseport > = 0 )
2007-02-09 17:24:47 +03:00
goto next_port ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
WARN_ON ( hlist_empty ( & tb - > owners ) ) ;
if ( ! check_established ( death_row , sk ,
port , & tw ) )
goto ok ;
goto next_port ;
2007-02-09 17:24:47 +03:00
}
}
2005-12-14 10:25:31 +03:00
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
tb = inet_bind_bucket_create ( hinfo - > bind_bucket_cachep ,
2018-11-07 18:36:02 +03:00
net , head , port , l3mdev ) ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
if ( ! tb ) {
spin_unlock_bh ( & head - > lock ) ;
return - ENOMEM ;
2007-02-09 17:24:47 +03:00
}
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
tb - > fastreuse = - 1 ;
tb - > fastreuseport = - 1 ;
goto ok ;
next_port :
spin_unlock_bh ( & head - > lock ) ;
cond_resched ( ) ;
}
2005-12-14 10:25:31 +03:00
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
offset + + ;
if ( ( offset & 1 ) & & remaining > 1 )
goto other_parity_scan ;
2005-12-14 10:25:31 +03:00
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
return - EADDRNOTAVAIL ;
2005-12-14 10:25:31 +03:00
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
ok :
2021-02-09 22:20:28 +03:00
/* If our first attempt found a candidate, skip next candidate
* in 1 / 16 of cases to add some noise .
*/
if ( ! i & & ! ( prandom_u32 ( ) % 16 ) )
i = 2 ;
2021-02-09 22:20:27 +03:00
WRITE_ONCE ( table_perturb [ index ] , READ_ONCE ( table_perturb [ index ] ) + i + 2 ) ;
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
/* Head lock still held and bh's disabled */
inet_bind_hash ( sk , tb , port ) ;
if ( sk_unhashed ( sk ) ) {
inet_sk ( sk ) - > inet_sport = htons ( port ) ;
tcp: fix race condition when creating child sockets from syncookies
When the TCP stack is in SYN flood mode, the server child socket is
created from the SYN cookie received in a TCP packet with the ACK flag
set.
The child socket is created when the server receives the first TCP
packet with a valid SYN cookie from the client. Usually, this packet
corresponds to the final step of the TCP 3-way handshake, the ACK
packet. But is also possible to receive a valid SYN cookie from the
first TCP data packet sent by the client, and thus create a child socket
from that SYN cookie.
Since a client socket is ready to send data as soon as it receives the
SYN+ACK packet from the server, the client can send the ACK packet (sent
by the TCP stack code), and the first data packet (sent by the userspace
program) almost at the same time, and thus the server will equally
receive the two TCP packets with valid SYN cookies almost at the same
instant.
When such event happens, the TCP stack code has a race condition that
occurs between the momement a lookup is done to the established
connections hashtable to check for the existence of a connection for the
same client, and the moment that the child socket is added to the
established connections hashtable. As a consequence, this race condition
can lead to a situation where we add two child sockets to the
established connections hashtable and deliver two sockets to the
userspace program to the same client.
This patch fixes the race condition by checking if an existing child
socket exists for the same client when we are adding the second child
socket to the established connections socket. If an existing child
socket exists, we drop the packet and discard the second child socket
to the same client.
Signed-off-by: Ricardo Dias <rdias@singlestore.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-11-20 14:11:33 +03:00
inet_ehash_nolisten ( sk , ( struct sock * ) tw , NULL ) ;
2005-12-14 10:25:31 +03:00
}
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-12 03:28:49 +03:00
if ( tw )
inet_twsk_bind_unhash ( tw , hinfo ) ;
spin_unlock ( & head - > lock ) ;
if ( tw )
inet_twsk_deschedule_put ( tw ) ;
local_bh_enable ( ) ;
return 0 ;
2005-12-14 10:25:31 +03:00
}
2008-01-31 16:04:45 +03:00
/*
* Bind a port for a connect operation and hash it .
*/
int inet_hash_connect ( struct inet_timewait_death_row * death_row ,
struct sock * sk )
{
2015-05-27 20:46:02 +03:00
u32 port_offset = 0 ;
if ( ! inet_sk ( sk ) - > inet_num )
port_offset = inet_sk_port_offset ( sk ) ;
return __inet_hash_connect ( death_row , sk , port_offset ,
2015-03-19 00:05:37 +03:00
__inet_check_established ) ;
2008-01-31 16:04:45 +03:00
}
2005-12-14 10:25:31 +03:00
EXPORT_SYMBOL_GPL ( inet_hash_connect ) ;
2008-11-20 11:40:07 +03:00
void inet_hashinfo_init ( struct inet_hashinfo * h )
{
int i ;
2008-11-24 04:22:55 +03:00
for ( i = 0 ; i < INET_LHTABLE_SIZE ; i + + ) {
2008-11-20 11:40:07 +03:00
spin_lock_init ( & h - > listening_hash [ i ] . lock ) ;
2019-12-14 05:20:41 +03:00
INIT_HLIST_NULLS_HEAD ( & h - > listening_hash [ i ] . nulls_head ,
i + LISTENING_NULLS_BASE ) ;
2017-12-01 23:52:29 +03:00
h - > listening_hash [ i ] . count = 0 ;
2016-04-01 18:52:17 +03:00
}
2017-12-01 23:52:31 +03:00
h - > lhash2 = NULL ;
2008-11-20 11:40:07 +03:00
}
EXPORT_SYMBOL_GPL ( inet_hashinfo_init ) ;
2015-05-26 17:55:34 +03:00
2018-12-24 23:57:17 +03:00
static void init_hashinfo_lhash2 ( struct inet_hashinfo * h )
{
int i ;
for ( i = 0 ; i < = h - > lhash2_mask ; i + + ) {
spin_lock_init ( & h - > lhash2 [ i ] . lock ) ;
INIT_HLIST_HEAD ( & h - > lhash2 [ i ] . head ) ;
h - > lhash2 [ i ] . count = 0 ;
}
}
2017-12-01 23:52:31 +03:00
void __init inet_hashinfo2_init ( struct inet_hashinfo * h , const char * name ,
unsigned long numentries , int scale ,
unsigned long low_limit ,
unsigned long high_limit )
{
h - > lhash2 = alloc_large_system_hash ( name ,
sizeof ( * h - > lhash2 ) ,
numentries ,
scale ,
0 ,
NULL ,
& h - > lhash2_mask ,
low_limit ,
high_limit ) ;
2018-12-24 23:57:17 +03:00
init_hashinfo_lhash2 ( h ) ;
}
2017-12-01 23:52:31 +03:00
2018-12-24 23:57:17 +03:00
int inet_hashinfo2_init_mod ( struct inet_hashinfo * h )
{
h - > lhash2 = kmalloc_array ( INET_LHTABLE_SIZE , sizeof ( * h - > lhash2 ) , GFP_KERNEL ) ;
if ( ! h - > lhash2 )
return - ENOMEM ;
h - > lhash2_mask = INET_LHTABLE_SIZE - 1 ;
/* INET_LHTABLE_SIZE must be a power of 2 */
BUG_ON ( INET_LHTABLE_SIZE & h - > lhash2_mask ) ;
init_hashinfo_lhash2 ( h ) ;
return 0 ;
2017-12-01 23:52:31 +03:00
}
2018-12-24 23:57:17 +03:00
EXPORT_SYMBOL_GPL ( inet_hashinfo2_init_mod ) ;
2017-12-01 23:52:31 +03:00
2015-05-26 17:55:34 +03:00
int inet_ehash_locks_alloc ( struct inet_hashinfo * hashinfo )
{
2015-07-22 08:02:00 +03:00
unsigned int locksz = sizeof ( spinlock_t ) ;
2015-05-26 17:55:34 +03:00
unsigned int i , nblocks = 1 ;
2015-07-22 08:02:00 +03:00
if ( locksz ! = 0 ) {
2015-05-26 17:55:34 +03:00
/* allocate 2 cache lines or at least one spinlock per cpu */
2015-07-22 08:02:00 +03:00
nblocks = max ( 2U * L1_CACHE_BYTES / locksz , 1U ) ;
2015-05-26 17:55:34 +03:00
nblocks = roundup_pow_of_two ( nblocks * num_possible_cpus ( ) ) ;
/* no more locks than number of hash buckets */
nblocks = min ( nblocks , hashinfo - > ehash_mask + 1 ) ;
2017-05-09 01:57:27 +03:00
hashinfo - > ehash_locks = kvmalloc_array ( nblocks , locksz , GFP_KERNEL ) ;
2015-05-26 17:55:34 +03:00
if ( ! hashinfo - > ehash_locks )
return - ENOMEM ;
for ( i = 0 ; i < nblocks ; i + + )
spin_lock_init ( & hashinfo - > ehash_locks [ i ] ) ;
}
hashinfo - > ehash_locks_mask = nblocks - 1 ;
return 0 ;
}
EXPORT_SYMBOL_GPL ( inet_ehash_locks_alloc ) ;