2019-05-27 09:55:01 +03:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
2005-08-10 07:09:30 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Definitions for a generic INET TIMEWAIT sock
*
* From code originally in net / tcp . h
*/
# ifndef _INET_TIMEWAIT_SOCK_
# define _INET_TIMEWAIT_SOCK_
# include <linux/list.h>
2005-08-10 07:44:40 +04:00
# include <linux/timer.h>
2005-08-10 07:09:30 +04:00
# include <linux/types.h>
2005-08-10 07:44:40 +04:00
# include <linux/workqueue.h>
2005-08-10 07:09:30 +04:00
2005-12-27 07:43:12 +03:00
# include <net/inet_sock.h>
2005-08-10 07:09:30 +04:00
# include <net/sock.h>
# include <net/tcp_states.h>
2005-12-14 10:25:19 +03:00
# include <net/timewait_sock.h>
2005-08-10 07:09:30 +04:00
2011-07-27 03:09:06 +04:00
# include <linux/atomic.h>
2005-08-10 07:09:30 +04:00
struct inet_bind_bucket ;
/*
* This is a TIME_WAIT sock . It works around the memory consumption
* problems of sockets in such a state on heavily loaded servers , but
* without violating the protocol specification .
*/
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common , so please just
* don ' t add nothing before this first member ( __tw_common ) - - acme
*/
struct sock_common __tw_common ;
# define tw_family __tw_common.skc_family
# define tw_state __tw_common.skc_state
# define tw_reuse __tw_common.skc_reuse
2018-04-07 23:42:43 +03:00
# define tw_reuseport __tw_common.skc_reuseport
2014-06-27 19:36:16 +04:00
# define tw_ipv6only __tw_common.skc_ipv6only
2005-08-10 07:09:30 +04:00
# define tw_bound_dev_if __tw_common.skc_bound_dev_if
2008-11-17 06:40:17 +03:00
# define tw_node __tw_common.skc_nulls_node
2005-08-10 07:09:30 +04:00
# define tw_bind_node __tw_common.skc_bind_node
# define tw_refcnt __tw_common.skc_refcnt
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-04 01:13:38 +04:00
# define tw_hash __tw_common.skc_hash
2005-08-10 07:09:30 +04:00
# define tw_prot __tw_common.skc_prot
2007-09-12 13:58:02 +04:00
# define tw_net __tw_common.skc_net
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
# define tw_daddr __tw_common.skc_daddr
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
# define tw_v6_daddr __tw_common.skc_v6_daddr
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
# define tw_rcv_saddr __tw_common.skc_rcv_saddr
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
# define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr
2012-11-30 13:49:27 +04:00
# define tw_dport __tw_common.skc_dport
# define tw_num __tw_common.skc_num
2015-03-12 04:53:14 +03:00
# define tw_cookie __tw_common.skc_cookie
2015-10-09 05:33:22 +03:00
# define tw_dr __tw_common.skc_tw_dr
2012-11-30 13:49:27 +04:00
2018-05-10 09:53:51 +03:00
__u32 tw_mark ;
2005-08-10 07:09:30 +04:00
volatile unsigned char tw_substate ;
unsigned char tw_rcv_wscale ;
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
2005-08-10 07:09:30 +04:00
/* Socket demultiplex comparisons on incoming packets. */
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 22:04:07 +03:00
/* these three are in inet_sock */
2006-09-28 05:43:50 +04:00
__be16 tw_sport ;
2005-08-10 07:09:30 +04:00
/* And these are ours. */
2022-01-24 23:24:53 +03:00
unsigned int tw_transparent : 1 ,
2014-01-16 20:21:22 +04:00
tw_flowlabel : 20 ,
2023-10-20 15:57:47 +03:00
tw_usec_ts : 1 ,
tw_pad : 2 , /* 2 bits hole */
2014-01-16 20:21:22 +04:00
tw_tos : 8 ;
2019-06-09 03:58:51 +03:00
u32 tw_txhash ;
2019-09-24 18:01:16 +03:00
u32 tw_priority ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
struct timer_list tw_timer ;
2005-08-10 07:09:30 +04:00
struct inet_bind_bucket * tw_tb ;
2022-12-26 16:27:52 +03:00
struct inet_bind2_bucket * tw_tb2 ;
2005-08-10 07:09:30 +04:00
} ;
2011-10-27 08:44:35 +04:00
# define tw_tclass tw_tos
2005-08-10 07:09:30 +04:00
static inline struct inet_timewait_sock * inet_twsk ( const struct sock * sk )
{
return ( struct inet_timewait_sock * ) sk ;
}
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 11:22:02 +04:00
void inet_twsk_free ( struct inet_timewait_sock * tw ) ;
2013-09-21 21:22:41 +04:00
void inet_twsk_put ( struct inet_timewait_sock * tw ) ;
2005-08-10 07:09:46 +04:00
2015-07-09 00:28:29 +03:00
void inet_twsk_bind_unhash ( struct inet_timewait_sock * tw ,
struct inet_hashinfo * hashinfo ) ;
2009-12-04 06:47:42 +03:00
2013-09-21 21:22:41 +04:00
struct inet_timewait_sock * inet_twsk_alloc ( const struct sock * sk ,
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
struct inet_timewait_death_row * dr ,
2013-09-21 21:22:41 +04:00
const int state ) ;
2005-08-10 07:09:59 +04:00
2017-12-12 08:25:12 +03:00
void inet_twsk_hashdance ( struct inet_timewait_sock * tw , struct sock * sk ,
struct inet_hashinfo * hashinfo ) ;
2005-08-10 07:45:03 +04:00
2015-09-19 19:08:34 +03:00
void __inet_twsk_schedule ( struct inet_timewait_sock * tw , int timeo ,
bool rearm ) ;
2015-10-01 14:48:53 +03:00
static inline void inet_twsk_schedule ( struct inet_timewait_sock * tw , int timeo )
2015-09-19 19:08:34 +03:00
{
__inet_twsk_schedule ( tw , timeo , false ) ;
}
2015-10-01 14:48:53 +03:00
static inline void inet_twsk_reschedule ( struct inet_timewait_sock * tw , int timeo )
2015-09-19 19:08:34 +03:00
{
__inet_twsk_schedule ( tw , timeo , true ) ;
}
2015-07-09 00:28:30 +03:00
void inet_twsk_deschedule_put ( struct inet_timewait_sock * tw ) ;
2008-03-25 20:26:21 +03:00
2022-05-13 00:14:56 +03:00
void inet_twsk_purge ( struct inet_hashinfo * hashinfo , int family ) ;
2008-03-25 20:26:21 +03:00
static inline
struct net * twsk_net ( const struct inet_timewait_sock * twsk )
{
2011-12-14 08:58:33 +04:00
return read_pnet ( & twsk - > tw_net ) ;
2008-03-25 20:26:21 +03:00
}
static inline
2008-03-26 10:48:17 +03:00
void twsk_net_set ( struct inet_timewait_sock * twsk , struct net * net )
2008-03-25 20:26:21 +03:00
{
2011-12-14 08:58:33 +04:00
write_pnet ( & twsk - > tw_net , net ) ;
2008-03-25 20:26:21 +03:00
}
2005-08-10 07:09:30 +04:00
# endif /* _INET_TIMEWAIT_SOCK_ */