2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Implementation of the Transmission Control Protocol ( TCP ) .
*
2005-05-06 03:16:16 +04:00
* Authors : Ross Biro
2005-04-17 02:20:36 +04:00
* Fred N . van Kempen , < waltje @ uWalt . NL . Mugnet . ORG >
* Mark Evans , < evansmp @ uhura . aston . ac . uk >
* Corey Minyard < wf - rch ! minyard @ relay . EU . net >
* Florian La Roche , < flla @ stud . uni - sb . de >
* Charles Hedrick , < hedrick @ klinzhai . rutgers . edu >
* Linus Torvalds , < torvalds @ cs . helsinki . fi >
* Alan Cox , < gw4pts @ gw4pts . ampr . org >
* Matthew Dillon , < dillon @ apollo . west . oic . com >
* Arnt Gulbrandsen , < agulbra @ nvg . unit . no >
* Jorge Cwik , < jorge @ laser . satlink . net >
*/
# include <net/tcp.h>
# include <net/xfrm.h>
2017-03-24 20:08:00 +03:00
# include <net/busy_poll.h>
2005-04-17 02:20:36 +04:00
2012-05-17 03:15:34 +04:00
static bool tcp_in_window ( u32 seq , u32 end_seq , u32 s_win , u32 e_win )
2005-04-17 02:20:36 +04:00
{
if ( seq = = s_win )
2012-05-17 03:15:34 +04:00
return true ;
2005-04-17 02:20:36 +04:00
if ( after ( end_seq , s_win ) & & before ( seq , e_win ) )
2012-05-17 03:15:34 +04:00
return true ;
2010-09-23 00:43:57 +04:00
return seq = = e_win & & seq = = end_seq ;
2005-04-17 02:20:36 +04:00
}
2015-02-07 00:04:41 +03:00
static enum tcp_tw_status
tcp_timewait_check_oow_rate_limit ( struct inet_timewait_sock * tw ,
const struct sk_buff * skb , int mib_idx )
{
struct tcp_timewait_sock * tcptw = tcp_twsk ( ( struct sock * ) tw ) ;
if ( ! tcp_oow_rate_limited ( twsk_net ( tw ) , skb , mib_idx ,
& tcptw - > tw_last_oow_ack_time ) ) {
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller .
*/
return TCP_TW_ACK ;
}
/* We are rate-limiting, so just release the tw sock and drop skb. */
inet_twsk_put ( tw ) ;
return TCP_TW_SUCCESS ;
}
2007-02-09 17:24:47 +03:00
/*
2005-04-17 02:20:36 +04:00
* * Main purpose of TIME - WAIT state is to close connection gracefully ,
* when one of ends sits in LAST - ACK or CLOSING retransmitting FIN
* ( and , probably , tail of data ) and one or more our ACKs are lost .
* * What is TIME - WAIT timeout ? It is associated with maximal packet
* lifetime in the internet , which results in wrong conclusion , that
* it is set to catch " old duplicate segments " wandering out of their path .
* It is not quite correct . This timeout is calculated so that it exceeds
* maximal retransmission timeout enough to allow to lose one ( or more )
* segments sent by peer and our ACKs . This time may be calculated from RTO .
* * When TIME - WAIT socket receives RST , it means that another end
* finally closed and we are allowed to kill TIME - WAIT too .
* * Second purpose of TIME - WAIT is catching old duplicate segments .
* Well , certainly it is pure paranoia , but if we load TIME - WAIT
* with this semantics , we MUST NOT kill TIME - WAIT state with RSTs .
* * If we invented some more clever way to catch duplicates
* ( f . e . based on PAWS ) , we could truncate TIME - WAIT to several RTOs .
*
* The algorithm below is based on FORMAL INTERPRETATION of RFCs .
* When you compare it to RFCs , please , read section SEGMENT ARRIVES
* from the very beginning .
*
* NOTE . With recycling ( and later with fin - wait - 2 ) TW bucket
* is _not_ stateless . It means , that strictly speaking we must
* spinlock it . I do not want ! Well , probability of misbehaviour
* is ridiculously low and , seems , we could use some mb ( ) tricks
* to avoid misread sequence numbers , states etc . - - ANK
2012-09-19 18:46:06 +04:00
*
* We don ' t need to initialize tmp_out . sack_ok as we don ' t use the results
2005-04-17 02:20:36 +04:00
*/
enum tcp_tw_status
2005-08-10 07:09:30 +04:00
tcp_timewait_state_process ( struct inet_timewait_sock * tw , struct sk_buff * skb ,
const struct tcphdr * th )
2005-04-17 02:20:36 +04:00
{
struct tcp_options_received tmp_opt ;
2009-12-02 21:25:27 +03:00
struct tcp_timewait_sock * tcptw = tcp_twsk ( ( struct sock * ) tw ) ;
2012-05-17 03:15:34 +04:00
bool paws_reject = false ;
2005-04-17 02:20:36 +04:00
tcp: Revert per-route SACK/DSACK/TIMESTAMP changes.
It creates a regression, triggering badness for SYN_RECV
sockets, for example:
[19148.022102] Badness at net/ipv4/inet_connection_sock.c:293
[19148.022570] NIP: c02a0914 LR: c02a0904 CTR: 00000000
[19148.023035] REGS: eeecbd30 TRAP: 0700 Not tainted (2.6.32)
[19148.023496] MSR: 00029032 <EE,ME,CE,IR,DR> CR: 24002442 XER: 00000000
[19148.024012] TASK = eee9a820[1756] 'privoxy' THREAD: eeeca000
This is likely caused by the change in the 'estab' parameter
passed to tcp_parse_options() when invoked by the functions
in net/ipv4/tcp_minisocks.c
But even if that is fixed, the ->conn_request() changes made in
this patch series is fundamentally wrong. They try to use the
listening socket's 'dst' to probe the route settings. The
listening socket doesn't even have a route, and you can't
get the right route (the child request one) until much later
after we setup all of the state, and it must be done by hand.
This stuff really isn't ready, so the best thing to do is a
full revert. This reverts the following commits:
f55017a93f1a74d50244b1254b9a2bd7ac9bbf7d
022c3f7d82f0f1c68018696f2f027b87b9bb45c2
1aba721eba1d84a2defce45b950272cee1e6c72a
cda42ebd67ee5fdf09d7057b5a4584d36fe8a335
345cda2fd695534be5a4494f1b59da9daed33663
dc343475ed062e13fc260acccaab91d7d80fd5b2
05eaade2782fb0c90d3034fd7a7d5a16266182bb
6a2a2d6bf8581216e08be15fcb563cfd6c430e1e
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-12-16 07:56:42 +03:00
tmp_opt . saw_tstamp = 0 ;
2005-08-10 07:09:30 +04:00
if ( th - > doff > ( sizeof ( * th ) > > 2 ) & & tcptw - > tw_ts_recent_stamp ) {
2017-06-07 20:34:36 +03:00
tcp_parse_options ( twsk_net ( tw ) , skb , & tmp_opt , 0 , NULL ) ;
2005-04-17 02:20:36 +04:00
if ( tmp_opt . saw_tstamp ) {
2017-02-22 13:23:56 +03:00
if ( tmp_opt . rcv_tsecr )
tmp_opt . rcv_tsecr - = tcptw - > tw_ts_offset ;
2005-08-10 07:09:30 +04:00
tmp_opt . ts_recent = tcptw - > tw_ts_recent ;
tmp_opt . ts_recent_stamp = tcptw - > tw_ts_recent_stamp ;
2009-03-14 17:23:03 +03:00
paws_reject = tcp_paws_reject ( & tmp_opt , th - > rst ) ;
2005-04-17 02:20:36 +04:00
}
}
if ( tw - > tw_substate = = TCP_FIN_WAIT2 ) {
/* Just repeat all the checks of tcp_rcv_state_process() */
/* Out of window, send ACK */
if ( paws_reject | |
! tcp_in_window ( TCP_SKB_CB ( skb ) - > seq , TCP_SKB_CB ( skb ) - > end_seq ,
2005-08-10 07:09:30 +04:00
tcptw - > tw_rcv_nxt ,
tcptw - > tw_rcv_nxt + tcptw - > tw_rcv_wnd ) )
2015-02-07 00:04:41 +03:00
return tcp_timewait_check_oow_rate_limit (
tw , skb , LINUX_MIB_TCPACKSKIPPEDFINWAIT2 ) ;
2005-04-17 02:20:36 +04:00
if ( th - > rst )
goto kill ;
2005-08-10 07:09:30 +04:00
if ( th - > syn & & ! before ( TCP_SKB_CB ( skb ) - > seq , tcptw - > tw_rcv_nxt ) )
tcp: honour SO_BINDTODEVICE for TW_RST case too
Hannes points out that when we generate tcp reset for timewait sockets we
pretend we found no socket and pass NULL sk to tcp_vX_send_reset().
Make it cope with inet tw sockets and then provide tw sk.
This makes RSTs appear on correct interface when SO_BINDTODEVICE is used.
Packetdrill test case:
// want default route to be used, we rely on BINDTODEVICE
`ip route del 192.0.2.0/24 via 192.168.0.2 dev tun0`
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
// test case still works due to BINDTODEVICE
0.001 setsockopt(3, SOL_SOCKET, SO_BINDTODEVICE, "tun0", 4) = 0
0.100...0.200 connect(3, ..., ...) = 0
0.100 > S 0:0(0) <mss 1460,sackOK,nop,nop>
0.200 < S. 0:0(0) ack 1 win 32792 <mss 1460,sackOK,nop,nop>
0.200 > . 1:1(0) ack 1
0.210 close(3) = 0
0.210 > F. 1:1(0) ack 1 win 29200
0.300 < . 1:1(0) ack 2 win 46
// more data while in FIN_WAIT2, expect RST
1.300 < P. 1:1001(1000) ack 1 win 46
// fails without this change -- default route is used
1.301 > R 1:1(0) win 0
Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-21 23:29:26 +03:00
return TCP_TW_RST ;
2005-04-17 02:20:36 +04:00
/* Dup ACK? */
2009-06-25 02:29:31 +04:00
if ( ! th - > ack | |
! after ( TCP_SKB_CB ( skb ) - > end_seq , tcptw - > tw_rcv_nxt ) | |
2005-04-17 02:20:36 +04:00
TCP_SKB_CB ( skb ) - > end_seq = = TCP_SKB_CB ( skb ) - > seq ) {
2005-08-10 07:09:30 +04:00
inet_twsk_put ( tw ) ;
2005-04-17 02:20:36 +04:00
return TCP_TW_SUCCESS ;
}
/* New data or FIN. If new data arrive after half-duplex close,
* reset .
*/
if ( ! th - > fin | |
tcp: honour SO_BINDTODEVICE for TW_RST case too
Hannes points out that when we generate tcp reset for timewait sockets we
pretend we found no socket and pass NULL sk to tcp_vX_send_reset().
Make it cope with inet tw sockets and then provide tw sk.
This makes RSTs appear on correct interface when SO_BINDTODEVICE is used.
Packetdrill test case:
// want default route to be used, we rely on BINDTODEVICE
`ip route del 192.0.2.0/24 via 192.168.0.2 dev tun0`
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
// test case still works due to BINDTODEVICE
0.001 setsockopt(3, SOL_SOCKET, SO_BINDTODEVICE, "tun0", 4) = 0
0.100...0.200 connect(3, ..., ...) = 0
0.100 > S 0:0(0) <mss 1460,sackOK,nop,nop>
0.200 < S. 0:0(0) ack 1 win 32792 <mss 1460,sackOK,nop,nop>
0.200 > . 1:1(0) ack 1
0.210 close(3) = 0
0.210 > F. 1:1(0) ack 1 win 29200
0.300 < . 1:1(0) ack 2 win 46
// more data while in FIN_WAIT2, expect RST
1.300 < P. 1:1001(1000) ack 1 win 46
// fails without this change -- default route is used
1.301 > R 1:1(0) win 0
Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-21 23:29:26 +03:00
TCP_SKB_CB ( skb ) - > end_seq ! = tcptw - > tw_rcv_nxt + 1 )
2005-04-17 02:20:36 +04:00
return TCP_TW_RST ;
/* FIN arrived, enter true time-wait state. */
2005-08-10 07:09:30 +04:00
tw - > tw_substate = TCP_TIME_WAIT ;
tcptw - > tw_rcv_nxt = TCP_SKB_CB ( skb ) - > end_seq ;
2005-04-17 02:20:36 +04:00
if ( tmp_opt . saw_tstamp ) {
2018-07-11 13:16:12 +03:00
tcptw - > tw_ts_recent_stamp = ktime_get_seconds ( ) ;
2005-08-10 07:09:30 +04:00
tcptw - > tw_ts_recent = tmp_opt . rcv_tsval ;
2005-04-17 02:20:36 +04:00
}
2017-03-15 23:30:45 +03:00
inet_twsk_reschedule ( tw , TCP_TIMEWAIT_LEN ) ;
2005-04-17 02:20:36 +04:00
return TCP_TW_ACK ;
}
/*
* Now real TIME - WAIT state .
*
* RFC 1122 :
* " When a connection is [...] on TIME-WAIT state [...]
* [ a TCP ] MAY accept a new SYN from the remote TCP to
* reopen the connection directly , if it :
2007-02-09 17:24:47 +03:00
*
2005-04-17 02:20:36 +04:00
* ( 1 ) assigns its initial sequence number for the new
* connection to be larger than the largest sequence
* number it used on the previous connection incarnation ,
* and
*
2007-02-09 17:24:47 +03:00
* ( 2 ) returns to TIME - WAIT state if the SYN turns out
2005-04-17 02:20:36 +04:00
* to be an old duplicate " .
*/
if ( ! paws_reject & &
2005-08-10 07:09:30 +04:00
( TCP_SKB_CB ( skb ) - > seq = = tcptw - > tw_rcv_nxt & &
2005-04-17 02:20:36 +04:00
( TCP_SKB_CB ( skb ) - > seq = = TCP_SKB_CB ( skb ) - > end_seq | | th - > rst ) ) ) {
/* In window segment, it may be only reset or bare ack. */
if ( th - > rst ) {
2005-11-11 04:13:47 +03:00
/* This is TIME_WAIT assassination, in two flavors.
2005-04-17 02:20:36 +04:00
* Oh well . . . nobody has a sufficient solution to this
* protocol bug yet .
*/
2022-07-18 20:26:51 +03:00
if ( ! READ_ONCE ( twsk_net ( tw ) - > ipv4 . sysctl_tcp_rfc1337 ) ) {
2005-04-17 02:20:36 +04:00
kill :
2015-07-09 00:28:30 +03:00
inet_twsk_deschedule_put ( tw ) ;
2005-04-17 02:20:36 +04:00
return TCP_TW_SUCCESS ;
}
tcp: do not restart timewait timer on rst reception
RFC 1337 says:
''Ignore RST segments in TIME-WAIT state.
If the 2 minute MSL is enforced, this fix avoids all three hazards.''
So with net.ipv4.tcp_rfc1337=1, expected behaviour is to have TIME-WAIT sk
expire rather than removing it instantly when a reset is received.
However, Linux will also re-start the TIME-WAIT timer.
This causes connect to fail when tying to re-use ports or very long
delays (until syn retry interval exceeds MSL).
packetdrill test case:
// Demonstrate bogus rearming of TIME-WAIT timer in rfc1337 mode.
`sysctl net.ipv4.tcp_rfc1337=1`
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0
0.100 < S 0:0(0) win 29200 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
0.200 < . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
// Receive first segment
0.310 < P. 1:1001(1000) ack 1 win 46
// Send one ACK
0.310 > . 1:1(0) ack 1001
// read 1000 byte
0.310 read(4, ..., 1000) = 1000
// Application writes 100 bytes
0.350 write(4, ..., 100) = 100
0.350 > P. 1:101(100) ack 1001
// ACK
0.500 < . 1001:1001(0) ack 101 win 257
// close the connection
0.600 close(4) = 0
0.600 > F. 101:101(0) ack 1001 win 244
// Our side is in FIN_WAIT_1 & waits for ack to fin
0.7 < . 1001:1001(0) ack 102 win 244
// Our side is in FIN_WAIT_2 with no outstanding data.
0.8 < F. 1001:1001(0) ack 102 win 244
0.8 > . 102:102(0) ack 1002 win 244
// Our side is now in TIME_WAIT state, send ack for fin.
0.9 < F. 1002:1002(0) ack 102 win 244
0.9 > . 102:102(0) ack 1002 win 244
// Peer reopens with in-window SYN:
1.000 < S 1000:1000(0) win 9200 <mss 1460,nop,nop,sackOK,nop,wscale 7>
// Therefore, reply with ACK.
1.000 > . 102:102(0) ack 1002 win 244
// Peer sends RST for this ACK. Normally this RST results
// in tw socket removal, but rfc1337=1 setting prevents this.
1.100 < R 1002:1002(0) win 244
// second syn. Due to rfc1337=1 expect another pure ACK.
31.0 < S 1000:1000(0) win 9200 <mss 1460,nop,nop,sackOK,nop,wscale 7>
31.0 > . 102:102(0) ack 1002 win 244
// .. and another RST from peer.
31.1 < R 1002:1002(0) win 244
31.2 `echo no timer restart;ss -m -e -a -i -n -t -o state TIME-WAIT`
// third syn after one minute. Time-Wait socket should have expired by now.
63.0 < S 1000:1000(0) win 9200 <mss 1460,nop,nop,sackOK,nop,wscale 7>
// so we expect a syn-ack & 3whs to proceed from here on.
63.0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 7>
Without this patch, 'ss' shows restarts of tw timer and last packet is
thus just another pure ack, more than one minute later.
This restores the original code from commit 283fd6cf0be690a83
("Merge in ANK networking jumbo patch") in netdev-vger-cvs.git .
For some reason the else branch was removed/lost in 1f28b683339f7
("Merge in TCP/UDP optimizations and [..]") and timer restart became
unconditional.
Reported-by: Michal Tesar <mtesar@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-30 15:24:29 +03:00
} else {
inet_twsk_reschedule ( tw , TCP_TIMEWAIT_LEN ) ;
2005-04-17 02:20:36 +04:00
}
if ( tmp_opt . saw_tstamp ) {
2005-08-10 07:09:30 +04:00
tcptw - > tw_ts_recent = tmp_opt . rcv_tsval ;
2018-07-11 13:16:12 +03:00
tcptw - > tw_ts_recent_stamp = ktime_get_seconds ( ) ;
2005-04-17 02:20:36 +04:00
}
2005-08-10 07:09:30 +04:00
inet_twsk_put ( tw ) ;
2005-04-17 02:20:36 +04:00
return TCP_TW_SUCCESS ;
}
/* Out of window segment.
All the segments are ACKed immediately .
The only exception is new SYN . We accept it , if it is
not old duplicate and we are not in danger to be killed
by delayed old duplicates . RFC check is that it has
newer sequence number works at rates < 40 Mbit / sec .
However , if paws works , it is reliable AND even more ,
we even may relax silly seq space cutoff .
RED - PEN : we violate main RFC requirement , if this SYN will appear
old duplicate ( i . e . we receive RST in reply to SYN - ACK ) ,
we must return socket to time - wait state . It is not good ,
but not fatal yet .
*/
if ( th - > syn & & ! th - > rst & & ! th - > ack & & ! paws_reject & &
2005-08-10 07:09:30 +04:00
( after ( TCP_SKB_CB ( skb ) - > seq , tcptw - > tw_rcv_nxt ) | |
( tmp_opt . saw_tstamp & &
( s32 ) ( tcptw - > tw_ts_recent - tmp_opt . rcv_tsval ) < 0 ) ) ) {
u32 isn = tcptw - > tw_snd_nxt + 65535 + 2 ;
2005-04-17 02:20:36 +04:00
if ( isn = = 0 )
isn + + ;
2014-09-06 02:33:32 +04:00
TCP_SKB_CB ( skb ) - > tcp_tw_isn = isn ;
2005-04-17 02:20:36 +04:00
return TCP_TW_SYN ;
}
if ( paws_reject )
2016-04-28 02:44:39 +03:00
__NET_INC_STATS ( twsk_net ( tw ) , LINUX_MIB_PAWSESTABREJECTED ) ;
2005-04-17 02:20:36 +04:00
2007-03-09 07:45:19 +03:00
if ( ! th - > rst ) {
2005-04-17 02:20:36 +04:00
/* In this case we must reset the TIMEWAIT timer.
*
* If it is ACKless SYN it may be both old duplicate
* and new good SYN with random sequence number < rcv_nxt .
* Do not reschedule in the last case .
*/
if ( paws_reject | | th - > ack )
2015-09-19 19:08:34 +03:00
inet_twsk_reschedule ( tw , TCP_TIMEWAIT_LEN ) ;
2005-04-17 02:20:36 +04:00
2015-02-07 00:04:41 +03:00
return tcp_timewait_check_oow_rate_limit (
tw , skb , LINUX_MIB_TCPACKSKIPPEDTIMEWAIT ) ;
2005-04-17 02:20:36 +04:00
}
2005-08-10 07:09:30 +04:00
inet_twsk_put ( tw ) ;
2005-04-17 02:20:36 +04:00
return TCP_TW_SUCCESS ;
}
2010-07-10 01:22:10 +04:00
EXPORT_SYMBOL ( tcp_timewait_state_process ) ;
2005-04-17 02:20:36 +04:00
2022-11-23 20:38:59 +03:00
static void tcp_time_wait_init ( struct sock * sk , struct tcp_timewait_sock * tcptw )
{
# ifdef CONFIG_TCP_MD5SIG
const struct tcp_sock * tp = tcp_sk ( sk ) ;
struct tcp_md5sig_key * key ;
/*
* The timewait bucket does not have the key DB from the
* sock structure . We just make a quick copy of the
* md5 key being used ( if indeed we are using one )
* so the timewait ack generating code has the key .
*/
tcptw - > tw_md5_key = NULL ;
if ( ! static_branch_unlikely ( & tcp_md5_needed . key ) )
return ;
key = tp - > af_specific - > md5_lookup ( sk , sk ) ;
if ( key ) {
tcptw - > tw_md5_key = kmemdup ( key , sizeof ( * key ) , GFP_ATOMIC ) ;
if ( ! tcptw - > tw_md5_key )
return ;
if ( ! tcp_alloc_md5sig_pool ( ) )
goto out_free ;
if ( ! static_key_fast_inc_not_disabled ( & tcp_md5_needed . key . key ) )
goto out_free ;
}
return ;
out_free :
WARN_ON_ONCE ( 1 ) ;
kfree ( tcptw - > tw_md5_key ) ;
tcptw - > tw_md5_key = NULL ;
# endif
}
2007-02-09 17:24:47 +03:00
/*
2005-04-17 02:20:36 +04:00
* Move a socket to time - wait or dead fin - wait - 2 state .
2007-02-09 17:24:47 +03:00
*/
2005-04-17 02:20:36 +04:00
void tcp_time_wait ( struct sock * sk , int state , int timeo )
{
2005-12-14 10:15:52 +03:00
const struct inet_connection_sock * icsk = inet_csk ( sk ) ;
2005-08-10 07:09:30 +04:00
const struct tcp_sock * tp = tcp_sk ( sk ) ;
2022-09-08 04:10:17 +03:00
struct net * net = sock_net ( sk ) ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
struct inet_timewait_sock * tw ;
2005-04-17 02:20:36 +04:00
2022-09-08 04:10:18 +03:00
tw = inet_twsk_alloc ( sk , & net - > ipv4 . tcp_death_row , state ) ;
2005-04-17 02:20:36 +04:00
2015-04-03 11:17:27 +03:00
if ( tw ) {
2005-08-10 07:09:30 +04:00
struct tcp_timewait_sock * tcptw = tcp_twsk ( ( struct sock * ) tw ) ;
2005-08-10 07:10:42 +04:00
const int rto = ( icsk - > icsk_rto < < 2 ) - ( icsk - > icsk_rto > > 1 ) ;
2012-06-10 01:56:12 +04:00
struct inet_sock * inet = inet_sk ( sk ) ;
2005-08-10 07:09:30 +04:00
2012-06-10 01:56:12 +04:00
tw - > tw_transparent = inet - > transparent ;
2018-05-10 09:53:51 +03:00
tw - > tw_mark = sk - > sk_mark ;
2019-09-24 18:01:16 +03:00
tw - > tw_priority = sk - > sk_priority ;
2005-04-17 02:20:36 +04:00
tw - > tw_rcv_wscale = tp - > rx_opt . rcv_wscale ;
2005-08-10 07:09:30 +04:00
tcptw - > tw_rcv_nxt = tp - > rcv_nxt ;
tcptw - > tw_snd_nxt = tp - > snd_nxt ;
tcptw - > tw_rcv_wnd = tcp_receive_window ( tp ) ;
tcptw - > tw_ts_recent = tp - > rx_opt . ts_recent ;
tcptw - > tw_ts_recent_stamp = tp - > rx_opt . ts_recent_stamp ;
2013-02-11 09:50:17 +04:00
tcptw - > tw_ts_offset = tp - > tsoffset ;
2015-02-07 00:04:41 +03:00
tcptw - > tw_last_oow_ack_time = 0 ;
tcp: add optional per socket transmit delay
Adding delays to TCP flows is crucial for studying behavior
of TCP stacks, including congestion control modules.
Linux offers netem module, but it has unpractical constraints :
- Need root access to change qdisc
- Hard to setup on egress if combined with non trivial qdisc like FQ
- Single delay for all flows.
EDT (Earliest Departure Time) adoption in TCP stack allows us
to enable a per socket delay at a very small cost.
Networking tools can now establish thousands of flows, each of them
with a different delay, simulating real world conditions.
This requires FQ packet scheduler or a EDT-enabled NIC.
This patchs adds TCP_TX_DELAY socket option, to set a delay in
usec units.
unsigned int tx_delay = 10000; /* 10 msec */
setsockopt(fd, SOL_TCP, TCP_TX_DELAY, &tx_delay, sizeof(tx_delay));
Note that FQ packet scheduler limits might need some tweaking :
man tc-fq
PARAMETERS
limit
Hard limit on the real queue size. When this limit is
reached, new packets are dropped. If the value is lowered,
packets are dropped so that the new limit is met. Default
is 10000 packets.
flow_limit
Hard limit on the maximum number of packets queued per
flow. Default value is 100.
Use of TCP_TX_DELAY option will increase number of skbs in FQ qdisc,
so packets would be dropped if any of the previous limit is hit.
Use of a jump label makes this support runtime-free, for hosts
never using the option.
Also note that TSQ (TCP Small Queues) limits are slightly changed
with this patch : we need to account that skbs artificially delayed
wont stop us providind more skbs to feed the pipe (netem uses
skb_orphan_partial() for this purpose, but FQ can not use this trick)
Because of that, using big delays might very well trigger
old bugs in TSO auto defer logic and/or sndbuf limited detection.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-12 21:57:25 +03:00
tcptw - > tw_tx_delay = tp - > tcp_tx_delay ;
2011-12-10 13:48:31 +04:00
# if IS_ENABLED(CONFIG_IPV6)
2005-04-17 02:20:36 +04:00
if ( tw - > tw_family = = PF_INET6 ) {
struct ipv6_pinfo * np = inet6_sk ( sk ) ;
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 02:42:29 +04:00
tw - > tw_v6_daddr = sk - > sk_v6_daddr ;
tw - > tw_v6_rcv_saddr = sk - > sk_v6_rcv_saddr ;
2011-10-27 08:44:35 +04:00
tw - > tw_tclass = np - > tclass ;
2015-05-16 01:24:59 +03:00
tw - > tw_flowlabel = be32_to_cpu ( np - > flow_label & IPV6_FLOWLABEL_MASK ) ;
2019-06-09 03:58:51 +03:00
tw - > tw_txhash = sk - > sk_txhash ;
2014-06-27 19:36:16 +04:00
tw - > tw_ipv6only = sk - > sk_ipv6only ;
2005-08-10 07:09:59 +04:00
}
2005-04-17 02:20:36 +04:00
# endif
2006-11-15 06:07:45 +03:00
2022-11-23 20:38:59 +03:00
tcp_time_wait_init ( sk , tcptw ) ;
2006-11-15 06:07:45 +03:00
2005-04-17 02:20:36 +04:00
/* Get the TIME_WAIT timeout firing. */
if ( timeo < rto )
timeo = rto ;
2017-03-15 23:30:45 +03:00
if ( state = = TCP_TIME_WAIT )
timeo = TCP_TIMEWAIT_LEN ;
2005-04-17 02:20:36 +04:00
2017-12-01 21:06:56 +03:00
/* tw_timer is pinned, so we need to make sure BH are disabled
* in following section , otherwise timer handler could run before
* we complete the initialization .
*/
local_bh_disable ( ) ;
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-13 04:51:09 +03:00
inet_twsk_schedule ( tw , timeo ) ;
2017-12-12 08:25:12 +03:00
/* Linkage updates.
* Note that access to tw after this point is illegal .
*/
2022-09-08 04:10:20 +03:00
inet_twsk_hashdance ( tw , sk , net - > ipv4 . tcp_death_row . hashinfo ) ;
2017-12-01 21:06:56 +03:00
local_bh_enable ( ) ;
2005-04-17 02:20:36 +04:00
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up . We ' ve got bigger problems than
* non - graceful socket closings .
*/
2022-09-08 04:10:17 +03:00
NET_INC_STATS ( net , LINUX_MIB_TCPTIMEWAITOVERFLOW ) ;
2005-04-17 02:20:36 +04:00
}
tcp_update_metrics ( sk ) ;
tcp_done ( sk ) ;
}
2018-03-31 19:11:59 +03:00
EXPORT_SYMBOL ( tcp_time_wait ) ;
2005-04-17 02:20:36 +04:00
2006-11-15 06:07:45 +03:00
void tcp_twsk_destructor ( struct sock * sk )
{
2012-07-10 14:27:56 +04:00
# ifdef CONFIG_TCP_MD5SIG
2022-11-23 20:38:57 +03:00
if ( static_branch_unlikely ( & tcp_md5_needed . key ) ) {
2019-02-26 20:49:12 +03:00
struct tcp_timewait_sock * twsk = tcp_twsk ( sk ) ;
2012-06-10 01:56:12 +04:00
2022-11-23 20:38:57 +03:00
if ( twsk - > tw_md5_key ) {
2019-02-26 20:49:12 +03:00
kfree_rcu ( twsk - > tw_md5_key , rcu ) ;
2022-11-23 20:38:57 +03:00
static_branch_slow_dec_deferred ( & tcp_md5_needed ) ;
}
2019-02-26 20:49:12 +03:00
}
2006-11-15 06:07:45 +03:00
# endif
}
EXPORT_SYMBOL_GPL ( tcp_twsk_destructor ) ;
2022-09-08 04:10:21 +03:00
void tcp_twsk_purge ( struct list_head * net_exit_list , int family )
{
tcp: Introduce optional per-netns ehash.
The more sockets we have in the hash table, the longer we spend looking
up the socket. While running a number of small workloads on the same
host, they penalise each other and cause performance degradation.
The root cause might be a single workload that consumes much more
resources than the others. It often happens on a cloud service where
different workloads share the same computing resource.
On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash
entries), after running iperf3 in different netns, creating 24Mi sockets
without data transfer in the root netns causes about 10% performance
regression for the iperf3's connection.
thash_entries sockets length Gbps
524288 1 1 50.7
24Mi 48 45.1
It is basically related to the length of the list of each hash bucket.
For testing purposes to see how performance drops along the length,
I set 131072 (1Mi / 8) to thash_entries, and here's the result.
thash_entries sockets length Gbps
131072 1 1 50.7
1Mi 8 49.9
2Mi 16 48.9
4Mi 32 47.3
8Mi 64 44.6
16Mi 128 40.6
24Mi 192 36.3
32Mi 256 32.5
40Mi 320 27.0
48Mi 384 25.0
To resolve the socket lookup degradation, we introduce an optional
per-netns hash table for TCP, but it's just ehash, and we still share
the global bhash, bhash2 and lhash2.
With a smaller ehash, we can look up non-listener sockets faster and
isolate such noisy neighbours. In addition, we can reduce lock contention.
We can control the ehash size by a new sysctl knob. However, depending
on workloads, it will require very sensitive tuning, so we disable the
feature by default (net.ipv4.tcp_child_ehash_entries == 0). Moreover,
we can fall back to using the global ehash in case we fail to allocate
enough memory for a new ehash. The maximum size is 16Mi, which is large
enough that even if we have 48Mi sockets, the average list length is 3,
and regression would be less than 1%.
We can check the current ehash size by another read-only sysctl knob,
net.ipv4.tcp_ehash_entries. A negative value means the netns shares
the global ehash (per-netns ehash is disabled or failed to allocate
memory).
# dmesg | cut -d ' ' -f 5- | grep "established hash"
TCP established hash table entries: 524288 (order: 10, 4194304 bytes, vmalloc hugepage)
# sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 524288 # can be changed by thash_entries
# sysctl net.ipv4.tcp_child_ehash_entries
net.ipv4.tcp_child_ehash_entries = 0 # disabled by default
# ip netns add test1
# ip netns exec test1 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = -524288 # share the global ehash
# sysctl -w net.ipv4.tcp_child_ehash_entries=100
net.ipv4.tcp_child_ehash_entries = 100
# ip netns add test2
# ip netns exec test2 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 128 # own a per-netns ehash with 2^n buckets
When more than two processes in the same netns create per-netns ehash
concurrently with different sizes, we need to guarantee the size in
one of the following ways:
1) Share the global ehash and create per-netns ehash
First, unshare() with tcp_child_ehash_entries==0. It creates dedicated
netns sysctl knobs where we can safely change tcp_child_ehash_entries
and clone()/unshare() to create a per-netns ehash.
2) Control write on sysctl by BPF
We can use BPF_PROG_TYPE_CGROUP_SYSCTL to allow/deny read/write on
sysctl knobs.
Note that the global ehash allocated at the boot time is spread over
available NUMA nodes, but inet_pernet_hashinfo_alloc() will allocate
pages for each per-netns ehash depending on the current process's NUMA
policy. By default, the allocation is done in the local node only, so
the per-netns hash table could fully reside on a random node. Thus,
depending on the NUMA policy the netns is created with and the CPU the
current thread is running on, we could see some performance differences
for highly optimised networking applications.
Note also that the default values of two sysctl knobs depend on the ehash
size and should be tuned carefully:
tcp_max_tw_buckets : tcp_child_ehash_entries / 2
tcp_max_syn_backlog : max(128, tcp_child_ehash_entries / 128)
As a bonus, we can dismantle netns faster. Currently, while destroying
netns, we call inet_twsk_purge(), which walks through the global ehash.
It can be potentially big because it can have many sockets other than
TIME_WAIT in all netns. Splitting ehash changes that situation, where
it's only necessary for inet_twsk_purge() to clean up TIME_WAIT sockets
in each netns.
With regard to this, we do not free the per-netns ehash in inet_twsk_kill()
to avoid UAF while iterating the per-netns ehash in inet_twsk_purge().
Instead, we do it in tcp_sk_exit_batch() after calling tcp_twsk_purge() to
keep it protocol-family-independent.
In the future, we could optimise ehash lookup/iteration further by removing
netns comparison for the per-netns ehash.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-09-08 04:10:22 +03:00
bool purged_once = false ;
2022-09-08 04:10:21 +03:00
struct net * net ;
list_for_each_entry ( net , net_exit_list , exit_list ) {
tcp: Introduce optional per-netns ehash.
The more sockets we have in the hash table, the longer we spend looking
up the socket. While running a number of small workloads on the same
host, they penalise each other and cause performance degradation.
The root cause might be a single workload that consumes much more
resources than the others. It often happens on a cloud service where
different workloads share the same computing resource.
On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash
entries), after running iperf3 in different netns, creating 24Mi sockets
without data transfer in the root netns causes about 10% performance
regression for the iperf3's connection.
thash_entries sockets length Gbps
524288 1 1 50.7
24Mi 48 45.1
It is basically related to the length of the list of each hash bucket.
For testing purposes to see how performance drops along the length,
I set 131072 (1Mi / 8) to thash_entries, and here's the result.
thash_entries sockets length Gbps
131072 1 1 50.7
1Mi 8 49.9
2Mi 16 48.9
4Mi 32 47.3
8Mi 64 44.6
16Mi 128 40.6
24Mi 192 36.3
32Mi 256 32.5
40Mi 320 27.0
48Mi 384 25.0
To resolve the socket lookup degradation, we introduce an optional
per-netns hash table for TCP, but it's just ehash, and we still share
the global bhash, bhash2 and lhash2.
With a smaller ehash, we can look up non-listener sockets faster and
isolate such noisy neighbours. In addition, we can reduce lock contention.
We can control the ehash size by a new sysctl knob. However, depending
on workloads, it will require very sensitive tuning, so we disable the
feature by default (net.ipv4.tcp_child_ehash_entries == 0). Moreover,
we can fall back to using the global ehash in case we fail to allocate
enough memory for a new ehash. The maximum size is 16Mi, which is large
enough that even if we have 48Mi sockets, the average list length is 3,
and regression would be less than 1%.
We can check the current ehash size by another read-only sysctl knob,
net.ipv4.tcp_ehash_entries. A negative value means the netns shares
the global ehash (per-netns ehash is disabled or failed to allocate
memory).
# dmesg | cut -d ' ' -f 5- | grep "established hash"
TCP established hash table entries: 524288 (order: 10, 4194304 bytes, vmalloc hugepage)
# sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 524288 # can be changed by thash_entries
# sysctl net.ipv4.tcp_child_ehash_entries
net.ipv4.tcp_child_ehash_entries = 0 # disabled by default
# ip netns add test1
# ip netns exec test1 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = -524288 # share the global ehash
# sysctl -w net.ipv4.tcp_child_ehash_entries=100
net.ipv4.tcp_child_ehash_entries = 100
# ip netns add test2
# ip netns exec test2 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 128 # own a per-netns ehash with 2^n buckets
When more than two processes in the same netns create per-netns ehash
concurrently with different sizes, we need to guarantee the size in
one of the following ways:
1) Share the global ehash and create per-netns ehash
First, unshare() with tcp_child_ehash_entries==0. It creates dedicated
netns sysctl knobs where we can safely change tcp_child_ehash_entries
and clone()/unshare() to create a per-netns ehash.
2) Control write on sysctl by BPF
We can use BPF_PROG_TYPE_CGROUP_SYSCTL to allow/deny read/write on
sysctl knobs.
Note that the global ehash allocated at the boot time is spread over
available NUMA nodes, but inet_pernet_hashinfo_alloc() will allocate
pages for each per-netns ehash depending on the current process's NUMA
policy. By default, the allocation is done in the local node only, so
the per-netns hash table could fully reside on a random node. Thus,
depending on the NUMA policy the netns is created with and the CPU the
current thread is running on, we could see some performance differences
for highly optimised networking applications.
Note also that the default values of two sysctl knobs depend on the ehash
size and should be tuned carefully:
tcp_max_tw_buckets : tcp_child_ehash_entries / 2
tcp_max_syn_backlog : max(128, tcp_child_ehash_entries / 128)
As a bonus, we can dismantle netns faster. Currently, while destroying
netns, we call inet_twsk_purge(), which walks through the global ehash.
It can be potentially big because it can have many sockets other than
TIME_WAIT in all netns. Splitting ehash changes that situation, where
it's only necessary for inet_twsk_purge() to clean up TIME_WAIT sockets
in each netns.
With regard to this, we do not free the per-netns ehash in inet_twsk_kill()
to avoid UAF while iterating the per-netns ehash in inet_twsk_purge().
Instead, we do it in tcp_sk_exit_batch() after calling tcp_twsk_purge() to
keep it protocol-family-independent.
In the future, we could optimise ehash lookup/iteration further by removing
netns comparison for the per-netns ehash.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-09-08 04:10:22 +03:00
if ( net - > ipv4 . tcp_death_row . hashinfo - > pernet ) {
2022-10-12 17:50:36 +03:00
/* Even if tw_refcount == 1, we must clean up kernel reqsk */
tcp: Introduce optional per-netns ehash.
The more sockets we have in the hash table, the longer we spend looking
up the socket. While running a number of small workloads on the same
host, they penalise each other and cause performance degradation.
The root cause might be a single workload that consumes much more
resources than the others. It often happens on a cloud service where
different workloads share the same computing resource.
On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash
entries), after running iperf3 in different netns, creating 24Mi sockets
without data transfer in the root netns causes about 10% performance
regression for the iperf3's connection.
thash_entries sockets length Gbps
524288 1 1 50.7
24Mi 48 45.1
It is basically related to the length of the list of each hash bucket.
For testing purposes to see how performance drops along the length,
I set 131072 (1Mi / 8) to thash_entries, and here's the result.
thash_entries sockets length Gbps
131072 1 1 50.7
1Mi 8 49.9
2Mi 16 48.9
4Mi 32 47.3
8Mi 64 44.6
16Mi 128 40.6
24Mi 192 36.3
32Mi 256 32.5
40Mi 320 27.0
48Mi 384 25.0
To resolve the socket lookup degradation, we introduce an optional
per-netns hash table for TCP, but it's just ehash, and we still share
the global bhash, bhash2 and lhash2.
With a smaller ehash, we can look up non-listener sockets faster and
isolate such noisy neighbours. In addition, we can reduce lock contention.
We can control the ehash size by a new sysctl knob. However, depending
on workloads, it will require very sensitive tuning, so we disable the
feature by default (net.ipv4.tcp_child_ehash_entries == 0). Moreover,
we can fall back to using the global ehash in case we fail to allocate
enough memory for a new ehash. The maximum size is 16Mi, which is large
enough that even if we have 48Mi sockets, the average list length is 3,
and regression would be less than 1%.
We can check the current ehash size by another read-only sysctl knob,
net.ipv4.tcp_ehash_entries. A negative value means the netns shares
the global ehash (per-netns ehash is disabled or failed to allocate
memory).
# dmesg | cut -d ' ' -f 5- | grep "established hash"
TCP established hash table entries: 524288 (order: 10, 4194304 bytes, vmalloc hugepage)
# sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 524288 # can be changed by thash_entries
# sysctl net.ipv4.tcp_child_ehash_entries
net.ipv4.tcp_child_ehash_entries = 0 # disabled by default
# ip netns add test1
# ip netns exec test1 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = -524288 # share the global ehash
# sysctl -w net.ipv4.tcp_child_ehash_entries=100
net.ipv4.tcp_child_ehash_entries = 100
# ip netns add test2
# ip netns exec test2 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 128 # own a per-netns ehash with 2^n buckets
When more than two processes in the same netns create per-netns ehash
concurrently with different sizes, we need to guarantee the size in
one of the following ways:
1) Share the global ehash and create per-netns ehash
First, unshare() with tcp_child_ehash_entries==0. It creates dedicated
netns sysctl knobs where we can safely change tcp_child_ehash_entries
and clone()/unshare() to create a per-netns ehash.
2) Control write on sysctl by BPF
We can use BPF_PROG_TYPE_CGROUP_SYSCTL to allow/deny read/write on
sysctl knobs.
Note that the global ehash allocated at the boot time is spread over
available NUMA nodes, but inet_pernet_hashinfo_alloc() will allocate
pages for each per-netns ehash depending on the current process's NUMA
policy. By default, the allocation is done in the local node only, so
the per-netns hash table could fully reside on a random node. Thus,
depending on the NUMA policy the netns is created with and the CPU the
current thread is running on, we could see some performance differences
for highly optimised networking applications.
Note also that the default values of two sysctl knobs depend on the ehash
size and should be tuned carefully:
tcp_max_tw_buckets : tcp_child_ehash_entries / 2
tcp_max_syn_backlog : max(128, tcp_child_ehash_entries / 128)
As a bonus, we can dismantle netns faster. Currently, while destroying
netns, we call inet_twsk_purge(), which walks through the global ehash.
It can be potentially big because it can have many sockets other than
TIME_WAIT in all netns. Splitting ehash changes that situation, where
it's only necessary for inet_twsk_purge() to clean up TIME_WAIT sockets
in each netns.
With regard to this, we do not free the per-netns ehash in inet_twsk_kill()
to avoid UAF while iterating the per-netns ehash in inet_twsk_purge().
Instead, we do it in tcp_sk_exit_batch() after calling tcp_twsk_purge() to
keep it protocol-family-independent.
In the future, we could optimise ehash lookup/iteration further by removing
netns comparison for the per-netns ehash.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-09-08 04:10:22 +03:00
inet_twsk_purge ( net - > ipv4 . tcp_death_row . hashinfo , family ) ;
} else if ( ! purged_once ) {
2022-10-12 17:50:36 +03:00
/* The last refcount is decremented in tcp_sk_exit_batch() */
if ( refcount_read ( & net - > ipv4 . tcp_death_row . tw_refcount ) = = 1 )
continue ;
tcp: Introduce optional per-netns ehash.
The more sockets we have in the hash table, the longer we spend looking
up the socket. While running a number of small workloads on the same
host, they penalise each other and cause performance degradation.
The root cause might be a single workload that consumes much more
resources than the others. It often happens on a cloud service where
different workloads share the same computing resource.
On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash
entries), after running iperf3 in different netns, creating 24Mi sockets
without data transfer in the root netns causes about 10% performance
regression for the iperf3's connection.
thash_entries sockets length Gbps
524288 1 1 50.7
24Mi 48 45.1
It is basically related to the length of the list of each hash bucket.
For testing purposes to see how performance drops along the length,
I set 131072 (1Mi / 8) to thash_entries, and here's the result.
thash_entries sockets length Gbps
131072 1 1 50.7
1Mi 8 49.9
2Mi 16 48.9
4Mi 32 47.3
8Mi 64 44.6
16Mi 128 40.6
24Mi 192 36.3
32Mi 256 32.5
40Mi 320 27.0
48Mi 384 25.0
To resolve the socket lookup degradation, we introduce an optional
per-netns hash table for TCP, but it's just ehash, and we still share
the global bhash, bhash2 and lhash2.
With a smaller ehash, we can look up non-listener sockets faster and
isolate such noisy neighbours. In addition, we can reduce lock contention.
We can control the ehash size by a new sysctl knob. However, depending
on workloads, it will require very sensitive tuning, so we disable the
feature by default (net.ipv4.tcp_child_ehash_entries == 0). Moreover,
we can fall back to using the global ehash in case we fail to allocate
enough memory for a new ehash. The maximum size is 16Mi, which is large
enough that even if we have 48Mi sockets, the average list length is 3,
and regression would be less than 1%.
We can check the current ehash size by another read-only sysctl knob,
net.ipv4.tcp_ehash_entries. A negative value means the netns shares
the global ehash (per-netns ehash is disabled or failed to allocate
memory).
# dmesg | cut -d ' ' -f 5- | grep "established hash"
TCP established hash table entries: 524288 (order: 10, 4194304 bytes, vmalloc hugepage)
# sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 524288 # can be changed by thash_entries
# sysctl net.ipv4.tcp_child_ehash_entries
net.ipv4.tcp_child_ehash_entries = 0 # disabled by default
# ip netns add test1
# ip netns exec test1 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = -524288 # share the global ehash
# sysctl -w net.ipv4.tcp_child_ehash_entries=100
net.ipv4.tcp_child_ehash_entries = 100
# ip netns add test2
# ip netns exec test2 sysctl net.ipv4.tcp_ehash_entries
net.ipv4.tcp_ehash_entries = 128 # own a per-netns ehash with 2^n buckets
When more than two processes in the same netns create per-netns ehash
concurrently with different sizes, we need to guarantee the size in
one of the following ways:
1) Share the global ehash and create per-netns ehash
First, unshare() with tcp_child_ehash_entries==0. It creates dedicated
netns sysctl knobs where we can safely change tcp_child_ehash_entries
and clone()/unshare() to create a per-netns ehash.
2) Control write on sysctl by BPF
We can use BPF_PROG_TYPE_CGROUP_SYSCTL to allow/deny read/write on
sysctl knobs.
Note that the global ehash allocated at the boot time is spread over
available NUMA nodes, but inet_pernet_hashinfo_alloc() will allocate
pages for each per-netns ehash depending on the current process's NUMA
policy. By default, the allocation is done in the local node only, so
the per-netns hash table could fully reside on a random node. Thus,
depending on the NUMA policy the netns is created with and the CPU the
current thread is running on, we could see some performance differences
for highly optimised networking applications.
Note also that the default values of two sysctl knobs depend on the ehash
size and should be tuned carefully:
tcp_max_tw_buckets : tcp_child_ehash_entries / 2
tcp_max_syn_backlog : max(128, tcp_child_ehash_entries / 128)
As a bonus, we can dismantle netns faster. Currently, while destroying
netns, we call inet_twsk_purge(), which walks through the global ehash.
It can be potentially big because it can have many sockets other than
TIME_WAIT in all netns. Splitting ehash changes that situation, where
it's only necessary for inet_twsk_purge() to clean up TIME_WAIT sockets
in each netns.
With regard to this, we do not free the per-netns ehash in inet_twsk_kill()
to avoid UAF while iterating the per-netns ehash in inet_twsk_purge().
Instead, we do it in tcp_sk_exit_batch() after calling tcp_twsk_purge() to
keep it protocol-family-independent.
In the future, we could optimise ehash lookup/iteration further by removing
netns comparison for the per-netns ehash.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-09-08 04:10:22 +03:00
inet_twsk_purge ( & tcp_hashinfo , family ) ;
purged_once = true ;
}
2022-09-08 04:10:21 +03:00
}
}
EXPORT_SYMBOL_GPL ( tcp_twsk_purge ) ;
2015-09-25 17:39:09 +03:00
/* Warning : This function is called without sk_listener being locked.
* Be sure to read socket fields once , as their value could change under us .
*/
2014-05-12 07:22:11 +04:00
void tcp_openreq_init_rwin ( struct request_sock * req ,
2015-09-25 17:39:09 +03:00
const struct sock * sk_listener ,
const struct dst_entry * dst )
2014-05-12 07:22:11 +04:00
{
struct inet_request_sock * ireq = inet_rsk ( req ) ;
2015-09-25 17:39:09 +03:00
const struct tcp_sock * tp = tcp_sk ( sk_listener ) ;
int full_space = tcp_full_space ( sk_listener ) ;
u32 window_clamp ;
__u8 rcv_wscale ;
2017-07-01 06:02:44 +03:00
u32 rcv_wnd ;
2017-02-02 19:04:56 +03:00
int mss ;
2014-05-12 07:22:11 +04:00
2017-02-02 19:04:56 +03:00
mss = tcp_mss_clamp ( tp , dst_metric_advmss ( dst ) ) ;
2015-09-25 17:39:09 +03:00
window_clamp = READ_ONCE ( tp - > window_clamp ) ;
2014-05-12 07:22:11 +04:00
/* Set this up on the first call only */
2015-10-09 05:33:23 +03:00
req - > rsk_window_clamp = window_clamp ? : dst_metric ( dst , RTAX_WINDOW ) ;
2014-05-12 07:22:11 +04:00
/* limit the window selection if the user enforce a smaller rx buffer */
2015-09-25 17:39:09 +03:00
if ( sk_listener - > sk_userlocks & SOCK_RCVBUF_LOCK & &
2015-10-09 05:33:23 +03:00
( req - > rsk_window_clamp > full_space | | req - > rsk_window_clamp = = 0 ) )
req - > rsk_window_clamp = full_space ;
2014-05-12 07:22:11 +04:00
2017-07-01 06:02:44 +03:00
rcv_wnd = tcp_rwnd_init_bpf ( ( struct sock * ) req ) ;
if ( rcv_wnd = = 0 )
rcv_wnd = dst_metric ( dst , RTAX_INITRWND ) ;
else if ( full_space < rcv_wnd * mss )
full_space = rcv_wnd * mss ;
2014-05-12 07:22:11 +04:00
/* tcp_full_space because it is guaranteed to be the first packet */
2017-10-27 17:47:24 +03:00
tcp_select_initial_window ( sk_listener , full_space ,
2014-05-12 07:22:11 +04:00
mss - ( ireq - > tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0 ) ,
2015-10-09 05:33:23 +03:00
& req - > rsk_rcv_wnd ,
& req - > rsk_window_clamp ,
2014-05-12 07:22:11 +04:00
ireq - > wscale_ok ,
& rcv_wscale ,
2017-07-01 06:02:44 +03:00
rcv_wnd ) ;
2014-05-12 07:22:11 +04:00
ireq - > rcv_wscale = rcv_wscale ;
}
EXPORT_SYMBOL ( tcp_openreq_init_rwin ) ;
2014-09-29 15:08:30 +04:00
static void tcp_ecn_openreq_child ( struct tcp_sock * tp ,
const struct request_sock * req )
2007-05-27 13:04:16 +04:00
{
tp - > ecn_flags = inet_rsk ( req ) - > ecn_ok ? TCP_ECN_OK : 0 ;
}
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination
congestion control algorithm. Generally, this opens up the possibility
for a machine with different links to enforce specific congestion
control algorithms with optimal strategies for each of them based
on their network characteristics, even transparently for a single
application listening on all links.
For our specific use case, this additionally facilitates deployment
of DCTCP, for example, applications can easily serve internal
traffic/dsts in DCTCP and external one with CUBIC. Other scenarios
would also allow for utilizing e.g. long living, low priority
background flows for certain destinations/routes while still being
able for normal traffic to utilize the default congestion control
algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific
property, we argue that a per route/destination setting is the most
natural and flexible.
The administrator can utilize this through ip-route(8) by appending
"congctl [lock] <name>", where <name> denotes the name of a
congestion control algorithm and the optional lock parameter allows
to enforce the given algorithm so that applications in user space
would not be allowed to overwrite that algorithm for that destination.
The dst metric lookups are being done when a dst entry is already
available in order to avoid a costly lookup and still before the
algorithms are being initialized, thus overhead is very low when the
feature is not being used. While the client side would need to drop
the current reference on the module, on server side this can actually
even be avoided as we just got a flat-copied socket clone.
Joint work with Florian Westphal.
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-06 01:57:48 +03:00
void tcp_ca_openreq_child ( struct sock * sk , const struct dst_entry * dst )
{
struct inet_connection_sock * icsk = inet_csk ( sk ) ;
u32 ca_key = dst_metric ( dst , RTAX_CC_ALGO ) ;
bool ca_got_dst = false ;
if ( ca_key ! = TCP_CA_UNSPEC ) {
const struct tcp_congestion_ops * ca ;
rcu_read_lock ( ) ;
ca = tcp_ca_find_key ( ca_key ) ;
2020-01-09 03:35:08 +03:00
if ( likely ( ca & & bpf_try_module_get ( ca , ca - > owner ) ) ) {
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination
congestion control algorithm. Generally, this opens up the possibility
for a machine with different links to enforce specific congestion
control algorithms with optimal strategies for each of them based
on their network characteristics, even transparently for a single
application listening on all links.
For our specific use case, this additionally facilitates deployment
of DCTCP, for example, applications can easily serve internal
traffic/dsts in DCTCP and external one with CUBIC. Other scenarios
would also allow for utilizing e.g. long living, low priority
background flows for certain destinations/routes while still being
able for normal traffic to utilize the default congestion control
algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific
property, we argue that a per route/destination setting is the most
natural and flexible.
The administrator can utilize this through ip-route(8) by appending
"congctl [lock] <name>", where <name> denotes the name of a
congestion control algorithm and the optional lock parameter allows
to enforce the given algorithm so that applications in user space
would not be allowed to overwrite that algorithm for that destination.
The dst metric lookups are being done when a dst entry is already
available in order to avoid a costly lookup and still before the
algorithms are being initialized, thus overhead is very low when the
feature is not being used. While the client side would need to drop
the current reference on the module, on server side this can actually
even be avoided as we just got a flat-copied socket clone.
Joint work with Florian Westphal.
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-06 01:57:48 +03:00
icsk - > icsk_ca_dst_locked = tcp_ca_dst_locked ( dst ) ;
icsk - > icsk_ca_ops = ca ;
ca_got_dst = true ;
}
rcu_read_unlock ( ) ;
}
2015-05-29 20:47:07 +03:00
/* If no valid choice made yet, assign current system default ca. */
if ( ! ca_got_dst & &
( ! icsk - > icsk_ca_setsockopt | |
2020-01-09 03:35:08 +03:00
! bpf_try_module_get ( icsk - > icsk_ca_ops , icsk - > icsk_ca_ops - > owner ) ) )
net: tcp: add per route congestion control
This work adds the possibility to define a per route/destination
congestion control algorithm. Generally, this opens up the possibility
for a machine with different links to enforce specific congestion
control algorithms with optimal strategies for each of them based
on their network characteristics, even transparently for a single
application listening on all links.
For our specific use case, this additionally facilitates deployment
of DCTCP, for example, applications can easily serve internal
traffic/dsts in DCTCP and external one with CUBIC. Other scenarios
would also allow for utilizing e.g. long living, low priority
background flows for certain destinations/routes while still being
able for normal traffic to utilize the default congestion control
algorithm. We also thought about a per netns setting (where different
defaults are possible), but given its actually a link specific
property, we argue that a per route/destination setting is the most
natural and flexible.
The administrator can utilize this through ip-route(8) by appending
"congctl [lock] <name>", where <name> denotes the name of a
congestion control algorithm and the optional lock parameter allows
to enforce the given algorithm so that applications in user space
would not be allowed to overwrite that algorithm for that destination.
The dst metric lookups are being done when a dst entry is already
available in order to avoid a costly lookup and still before the
algorithms are being initialized, thus overhead is very low when the
feature is not being used. While the client side would need to drop
the current reference on the module, on server side this can actually
even be avoided as we just got a flat-copied socket clone.
Joint work with Florian Westphal.
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-06 01:57:48 +03:00
tcp_assign_congestion_control ( sk ) ;
tcp_set_ca_state ( sk , TCP_CA_Open ) ;
}
EXPORT_SYMBOL_GPL ( tcp_ca_openreq_child ) ;
2017-10-25 12:01:45 +03:00
static void smc_check_reset_syn_req ( struct tcp_sock * oldtp ,
struct request_sock * req ,
struct tcp_sock * newtp )
{
# if IS_ENABLED(CONFIG_SMC)
struct inet_request_sock * ireq ;
if ( static_branch_unlikely ( & tcp_have_smc ) ) {
ireq = inet_rsk ( req ) ;
if ( oldtp - > syn_smc & & ! ireq - > smc_ok )
newtp - > syn_smc = 0 ;
}
# endif
}
2005-04-17 02:20:36 +04:00
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4 / IPv6 SYN recv processing . - DaveM
*
* Actually , we could lots of memory writes here . tp of listening
* socket contains all necessary default parameters .
*/
2015-09-29 17:42:47 +03:00
struct sock * tcp_create_openreq_child ( const struct sock * sk ,
struct request_sock * req ,
struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
2011-11-09 02:07:07 +04:00
struct sock * newsk = inet_csk_clone_lock ( sk , req , GFP_ATOMIC ) ;
2018-06-26 18:45:49 +03:00
const struct inet_request_sock * ireq = inet_rsk ( req ) ;
struct tcp_request_sock * treq = tcp_rsk ( req ) ;
struct inet_connection_sock * newicsk ;
struct tcp_sock * oldtp , * newtp ;
2019-10-11 06:17:39 +03:00
u32 seq ;
2018-06-26 18:45:49 +03:00
if ( ! newsk )
return NULL ;
newicsk = inet_csk ( newsk ) ;
newtp = tcp_sk ( newsk ) ;
oldtp = tcp_sk ( sk ) ;
smc_check_reset_syn_req ( oldtp , req , newtp ) ;
/* Now setup tcp_sock */
newtp - > pred_flags = 0 ;
2019-10-11 06:17:39 +03:00
seq = treq - > rcv_isn + 1 ;
newtp - > rcv_wup = seq ;
2019-10-11 06:17:40 +03:00
WRITE_ONCE ( newtp - > copied_seq , seq ) ;
2019-10-11 06:17:39 +03:00
WRITE_ONCE ( newtp - > rcv_nxt , seq ) ;
2018-06-26 18:45:49 +03:00
newtp - > segs_in = 1 ;
2019-10-11 06:17:42 +03:00
seq = treq - > snt_isn + 1 ;
newtp - > snd_sml = newtp - > snd_una = seq ;
WRITE_ONCE ( newtp - > snd_nxt , seq ) ;
newtp - > snd_up = seq ;
2018-06-26 18:45:49 +03:00
INIT_LIST_HEAD ( & newtp - > tsq_node ) ;
INIT_LIST_HEAD ( & newtp - > tsorted_sent_queue ) ;
tcp_init_wl ( newtp , treq - > rcv_isn ) ;
minmax_reset ( & newtp - > rtt_min , tcp_jiffies32 , ~ 0U ) ;
newicsk - > icsk_ack . lrcvtime = tcp_jiffies32 ;
newtp - > lsndtime = tcp_jiffies32 ;
newsk - > sk_txhash = treq - > txhash ;
newtp - > total_retrans = req - > num_retrans ;
tcp_init_xmit_timers ( newsk ) ;
2019-10-11 06:17:41 +03:00
WRITE_ONCE ( newtp - > write_seq , newtp - > pushed_seq = treq - > snt_isn + 1 ) ;
2018-06-26 18:45:49 +03:00
if ( sock_flag ( newsk , SOCK_KEEPOPEN ) )
inet_csk_reset_keepalive_timer ( newsk ,
keepalive_time_when ( newtp ) ) ;
newtp - > rx_opt . tstamp_ok = ireq - > tstamp_ok ;
newtp - > rx_opt . sack_ok = ireq - > sack_ok ;
newtp - > window_clamp = req - > rsk_window_clamp ;
newtp - > rcv_ssthresh = req - > rsk_rcv_wnd ;
newtp - > rcv_wnd = req - > rsk_rcv_wnd ;
newtp - > rx_opt . wscale_ok = ireq - > wscale_ok ;
if ( newtp - > rx_opt . wscale_ok ) {
newtp - > rx_opt . snd_wscale = ireq - > snd_wscale ;
newtp - > rx_opt . rcv_wscale = ireq - > rcv_wscale ;
} else {
newtp - > rx_opt . snd_wscale = newtp - > rx_opt . rcv_wscale = 0 ;
newtp - > window_clamp = min ( newtp - > window_clamp , 65535U ) ;
}
newtp - > snd_wnd = ntohs ( tcp_hdr ( skb ) - > window ) < < newtp - > rx_opt . snd_wscale ;
newtp - > max_window = newtp - > snd_wnd ;
if ( newtp - > rx_opt . tstamp_ok ) {
newtp - > rx_opt . ts_recent = req - > ts_recent ;
2018-07-11 13:16:12 +03:00
newtp - > rx_opt . ts_recent_stamp = ktime_get_seconds ( ) ;
2018-06-26 18:45:49 +03:00
newtp - > tcp_header_len = sizeof ( struct tcphdr ) + TCPOLEN_TSTAMP_ALIGNED ;
} else {
newtp - > rx_opt . ts_recent_stamp = 0 ;
newtp - > tcp_header_len = sizeof ( struct tcphdr ) ;
}
2019-04-30 01:46:16 +03:00
if ( req - > num_timeout ) {
newtp - > undo_marker = treq - > snt_isn ;
newtp - > retrans_stamp = div_u64 ( treq - > snt_synack ,
USEC_PER_SEC / TCP_TS_HZ ) ;
}
2018-06-26 18:45:49 +03:00
newtp - > tsoffset = treq - > ts_off ;
2006-11-15 06:07:45 +03:00
# ifdef CONFIG_TCP_MD5SIG
2018-06-26 18:45:49 +03:00
newtp - > md5sig_info = NULL ; /*XXX*/
2022-04-21 03:50:26 +03:00
if ( treq - > af_specific - > req_md5_lookup ( sk , req_to_sk ( req ) ) )
2018-06-26 18:45:49 +03:00
newtp - > tcp_header_len + = TCPOLEN_MD5SIG_ALIGNED ;
2006-11-15 06:07:45 +03:00
# endif
2018-06-26 18:45:49 +03:00
if ( skb - > len > = TCP_MSS_DEFAULT + newtp - > tcp_header_len )
newicsk - > icsk_ack . last_seg_size = skb - > len - newtp - > tcp_header_len ;
newtp - > rx_opt . mss_clamp = req - > mss ;
tcp_ecn_openreq_child ( newtp , req ) ;
newtp - > fastopen_req = NULL ;
2019-10-11 06:17:38 +03:00
RCU_INIT_POINTER ( newtp - > fastopen_rsk , NULL ) ;
2018-06-26 18:45:49 +03:00
2022-09-29 10:04:06 +03:00
newtp - > bpf_chg_cc_inprogress = 0 ;
2020-02-18 20:10:15 +03:00
tcp_bpf_clone ( sk , newsk ) ;
2018-06-26 18:45:49 +03:00
__TCP_INC_STATS ( sock_net ( sk ) , TCP_MIB_PASSIVEOPENS ) ;
2005-04-17 02:20:36 +04:00
return newsk ;
}
2010-07-10 01:22:10 +04:00
EXPORT_SYMBOL ( tcp_create_openreq_child ) ;
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:47 +03:00
/*
2012-08-31 16:29:12 +04:00
* Process an incoming packet for SYN_RECV sockets represented as a
* request_sock . Normally sk is the listener socket but for TFO it
* points to the child socket .
*
* XXX ( TFO ) - The current impl contains a special check for ack
* validation and inside tcp_v4_reqsk_send_ack ( ) . Can we do better ?
2012-09-19 18:46:06 +04:00
*
* We don ' t need to initialize tmp_opt . sack_ok as we don ' t use the results
2023-02-27 11:33:36 +03:00
*
* Note : If @ fastopen is true , this can be called from process context .
* Otherwise , this is from BH context .
2005-04-17 02:20:36 +04:00
*/
2008-11-03 11:24:34 +03:00
struct sock * tcp_check_req ( struct sock * sk , struct sk_buff * skb ,
2005-06-19 09:47:21 +04:00
struct request_sock * req ,
2018-02-13 17:14:12 +03:00
bool fastopen , bool * req_stolen )
2005-04-17 02:20:36 +04:00
{
2009-12-02 21:25:27 +03:00
struct tcp_options_received tmp_opt ;
struct sock * child ;
2007-04-11 08:04:22 +04:00
const struct tcphdr * th = tcp_hdr ( skb ) ;
2006-11-15 07:51:49 +03:00
__be32 flg = tcp_flag_word ( th ) & ( TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_ACK ) ;
2012-05-17 03:15:34 +04:00
bool paws_reject = false ;
2015-10-22 18:20:46 +03:00
bool own_req ;
2005-04-17 02:20:36 +04:00
tcp: Revert per-route SACK/DSACK/TIMESTAMP changes.
It creates a regression, triggering badness for SYN_RECV
sockets, for example:
[19148.022102] Badness at net/ipv4/inet_connection_sock.c:293
[19148.022570] NIP: c02a0914 LR: c02a0904 CTR: 00000000
[19148.023035] REGS: eeecbd30 TRAP: 0700 Not tainted (2.6.32)
[19148.023496] MSR: 00029032 <EE,ME,CE,IR,DR> CR: 24002442 XER: 00000000
[19148.024012] TASK = eee9a820[1756] 'privoxy' THREAD: eeeca000
This is likely caused by the change in the 'estab' parameter
passed to tcp_parse_options() when invoked by the functions
in net/ipv4/tcp_minisocks.c
But even if that is fixed, the ->conn_request() changes made in
this patch series is fundamentally wrong. They try to use the
listening socket's 'dst' to probe the route settings. The
listening socket doesn't even have a route, and you can't
get the right route (the child request one) until much later
after we setup all of the state, and it must be done by hand.
This stuff really isn't ready, so the best thing to do is a
full revert. This reverts the following commits:
f55017a93f1a74d50244b1254b9a2bd7ac9bbf7d
022c3f7d82f0f1c68018696f2f027b87b9bb45c2
1aba721eba1d84a2defce45b950272cee1e6c72a
cda42ebd67ee5fdf09d7057b5a4584d36fe8a335
345cda2fd695534be5a4494f1b59da9daed33663
dc343475ed062e13fc260acccaab91d7d80fd5b2
05eaade2782fb0c90d3034fd7a7d5a16266182bb
6a2a2d6bf8581216e08be15fcb563cfd6c430e1e
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-12-16 07:56:42 +03:00
tmp_opt . saw_tstamp = 0 ;
if ( th - > doff > ( sizeof ( struct tcphdr ) > > 2 ) ) {
2017-06-07 20:34:36 +03:00
tcp_parse_options ( sock_net ( sk ) , skb , & tmp_opt , 0 , NULL ) ;
2005-04-17 02:20:36 +04:00
if ( tmp_opt . saw_tstamp ) {
tmp_opt . ts_recent = req - > ts_recent ;
2016-12-01 13:32:06 +03:00
if ( tmp_opt . rcv_tsecr )
tmp_opt . rcv_tsecr - = tcp_rsk ( req ) - > ts_off ;
2005-04-17 02:20:36 +04:00
/* We do not store true stamp, but it is not required,
* it can be estimated ( approximately )
* from another data .
*/
2022-01-28 22:26:21 +03:00
tmp_opt . ts_recent_stamp = ktime_get_seconds ( ) - reqsk_timeout ( req , TCP_RTO_MAX ) / HZ ;
2009-03-14 17:23:03 +03:00
paws_reject = tcp_paws_reject ( & tmp_opt , th - > rst ) ;
2005-04-17 02:20:36 +04:00
}
}
/* Check for pure retransmitted SYN. */
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 09:46:52 +04:00
if ( TCP_SKB_CB ( skb ) - > seq = = tcp_rsk ( req ) - > rcv_isn & &
2005-04-17 02:20:36 +04:00
flg = = TCP_FLAG_SYN & &
! paws_reject ) {
/*
* RFC793 draws ( Incorrectly ! It was fixed in RFC1122 )
* this case on figure 6 and figure 8 , but formal
* protocol description says NOTHING .
* To be more exact , it says that we should send ACK ,
* because this segment ( at least , if it has no data )
* is out of window .
*
* CONCLUSION : RFC793 ( even with RFC1122 ) DOES NOT
* describe SYN - RECV state . All the description
* is wrong , we cannot believe to it and should
* rely only on common sense and implementation
* experience .
*
* Enforce " SYN-ACK " according to figure 8 , figure 6
* of RFC793 , fixed by RFC1122 .
2012-08-31 16:29:12 +04:00
*
* Note that even if there is new data in the SYN packet
* they will be thrown away too .
2013-04-29 12:44:51 +04:00
*
* Reset timer after retransmitting SYNACK , similar to
* the idea of fast retransmit in recovery .
2005-04-17 02:20:36 +04:00
*/
tcp: mitigate ACK loops for connections as tcp_request_sock
In the SYN_RECV state, where the TCP connection is represented by
tcp_request_sock, we now rate-limit SYNACKs in response to a client's
retransmitted SYNs: we do not send a SYNACK in response to client SYN
if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms)
since we last sent a SYNACK in response to a client's retransmitted
SYN.
This allows the vast majority of legitimate client connections to
proceed unimpeded, even for the most aggressive platforms, iOS and
MacOS, which actually retransmit SYNs 1-second intervals for several
times in a row. They use SYN RTO timeouts following the progression:
1,1,1,1,1,2,4,8,16,32.
Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-07 00:04:39 +03:00
if ( ! tcp_oow_rate_limited ( sock_net ( sk ) , skb ,
LINUX_MIB_TCPACKSKIPPEDSYNRECV ,
& tcp_rsk ( req ) - > last_oow_ack_time ) & &
2015-04-09 01:34:04 +03:00
! inet_rtx_syn_ack ( sk , req ) ) {
unsigned long expires = jiffies ;
2022-01-28 22:26:21 +03:00
expires + = reqsk_timeout ( req , TCP_RTO_MAX ) ;
2015-04-09 01:34:04 +03:00
if ( ! fastopen )
mod_timer_pending ( & req - > rsk_timer , expires ) ;
else
req - > rsk_timer . expires = expires ;
}
2005-04-17 02:20:36 +04:00
return NULL ;
}
/* Further reproduces section "SEGMENT ARRIVES"
for state SYN - RECEIVED of RFC793 .
It is broken , however , it does not work only
when SYNs are crossed .
You would think that SYN crossing is impossible here , since
we should have a SYN_SENT socket ( from connect ( ) ) on our end ,
but this is not true if the crossed SYNs were sent to both
ends by a malicious third party . We must defend against this ,
and to do that we first verify the ACK ( as per RFC793 , page
36 ) and reset if it is invalid . Is this a true full defense ?
To convince ourselves , let us consider a way in which the ACK
test can still pass in this ' malicious crossed SYNs ' case .
Malicious sender sends identical SYNs ( and thus identical sequence
numbers ) to both A and B :
A : gets SYN , seq = 7
B : gets SYN , seq = 7
By our good fortune , both A and B select the same initial
send sequence number of seven : - )
A : sends SYN | ACK , seq = 7 , ack_seq = 8
B : sends SYN | ACK , seq = 7 , ack_seq = 8
So we are now A eating this SYN | ACK , ACK test passes . So
does sequence test , SYN is truncated , and thus we consider
it a bare ACK .
tcp: Revert 'process defer accept as established' changes.
This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").
This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.
Ilpo Järvinen first spotted the locking problems. The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.
Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock. The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.
Next is a problem noticed by Vitaliy Gusev, he noted:
----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
> goto death;
> }
>
>+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+ tcp_send_active_reset(sk, GFP_ATOMIC);
>+ goto death;
Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------
Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:
----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------
So revert this thing for now.
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-13 03:31:35 +04:00
If icsk - > icsk_accept_queue . rskq_defer_accept , we silently drop this
bare ACK . Otherwise , we create an established connection . Both
ends ( listening sockets ) accept the new incoming connection and try
to talk to each other . 8 - )
2005-04-17 02:20:36 +04:00
Note : This case is both harmless , and rare . Possibility is about the
same as us discovering intelligent life on another plant tomorrow .
But generally , we should ( RFC lies ! ) to accept ACK
from SYNACK both here and in tcp_rcv_state_process ( ) .
tcp_rcv_state_process ( ) does not , hence , we do not too .
Note that the case is absolutely generic :
we cannot optimize anything here without
violating protocol . All the checks must be made
before attempt to create socket .
*/
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
2005-11-11 04:13:47 +03:00
* sent ( the segment carries an unacceptable ACK ) . . .
2005-04-17 02:20:36 +04:00
* a reset is sent . "
*
2012-08-31 16:29:12 +04:00
* Invalid ACK : reset will be sent by listening socket .
* Note that the ACK validity check for a Fast Open socket is done
* elsewhere and is checked directly against the child socket rather
* than req because user data may have been sent out .
2005-04-17 02:20:36 +04:00
*/
2012-08-31 16:29:12 +04:00
if ( ( flg & TCP_FLAG_ACK ) & & ! fastopen & &
TCPCT part 1d: define TCP cookie option, extend existing struct's
Data structures are carefully composed to require minimal additions.
For example, the struct tcp_options_received cookie_plus variable fits
between existing 16-bit and 8-bit variables, requiring no additional
space (taking alignment into consideration). There are no additions to
tcp_request_sock, and only 1 pointer in tcp_sock.
This is a significantly revised implementation of an earlier (year-old)
patch that no longer applies cleanly, with permission of the original
author (Adam Langley):
http://thread.gmane.org/gmane.linux.network/102586
The principle difference is using a TCP option to carry the cookie nonce,
instead of a user configured offset in the data. This is more flexible and
less subject to user configuration error. Such a cookie option has been
suggested for many years, and is also useful without SYN data, allowing
several related concepts to use the same extension option.
"Re: SYN floods (was: does history repeat itself?)", September 9, 1996.
http://www.merit.net/mail.archives/nanog/1996-09/msg00235.html
"Re: what a new TCP header might look like", May 12, 1998.
ftp://ftp.isi.edu/end2end/end2end-interest-1998.mail
These functions will also be used in subsequent patches that implement
additional features.
Requires:
TCPCT part 1a: add request_values parameter for sending SYNACK
TCPCT part 1b: generate Responder Cookie secret
TCPCT part 1c: sysctl_tcp_cookie_size, socket option TCP_COOKIE_TRANSACTIONS
Signed-off-by: William.Allen.Simpson@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-12-02 21:17:05 +03:00
( TCP_SKB_CB ( skb ) - > ack_seq ! =
2013-03-17 12:23:34 +04:00
tcp_rsk ( req ) - > snt_isn + 1 ) )
2005-04-17 02:20:36 +04:00
return sk ;
/* Also, it would be not so bad idea to check rcv_tsecr, which
* is essentially ACK extension and too early or too late values
* should cause reset in unsynchronized states .
*/
/* RFC793: "first check sequence number". */
if ( paws_reject | | ! tcp_in_window ( TCP_SKB_CB ( skb ) - > seq , TCP_SKB_CB ( skb ) - > end_seq ,
2015-10-09 05:33:23 +03:00
tcp_rsk ( req ) - > rcv_nxt , tcp_rsk ( req ) - > rcv_nxt + req - > rsk_rcv_wnd ) ) {
2005-04-17 02:20:36 +04:00
/* Out of window: send ACK and drop. */
2016-04-01 18:52:22 +03:00
if ( ! ( flg & TCP_FLAG_RST ) & &
! tcp_oow_rate_limited ( sock_net ( sk ) , skb ,
LINUX_MIB_TCPACKSKIPPEDSYNRECV ,
& tcp_rsk ( req ) - > last_oow_ack_time ) )
2008-08-07 10:50:04 +04:00
req - > rsk_ops - > send_ack ( sk , skb , req ) ;
2005-04-17 02:20:36 +04:00
if ( paws_reject )
2023-02-27 11:33:36 +03:00
NET_INC_STATS ( sock_net ( sk ) , LINUX_MIB_PAWSESTABREJECTED ) ;
2005-04-17 02:20:36 +04:00
return NULL ;
}
/* In sequence, PAWS is OK. */
2012-08-31 16:29:12 +04:00
if ( tmp_opt . saw_tstamp & & ! after ( TCP_SKB_CB ( skb ) - > seq , tcp_rsk ( req ) - > rcv_nxt ) )
2008-08-08 07:27:45 +04:00
req - > ts_recent = tmp_opt . rcv_tsval ;
2005-04-17 02:20:36 +04:00
2008-08-08 07:27:45 +04:00
if ( TCP_SKB_CB ( skb ) - > seq = = tcp_rsk ( req ) - > rcv_isn ) {
/* Truncate SYN, it is out of window starting
at tcp_rsk ( req ) - > rcv_isn + 1. */
flg & = ~ TCP_FLAG_SYN ;
}
2005-04-17 02:20:36 +04:00
2008-08-08 07:27:45 +04:00
/* RFC793: "second check the RST bit" and
* " fourth, check the SYN bit "
*/
if ( flg & ( TCP_FLAG_RST | TCP_FLAG_SYN ) ) {
2023-02-27 11:33:36 +03:00
TCP_INC_STATS ( sock_net ( sk ) , TCP_MIB_ATTEMPTFAILS ) ;
2008-08-08 07:27:45 +04:00
goto embryonic_reset ;
}
2005-04-17 02:20:36 +04:00
2008-08-08 07:27:45 +04:00
/* ACK sequence verified above, just make sure ACK is
* set . If ACK not set , just silently drop the packet .
2012-08-31 16:29:12 +04:00
*
* XXX ( TFO ) - if we ever allow " data after SYN " , the
* following check needs to be removed .
2008-08-08 07:27:45 +04:00
*/
if ( ! ( flg & TCP_FLAG_ACK ) )
return NULL ;
tcp: Revert 'process defer accept as established' changes.
This reverts two changesets, ec3c0982a2dd1e671bad8e9d26c28dcba0039d87
("[TCP]: TCP_DEFER_ACCEPT updates - process as established") and
the follow-on bug fix 9ae27e0adbf471c7a6b80102e38e1d5a346b3b38
("tcp: Fix slab corruption with ipv6 and tcp6fuzz").
This change causes several problems, first reported by Ingo Molnar
as a distcc-over-loopback regression where connections were getting
stuck.
Ilpo Järvinen first spotted the locking problems. The new function
added by this code, tcp_defer_accept_check(), only has the
child socket locked, yet it is modifying state of the parent
listening socket.
Fixing that is non-trivial at best, because we can't simply just grab
the parent listening socket lock at this point, because it would
create an ABBA deadlock. The normal ordering is parent listening
socket --> child socket, but this code path would require the
reverse lock ordering.
Next is a problem noticed by Vitaliy Gusev, he noted:
----------------------------------------
>--- a/net/ipv4/tcp_timer.c
>+++ b/net/ipv4/tcp_timer.c
>@@ -481,6 +481,11 @@ static void tcp_keepalive_timer (unsigned long data)
> goto death;
> }
>
>+ if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) {
>+ tcp_send_active_reset(sk, GFP_ATOMIC);
>+ goto death;
Here socket sk is not attached to listening socket's request queue. tcp_done()
will not call inet_csk_destroy_sock() (and tcp_v4_destroy_sock() which should
release this sk) as socket is not DEAD. Therefore socket sk will be lost for
freeing.
----------------------------------------
Finally, Alexey Kuznetsov argues that there might not even be any
real value or advantage to these new semantics even if we fix all
of the bugs:
----------------------------------------
Hiding from accept() sockets with only out-of-order data only
is the only thing which is impossible with old approach. Is this really
so valuable? My opinion: no, this is nothing but a new loophole
to consume memory without control.
----------------------------------------
So revert this thing for now.
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-06-13 03:31:35 +04:00
2012-08-31 16:29:12 +04:00
/* For Fast Open no more processing is needed (sk is the
* child socket ) .
*/
if ( fastopen )
return sk ;
2009-10-19 14:01:56 +04:00
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
2012-10-28 03:16:46 +04:00
if ( req - > num_timeout < inet_csk ( sk ) - > icsk_accept_queue . rskq_defer_accept & &
2008-08-08 07:27:45 +04:00
TCP_SKB_CB ( skb ) - > end_seq = = tcp_rsk ( req ) - > rcv_isn + 1 ) {
inet_rsk ( req ) - > acked = 1 ;
2016-04-28 02:44:39 +03:00
__NET_INC_STATS ( sock_net ( sk ) , LINUX_MIB_TCPDEFERACCEPTDROP ) ;
2008-08-08 07:27:45 +04:00
return NULL ;
}
/* OK, ACK is valid, create big socket and
* feed this segment to it . It will repeat all
* the tests . THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE . If it will be dropped after
* socket is created , wait for troubles .
*/
2015-10-22 18:20:46 +03:00
child = inet_csk ( sk ) - > icsk_af_ops - > syn_recv_sock ( sk , skb , req , NULL ,
req , & own_req ) ;
2015-04-03 11:17:26 +03:00
if ( ! child )
2008-08-08 07:27:45 +04:00
goto listen_overflow ;
2005-04-17 02:20:36 +04:00
2020-05-15 20:22:15 +03:00
if ( own_req & & rsk_drop_req ( req ) ) {
tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.
This patch also changes the code to call reuseport_migrate_sock() and
inet_reqsk_clone(), but unlike the other cases, we do not call
inet_reqsk_clone() right after reuseport_migrate_sock().
Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:
(A) for listener itself
(B) carried by reuqest_sock
(C) sock_hold() in tcp_v[46]_rcv()
While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.
For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.
In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:
CPU 1 looks up req1
CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
...
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-8-kuniyu@amazon.co.jp
2021-06-12 15:32:20 +03:00
reqsk_queue_removed ( & inet_csk ( req - > rsk_listener ) - > icsk_accept_queue , req ) ;
inet_csk_reqsk_queue_drop_and_put ( req - > rsk_listener , req ) ;
2020-03-28 00:48:39 +03:00
return child ;
}
2015-10-08 21:16:48 +03:00
sock_rps_save_rxhash ( child , skb ) ;
2015-09-18 21:36:14 +03:00
tcp_synack_rtt_meas ( child , req ) ;
2018-02-13 17:14:12 +03:00
* req_stolen = ! own_req ;
2015-10-22 18:20:46 +03:00
return inet_csk_complete_hashdance ( sk , child , req , own_req ) ;
2005-04-17 02:20:36 +04:00
2008-08-08 07:27:45 +04:00
listen_overflow :
2021-06-23 02:35:29 +03:00
if ( sk ! = req - > rsk_listener )
__NET_INC_STATS ( sock_net ( sk ) , LINUX_MIB_TCPMIGRATEREQFAILURE ) ;
2022-07-18 20:26:52 +03:00
if ( ! READ_ONCE ( sock_net ( sk ) - > ipv4 . sysctl_tcp_abort_on_overflow ) ) {
2008-08-08 07:27:45 +04:00
inet_rsk ( req ) - > acked = 1 ;
return NULL ;
}
2005-04-17 02:20:36 +04:00
2008-08-08 07:27:45 +04:00
embryonic_reset :
2012-08-31 16:29:12 +04:00
if ( ! ( flg & TCP_FLAG_RST ) ) {
/* Received a bad SYN pkt - for TFO We try not to reset
* the local connection unless it ' s really necessary to
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections .
*/
2008-08-08 07:27:45 +04:00
req - > rsk_ops - > send_reset ( sk , skb ) ;
2012-08-31 16:29:12 +04:00
} else if ( fastopen ) { /* received a valid RST pkt */
reqsk_fastopen_remove ( sk , req , true ) ;
2020-12-11 01:25:03 +03:00
tcp_reset ( sk , skb ) ;
2012-08-31 16:29:12 +04:00
}
if ( ! fastopen ) {
2021-03-15 14:05:45 +03:00
bool unlinked = inet_csk_reqsk_queue_drop ( sk , req ) ;
if ( unlinked )
__NET_INC_STATS ( sock_net ( sk ) , LINUX_MIB_EMBRYONICRSTS ) ;
* req_stolen = ! unlinked ;
2012-08-31 16:29:12 +04:00
}
2008-08-08 07:27:45 +04:00
return NULL ;
2005-04-17 02:20:36 +04:00
}
2010-07-10 01:22:10 +04:00
EXPORT_SYMBOL ( tcp_check_req ) ;
2005-04-17 02:20:36 +04:00
/*
* Queue segment on the new socket if the new socket is active ,
* otherwise we just shortcircuit this and continue with
* the new socket .
2012-08-31 16:29:12 +04:00
*
* For the vast majority of cases child - > sk_state will be TCP_SYN_RECV
* when entering . But other states are possible due to a race condition
* where after __inet_lookup_established ( ) fails but before the listener
* locked is obtained , other packets cause the same connection to
* be created .
2005-04-17 02:20:36 +04:00
*/
int tcp_child_process ( struct sock * parent , struct sock * child ,
struct sk_buff * skb )
2020-03-11 04:09:03 +03:00
__releases ( & ( ( child ) - > sk_lock . slock ) )
2005-04-17 02:20:36 +04:00
{
int ret = 0 ;
int state = child - > sk_state ;
2021-12-03 02:37:24 +03:00
/* record sk_napi_id and sk_rx_queue_mapping of child. */
sk_mark_napi_id_set ( child , skb ) ;
2017-03-24 20:08:00 +03:00
2016-03-14 20:52:15 +03:00
tcp_segs_in ( tcp_sk ( child ) , skb ) ;
2005-04-17 02:20:36 +04:00
if ( ! sock_owned_by_user ( child ) ) {
2015-09-29 17:42:41 +03:00
ret = tcp_rcv_state_process ( child , skb ) ;
2005-04-17 02:20:36 +04:00
/* Wakeup parent, send SIGIO */
if ( state = = TCP_SYN_RECV & & child - > sk_state ! = state )
2014-04-12 00:15:36 +04:00
parent - > sk_data_ready ( parent ) ;
2005-04-17 02:20:36 +04:00
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more .
*/
2010-03-04 21:01:47 +03:00
__sk_add_backlog ( child , skb ) ;
2005-04-17 02:20:36 +04:00
}
bh_unlock_sock ( child ) ;
sock_put ( child ) ;
return ret ;
}
EXPORT_SYMBOL ( tcp_child_process ) ;