2015-10-17 07:57:46 +03:00
# include <linux/tcp.h>
# include <net/tcp.h>
2017-01-13 09:11:36 +03:00
int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION ;
2015-10-17 07:57:47 +03:00
2017-01-13 09:11:30 +03:00
static void tcp_rack_mark_skb_lost ( struct sock * sk , struct sk_buff * skb )
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
tcp_skb_mark_lost_uncond_verify ( tp , skb ) ;
if ( TCP_SKB_CB ( skb ) - > sacked & TCPCB_SACKED_RETRANS ) {
/* Account for retransmits that are lost again */
TCP_SKB_CB ( skb ) - > sacked & = ~ TCPCB_SACKED_RETRANS ;
tp - > retrans_out - = tcp_skb_pcount ( skb ) ;
2017-04-05 00:15:39 +03:00
NET_ADD_STATS ( sock_net ( sk ) , LINUX_MIB_TCPLOSTRETRANSMIT ,
tcp_skb_pcount ( skb ) ) ;
2017-01-13 09:11:30 +03:00
}
}
2017-05-17 00:00:14 +03:00
static bool tcp_rack_sent_after ( u64 t1 , u64 t2 , u32 seq1 , u32 seq2 )
2017-01-13 09:11:34 +03:00
{
2017-05-17 00:00:14 +03:00
return t1 > t2 | | ( t1 = = t2 & & after ( seq1 , seq2 ) ) ;
2017-01-13 09:11:34 +03:00
}
2017-01-13 09:11:36 +03:00
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
*
* Marks a packet lost , if some packet sent later has been ( s ) acked .
2015-10-17 07:57:47 +03:00
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics :
*
* dupthresh : 3 OOO packets delivered ( packet count )
* FACK : sequence delta to highest sacked sequence ( sequence space )
* RACK : sent time delta to the latest delivered packet ( time domain )
*
* The advantage of RACK is it applies to both original and retransmitted
* packet and therefore is robust against tail losses . Another advantage
* is being more resilient to reordering by simply allowing some
* " settling delay " , instead of tweaking the dupthresh .
*
2017-01-13 09:11:36 +03:00
* When tcp_rack_detect_loss ( ) detects some packets are lost and we
* are not already in the CA_Recovery state , either tcp_rack_reo_timeout ( )
* or tcp_time_to_recover ( ) ' s " Trick#1: the loss is proven " code path will
* make us enter the CA_Recovery state .
2015-10-17 07:57:47 +03:00
*/
2017-04-25 20:15:33 +03:00
static void tcp_rack_detect_loss ( struct sock * sk , u32 * reo_timeout )
2015-10-17 07:57:47 +03:00
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
struct sk_buff * skb ;
2017-01-13 09:11:31 +03:00
u32 reo_wnd ;
2015-10-17 07:57:47 +03:00
2017-01-13 09:11:33 +03:00
* reo_timeout = 0 ;
2015-10-17 07:57:47 +03:00
/* To be more reordering resilient, allow min_rtt/4 settling delay
* ( lower - bounded to 1000u S ) . We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs .
*/
reo_wnd = 1000 ;
2017-01-13 09:11:36 +03:00
if ( ( tp - > rack . reord | | ! tp - > lost_out ) & & tcp_min_rtt ( tp ) ! = ~ 0U )
2015-10-17 07:57:47 +03:00
reo_wnd = max ( tcp_min_rtt ( tp ) > > 2 , reo_wnd ) ;
tcp_for_write_queue ( skb , sk ) {
struct tcp_skb_cb * scb = TCP_SKB_CB ( skb ) ;
if ( skb = = tcp_send_head ( sk ) )
break ;
/* Skip ones already (s)acked */
if ( ! after ( scb - > end_seq , tp - > snd_una ) | |
scb - > sacked & TCPCB_SACKED_ACKED )
continue ;
2017-05-17 00:00:14 +03:00
if ( tcp_rack_sent_after ( tp - > rack . mstamp , skb - > skb_mstamp ,
2017-01-13 09:11:34 +03:00
tp - > rack . end_seq , scb - > end_seq ) ) {
2017-01-13 09:11:32 +03:00
/* Step 3 in draft-cheng-tcpm-rack-00.txt:
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window .
*/
2017-05-17 00:00:14 +03:00
u32 elapsed = tcp_stamp_us_delta ( tp - > tcp_mstamp ,
skb - > skb_mstamp ) ;
2017-01-13 09:11:33 +03:00
s32 remaining = tp - > rack . rtt_us + reo_wnd - elapsed ;
if ( remaining < 0 ) {
2017-01-13 09:11:32 +03:00
tcp_rack_mark_skb_lost ( sk , skb ) ;
2017-01-13 09:11:33 +03:00
continue ;
2017-01-13 09:11:32 +03:00
}
2017-01-13 09:11:33 +03:00
/* Skip ones marked lost but not yet retransmitted */
if ( ( scb - > sacked & TCPCB_LOST ) & &
! ( scb - > sacked & TCPCB_SACKED_RETRANS ) )
continue ;
/* Record maximum wait time (+1 to avoid 0) */
* reo_timeout = max_t ( u32 , * reo_timeout , 1 + remaining ) ;
2015-10-17 07:57:47 +03:00
} else if ( ! ( scb - > sacked & TCPCB_RETRANS ) ) {
/* Original data are sent sequentially so stop early
* b / c the rest are all sent after rack_sent
*/
break ;
}
}
2017-01-13 09:11:31 +03:00
}
2017-04-25 20:15:34 +03:00
void tcp_rack_mark_lost ( struct sock * sk )
2017-01-13 09:11:31 +03:00
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
2017-01-13 09:11:33 +03:00
u32 timeout ;
2017-01-13 09:11:31 +03:00
2017-01-13 09:11:36 +03:00
if ( ! tp - > rack . advanced )
2017-01-13 09:11:31 +03:00
return ;
2017-01-13 09:11:33 +03:00
2017-01-13 09:11:31 +03:00
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp - > rack . advanced = 0 ;
2017-04-25 20:15:33 +03:00
tcp_rack_detect_loss ( sk , & timeout ) ;
2017-01-13 09:11:33 +03:00
if ( timeout ) {
timeout = usecs_to_jiffies ( timeout + TCP_REO_TIMEOUT_MIN ) ;
inet_csk_reset_xmit_timer ( sk , ICSK_TIME_REO_TIMEOUT ,
timeout , inet_csk ( sk ) - > icsk_rto ) ;
}
2015-10-17 07:57:47 +03:00
}
2017-01-13 09:11:32 +03:00
/* Record the most recently (re)sent time among the (s)acked packets
* This is " Step 3: Advance RACK.xmit_time and update RACK.RTT " from
* draft - cheng - tcpm - rack - 00. txt
*/
2017-01-13 09:11:34 +03:00
void tcp_rack_advance ( struct tcp_sock * tp , u8 sacked , u32 end_seq ,
2017-05-17 00:00:14 +03:00
u64 xmit_time )
2015-10-17 07:57:46 +03:00
{
2017-01-13 09:11:32 +03:00
u32 rtt_us ;
2017-05-17 00:00:14 +03:00
if ( tp - > rack . mstamp & &
! tcp_rack_sent_after ( xmit_time , tp - > rack . mstamp ,
2017-01-13 09:11:34 +03:00
end_seq , tp - > rack . end_seq ) )
2015-10-17 07:57:46 +03:00
return ;
2017-05-17 00:00:14 +03:00
rtt_us = tcp_stamp_us_delta ( tp - > tcp_mstamp , xmit_time ) ;
2015-10-17 07:57:46 +03:00
if ( sacked & TCPCB_RETRANS ) {
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original ( or the prior
* retransmission ) was sacked .
*
* If the original is lost , there is no ambiguity . Otherwise
* we assume the original can be delayed up to aRTT + min_rtt .
* the aRTT term is bounded by the fast recovery or timeout ,
* so it ' s at least one RTT ( i . e . , retransmission is at least
* an RTT later ) .
*/
2017-01-13 09:11:32 +03:00
if ( rtt_us < tcp_min_rtt ( tp ) )
2015-10-17 07:57:46 +03:00
return ;
}
2017-01-13 09:11:32 +03:00
tp - > rack . rtt_us = rtt_us ;
2017-05-17 00:00:14 +03:00
tp - > rack . mstamp = xmit_time ;
2017-01-13 09:11:34 +03:00
tp - > rack . end_seq = end_seq ;
2015-10-17 07:57:46 +03:00
tp - > rack . advanced = 1 ;
}
2017-01-13 09:11:33 +03:00
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them .
*/
void tcp_rack_reo_timeout ( struct sock * sk )
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
u32 timeout , prior_inflight ;
prior_inflight = tcp_packets_in_flight ( tp ) ;
2017-04-25 20:15:33 +03:00
tcp_rack_detect_loss ( sk , & timeout ) ;
2017-01-13 09:11:33 +03:00
if ( prior_inflight ! = tcp_packets_in_flight ( tp ) ) {
if ( inet_csk ( sk ) - > icsk_ca_state ! = TCP_CA_Recovery ) {
tcp_enter_recovery ( sk , false ) ;
if ( ! inet_csk ( sk ) - > icsk_ca_ops - > cong_control )
tcp_cwnd_reduction ( sk , 1 , 0 ) ;
}
tcp_xmit_retransmit_queue ( sk ) ;
}
if ( inet_csk ( sk ) - > icsk_pending ! = ICSK_TIME_RETRANS )
tcp_rearm_rto ( sk ) ;
}