From 02db55718d53f9d426cee504c27fb768e9ed4ffe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:02 -0800 Subject: [PATCH 1/3] tcp: do not overshoot window_clamp in tcp_rcv_space_adjust() While rcvbuf is properly clamped by tcp_rmem[2], rcvwin is left to a potentially too big value. It has no serious effect, since : 1) tcp_grow_window() has very strict checks. 2) window_clamp can be mangled by user space to any value anyway. tcp_init_buffer_space() and companions use tcp_full_space(), we use tcp_win_from_space() to avoid reloading sk->sk_rcvbuf Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9550cc42de2d..746a6773c482 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -631,7 +631,7 @@ void tcp_rcv_space_adjust(struct sock *sk) sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; From 607065bad9931e72207b0cac365d7d4abc06bd99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:03 -0800 Subject: [PATCH 2/3] tcp: avoid integer overflows in tcp_rcv_space_adjust() When using large tcp_rmem[2] values (I did tests with 500 MB), I noticed overflows while computing rcvwin. Lets fix this before the following patch. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca4a6361389b..4f93f0953c41 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -344,7 +344,7 @@ struct tcp_sock { /* Receiver queue space */ struct { - int space; + u32 space; u32 seq; u64 time; } rcvq_space; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 746a6773c482..2900e58738cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -576,8 +576,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); @@ -600,12 +600,13 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; /* If rate increased by 25%, * assume slow start, rcvwin = 3 * copied @@ -625,8 +626,9 @@ void tcp_rcv_space_adjust(struct sock *sk) while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; From c3916ad9320eed8eacd7c0b2cf7f881efceda892 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Dec 2017 17:55:04 -0800 Subject: [PATCH 3/3] tcp: smoother receiver autotuning Back in linux-3.13 (commit b0983d3c9b13 ("tcp: fix dynamic right sizing")) I addressed the pressing issues we had with receiver autotuning. But DRS suffers from extra latencies caused by rcv_rtt_est.rtt_us drifts. One common problem happens during slow start, since the apparent RTT measured by the receiver can be inflated by ~50%, at the end of one packet train. Also, a single drop can delay read() calls by one RTT, meaning tcp_rcv_space_adjust() can be called one RTT too late. By replacing the tri-modal heuristic with a continuous function, we can offset the effects of not growing 'at the optimal time'. The curve of the function matches prior behavior if the space increased by 25% and 50% exactly. Cost of added multiply/divide is small, considering a TCP flow typically would run this part of the code few times in its life. I tested this patch with 100 ms RTT / 1% loss link, 100 runs of (netperf -l 5), and got an average throughput of 4600 Mbit instead of 1700 Mbit. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Wei Wang Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2900e58738cd..fefb46c16de7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -601,26 +601,17 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; - u64 rcvwin; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss)