From 0d199e4363b482badcedba764e2aceab53a4a10a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Sep 2021 14:12:34 -0700 Subject: [PATCH 1/5] mptcp: do not shrink snd_nxt when recovering When recovering after a link failure, snd_nxt should not be set to a lower value. Else, update of snd_nxt is broken because: msk->snd_nxt += ret; (where ret is number of bytes sent) assumes that snd_nxt always moves forward. After reduction, its possible that snd_nxt update gets out of sync: dfrag we just sent might have had a data sequence number even past recovery_snd_nxt. This change factors the common msk state update to a helper and updates snd_nxt based on the current dfrag data sequence number. The conditional is required for the recovery phase where we may re-transmit old dfrags that are before current snd_nxt. After this change, snd_nxt only moves forward and covers all in-sequence data that was transmitted. recovery_snd_nxt is retained to detect when recovery has completed. Fixes: 1e1d9d6f119c5 ("mptcp: handle pending data on closed subflow") Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 8 +++----- net/mptcp/protocol.c | 43 +++++++++++++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c41273cefc51..1ec6529c4326 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1019,11 +1019,9 @@ static void ack_update_msk(struct mptcp_sock *msk, old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); - /* ACK for data not even sent yet and even above recovery bound? Ignore.*/ - if (unlikely(after64(new_snd_una, snd_nxt))) { - if (!msk->recovery || after64(new_snd_una, msk->recovery_snd_nxt)) - new_snd_una = old_snd_una; - } + /* ACK for data not even sent yet? Ignore.*/ + if (unlikely(after64(new_snd_una, snd_nxt))) + new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7e5f76092b64..3d1757b8ef69 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1525,6 +1525,32 @@ static void mptcp_push_release(struct sock *sk, struct sock *ssk, release_sock(ssk); } +static void mptcp_update_post_push(struct mptcp_sock *msk, + struct mptcp_data_frag *dfrag, + u32 sent) +{ + u64 snd_nxt_new = dfrag->data_seq; + + dfrag->already_sent += sent; + + msk->snd_burst -= sent; + msk->tx_pending_data -= sent; + + snd_nxt_new += dfrag->already_sent; + + /* snd_nxt_new can be smaller than snd_nxt in case mptcp + * is recovering after a failover. In that event, this re-sends + * old segments. + * + * Thus compute snd_nxt_new candidate based on + * the dfrag->data_seq that was sent and the data + * that has been handed to the subflow for transmission + * and skip update in case it was old dfrag. + */ + if (likely(after64(snd_nxt_new, msk->snd_nxt))) + msk->snd_nxt = snd_nxt_new; +} + void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; @@ -1568,12 +1594,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) } info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1626,13 +1650,11 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; first = false; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -2230,15 +2252,12 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return false; } - /* will accept ack for reijected data before re-sending them */ - if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt)) - msk->recovery_snd_nxt = msk->snd_nxt; + msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; - msk->snd_nxt = rtx_head->data_seq; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ From 13ac17a32bf1fefbc0bd412545a907979fda26b6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 24 Sep 2021 14:12:35 -0700 Subject: [PATCH 2/5] mptcp: use OPTIONS_MPTCP_MPC Since OPTIONS_MPTCP_MPC has been defined, use it instead of open-coding. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1ec6529c4326..422f4acfb3e6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -748,9 +748,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ - if (!subflow->send_mp_prio || - ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions)) + if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ @@ -1327,8 +1325,7 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } - } else if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { From 765ff425528f309b166978c4b295eb47ebefd47a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 24 Sep 2021 14:12:36 -0700 Subject: [PATCH 3/5] mptcp: use lockdep_assert_held_once() instead of open-coding it We have a few more places where the mptcp code duplicates lockdep_assert_held_once(). Let's use the existing macro and avoid a bunch of compiler's conditional. Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3d1757b8ef69..87ee409d68ab 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -956,9 +956,7 @@ static void __mptcp_update_wmem(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); if (!msk->wmem_reserved) return; @@ -1117,9 +1115,8 @@ out: static void __mptcp_clean_una_wakeup(struct sock *sk) { -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_clean_una(sk); mptcp_write_space(sk); } From 9e65b6a5aaa3236488b4f4e3e8b914d73124a5a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Sep 2021 14:12:37 -0700 Subject: [PATCH 4/5] mptcp: remove tx_pending_data The update on recovery is not correct. msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; will update tx_pending_data multiple times when a subflow is declared stale while earlier recovery is still in progress. This means that tx_pending_data will still be positive even after all data as has been transmitted. Rather than fix it, remove this field: there are no consumers. The outstanding data byte count can be computed either via "msk->write_seq - rtx_head->data_seq" or "msk->write_seq - msk->snd_una". The latter is more recent/accurate estimate as rtx_head adjustment is deferred until mptcp lock can be acquired. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 4 ---- net/mptcp/protocol.h | 1 - 2 files changed, 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 87ee409d68ab..5b0ed64c5cd2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1531,7 +1531,6 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, dfrag->already_sent += sent; msk->snd_burst -= sent; - msk->tx_pending_data -= sent; snd_nxt_new += dfrag->already_sent; @@ -1761,7 +1760,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); - msk->tx_pending_data += psize; /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk @@ -2254,7 +2252,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) mptcp_data_unlock(sk); msk->first_pending = rtx_head; - msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ @@ -2525,7 +2522,6 @@ static int __mptcp_init_sock(struct sock *sk) msk->first_pending = NULL; msk->wmem_reserved = 0; WRITE_ONCE(msk->rmem_released, 0); - msk->tx_pending_data = 0; msk->timer_ival = TCP_RTO_MIN; msk->first = NULL; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d3e6fd1615f1..d516fb6578cc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -254,7 +254,6 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - int tx_pending_data; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; From 3241a9c029344ca4aa0ef1fa9f0f010d5bbc2a85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Sep 2021 14:12:38 -0700 Subject: [PATCH 5/5] mptcp: re-arm retransmit timer if data is pending The retransmit head will be NULL in case there is no in-flight data (meaning all data injected into network has been acked). In that case the retransmit timer is stopped. This is only correct if there is no more pending, not-yet-sent data. If there is, the retransmit timer needs to set the PENDING bit again so that mptcp tries to send the remaining (new) data once a subflow can accept more data. Also, mptcp_subflow_get_retrans() has to be called unconditionally. This function checks for subflows that have become unresponsive and marks them as stale, so in the case where the rtx queue is empty, subflows will never be marked stale which prevents available backup subflows from becoming eligible for transmit. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/226 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5b0ed64c5cd2..8029bbbe1c9e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1105,7 +1105,8 @@ out: if (cleaned && tcp_under_memory_pressure(sk)) __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) { + if (snd_una == READ_ONCE(msk->snd_nxt) && + snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { @@ -1547,6 +1548,13 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, msk->snd_nxt = snd_nxt_new; } +static void mptcp_check_and_set_pending(struct sock *sk) +{ + if (mptcp_send_head(sk) && + !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; @@ -2414,6 +2422,9 @@ static void __mptcp_retrans(struct sock *sk) int ret; mptcp_clean_una_wakeup(sk); + + /* first check ssk: need to kick "stale" logic */ + ssk = mptcp_subflow_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2426,10 +2437,12 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - return; + if (!mptcp_send_head(sk)) + return; + + goto reset_timer; } - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_timer; @@ -2456,6 +2469,8 @@ static void __mptcp_retrans(struct sock *sk) release_sock(ssk); reset_timer: + mptcp_check_and_set_pending(sk); + if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); }