Merge branch 'mptcp-avoid-workqueue-usage-for-data'

Paolo Abeni says:

====================
mptcp: avoid workqueue usage for data

The current locking schema used to protect the MPTCP data-path
requires the usage of the MPTCP workqueue to process the incoming
data, depending on trylock result.

The above poses scalability limits and introduces random delays
in MPTCP-level acks.

With this series we use a single spinlock to protect the MPTCP
data-path, removing the need for workqueue and delayed ack usage.

This additionally reduces the number of atomic operations required
per packet and cleans-up considerably the poll/wake-up code.
====================

Link: https://lore.kernel.org/r/cover.1606413118.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2020-11-30 17:55:26 -08:00
commit 5f3e915c36
7 changed files with 598 additions and 235 deletions

View File

@ -1590,6 +1590,7 @@ static inline void lock_sock(struct sock *sk)
lock_sock_nested(sk, 0); lock_sock_nested(sk, 0);
} }
void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk); void __release_sock(struct sock *sk);
void release_sock(struct sock *sk); void release_sock(struct sock *sk);

View File

@ -2486,7 +2486,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
} }
EXPORT_SYMBOL(sk_page_frag_refill); EXPORT_SYMBOL(sk_page_frag_refill);
static void __lock_sock(struct sock *sk) void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock) __releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock)
{ {

View File

@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
info->mptcpi_flags = flags; info->mptcpi_flags = flags;
info->mptcpi_token = READ_ONCE(msk->token); info->mptcpi_token = READ_ONCE(msk->token);
info->mptcpi_write_seq = READ_ONCE(msk->write_seq); info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
info->mptcpi_snd_una = atomic64_read(&msk->snd_una); info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
unlock_sock_fast(sk, slow); unlock_sock_fast(sk, slow);
} }

View File

@ -830,18 +830,20 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
} }
static void ack_update_msk(struct mptcp_sock *msk, static void ack_update_msk(struct mptcp_sock *msk,
const struct sock *ssk, struct sock *ssk,
struct mptcp_options_received *mp_opt) struct mptcp_options_received *mp_opt)
{ {
u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end);
u64 snd_nxt = READ_ONCE(msk->snd_nxt);
struct sock *sk = (struct sock *)msk; struct sock *sk = (struct sock *)msk;
u64 old_snd_una;
mptcp_data_lock(sk);
/* avoid ack expansion on update conflict, to reduce the risk of /* avoid ack expansion on update conflict, to reduce the risk of
* wrongly expanding to a future ack sequence number, which is way * wrongly expanding to a future ack sequence number, which is way
* more dangerous than missing an ack * more dangerous than missing an ack
*/ */
old_snd_una = msk->snd_una;
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
/* ACK for data not even sent yet? Ignore. */ /* ACK for data not even sent yet? Ignore. */
@ -850,26 +852,16 @@ static void ack_update_msk(struct mptcp_sock *msk,
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
while (after64(new_wnd_end, old_wnd_end)) { if (after64(new_wnd_end, msk->wnd_end)) {
wnd_end = old_wnd_end; msk->wnd_end = new_wnd_end;
old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end, __mptcp_wnd_updated(sk, ssk);
new_wnd_end);
if (old_wnd_end == wnd_end) {
if (mptcp_send_head(sk))
mptcp_schedule_work(sk);
break;
}
} }
while (after64(new_snd_una, old_snd_una)) { if (after64(new_snd_una, old_snd_una)) {
snd_una = old_snd_una; msk->snd_una = new_snd_una;
old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, __mptcp_data_acked(sk);
new_snd_una);
if (old_snd_una == snd_una) {
mptcp_data_acked(sk);
break;
}
} }
mptcp_data_unlock(sk);
} }
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
@ -922,8 +914,19 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
struct mptcp_options_received mp_opt; struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext; struct mptcp_ext *mpext;
if (__mptcp_check_fallback(msk)) if (__mptcp_check_fallback(msk)) {
/* Keep it simple and unconditionally trigger send data cleanup and
* pending queue spooling. We will need to acquire the data lock
* for more accurate checks, and once the lock is acquired, such
* helpers are cheap.
*/
mptcp_data_lock(subflow->conn);
if (mptcp_send_head(subflow->conn))
__mptcp_wnd_updated(subflow->conn, sk);
__mptcp_data_acked(subflow->conn);
mptcp_data_unlock(subflow->conn);
return; return;
}
mptcp_get_options(skb, &mp_opt); mptcp_get_options(skb, &mp_opt);
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))

File diff suppressed because it is too large Load Diff

View File

@ -91,6 +91,8 @@
#define MPTCP_WORK_EOF 3 #define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4 #define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5 #define MPTCP_WORK_CLOSE_SUBFLOW 5
#define MPTCP_PUSH_PENDING 6
#define MPTCP_CLEAN_UNA 7
static inline bool before64(__u64 seq1, __u64 seq2) static inline bool before64(__u64 seq1, __u64 seq2)
{ {
@ -218,14 +220,16 @@ struct mptcp_sock {
u64 ack_seq; u64 ack_seq;
u64 rcv_wnd_sent; u64 rcv_wnd_sent;
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
int wmem_reserved;
struct sock *last_snd; struct sock *last_snd;
int snd_burst; int snd_burst;
int old_wspace; int old_wspace;
atomic64_t snd_una; u64 snd_una;
atomic64_t wnd_end; u64 wnd_end;
unsigned long timer_ival; unsigned long timer_ival;
u32 token; u32 token;
int rmem_pending; int rmem_pending;
int rmem_released;
unsigned long flags; unsigned long flags;
bool can_ack; bool can_ack;
bool fully_established; bool fully_established;
@ -237,11 +241,14 @@ struct mptcp_sock {
struct work_struct work; struct work_struct work;
struct sk_buff *ooo_last_skb; struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue; struct rb_root out_of_order_queue;
struct sk_buff_head receive_queue;
struct sk_buff_head skb_tx_cache; /* this is wmem accounted */
int tx_pending_data;
int size_goal_cache;
struct list_head conn_list; struct list_head conn_list;
struct list_head rtx_queue; struct list_head rtx_queue;
struct mptcp_data_frag *first_pending; struct mptcp_data_frag *first_pending;
struct list_head join_list; struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first; struct sock *first;
struct mptcp_pm_data pm; struct mptcp_pm_data pm;
@ -253,6 +260,22 @@ struct mptcp_sock {
} rcvq_space; } rcvq_space;
}; };
#define mptcp_lock_sock(___sk, cb) do { \
struct sock *__sk = (___sk); /* silence macro reuse warning */ \
might_sleep(); \
spin_lock_bh(&__sk->sk_lock.slock); \
if (__sk->sk_lock.owned) \
__lock_sock(__sk); \
cb; \
__sk->sk_lock.owned = 1; \
spin_unlock(&__sk->sk_lock.slock); \
mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
local_bh_enable(); \
} while (0)
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
#define mptcp_for_each_subflow(__msk, __subflow) \ #define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node) list_for_each_entry(__subflow, &((__msk)->conn_list), node)
@ -300,7 +323,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
if (!before64(msk->snd_nxt, atomic64_read(&msk->snd_una))) if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una)))
return NULL; return NULL;
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
@ -474,7 +497,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk); bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk); bool mptcp_schedule_work(struct sock *sk);
void mptcp_data_acked(struct sock *sk); void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk); void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
void __mptcp_flush_join_list(struct mptcp_sock *msk); void __mptcp_flush_join_list(struct mptcp_sock *msk);

View File

@ -995,19 +995,9 @@ static void subflow_data_ready(struct sock *sk)
mptcp_data_ready(parent, sk); mptcp_data_ready(parent, sk);
} }
static void subflow_write_space(struct sock *sk) static void subflow_write_space(struct sock *ssk)
{ {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we take action in __mptcp_clean_una() */
struct socket *sock = READ_ONCE(sk->sk_socket);
struct sock *parent = subflow->conn;
if (!sk_stream_is_writeable(sk))
return;
if (sock && sk_stream_is_writeable(parent))
clear_bit(SOCK_NOSPACE, &sock->flags);
sk_stream_write_space(parent);
} }
static struct inet_connection_sock_af_ops * static struct inet_connection_sock_af_ops *