Merge branch 'mptcp-avoid-workqueue-usage-for-data'
Paolo Abeni says: ==================== mptcp: avoid workqueue usage for data The current locking schema used to protect the MPTCP data-path requires the usage of the MPTCP workqueue to process the incoming data, depending on trylock result. The above poses scalability limits and introduces random delays in MPTCP-level acks. With this series we use a single spinlock to protect the MPTCP data-path, removing the need for workqueue and delayed ack usage. This additionally reduces the number of atomic operations required per packet and cleans-up considerably the poll/wake-up code. ==================== Link: https://lore.kernel.org/r/cover.1606413118.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
5f3e915c36
@ -1590,6 +1590,7 @@ static inline void lock_sock(struct sock *sk)
|
|||||||
lock_sock_nested(sk, 0);
|
lock_sock_nested(sk, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void __lock_sock(struct sock *sk);
|
||||||
void __release_sock(struct sock *sk);
|
void __release_sock(struct sock *sk);
|
||||||
void release_sock(struct sock *sk);
|
void release_sock(struct sock *sk);
|
||||||
|
|
||||||
|
@ -2486,7 +2486,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL(sk_page_frag_refill);
|
EXPORT_SYMBOL(sk_page_frag_refill);
|
||||||
|
|
||||||
static void __lock_sock(struct sock *sk)
|
void __lock_sock(struct sock *sk)
|
||||||
__releases(&sk->sk_lock.slock)
|
__releases(&sk->sk_lock.slock)
|
||||||
__acquires(&sk->sk_lock.slock)
|
__acquires(&sk->sk_lock.slock)
|
||||||
{
|
{
|
||||||
|
@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
|
|||||||
info->mptcpi_flags = flags;
|
info->mptcpi_flags = flags;
|
||||||
info->mptcpi_token = READ_ONCE(msk->token);
|
info->mptcpi_token = READ_ONCE(msk->token);
|
||||||
info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
|
info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
|
||||||
info->mptcpi_snd_una = atomic64_read(&msk->snd_una);
|
info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
|
||||||
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
|
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
|
||||||
unlock_sock_fast(sk, slow);
|
unlock_sock_fast(sk, slow);
|
||||||
}
|
}
|
||||||
|
@ -830,18 +830,20 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ack_update_msk(struct mptcp_sock *msk,
|
static void ack_update_msk(struct mptcp_sock *msk,
|
||||||
const struct sock *ssk,
|
struct sock *ssk,
|
||||||
struct mptcp_options_received *mp_opt)
|
struct mptcp_options_received *mp_opt)
|
||||||
{
|
{
|
||||||
u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
|
u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
|
||||||
u64 new_wnd_end, wnd_end, old_wnd_end = atomic64_read(&msk->wnd_end);
|
|
||||||
u64 snd_nxt = READ_ONCE(msk->snd_nxt);
|
|
||||||
struct sock *sk = (struct sock *)msk;
|
struct sock *sk = (struct sock *)msk;
|
||||||
|
u64 old_snd_una;
|
||||||
|
|
||||||
|
mptcp_data_lock(sk);
|
||||||
|
|
||||||
/* avoid ack expansion on update conflict, to reduce the risk of
|
/* avoid ack expansion on update conflict, to reduce the risk of
|
||||||
* wrongly expanding to a future ack sequence number, which is way
|
* wrongly expanding to a future ack sequence number, which is way
|
||||||
* more dangerous than missing an ack
|
* more dangerous than missing an ack
|
||||||
*/
|
*/
|
||||||
|
old_snd_una = msk->snd_una;
|
||||||
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
|
new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
|
||||||
|
|
||||||
/* ACK for data not even sent yet? Ignore. */
|
/* ACK for data not even sent yet? Ignore. */
|
||||||
@ -850,26 +852,16 @@ static void ack_update_msk(struct mptcp_sock *msk,
|
|||||||
|
|
||||||
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
|
new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
|
||||||
|
|
||||||
while (after64(new_wnd_end, old_wnd_end)) {
|
if (after64(new_wnd_end, msk->wnd_end)) {
|
||||||
wnd_end = old_wnd_end;
|
msk->wnd_end = new_wnd_end;
|
||||||
old_wnd_end = atomic64_cmpxchg(&msk->wnd_end, wnd_end,
|
__mptcp_wnd_updated(sk, ssk);
|
||||||
new_wnd_end);
|
|
||||||
if (old_wnd_end == wnd_end) {
|
|
||||||
if (mptcp_send_head(sk))
|
|
||||||
mptcp_schedule_work(sk);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (after64(new_snd_una, old_snd_una)) {
|
if (after64(new_snd_una, old_snd_una)) {
|
||||||
snd_una = old_snd_una;
|
msk->snd_una = new_snd_una;
|
||||||
old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
|
__mptcp_data_acked(sk);
|
||||||
new_snd_una);
|
|
||||||
if (old_snd_una == snd_una) {
|
|
||||||
mptcp_data_acked(sk);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
mptcp_data_unlock(sk);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
|
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
|
||||||
@ -922,8 +914,19 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
|
|||||||
struct mptcp_options_received mp_opt;
|
struct mptcp_options_received mp_opt;
|
||||||
struct mptcp_ext *mpext;
|
struct mptcp_ext *mpext;
|
||||||
|
|
||||||
if (__mptcp_check_fallback(msk))
|
if (__mptcp_check_fallback(msk)) {
|
||||||
|
/* Keep it simple and unconditionally trigger send data cleanup and
|
||||||
|
* pending queue spooling. We will need to acquire the data lock
|
||||||
|
* for more accurate checks, and once the lock is acquired, such
|
||||||
|
* helpers are cheap.
|
||||||
|
*/
|
||||||
|
mptcp_data_lock(subflow->conn);
|
||||||
|
if (mptcp_send_head(subflow->conn))
|
||||||
|
__mptcp_wnd_updated(subflow->conn, sk);
|
||||||
|
__mptcp_data_acked(subflow->conn);
|
||||||
|
mptcp_data_unlock(subflow->conn);
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
mptcp_get_options(skb, &mp_opt);
|
mptcp_get_options(skb, &mp_opt);
|
||||||
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
|
if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -91,6 +91,8 @@
|
|||||||
#define MPTCP_WORK_EOF 3
|
#define MPTCP_WORK_EOF 3
|
||||||
#define MPTCP_FALLBACK_DONE 4
|
#define MPTCP_FALLBACK_DONE 4
|
||||||
#define MPTCP_WORK_CLOSE_SUBFLOW 5
|
#define MPTCP_WORK_CLOSE_SUBFLOW 5
|
||||||
|
#define MPTCP_PUSH_PENDING 6
|
||||||
|
#define MPTCP_CLEAN_UNA 7
|
||||||
|
|
||||||
static inline bool before64(__u64 seq1, __u64 seq2)
|
static inline bool before64(__u64 seq1, __u64 seq2)
|
||||||
{
|
{
|
||||||
@ -218,14 +220,16 @@ struct mptcp_sock {
|
|||||||
u64 ack_seq;
|
u64 ack_seq;
|
||||||
u64 rcv_wnd_sent;
|
u64 rcv_wnd_sent;
|
||||||
u64 rcv_data_fin_seq;
|
u64 rcv_data_fin_seq;
|
||||||
|
int wmem_reserved;
|
||||||
struct sock *last_snd;
|
struct sock *last_snd;
|
||||||
int snd_burst;
|
int snd_burst;
|
||||||
int old_wspace;
|
int old_wspace;
|
||||||
atomic64_t snd_una;
|
u64 snd_una;
|
||||||
atomic64_t wnd_end;
|
u64 wnd_end;
|
||||||
unsigned long timer_ival;
|
unsigned long timer_ival;
|
||||||
u32 token;
|
u32 token;
|
||||||
int rmem_pending;
|
int rmem_pending;
|
||||||
|
int rmem_released;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
bool can_ack;
|
bool can_ack;
|
||||||
bool fully_established;
|
bool fully_established;
|
||||||
@ -237,11 +241,14 @@ struct mptcp_sock {
|
|||||||
struct work_struct work;
|
struct work_struct work;
|
||||||
struct sk_buff *ooo_last_skb;
|
struct sk_buff *ooo_last_skb;
|
||||||
struct rb_root out_of_order_queue;
|
struct rb_root out_of_order_queue;
|
||||||
|
struct sk_buff_head receive_queue;
|
||||||
|
struct sk_buff_head skb_tx_cache; /* this is wmem accounted */
|
||||||
|
int tx_pending_data;
|
||||||
|
int size_goal_cache;
|
||||||
struct list_head conn_list;
|
struct list_head conn_list;
|
||||||
struct list_head rtx_queue;
|
struct list_head rtx_queue;
|
||||||
struct mptcp_data_frag *first_pending;
|
struct mptcp_data_frag *first_pending;
|
||||||
struct list_head join_list;
|
struct list_head join_list;
|
||||||
struct skb_ext *cached_ext; /* for the next sendmsg */
|
|
||||||
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
|
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
|
||||||
struct sock *first;
|
struct sock *first;
|
||||||
struct mptcp_pm_data pm;
|
struct mptcp_pm_data pm;
|
||||||
@ -253,6 +260,22 @@ struct mptcp_sock {
|
|||||||
} rcvq_space;
|
} rcvq_space;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define mptcp_lock_sock(___sk, cb) do { \
|
||||||
|
struct sock *__sk = (___sk); /* silence macro reuse warning */ \
|
||||||
|
might_sleep(); \
|
||||||
|
spin_lock_bh(&__sk->sk_lock.slock); \
|
||||||
|
if (__sk->sk_lock.owned) \
|
||||||
|
__lock_sock(__sk); \
|
||||||
|
cb; \
|
||||||
|
__sk->sk_lock.owned = 1; \
|
||||||
|
spin_unlock(&__sk->sk_lock.slock); \
|
||||||
|
mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
|
||||||
|
local_bh_enable(); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
|
||||||
|
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
|
||||||
|
|
||||||
#define mptcp_for_each_subflow(__msk, __subflow) \
|
#define mptcp_for_each_subflow(__msk, __subflow) \
|
||||||
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
|
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
|
||||||
|
|
||||||
@ -300,7 +323,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
|
|||||||
{
|
{
|
||||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||||
|
|
||||||
if (!before64(msk->snd_nxt, atomic64_read(&msk->snd_una)))
|
if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una)))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
|
return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
|
||||||
@ -474,7 +497,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
|
|||||||
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
|
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
|
||||||
bool mptcp_finish_join(struct sock *sk);
|
bool mptcp_finish_join(struct sock *sk);
|
||||||
bool mptcp_schedule_work(struct sock *sk);
|
bool mptcp_schedule_work(struct sock *sk);
|
||||||
void mptcp_data_acked(struct sock *sk);
|
void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk);
|
||||||
|
void __mptcp_data_acked(struct sock *sk);
|
||||||
void mptcp_subflow_eof(struct sock *sk);
|
void mptcp_subflow_eof(struct sock *sk);
|
||||||
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
|
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
|
||||||
void __mptcp_flush_join_list(struct mptcp_sock *msk);
|
void __mptcp_flush_join_list(struct mptcp_sock *msk);
|
||||||
|
@ -995,19 +995,9 @@ static void subflow_data_ready(struct sock *sk)
|
|||||||
mptcp_data_ready(parent, sk);
|
mptcp_data_ready(parent, sk);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void subflow_write_space(struct sock *sk)
|
static void subflow_write_space(struct sock *ssk)
|
||||||
{
|
{
|
||||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
/* we take action in __mptcp_clean_una() */
|
||||||
struct socket *sock = READ_ONCE(sk->sk_socket);
|
|
||||||
struct sock *parent = subflow->conn;
|
|
||||||
|
|
||||||
if (!sk_stream_is_writeable(sk))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (sock && sk_stream_is_writeable(parent))
|
|
||||||
clear_bit(SOCK_NOSPACE, &sock->flags);
|
|
||||||
|
|
||||||
sk_stream_write_space(parent);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct inet_connection_sock_af_ops *
|
static struct inet_connection_sock_af_ops *
|
||||||
|
Loading…
x
Reference in New Issue
Block a user