Merge branch 'tcp-rx-tx-cache'
Eric Dumazet says: ==================== tcp: add rx/tx cache to reduce lock contention On hosts with many cpus we can observe a very serious contention on spinlocks used in mm slab layer. The following can happen quite often : 1) TX path sendmsg() allocates one (fclone) skb on CPU A, sends a clone. ACK is received on CPU B, and consumes the skb that was in the retransmit queue. 2) RX path network driver allocates skb on CPU C recvmsg() happens on CPU D, freeing the skb after it has been delivered to user space. In both cases, we are hitting the asymetric alloc/free pattern for which slab has to drain alien caches. At 8 Mpps per second, this represents 16 Mpps alloc/free per second and has a huge penalty. In an interesting experiment, I tried to use a single kmem_cache for all the skbs (in skb_init() : skbuff_fclone_cache = skbuff_head_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones),); qnd most of the contention disappeared, since cpus could better use their local slab per-cpu cache. But we can do actually better, in the following patches. TX : at ACK time, no longer free the skb but put it back in a tcp socket cache, so that next sendmsg() can reuse it immediately. RX : at recvmsg() time, do not free the skb but put it in a tcp socket cache so that it can be freed by the cpu feeding the incoming packets in BH. This increased the performance of small RPC benchmark by about 10 % on a host with 112 hyperthreads. v2 : - Solved a race condition : sk_stream_alloc_skb() to make sure the prior clone has been freed. - Really test rps_needed in sk_eat_skb() as claimed. - Fixed rps_needed use in drivers/net/tun.c v3: Added a #ifdef CONFIG_RPS, to avoid compile error (kbuild robot) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
bdaba8959e
@ -1042,7 +1042,7 @@ static int tun_net_close(struct net_device *dev)
|
||||
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
|
||||
{
|
||||
#ifdef CONFIG_RPS
|
||||
if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
|
||||
if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
|
||||
/* Select queue was not called for the skbuff, so we extract the
|
||||
* RPS hash and save it into the flow_table here.
|
||||
*/
|
||||
|
@ -194,8 +194,8 @@ struct net_device_stats {
|
||||
|
||||
#ifdef CONFIG_RPS
|
||||
#include <linux/static_key.h>
|
||||
extern struct static_key rps_needed;
|
||||
extern struct static_key rfs_needed;
|
||||
extern struct static_key_false rps_needed;
|
||||
extern struct static_key_false rfs_needed;
|
||||
#endif
|
||||
|
||||
struct neighbour;
|
||||
|
@ -368,6 +368,7 @@ struct sock {
|
||||
atomic_t sk_drops;
|
||||
int sk_rcvlowat;
|
||||
struct sk_buff_head sk_error_queue;
|
||||
struct sk_buff *sk_rx_skb_cache;
|
||||
struct sk_buff_head sk_receive_queue;
|
||||
/*
|
||||
* The backlog queue is special, it is always used with
|
||||
@ -414,6 +415,7 @@ struct sock {
|
||||
struct sk_buff *sk_send_head;
|
||||
struct rb_root tcp_rtx_queue;
|
||||
};
|
||||
struct sk_buff *sk_tx_skb_cache;
|
||||
struct sk_buff_head sk_write_queue;
|
||||
__s32 sk_peek_off;
|
||||
int sk_write_pending;
|
||||
@ -966,7 +968,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
|
||||
static inline void sock_rps_record_flow(const struct sock *sk)
|
||||
{
|
||||
#ifdef CONFIG_RPS
|
||||
if (static_key_false(&rfs_needed)) {
|
||||
if (static_branch_unlikely(&rfs_needed)) {
|
||||
/* Reading sk->sk_rxhash might incur an expensive cache line
|
||||
* miss.
|
||||
*
|
||||
@ -1463,6 +1465,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
|
||||
|
||||
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
if (!sk->sk_tx_skb_cache) {
|
||||
sk->sk_tx_skb_cache = skb;
|
||||
return;
|
||||
}
|
||||
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
|
||||
sk->sk_wmem_queued -= skb->truesize;
|
||||
sk_mem_uncharge(sk, skb->truesize);
|
||||
@ -2433,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
|
||||
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
__skb_unlink(skb, &sk->sk_receive_queue);
|
||||
if (
|
||||
#ifdef CONFIG_RPS
|
||||
!static_branch_unlikely(&rps_needed) &&
|
||||
#endif
|
||||
!sk->sk_rx_skb_cache) {
|
||||
sk->sk_rx_skb_cache = skb;
|
||||
skb_orphan(skb);
|
||||
return;
|
||||
}
|
||||
__kfree_skb(skb);
|
||||
}
|
||||
|
||||
|
@ -3982,9 +3982,9 @@ EXPORT_SYMBOL(rps_sock_flow_table);
|
||||
u32 rps_cpu_mask __read_mostly;
|
||||
EXPORT_SYMBOL(rps_cpu_mask);
|
||||
|
||||
struct static_key rps_needed __read_mostly;
|
||||
struct static_key_false rps_needed __read_mostly;
|
||||
EXPORT_SYMBOL(rps_needed);
|
||||
struct static_key rfs_needed __read_mostly;
|
||||
struct static_key_false rfs_needed __read_mostly;
|
||||
EXPORT_SYMBOL(rfs_needed);
|
||||
|
||||
static struct rps_dev_flow *
|
||||
@ -4510,7 +4510,7 @@ static int netif_rx_internal(struct sk_buff *skb)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RPS
|
||||
if (static_key_false(&rps_needed)) {
|
||||
if (static_branch_unlikely(&rps_needed)) {
|
||||
struct rps_dev_flow voidflow, *rflow = &voidflow;
|
||||
int cpu;
|
||||
|
||||
@ -5179,7 +5179,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
|
||||
|
||||
rcu_read_lock();
|
||||
#ifdef CONFIG_RPS
|
||||
if (static_key_false(&rps_needed)) {
|
||||
if (static_branch_unlikely(&rps_needed)) {
|
||||
struct rps_dev_flow voidflow, *rflow = &voidflow;
|
||||
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
|
||||
|
||||
@ -5227,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
|
||||
|
||||
rcu_read_lock();
|
||||
#ifdef CONFIG_RPS
|
||||
if (static_key_false(&rps_needed)) {
|
||||
if (static_branch_unlikely(&rps_needed)) {
|
||||
list_for_each_entry_safe(skb, next, head, list) {
|
||||
struct rps_dev_flow voidflow, *rflow = &voidflow;
|
||||
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
|
||||
|
@ -754,9 +754,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
|
||||
rcu_assign_pointer(queue->rps_map, map);
|
||||
|
||||
if (map)
|
||||
static_key_slow_inc(&rps_needed);
|
||||
static_branch_inc(&rps_needed);
|
||||
if (old_map)
|
||||
static_key_slow_dec(&rps_needed);
|
||||
static_branch_dec(&rps_needed);
|
||||
|
||||
mutex_unlock(&rps_map_mutex);
|
||||
|
||||
|
@ -95,12 +95,12 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
|
||||
if (sock_table != orig_sock_table) {
|
||||
rcu_assign_pointer(rps_sock_flow_table, sock_table);
|
||||
if (sock_table) {
|
||||
static_key_slow_inc(&rps_needed);
|
||||
static_key_slow_inc(&rfs_needed);
|
||||
static_branch_inc(&rps_needed);
|
||||
static_branch_inc(&rfs_needed);
|
||||
}
|
||||
if (orig_sock_table) {
|
||||
static_key_slow_dec(&rps_needed);
|
||||
static_key_slow_dec(&rfs_needed);
|
||||
static_branch_dec(&rps_needed);
|
||||
static_branch_dec(&rfs_needed);
|
||||
synchronize_rcu();
|
||||
vfree(orig_sock_table);
|
||||
}
|
||||
|
@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
__skb_queue_purge(&sk->sk_receive_queue);
|
||||
if (sk->sk_rx_skb_cache) {
|
||||
__kfree_skb(sk->sk_rx_skb_cache);
|
||||
sk->sk_rx_skb_cache = NULL;
|
||||
}
|
||||
__skb_queue_purge(&sk->sk_error_queue);
|
||||
|
||||
sk_mem_reclaim(sk);
|
||||
|
@ -865,6 +865,21 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = sk->sk_tx_skb_cache;
|
||||
if (skb && !size) {
|
||||
const struct sk_buff_fclones *fclones;
|
||||
|
||||
fclones = container_of(skb, struct sk_buff_fclones, skb1);
|
||||
if (refcount_read(&fclones->fclone_ref) == 1) {
|
||||
sk->sk_wmem_queued -= skb->truesize;
|
||||
sk_mem_uncharge(sk, skb->truesize);
|
||||
skb->truesize -= skb->data_len;
|
||||
sk->sk_tx_skb_cache = NULL;
|
||||
pskb_trim(skb, 0);
|
||||
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
|
||||
return skb;
|
||||
}
|
||||
}
|
||||
/* The TCP header must be at least 32-bit aligned. */
|
||||
size = ALIGN(size, 4);
|
||||
|
||||
@ -1098,30 +1113,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_sendpage);
|
||||
|
||||
/* Do not bother using a page frag for very small frames.
|
||||
* But use this heuristic only for the first skb in write queue.
|
||||
*
|
||||
* Having no payload in skb->head allows better SACK shifting
|
||||
* in tcp_shift_skb_data(), reducing sack/rack overhead, because
|
||||
* write queue has less skbs.
|
||||
* Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
|
||||
* This also speeds up tso_fragment(), since it wont fallback
|
||||
* to tcp_fragment().
|
||||
*/
|
||||
static int linear_payload_sz(bool first_skb)
|
||||
{
|
||||
if (first_skb)
|
||||
return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int select_size(bool first_skb, bool zc)
|
||||
{
|
||||
if (zc)
|
||||
return 0;
|
||||
return linear_payload_sz(first_skb);
|
||||
}
|
||||
|
||||
void tcp_free_fastopen_req(struct tcp_sock *tp)
|
||||
{
|
||||
if (tp->fastopen_req) {
|
||||
@ -1272,7 +1263,6 @@ restart:
|
||||
|
||||
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
|
||||
bool first_skb;
|
||||
int linear;
|
||||
|
||||
new_segment:
|
||||
if (!sk_stream_memory_free(sk))
|
||||
@ -1283,8 +1273,7 @@ new_segment:
|
||||
goto restart;
|
||||
}
|
||||
first_skb = tcp_rtx_and_write_queues_empty(sk);
|
||||
linear = select_size(first_skb, zc);
|
||||
skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
|
||||
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
|
||||
first_skb);
|
||||
if (!skb)
|
||||
goto wait_for_memory;
|
||||
@ -2552,6 +2541,13 @@ void tcp_write_queue_purge(struct sock *sk)
|
||||
sk_wmem_free_skb(sk, skb);
|
||||
}
|
||||
tcp_rtx_queue_purge(sk);
|
||||
skb = sk->sk_tx_skb_cache;
|
||||
if (skb) {
|
||||
sk->sk_wmem_queued -= skb->truesize;
|
||||
sk_mem_uncharge(sk, skb->truesize);
|
||||
__kfree_skb(skb);
|
||||
sk->sk_tx_skb_cache = NULL;
|
||||
}
|
||||
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
|
||||
sk_mem_reclaim(sk);
|
||||
tcp_clear_all_retrans_hints(tcp_sk(sk));
|
||||
@ -2587,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
|
||||
|
||||
tcp_clear_xmit_timers(sk);
|
||||
__skb_queue_purge(&sk->sk_receive_queue);
|
||||
if (sk->sk_rx_skb_cache) {
|
||||
__kfree_skb(sk->sk_rx_skb_cache);
|
||||
sk->sk_rx_skb_cache = NULL;
|
||||
}
|
||||
tp->copied_seq = tp->rcv_nxt;
|
||||
tp->urg_data = 0;
|
||||
tcp_write_queue_purge(sk);
|
||||
|
@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
|
||||
int tcp_v4_rcv(struct sk_buff *skb)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct sk_buff *skb_to_free;
|
||||
int sdif = inet_sdif(skb);
|
||||
const struct iphdr *iph;
|
||||
const struct tcphdr *th;
|
||||
@ -1905,11 +1906,17 @@ process:
|
||||
tcp_segs_in(tcp_sk(sk), skb);
|
||||
ret = 0;
|
||||
if (!sock_owned_by_user(sk)) {
|
||||
skb_to_free = sk->sk_rx_skb_cache;
|
||||
sk->sk_rx_skb_cache = NULL;
|
||||
ret = tcp_v4_do_rcv(sk, skb);
|
||||
} else if (tcp_add_backlog(sk, skb)) {
|
||||
goto discard_and_relse;
|
||||
} else {
|
||||
if (tcp_add_backlog(sk, skb))
|
||||
goto discard_and_relse;
|
||||
skb_to_free = NULL;
|
||||
}
|
||||
bh_unlock_sock(sk);
|
||||
if (skb_to_free)
|
||||
__kfree_skb(skb_to_free);
|
||||
|
||||
put_and_return:
|
||||
if (refcounted)
|
||||
|
@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
|
||||
|
||||
static int tcp_v6_rcv(struct sk_buff *skb)
|
||||
{
|
||||
struct sk_buff *skb_to_free;
|
||||
int sdif = inet6_sdif(skb);
|
||||
const struct tcphdr *th;
|
||||
const struct ipv6hdr *hdr;
|
||||
@ -1562,12 +1563,17 @@ process:
|
||||
tcp_segs_in(tcp_sk(sk), skb);
|
||||
ret = 0;
|
||||
if (!sock_owned_by_user(sk)) {
|
||||
skb_to_free = sk->sk_rx_skb_cache;
|
||||
sk->sk_rx_skb_cache = NULL;
|
||||
ret = tcp_v6_do_rcv(sk, skb);
|
||||
} else if (tcp_add_backlog(sk, skb)) {
|
||||
goto discard_and_relse;
|
||||
} else {
|
||||
if (tcp_add_backlog(sk, skb))
|
||||
goto discard_and_relse;
|
||||
skb_to_free = NULL;
|
||||
}
|
||||
bh_unlock_sock(sk);
|
||||
|
||||
if (skb_to_free)
|
||||
__kfree_skb(skb_to_free);
|
||||
put_and_return:
|
||||
if (refcounted)
|
||||
sock_put(sk);
|
||||
|
Loading…
Reference in New Issue
Block a user