From e70f3c701276a8825414dc650c4f76141a8704b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:06 -0700 Subject: [PATCH 1/7] Revert "net: set SK_MEM_QUANTUM to 4096" This reverts commit bd68a2a854ad5a85f0c8d0a9c8048ca3f6391efb. This change broke memcg on arches with PAGE_SIZE != 4096 Later, commit 2bb2f5fb21b04 ("net: add new socket option SO_RESERVE_MEM") also assumed PAGE_SIZE==SK_MEM_QUANTUM Following patches in the series will greatly reduce the over allocations problem. Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 657873e2d90f..5c5265269899 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1532,25 +1532,15 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); -/* We used to have PAGE_SIZE here, but systems with 64KB pages - * do not necessarily have 16x time more memory than 4KB ones. - */ -#define SK_MEM_QUANTUM 4096 +#define SK_MEM_QUANTUM ((int)PAGE_SIZE) #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 -/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ +/* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { - long val = sk->sk_prot->sysctl_mem[index]; - -#if PAGE_SIZE > SK_MEM_QUANTUM - val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; -#elif PAGE_SIZE < SK_MEM_QUANTUM - val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT; -#endif - return val; + return sk->sk_prot->sysctl_mem[index]; } static inline int sk_mem_pages(int amt) From 100fdd1faf50557558e2911af4be32e515cb8036 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:07 -0700 Subject: [PATCH 2/7] net: remove SK_MEM_QUANTUM and SK_MEM_QUANTUM_SHIFT Due to memcg interface, SK_MEM_QUANTUM is effectively PAGE_SIZE. This might change in the future, but it seems better to avoid the confusion. Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 8 +++----- net/core/sock.c | 16 ++++++++-------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/udp.c | 10 +++++----- net/mptcp/protocol.c | 8 ++++---- net/sctp/protocol.c | 4 ++-- 8 files changed, 26 insertions(+), 28 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 5c5265269899..298897bbfb3a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1532,8 +1532,6 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); -#define SK_MEM_QUANTUM ((int)PAGE_SIZE) -#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 @@ -1545,7 +1543,7 @@ static inline long sk_prot_mem_limits(const struct sock *sk, int index) static inline int sk_mem_pages(int amt) { - return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; + return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) @@ -1594,7 +1592,7 @@ static inline void sk_mem_reclaim(struct sock *sk) reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); - if (reclaimable >= SK_MEM_QUANTUM) + if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } @@ -1613,7 +1611,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk) reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); - if (reclaimable > SK_MEM_QUANTUM) + if (reclaimable > (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable - 1); } diff --git a/net/core/sock.c b/net/core/sock.c index f5062d9e1222..063fd7680a84 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -991,7 +991,7 @@ EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ - bytes &= ~(SK_MEM_QUANTUM - 1); + bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); sk->sk_reserved_mem -= bytes; @@ -1028,9 +1028,9 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc += pages << PAGE_SHIFT; - sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; + sk->sk_reserved_mem += pages << PAGE_SHIFT; return 0; } @@ -3003,10 +3003,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc += amt << PAGE_SHIFT; ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc -= amt << PAGE_SHIFT; return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -3034,12 +3034,12 @@ EXPORT_SYMBOL(__sk_mem_reduce_allocated); /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket - * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; - sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + amount >>= PAGE_SHIFT; + sk->sk_forward_alloc -= amount << PAGE_SHIFT; __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9984d23a7f3e..9e696758a4c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4661,11 +4661,11 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2e2a9ece9af2..3fb117022558 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5287,7 +5287,7 @@ new_range: before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || - end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) { tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c054431e358..8ab98e1aca67 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3367,7 +3367,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + sk->sk_forward_alloc += amt << PAGE_SHIFT; sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index aa9f2ec3dc46..bbc9970fa2e9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1461,11 +1461,11 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, sk->sk_forward_alloc += size; - amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); + amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); sk->sk_forward_alloc -= amt; if (amt) - __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); + __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); @@ -1558,7 +1558,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_lock(&list->lock); if (size >= sk->sk_forward_alloc) { amt = sk_mem_pages(size); - delta = amt << SK_MEM_QUANTUM_SHIFT; + delta = amt << PAGE_SHIFT; if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) { err = -ENOBUFS; spin_unlock(&list->lock); @@ -3263,8 +3263,8 @@ EXPORT_SYMBOL(udp_flow_hashrnd); static void __udp_sysctl_init(struct net *net) { - net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM; - net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM; + net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; + net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 17e13396024a..080a630d6902 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -167,8 +167,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; - mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + amount >>= PAGE_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT; __sk_mem_reduce_allocated(sk, amount); } @@ -327,7 +327,7 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) return true; amt = sk_mem_pages(size); - amount = amt << SK_MEM_QUANTUM_SHIFT; + amount = amt << PAGE_SHIFT; msk->rmem_fwd_alloc += amount; if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { if (ssk->sk_forward_alloc < amount) { @@ -972,7 +972,7 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk) lockdep_assert_held_once(&sk->sk_lock.slock); - if (reclaimable > SK_MEM_QUANTUM) + if (reclaimable > (int)PAGE_SIZE) __mptcp_rmem_reclaim(sk, reclaimable - 1); sk_mem_reclaim_partial(sk); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 35928fefae33..fa500ea3a1f1 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1523,11 +1523,11 @@ static __init int sctp_init(void) limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); - sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */ + sysctl_sctp_rmem[0] = PAGE_SIZE; /* give each asoc 1 page min */ sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1); sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); - sysctl_sctp_wmem[0] = SK_MEM_QUANTUM; + sysctl_sctp_wmem[0] = PAGE_SIZE; sysctl_sctp_wmem[1] = 16*1024; sysctl_sctp_wmem[2] = max(64*1024, max_share); From 0defbb0af775ef037913786048d099bbe8b9a2c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:08 -0700 Subject: [PATCH 3/7] net: add per_cpu_fw_alloc field to struct proto Each protocol having a ->memory_allocated pointer gets a corresponding per-cpu reserve, that following patches will use. Instead of having reserved bytes per socket, we want to have per-cpu reserves. Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + include/net/tcp.h | 2 ++ include/net/udp.h | 1 + net/core/sock.c | 4 ++++ net/decnet/af_decnet.c | 4 ++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 4 ++++ net/ipv4/udplite.c | 3 +++ net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ net/ipv6/udplite.c | 3 +++ net/mptcp/protocol.c | 3 +++ net/sctp/socket.c | 7 +++++++ 14 files changed, 43 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 298897bbfb3a..825f8cbf791f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1254,6 +1254,7 @@ struct proto { void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ + int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e99f5c61f84..4794cae4577e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -253,6 +253,8 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; +DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); + extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; diff --git a/include/net/udp.h b/include/net/udp.h index b83a00330566..b60eea2e3fae 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -95,6 +95,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; +DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; diff --git a/net/core/sock.c b/net/core/sock.c index 063fd7680a84..f96efc95e334 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3798,6 +3798,10 @@ int proto_register(struct proto *prot, int alloc_slab) pr_err("%s: missing sysctl_mem\n", prot->name); return -EINVAL; } + if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { + pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); + return -EINVAL; + } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index dc92a67baea3..aa4f43f52499 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -149,6 +149,7 @@ static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; static atomic_long_t decnet_memory_allocated; +static DEFINE_PER_CPU(int, decnet_memory_per_cpu_fw_alloc); static int __dn_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen, int flags); @@ -454,7 +455,10 @@ static struct proto dn_proto = { .owner = THIS_MODULE, .enter_memory_pressure = dn_enter_memory_pressure, .memory_pressure = &dn_memory_pressure, + .memory_allocated = &decnet_memory_allocated, + .per_cpu_fw_alloc = &decnet_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_decnet_mem, .sysctl_wmem = sysctl_decnet_wmem, .sysctl_rmem = sysctl_decnet_rmem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e696758a4c2..e6bdf8e2c09a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fe8f23b95d32..fda811a5251f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3045,7 +3045,10 @@ struct proto tcp_prot = { .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bbc9970fa2e9..6172b4750a88 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,6 +125,8 @@ EXPORT_SYMBOL(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(udp_memory_allocated); +DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) @@ -2946,6 +2948,8 @@ struct proto udp_prot = { .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index cd1cd68adeec..6e08a76ae1e7 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -51,7 +51,10 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f37dd4aa91c6..c72448ba6dc9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2159,7 +2159,10 @@ struct proto tcpv6_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 55afd7f39c04..be074f07073a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1740,7 +1740,10 @@ struct proto udpv6_prot = { #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif + .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index fbb700d3f437..b70725856259 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -48,7 +48,10 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 080a630d6902..9563124ac8af 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3437,7 +3437,10 @@ static struct proto mptcp_prot = { .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6d37d2dfb3da..05174acd981a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -93,6 +93,7 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, static unsigned long sctp_memory_pressure; static atomic_long_t sctp_memory_allocated; +static DEFINE_PER_CPU(int, sctp_memory_per_cpu_fw_alloc); struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) @@ -9657,7 +9658,10 @@ struct proto sctp_prot = { .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, + .memory_allocated = &sctp_memory_allocated, + .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc, + .sockets_allocated = &sctp_sockets_allocated, }; @@ -9700,7 +9704,10 @@ struct proto sctpv6_prot = { .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, + .memory_allocated = &sctp_memory_allocated, + .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc, + .sockets_allocated = &sctp_sockets_allocated, }; #endif /* IS_ENABLED(CONFIG_IPV6) */ From 3cd3399dd7a84ada85cb839989cdf7310e302c7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:09 -0700 Subject: [PATCH 4/7] net: implement per-cpu reserves for memory_allocated We plan keeping sk->sk_forward_alloc as small as possible in future patches. This means we are going to call sk_memory_allocated_add() and sk_memory_allocated_sub() more often. Implement a per-cpu cache of +1/-1 MB, to reduce number of changes to sk->sk_prot->memory_allocated, which would otherwise be cause of false sharing. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Shakeel Butt Signed-off-by: Jakub Kicinski --- include/net/sock.h | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 825f8cbf791f..59040fee74e7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1397,22 +1397,48 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) return !!*sk->sk_prot->memory_pressure; } +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + static inline long sk_memory_allocated(const struct sock *sk) { - return atomic_long_read(sk->sk_prot->memory_allocated); + return proto_memory_allocated(sk->sk_prot); } +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) + static inline long sk_memory_allocated_add(struct sock *sk, int amt) { - return atomic_long_add_return(amt, sk->sk_prot->memory_allocated); + int local_reserve; + + preempt_disable(); + local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt); + if (local_reserve >= SK_MEMORY_PCPU_RESERVE) { + __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); + atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); + } + preempt_enable(); + return sk_memory_allocated(sk); } static inline void sk_memory_allocated_sub(struct sock *sk, int amt) { - atomic_long_sub(amt, sk->sk_prot->memory_allocated); + int local_reserve; + + preempt_disable(); + local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt); + if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) { + __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve); + atomic_long_add(local_reserve, sk->sk_prot->memory_allocated); + } + preempt_enable(); } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 @@ -1441,12 +1467,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline long -proto_memory_allocated(struct proto *prot) -{ - return atomic_long_read(prot->memory_allocated); -} - static inline bool proto_memory_pressure(struct proto *prot) { From 7c80b038d23e1f4c7fcc311f43f83b8c60e7fb80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:10 -0700 Subject: [PATCH 5/7] net: fix sk_wmem_schedule() and sk_rmem_schedule() errors If sk->sk_forward_alloc is 150000, and we need to schedule 150001 bytes, we want to allocate 1 byte more (rounded up to one page), instead of 150001 :/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 59040fee74e7..cf288f7e9019 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1575,19 +1575,23 @@ static inline bool sk_has_account(struct sock *sk) static inline bool sk_wmem_schedule(struct sock *sk, int size) { + int delta; + if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_SEND); + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { + int delta; + if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV) || + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || skb_pfmemalloc(skb); } From 4890b686f4088c90432149bd6de567e621266fa2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:11 -0700 Subject: [PATCH 6/7] net: keep sk->sk_forward_alloc as small as possible Currently, tcp_memory_allocated can hit tcp_mem[] limits quite fast. Each TCP socket can forward allocate up to 2 MB of memory, even after flow became less active. 10,000 sockets can have reserved 20 GB of memory, and we have no shrinker in place to reclaim that. Instead of trying to reclaim the extra allocations in some places, just keep sk->sk_forward_alloc values as small as possible. This should not impact performance too much now we have per-cpu reserves: Changes to tcp_memory_allocated should not be too frequent. For sockets not using SO_RESERVE_MEM: - idle sockets (no packets in tx/rx queues) have zero forward alloc. - non idle sockets have a forward alloc smaller than one page. Note: - Removal of SK_RECLAIM_CHUNK and SK_RECLAIM_THRESHOLD is left to MPTCP maintainers as a follow up. Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/sock.h | 29 ++--------------------------- net/core/datagram.c | 3 --- net/ipv4/tcp.c | 7 ------- net/ipv4/tcp_input.c | 4 ---- net/ipv4/tcp_timer.c | 19 ++++--------------- net/iucv/af_iucv.c | 2 -- net/mptcp/protocol.c | 2 +- net/sctp/sm_statefuns.c | 2 -- net/sctp/socket.c | 5 ----- net/sctp/stream_interleave.c | 2 -- net/sctp/ulpqueue.c | 4 ---- 11 files changed, 7 insertions(+), 72 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index cf288f7e9019..0063e8410a4e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1627,19 +1627,6 @@ static inline void sk_mem_reclaim_final(struct sock *sk) sk_mem_reclaim(sk); } -static inline void sk_mem_reclaim_partial(struct sock *sk) -{ - int reclaimable; - - if (!sk_has_account(sk)) - return; - - reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); - - if (reclaimable > (int)PAGE_SIZE) - __sk_mem_reclaim(sk, reclaimable - 1); -} - static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) @@ -1647,29 +1634,17 @@ static inline void sk_mem_charge(struct sock *sk, int size) sk->sk_forward_alloc -= size; } -/* the following macros control memory reclaiming in sk_mem_uncharge() +/* the following macros control memory reclaiming in mptcp_rmem_uncharge() */ #define SK_RECLAIM_THRESHOLD (1 << 21) #define SK_RECLAIM_CHUNK (1 << 20) static inline void sk_mem_uncharge(struct sock *sk, int size) { - int reclaimable; - if (!sk_has_account(sk)) return; sk->sk_forward_alloc += size; - reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); - - /* Avoid a possible overflow. - * TCP send queues can make this happen, if sk_mem_reclaim() - * is not called and more than 2 GBytes are released at once. - * - * If we reach 2 MBytes, reclaim 1 MBytes right now, there is - * no need to hold that much forward allocation anyway. - */ - if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) - __sk_mem_reclaim(sk, SK_RECLAIM_CHUNK); + sk_mem_reclaim(sk); } /* diff --git a/net/core/datagram.c b/net/core/datagram.c index 50f4faeea76c..35791f86bd1a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -320,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); - sk_mem_reclaim_partial(sk); } EXPORT_SYMBOL(skb_free_datagram); @@ -336,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) slow = lock_sock_fast(sk); sk_peek_offset_bwd(sk, len); skb_orphan(skb); - sk_mem_reclaim_partial(sk); unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ @@ -396,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) NULL); kfree_skb(skb); - sk_mem_reclaim_partial(sk); return err; } EXPORT_SYMBOL(skb_kill_datagram); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6bdf8e2c09a..14ebb4ec4a51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -858,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - if (unlikely(tcp_under_memory_pressure(sk))) - sk_mem_reclaim_partial(sk); - skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; @@ -2764,8 +2761,6 @@ void __tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_mem_reclaim(sk); - /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; @@ -2873,7 +2868,6 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); @@ -2951,7 +2945,6 @@ void tcp_write_queue_purge(struct sock *sk) } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3fb117022558..fdc7beb81b68 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -805,7 +805,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); - sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -4390,7 +4389,6 @@ void tcp_fin(struct sock *sk) skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -5336,7 +5334,6 @@ static bool tcp_prune_ofo_queue(struct sock *sk) tcp_drop_reason(sk, rb_to_skb(node), SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); if (!prev || goal <= 0) { - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; @@ -5383,7 +5380,6 @@ static int tcp_prune_queue(struct sock *sk) skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 20cf4a98c69d..2208755e8efc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -290,15 +290,13 @@ void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - sk_mem_reclaim_partial(sk); - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; + return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); - goto out; + return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; @@ -317,10 +315,6 @@ void tcp_delack_timer_handler(struct sock *sk) tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } - -out: - if (tcp_under_memory_pressure(sk)) - sk_mem_reclaim(sk); } @@ -600,11 +594,11 @@ void tcp_write_timer_handler(struct sock *sk) if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !icsk->icsk_pending) - goto out; + return; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); - goto out; + return; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -626,9 +620,6 @@ void tcp_write_timer_handler(struct sock *sk) tcp_probe_timer(sk); break; } - -out: - sk_mem_reclaim(sk); } static void tcp_write_timer(struct timer_list *t) @@ -743,8 +734,6 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_when(tp) - elapsed; } - sk_mem_reclaim(sk); - resched: inet_csk_reset_keepalive_timer (sk, elapsed); goto out; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a0385ddbffcf..498a0c35b7bb 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -278,8 +278,6 @@ static void iucv_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); - sk_mem_reclaim(sk); - if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive iucv socket %p\n", sk); return; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9563124ac8af..e0fb9f96c45c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -975,7 +975,7 @@ static void __mptcp_mem_reclaim_partial(struct sock *sk) if (reclaimable > (int)PAGE_SIZE) __mptcp_rmem_reclaim(sk, reclaimable - 1); - sk_mem_reclaim_partial(sk); + sk_mem_reclaim(sk); } static void mptcp_mem_reclaim_partial(struct sock *sk) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 52edee1322fc..f6ee7f4040c1 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6590,8 +6590,6 @@ static int sctp_eat_data(const struct sctp_association *asoc, pr_debug("%s: under pressure, reneging for tsn:%u\n", __func__, tsn); deliver = SCTP_CMD_RENEGE; - } else { - sk_mem_reclaim(sk); } } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 05174acd981a..171f1a35d205 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1824,9 +1824,6 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - if (sk_under_memory_pressure(sk)) - sk_mem_reclaim(sk); - if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); @@ -9195,8 +9192,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, goto do_error; if (signal_pending(current)) goto do_interrupted; - if (sk_under_memory_pressure(sk)) - sk_mem_reclaim(sk); if ((int)msg_len <= sctp_wspace(asoc) && sk_wmem_schedule(sk, msg_len)) break; diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 6b13f737ebf2..bb22b71df7a3 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -979,8 +979,6 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); - - sk_mem_reclaim(asoc->base.sk); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 407fed46931b..0a8510a0c5e6 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1100,12 +1100,8 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, else if (retval == 1) sctp_ulpq_reasm_drain(ulpq); } - - sk_mem_reclaim(asoc->base.sk); } - - /* Notify the application if an association is aborted and in * partial delivery mode. Send up any pending received messages. */ From 0f2c2693988aeeb4c83a581fe58a28d526eecd39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 23:34:12 -0700 Subject: [PATCH 7/7] net: unexport __sk_mem_{raise|reduce}_allocated These two helpers are only used from core networking. Signed-off-by: Eric Dumazet Reviewed-by: Shakeel Butt Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- net/core/sock.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index f96efc95e334..697d5c8e2f0d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2987,7 +2987,6 @@ suppress_allocation: return 0; } -EXPORT_SYMBOL(__sk_mem_raise_allocated); /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated @@ -3029,7 +3028,6 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } -EXPORT_SYMBOL(__sk_mem_reduce_allocated); /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated