From d507204d3c5cc57d9a8bdf0a477615bb59ea1611 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:52 -0800 Subject: [PATCH 1/6] tcp/dccp: add tw->tw_bslot We want to allow inet_twsk_kill() working even if netns has been dismantled/freed, to get rid of inet_twsk_purge(). This patch adds tw->tw_bslot to cache the bind bucket slot so that inet_twsk_kill() no longer needs to dereference twsk_net(tw) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + net/ipv4/inet_timewait_sock.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index dfd919b3119e..c221fe2b77dd 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -72,6 +72,7 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + u32 tw_bslot; /* bind bucket slot */ struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 437afe392e66..6e8f4a6cd222 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -52,8 +52,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) spin_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, - hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[tw->tw_bslot]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); @@ -110,8 +109,12 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, - hashinfo->bhash_size)]; + /* Cache inet_bhashfn(), because 'struct net' might be no longer + * available later in inet_twsk_kill(). + */ + tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num, + hashinfo->bhash_size); + bhead = &hashinfo->bhash[tw->tw_bslot]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); From 27dd35e02235902933626469a1c9198612d72564 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:53 -0800 Subject: [PATCH 2/6] tcp/dccp: no longer use twsk_net(tw) from tw_timer_handler() We will soon get rid of inet_twsk_purge(). This means that tw_timer_handler() might fire after a netns has been dismantled/freed. Instead of adding a function (and data structure) to find a netns from tw->tw_net_cookie, just update the SNMP counters a bit earlier, when the netns is known to be alive. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 5 ++--- net/ipv4/inet_timewait_sock.c | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c221fe2b77dd..b323db969b8b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -65,10 +65,9 @@ struct inet_timewait_sock { /* these three are in inet_sock */ __be16 tw_sport; /* And these are ours. */ - unsigned int tw_kill : 1, - tw_transparent : 1, + unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 2, /* 2 bits hole */ + tw_pad : 3, /* 3 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 6e8f4a6cd222..e37e4852711c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -148,10 +148,6 @@ static void tw_timer_handler(struct timer_list *t) { struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); - if (tw->tw_kill) - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); - else - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED); inet_twsk_kill(tw); } @@ -247,8 +243,11 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) * of PAWS. */ - tw->tw_kill = timeo <= 4*HZ; if (!rearm) { + bool kill = timeo <= 4*HZ; + + __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : + LINUX_MIB_TIMEWAITED); BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); } else { From 0dad4087a86a2cbe177404dc73f18ada26a2c390 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:54 -0800 Subject: [PATCH 3/6] tcp/dccp: get rid of inet_twsk_purge() Prior patches in the series made sure tw_timer_handler() can be fired after netns has been dismantled/freed. We no longer have to scan a potentially big TCP ehash table at netns dismantle. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 -- net/dccp/ipv4.c | 6 ---- net/dccp/ipv6.c | 6 ---- net/ipv4/inet_timewait_sock.c | 47 -------------------------------- net/ipv4/tcp_ipv4.c | 2 -- net/ipv6/tcp_ipv6.c | 6 ---- 6 files changed, 69 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b323db969b8b..463ae5d33eb0 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,8 +110,6 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); - static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0ea29270d7e5..ae662567a6cb 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1030,15 +1030,9 @@ static void __net_exit dccp_v4_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v4_ctl_sk); } -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET); -} - static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, .id = &dccp_v4_pernet_id, .size = sizeof(struct dccp_v4_pernet), }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index fa663518fa0e..eab3bd1ee9a0 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1115,15 +1115,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } -static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET6); -} - static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, - .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e37e4852711c..71808c7a7025 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -255,50 +255,3 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); - -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) -{ - struct inet_timewait_sock *tw; - struct sock *sk; - struct hlist_nulls_node *node; - unsigned int slot; - - for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; -restart_rcu: - cond_resched(); - rcu_read_lock(); -restart: - sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) - continue; - tw = inet_twsk(sk); - if ((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->ns.count)) - continue; - - if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) - continue; - - if (unlikely((tw->tw_family != family) || - refcount_read(&twsk_net(tw)->ns.count))) { - inet_twsk_put(tw); - goto restart; - } - - rcu_read_unlock(); - local_bh_disable(); - inet_twsk_deschedule_put(tw); - local_bh_enable(); - goto restart_rcu; - } - /* If the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto restart; - rcu_read_unlock(); - } -} -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..8e94b9988204 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3239,8 +3239,6 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - inet_twsk_purge(&tcp_hashinfo, AF_INET); - list_for_each_entry(net, net_exit_list, exit_list) tcp_fastopen_ctx_destroy(net); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 075ee8a2df3b..1e55ee98dfed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2237,15 +2237,9 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&tcp_hashinfo, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) From a15c89c703d43490ea68ea4516553d4ea4f6b1e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:55 -0800 Subject: [PATCH 4/6] ipv4: do not use per netns icmp sockets Back in linux-2.6.25 (commit 4a6ad7a141cb "[NETNS]: Make icmp_sk per namespace."), we added private per-cpu/per-netns ipv4 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmp_xmit_lock() already makes sure to lock the chosen per-cpu socket. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - net/ipv4/icmp.c | 91 ++++++++++++++-------------------------- 2 files changed, 31 insertions(+), 61 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 78557643526e..639a31638159 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -70,7 +70,6 @@ struct netns_ipv4 { struct hlist_head *fib_table_hash; struct sock *fibnl; - struct sock * __percpu *icmp_sk; struct sock *mc_autojoin_sk; struct inet_peer_base *peers; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index b7e277d8a84d..72a375c7f417 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -192,24 +192,14 @@ struct icmp_control { static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; -/* - * The ICMP socket(s). This is the most convenient way to flow control - * our ICMP output as well as maintain a clean interface throughout - * all layers. All Socketless IP sends will soon be gone. - * - * On SMP we have one ICMP socket per-cpu. - */ -static struct sock *icmp_sk(struct net *net) -{ - return this_cpu_read(*net->ipv4.icmp_sk); -} +static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk); /* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; - sk = icmp_sk(net); + sk = this_cpu_read(ipv4_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a @@ -217,11 +207,13 @@ static inline struct sock *icmp_xmit_lock(struct net *net) */ return NULL; } + sock_net_set(sk, net); return sk; } static inline void icmp_xmit_unlock(struct sock *sk) { + sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } @@ -363,14 +355,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, return 0; } -static void icmp_push_reply(struct icmp_bxm *icmp_param, +static void icmp_push_reply(struct sock *sk, + struct icmp_bxm *icmp_param, struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rt) { - struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net((*rt)->dst.dev)); if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -452,7 +443,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) if (IS_ERR(rt)) goto out_unlock; if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); @@ -766,7 +757,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (!fl4.saddr) fl4.saddr = htonl(INADDR_DUMMY); - icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: @@ -1434,46 +1425,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { }, }; -static void __net_exit icmp_sk_exit(struct net *net) -{ - int i; - - for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i)); - free_percpu(net->ipv4.icmp_sk); - net->ipv4.icmp_sk = NULL; -} - static int __net_init icmp_sk_init(struct net *net) { - int i, err; - - net->ipv4.icmp_sk = alloc_percpu(struct sock *); - if (!net->ipv4.icmp_sk) - return -ENOMEM; - - for_each_possible_cpu(i) { - struct sock *sk; - - err = inet_ctl_sock_create(&sk, PF_INET, - SOCK_RAW, IPPROTO_ICMP, net); - if (err < 0) - goto fail; - - *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk; - - /* Enough space for 2 64K ICMP packets, including - * sk_buff/skb_shared_info struct overhead. - */ - sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); - - /* - * Speedup sock_wfree() - */ - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; - } - /* Control parameters for ECHO replies. */ net->ipv4.sysctl_icmp_echo_ignore_all = 0; net->ipv4.sysctl_icmp_echo_enable_probe = 0; @@ -1499,18 +1452,36 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; return 0; - -fail: - icmp_sk_exit(net); - return err; } static struct pernet_operations __net_initdata icmp_sk_ops = { .init = icmp_sk_init, - .exit = icmp_sk_exit, }; int __init icmp_init(void) { + int err, i; + + for_each_possible_cpu(i) { + struct sock *sk; + + err = inet_ctl_sock_create(&sk, PF_INET, + SOCK_RAW, IPPROTO_ICMP, &init_net); + if (err < 0) + return err; + + per_cpu(ipv4_icmp_sk, i) = sk; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff/skb_shared_info struct overhead. + */ + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); + + /* + * Speedup sock_wfree() + */ + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; + } return register_pernet_subsys(&icmp_sk_ops); } From 6a17b961ec19cd61ca646a6655ab93e8f6fe15c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:56 -0800 Subject: [PATCH 5/6] ipv6: do not use per netns icmp sockets Back in linux-2.6.25 (commit 98c6d1b261e7 "[NETNS]: Make icmpv6_sk per namespace.", we added private per-cpu/per-netns ipv6 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmpv6_xmit_lock() already makes sure to lock the chosen per-cpu socket. This patch has a considerable impact on the number of netns that the worker thread in cleanup_net() can dismantle per second, because ip6mr_sk_done() is no longer called, meaning we no longer acquire the rtnl mutex, competing with other threads adding new netns. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 - net/ipv6/icmp.c | 62 +++++++--------------------------------- 2 files changed, 10 insertions(+), 53 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..30cdfc4e1615 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -88,7 +88,6 @@ struct netns_ipv6 { struct fib6_table *fib6_local_tbl; struct fib_rules_ops *fib6_rules_ops; #endif - struct sock * __percpu *icmp_sk; struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 96c5cc0f30ce..e6b978ea0e87 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -69,17 +69,7 @@ #include -/* - * The ICMP socket(s). This is the most convenient way to flow control - * our ICMP output as well as maintain a clean interface throughout - * all layers. All Socketless IP sends will soon be gone. - * - * On SMP we have one ICMP socket per-cpu. - */ -static struct sock *icmpv6_sk(struct net *net) -{ - return this_cpu_read(*net->ipv6.icmp_sk); -} +static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk); static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) @@ -110,11 +100,11 @@ static const struct inet6_protocol icmpv6_protocol = { }; /* Called with BH disabled */ -static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) +static struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; - sk = icmpv6_sk(net); + sk = this_cpu_read(ipv6_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an @@ -122,11 +112,13 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) */ return NULL; } + sock_net_set(sk, net); return sk; } -static __inline__ void icmpv6_xmit_unlock(struct sock *sk) +static void icmpv6_xmit_unlock(struct sock *sk) { + sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } @@ -1034,59 +1026,27 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } -static void __net_exit icmpv6_sk_exit(struct net *net) -{ - int i; - - for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv6.icmp_sk, i)); - free_percpu(net->ipv6.icmp_sk); -} - -static int __net_init icmpv6_sk_init(struct net *net) +int __init icmpv6_init(void) { struct sock *sk; int err, i; - net->ipv6.icmp_sk = alloc_percpu(struct sock *); - if (!net->ipv6.icmp_sk) - return -ENOMEM; - for_each_possible_cpu(i) { err = inet_ctl_sock_create(&sk, PF_INET6, - SOCK_RAW, IPPROTO_ICMPV6, net); + SOCK_RAW, IPPROTO_ICMPV6, &init_net); if (err < 0) { pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); - goto fail; + return err; } - *per_cpu_ptr(net->ipv6.icmp_sk, i) = sk; + per_cpu(ipv6_icmp_sk, i) = sk; /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); } - return 0; - - fail: - icmpv6_sk_exit(net); - return err; -} - -static struct pernet_operations icmpv6_sk_ops = { - .init = icmpv6_sk_init, - .exit = icmpv6_sk_exit, -}; - -int __init icmpv6_init(void) -{ - int err; - - err = register_pernet_subsys(&icmpv6_sk_ops); - if (err < 0) - return err; err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) @@ -1101,14 +1061,12 @@ sender_reg_err: inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: pr_err("Failed to register ICMP6 protocol\n"); - unregister_pernet_subsys(&icmpv6_sk_ops); return err; } void icmpv6_cleanup(void) { inet6_unregister_icmp_sender(icmp6_send); - unregister_pernet_subsys(&icmpv6_sk_ops); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } From 37ba017dcc3b1123206808979834655ddcf93251 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:57 -0800 Subject: [PATCH 6/6] ipv4/tcp: do not use per netns ctl sockets TCP ipv4 uses per-cpu/per-netns ctl sockets in order to send RST and some ACK packets (on behalf of TIMEWAIT sockets). This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. tcp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer in order to be able to use IPv4 output functions. Note that I attempted a related change in the past, that had to be hot-fixed in commit bdbbb8527b6f ("ipv4: tcp: get rid of ugly unicast_sock") This patch could very well surface old bugs, on layers not taking care of sk->sk_kern_sock properly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - net/ipv4/tcp_ipv4.c | 61 ++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 639a31638159..22b4c6df1d2b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -73,7 +73,6 @@ struct netns_ipv4 { struct sock *mc_autojoin_sk; struct inet_peer_base *peers; - struct sock * __percpu *tcp_sk; struct fqdir *fqdir; u8 sysctl_icmp_echo_ignore_all; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8e94b9988204..a7d83ceea420 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); +static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); + static u32 tcp_v4_init_seq(const struct sk_buff *skb) { return secure_tcp_seq(ip_hdr(skb)->daddr, @@ -810,7 +812,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk = this_cpu_read(ipv4_tcp_sk); + sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; @@ -825,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) transmit_time); ctl_sk->sk_mark = 0; + sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); local_bh_enable(); @@ -908,7 +912,8 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(*net->ipv4.tcp_sk); + ctl_sk = this_cpu_read(ipv4_tcp_sk); + sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : sk->sk_mark; ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? @@ -921,6 +926,7 @@ static void tcp_v4_send_ack(const struct sock *sk, transmit_time); ctl_sk->sk_mark = 0; + sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_bh_enable(); } @@ -3111,41 +3117,14 @@ EXPORT_SYMBOL(tcp_prot); static void __net_exit tcp_sk_exit(struct net *net) { - int cpu; - if (net->ipv4.tcp_congestion_control) bpf_module_put(net->ipv4.tcp_congestion_control, net->ipv4.tcp_congestion_control->owner); - - for_each_possible_cpu(cpu) - inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); - free_percpu(net->ipv4.tcp_sk); } static int __net_init tcp_sk_init(struct net *net) { - int res, cpu, cnt; - - net->ipv4.tcp_sk = alloc_percpu(struct sock *); - if (!net->ipv4.tcp_sk) - return -ENOMEM; - - for_each_possible_cpu(cpu) { - struct sock *sk; - - res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, - IPPROTO_TCP, net); - if (res) - goto fail; - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - - /* Please enforce IP_DF and IPID==0 for RST and - * ACK sent in SYN-RECV and TIME-WAIT state. - */ - inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - - *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; - } + int cnt; net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_fallback = 1; @@ -3229,10 +3208,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_congestion_control = &tcp_reno; return 0; -fail: - tcp_sk_exit(net); - - return res; } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) @@ -3324,6 +3299,24 @@ static void __init bpf_iter_register(void) void __init tcp_v4_init(void) { + int cpu, res; + + for_each_possible_cpu(cpu) { + struct sock *sk; + + res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, + IPPROTO_TCP, &init_net); + if (res) + panic("Failed to create the TCP control socket.\n"); + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + /* Please enforce IP_DF and IPID==0 for RST and + * ACK sent in SYN-RECV and TIME-WAIT state. + */ + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; + + per_cpu(ipv4_tcp_sk, cpu) = sk; + } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n");