Having two ring buffers per-peer means that every peer results in two massive ring allocations. On an 8-core x86_64 machine, this commit reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which is an 90% reduction. Ninety percent! With some single-machine deployments approaching 500,000 peers, we're talking about a reduction from 7 gigs of memory down to 700 megs of memory. In order to get rid of these per-peer allocations, this commit switches to using a list-based queueing approach. Currently GSO fragments are chained together using the skb->next pointer (the skb_list_* singly linked list approach), so we form the per-peer queue around the unused skb->prev pointer (which sort of makes sense because the links are pointing backwards). Use of skb_queue_* is not possible here, because that is based on doubly linked lists and spinlocks. Multiple cores can write into the queue at any given time, because its writes occur in the start_xmit path or in the udp_recv path. But reads happen in a single workqueue item per-peer, amounting to a multi-producer, single-consumer paradigm. The MPSC queue is implemented locklessly and never blocks. However, it is not linearizable (though it is serializable), with a very tight and unlikely race on writes, which, when hit (some tiny fraction of the 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes the queue reader to terminate early. However, because every packet sent queues up the same workqueue item after it is fully added, the worker resumes again, and stopping early isn't actually a problem, since at that point the packet wouldn't have yet been added to the encryption queue. These properties allow us to avoid disabling interrupts or spinning. The design is based on Dmitry Vyukov's algorithm [1]. Performance-wise, ordinarily list-based queues aren't preferable to ringbuffers, because of cache misses when following pointers around. However, we *already* have to follow the adjacent pointers when working through fragments, so there shouldn't actually be any change there. A potential downside is that dequeueing is a bit more complicated, but the ptr_ring structure used prior had a spinlock when dequeueing, so all and all the difference appears to be a wash. Actually, from profiling, the biggest performance hit, by far, of this commit winds up being atomic_add_unless(count, 1, max) and atomic_ dec(count), which account for the majority of CPU time, according to perf. In that sense, the previous ring buffer was superior in that it could check if it was full by head==tail, which the list-based approach cannot do. But all and all, this enables us to get massive memory savings, allowing WireGuard to scale for real world deployments, without taking much of a performance hit. [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue Reviewed-by: Dmitry Vyukov <dvyukov@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
84 lines
2.1 KiB
C
84 lines
2.1 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
*/
|
|
|
|
#ifndef _WG_PEER_H
|
|
#define _WG_PEER_H
|
|
|
|
#include "device.h"
|
|
#include "noise.h"
|
|
#include "cookie.h"
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/netfilter.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/kref.h>
|
|
#include <net/dst_cache.h>
|
|
|
|
struct wg_device;
|
|
|
|
struct endpoint {
|
|
union {
|
|
struct sockaddr addr;
|
|
struct sockaddr_in addr4;
|
|
struct sockaddr_in6 addr6;
|
|
};
|
|
union {
|
|
struct {
|
|
struct in_addr src4;
|
|
/* Essentially the same as addr6->scope_id */
|
|
int src_if4;
|
|
};
|
|
struct in6_addr src6;
|
|
};
|
|
};
|
|
|
|
struct wg_peer {
|
|
struct wg_device *device;
|
|
struct prev_queue tx_queue, rx_queue;
|
|
struct sk_buff_head staged_packet_queue;
|
|
int serial_work_cpu;
|
|
bool is_dead;
|
|
struct noise_keypairs keypairs;
|
|
struct endpoint endpoint;
|
|
struct dst_cache endpoint_cache;
|
|
rwlock_t endpoint_lock;
|
|
struct noise_handshake handshake;
|
|
atomic64_t last_sent_handshake;
|
|
struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
|
|
struct cookie latest_cookie;
|
|
struct hlist_node pubkey_hash;
|
|
u64 rx_bytes, tx_bytes;
|
|
struct timer_list timer_retransmit_handshake, timer_send_keepalive;
|
|
struct timer_list timer_new_handshake, timer_zero_key_material;
|
|
struct timer_list timer_persistent_keepalive;
|
|
unsigned int timer_handshake_attempts;
|
|
u16 persistent_keepalive_interval;
|
|
bool timer_need_another_keepalive;
|
|
bool sent_lastminute_handshake;
|
|
struct timespec64 walltime_last_handshake;
|
|
struct kref refcount;
|
|
struct rcu_head rcu;
|
|
struct list_head peer_list;
|
|
struct list_head allowedips_list;
|
|
struct napi_struct napi;
|
|
u64 internal_id;
|
|
};
|
|
|
|
struct wg_peer *wg_peer_create(struct wg_device *wg,
|
|
const u8 public_key[NOISE_PUBLIC_KEY_LEN],
|
|
const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]);
|
|
|
|
struct wg_peer *__must_check wg_peer_get_maybe_zero(struct wg_peer *peer);
|
|
static inline struct wg_peer *wg_peer_get(struct wg_peer *peer)
|
|
{
|
|
kref_get(&peer->refcount);
|
|
return peer;
|
|
}
|
|
void wg_peer_put(struct wg_peer *peer);
|
|
void wg_peer_remove(struct wg_peer *peer);
|
|
void wg_peer_remove_all(struct wg_device *wg);
|
|
|
|
#endif /* _WG_PEER_H */
|