8b5553ace8
Having two ring buffers per-peer means that every peer results in two massive ring allocations. On an 8-core x86_64 machine, this commit reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which is an 90% reduction. Ninety percent! With some single-machine deployments approaching 500,000 peers, we're talking about a reduction from 7 gigs of memory down to 700 megs of memory. In order to get rid of these per-peer allocations, this commit switches to using a list-based queueing approach. Currently GSO fragments are chained together using the skb->next pointer (the skb_list_* singly linked list approach), so we form the per-peer queue around the unused skb->prev pointer (which sort of makes sense because the links are pointing backwards). Use of skb_queue_* is not possible here, because that is based on doubly linked lists and spinlocks. Multiple cores can write into the queue at any given time, because its writes occur in the start_xmit path or in the udp_recv path. But reads happen in a single workqueue item per-peer, amounting to a multi-producer, single-consumer paradigm. The MPSC queue is implemented locklessly and never blocks. However, it is not linearizable (though it is serializable), with a very tight and unlikely race on writes, which, when hit (some tiny fraction of the 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes the queue reader to terminate early. However, because every packet sent queues up the same workqueue item after it is fully added, the worker resumes again, and stopping early isn't actually a problem, since at that point the packet wouldn't have yet been added to the encryption queue. These properties allow us to avoid disabling interrupts or spinning. The design is based on Dmitry Vyukov's algorithm [1]. Performance-wise, ordinarily list-based queues aren't preferable to ringbuffers, because of cache misses when following pointers around. However, we *already* have to follow the adjacent pointers when working through fragments, so there shouldn't actually be any change there. A potential downside is that dequeueing is a bit more complicated, but the ptr_ring structure used prior had a spinlock when dequeueing, so all and all the difference appears to be a wash. Actually, from profiling, the biggest performance hit, by far, of this commit winds up being atomic_add_unless(count, 1, max) and atomic_ dec(count), which account for the majority of CPU time, according to perf. In that sense, the previous ring buffer was superior in that it could check if it was full by head==tail, which the list-based approach cannot do. But all and all, this enables us to get massive memory savings, allowing WireGuard to scale for real world deployments, without taking much of a performance hit. [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue Reviewed-by: Dmitry Vyukov <dvyukov@google.com> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com> Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
215 lines
6.3 KiB
C
215 lines
6.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
*/
|
|
|
|
#ifndef _WG_QUEUEING_H
|
|
#define _WG_QUEUEING_H
|
|
|
|
#include "peer.h"
|
|
#include <linux/types.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/ipv6.h>
|
|
#include <net/ip_tunnels.h>
|
|
|
|
struct wg_device;
|
|
struct wg_peer;
|
|
struct multicore_worker;
|
|
struct crypt_queue;
|
|
struct prev_queue;
|
|
struct sk_buff;
|
|
|
|
/* queueing.c APIs: */
|
|
int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
|
|
unsigned int len);
|
|
void wg_packet_queue_free(struct crypt_queue *queue);
|
|
struct multicore_worker __percpu *
|
|
wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
|
|
|
|
/* receive.c APIs: */
|
|
void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb);
|
|
void wg_packet_handshake_receive_worker(struct work_struct *work);
|
|
/* NAPI poll function: */
|
|
int wg_packet_rx_poll(struct napi_struct *napi, int budget);
|
|
/* Workqueue worker: */
|
|
void wg_packet_decrypt_worker(struct work_struct *work);
|
|
|
|
/* send.c APIs: */
|
|
void wg_packet_send_queued_handshake_initiation(struct wg_peer *peer,
|
|
bool is_retry);
|
|
void wg_packet_send_handshake_response(struct wg_peer *peer);
|
|
void wg_packet_send_handshake_cookie(struct wg_device *wg,
|
|
struct sk_buff *initiating_skb,
|
|
__le32 sender_index);
|
|
void wg_packet_send_keepalive(struct wg_peer *peer);
|
|
void wg_packet_purge_staged_packets(struct wg_peer *peer);
|
|
void wg_packet_send_staged_packets(struct wg_peer *peer);
|
|
/* Workqueue workers: */
|
|
void wg_packet_handshake_send_worker(struct work_struct *work);
|
|
void wg_packet_tx_worker(struct work_struct *work);
|
|
void wg_packet_encrypt_worker(struct work_struct *work);
|
|
|
|
enum packet_state {
|
|
PACKET_STATE_UNCRYPTED,
|
|
PACKET_STATE_CRYPTED,
|
|
PACKET_STATE_DEAD
|
|
};
|
|
|
|
struct packet_cb {
|
|
u64 nonce;
|
|
struct noise_keypair *keypair;
|
|
atomic_t state;
|
|
u32 mtu;
|
|
u8 ds;
|
|
};
|
|
|
|
#define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb))
|
|
#define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer)
|
|
|
|
static inline bool wg_check_packet_protocol(struct sk_buff *skb)
|
|
{
|
|
__be16 real_protocol = ip_tunnel_parse_protocol(skb);
|
|
return real_protocol && skb->protocol == real_protocol;
|
|
}
|
|
|
|
static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating)
|
|
{
|
|
u8 l4_hash = skb->l4_hash;
|
|
u8 sw_hash = skb->sw_hash;
|
|
u32 hash = skb->hash;
|
|
skb_scrub_packet(skb, true);
|
|
memset(&skb->headers_start, 0,
|
|
offsetof(struct sk_buff, headers_end) -
|
|
offsetof(struct sk_buff, headers_start));
|
|
if (encapsulating) {
|
|
skb->l4_hash = l4_hash;
|
|
skb->sw_hash = sw_hash;
|
|
skb->hash = hash;
|
|
}
|
|
skb->queue_mapping = 0;
|
|
skb->nohdr = 0;
|
|
skb->peeked = 0;
|
|
skb->mac_len = 0;
|
|
skb->dev = NULL;
|
|
#ifdef CONFIG_NET_SCHED
|
|
skb->tc_index = 0;
|
|
#endif
|
|
skb_reset_redirect(skb);
|
|
skb->hdr_len = skb_headroom(skb);
|
|
skb_reset_mac_header(skb);
|
|
skb_reset_network_header(skb);
|
|
skb_reset_transport_header(skb);
|
|
skb_probe_transport_header(skb);
|
|
skb_reset_inner_headers(skb);
|
|
}
|
|
|
|
static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
|
|
{
|
|
unsigned int cpu = *stored_cpu, cpu_index, i;
|
|
|
|
if (unlikely(cpu == nr_cpumask_bits ||
|
|
!cpumask_test_cpu(cpu, cpu_online_mask))) {
|
|
cpu_index = id % cpumask_weight(cpu_online_mask);
|
|
cpu = cpumask_first(cpu_online_mask);
|
|
for (i = 0; i < cpu_index; ++i)
|
|
cpu = cpumask_next(cpu, cpu_online_mask);
|
|
*stored_cpu = cpu;
|
|
}
|
|
return cpu;
|
|
}
|
|
|
|
/* This function is racy, in the sense that next is unlocked, so it could return
|
|
* the same CPU twice. A race-free version of this would be to instead store an
|
|
* atomic sequence number, do an increment-and-return, and then iterate through
|
|
* every possible CPU until we get to that index -- choose_cpu. However that's
|
|
* a bit slower, and it doesn't seem like this potential race actually
|
|
* introduces any performance loss, so we live with it.
|
|
*/
|
|
static inline int wg_cpumask_next_online(int *next)
|
|
{
|
|
int cpu = *next;
|
|
|
|
while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask)))
|
|
cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
|
|
*next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
|
|
return cpu;
|
|
}
|
|
|
|
void wg_prev_queue_init(struct prev_queue *queue);
|
|
|
|
/* Multi producer */
|
|
bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
|
|
|
|
/* Single consumer */
|
|
struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
|
|
|
|
/* Single consumer */
|
|
static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
|
|
{
|
|
if (queue->peeked)
|
|
return queue->peeked;
|
|
queue->peeked = wg_prev_queue_dequeue(queue);
|
|
return queue->peeked;
|
|
}
|
|
|
|
/* Single consumer */
|
|
static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
|
|
{
|
|
queue->peeked = NULL;
|
|
}
|
|
|
|
static inline int wg_queue_enqueue_per_device_and_peer(
|
|
struct crypt_queue *device_queue, struct prev_queue *peer_queue,
|
|
struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
|
|
{
|
|
int cpu;
|
|
|
|
atomic_set_release(&PACKET_CB(skb)->state, PACKET_STATE_UNCRYPTED);
|
|
/* We first queue this up for the peer ingestion, but the consumer
|
|
* will wait for the state to change to CRYPTED or DEAD before.
|
|
*/
|
|
if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
|
|
return -ENOSPC;
|
|
|
|
/* Then we queue it up in the device queue, which consumes the
|
|
* packet as soon as it can.
|
|
*/
|
|
cpu = wg_cpumask_next_online(next_cpu);
|
|
if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb)))
|
|
return -EPIPE;
|
|
queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work);
|
|
return 0;
|
|
}
|
|
|
|
static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
|
|
{
|
|
/* We take a reference, because as soon as we call atomic_set, the
|
|
* peer can be freed from below us.
|
|
*/
|
|
struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
|
|
|
|
atomic_set_release(&PACKET_CB(skb)->state, state);
|
|
queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
|
|
peer->device->packet_crypt_wq, &peer->transmit_packet_work);
|
|
wg_peer_put(peer);
|
|
}
|
|
|
|
static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
|
|
{
|
|
/* We take a reference, because as soon as we call atomic_set, the
|
|
* peer can be freed from below us.
|
|
*/
|
|
struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
|
|
|
|
atomic_set_release(&PACKET_CB(skb)->state, state);
|
|
napi_schedule(&peer->napi);
|
|
wg_peer_put(peer);
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
bool wg_packet_counter_selftest(void);
|
|
#endif
|
|
|
|
#endif /* _WG_QUEUEING_H */
|