648700f76b
Some applications still rely on IP fragmentation, and to be fair linux reassembly unit is not working under any serious load. It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU. A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. Before this patch, 16 cpus (16 RX queue NIC) could not handle more than 1 Mpps frags DDOS. After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB of storage for the fragments (exact number depends on frags being evicted after timeout) $ grep FRAG /proc/net/sockstat FRAG: inuse 1966916 memory 2140004608 A followup patch will change the limits for 64bit arches. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Florian Westphal <fw@strlen.de> Cc: Jesper Dangaard Brouer <brouer@redhat.com> Cc: Alexander Aring <alex.aring@gmail.com> Cc: Stefan Schmidt <stefan@osg.samsung.com> Signed-off-by: David S. Miller <davem@davemloft.net>
161 lines
3.9 KiB
C
161 lines
3.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __NET_FRAG_H__
|
|
#define __NET_FRAG_H__
|
|
|
|
#include <linux/rhashtable.h>
|
|
|
|
struct netns_frags {
|
|
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
|
|
|
|
/* Keep atomic mem on separate cachelines in structs that include it */
|
|
atomic_t mem ____cacheline_aligned_in_smp;
|
|
/* sysctls */
|
|
int timeout;
|
|
int high_thresh;
|
|
int low_thresh;
|
|
int max_dist;
|
|
struct inet_frags *f;
|
|
};
|
|
|
|
/**
|
|
* fragment queue flags
|
|
*
|
|
* @INET_FRAG_FIRST_IN: first fragment has arrived
|
|
* @INET_FRAG_LAST_IN: final fragment has arrived
|
|
* @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
|
|
*/
|
|
enum {
|
|
INET_FRAG_FIRST_IN = BIT(0),
|
|
INET_FRAG_LAST_IN = BIT(1),
|
|
INET_FRAG_COMPLETE = BIT(2),
|
|
};
|
|
|
|
struct frag_v4_compare_key {
|
|
__be32 saddr;
|
|
__be32 daddr;
|
|
u32 user;
|
|
u32 vif;
|
|
__be16 id;
|
|
u16 protocol;
|
|
};
|
|
|
|
struct frag_v6_compare_key {
|
|
struct in6_addr saddr;
|
|
struct in6_addr daddr;
|
|
u32 user;
|
|
__be32 id;
|
|
u32 iif;
|
|
};
|
|
|
|
/**
|
|
* struct inet_frag_queue - fragment queue
|
|
*
|
|
* @node: rhash node
|
|
* @key: keys identifying this frag.
|
|
* @timer: queue expiration timer
|
|
* @lock: spinlock protecting this frag
|
|
* @refcnt: reference count of the queue
|
|
* @fragments: received fragments head
|
|
* @fragments_tail: received fragments tail
|
|
* @stamp: timestamp of the last received fragment
|
|
* @len: total length of the original datagram
|
|
* @meat: length of received fragments so far
|
|
* @flags: fragment queue flags
|
|
* @max_size: maximum received fragment size
|
|
* @net: namespace that this frag belongs to
|
|
* @rcu: rcu head for freeing deferall
|
|
*/
|
|
struct inet_frag_queue {
|
|
struct rhash_head node;
|
|
union {
|
|
struct frag_v4_compare_key v4;
|
|
struct frag_v6_compare_key v6;
|
|
} key;
|
|
struct timer_list timer;
|
|
spinlock_t lock;
|
|
refcount_t refcnt;
|
|
struct sk_buff *fragments;
|
|
struct sk_buff *fragments_tail;
|
|
ktime_t stamp;
|
|
int len;
|
|
int meat;
|
|
__u8 flags;
|
|
u16 max_size;
|
|
struct netns_frags *net;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
struct inet_frags {
|
|
unsigned int qsize;
|
|
|
|
void (*constructor)(struct inet_frag_queue *q,
|
|
const void *arg);
|
|
void (*destructor)(struct inet_frag_queue *);
|
|
void (*frag_expire)(struct timer_list *t);
|
|
struct kmem_cache *frags_cachep;
|
|
const char *frags_cache_name;
|
|
struct rhashtable_params rhash_params;
|
|
};
|
|
|
|
int inet_frags_init(struct inet_frags *);
|
|
void inet_frags_fini(struct inet_frags *);
|
|
|
|
static inline int inet_frags_init_net(struct netns_frags *nf)
|
|
{
|
|
atomic_set(&nf->mem, 0);
|
|
return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
|
|
}
|
|
void inet_frags_exit_net(struct netns_frags *nf);
|
|
|
|
void inet_frag_kill(struct inet_frag_queue *q);
|
|
void inet_frag_destroy(struct inet_frag_queue *q);
|
|
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
|
|
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
|
|
const char *prefix);
|
|
|
|
static inline void inet_frag_put(struct inet_frag_queue *q)
|
|
{
|
|
if (refcount_dec_and_test(&q->refcnt))
|
|
inet_frag_destroy(q);
|
|
}
|
|
|
|
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/* Memory Tracking Functions. */
|
|
|
|
static inline int frag_mem_limit(struct netns_frags *nf)
|
|
{
|
|
return atomic_read(&nf->mem);
|
|
}
|
|
|
|
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
|
|
{
|
|
atomic_sub(i, &nf->mem);
|
|
}
|
|
|
|
static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
|
|
{
|
|
atomic_add(i, &nf->mem);
|
|
}
|
|
|
|
static inline int sum_frag_mem_limit(struct netns_frags *nf)
|
|
{
|
|
return atomic_read(&nf->mem);
|
|
}
|
|
|
|
/* RFC 3168 support :
|
|
* We want to check ECN values of all fragments, do detect invalid combinations.
|
|
* In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
|
|
*/
|
|
#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
|
|
#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
|
|
#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
|
|
#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
|
|
|
|
extern const u8 ip_frag_ecn_table[16];
|
|
|
|
#endif
|