Merge branch 'inet-add-drop-monitor-support'
Eric Dumazet says: ==================== inet: add drop monitor support I recently tried to analyse flakes in ip_defrag selftest. This failed miserably. IPv4 and IPv6 reassembly units are causing false kfree_skb() notifications. It is time to deal with this issue. First two patches are changing core networking to better deal with eventual skb frag_list chains, in respect of kfree_skb/consume_skb status. Last three patches are adding three new drop reasons, and make sure skbs that have been reassembled into a large datagram are no longer viewed as dropped ones. After this, understanding why ip_defrag selftest is flaky is possible using standard drop monitoring tools. ==================== Link: https://lore.kernel.org/r/20221029154520.2747444-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
6f1a298b2e
@ -68,6 +68,9 @@
|
||||
FN(IP_INADDRERRORS) \
|
||||
FN(IP_INNOROUTES) \
|
||||
FN(PKT_TOO_BIG) \
|
||||
FN(DUP_FRAG) \
|
||||
FN(FRAG_REASM_TIMEOUT) \
|
||||
FN(FRAG_TOO_FAR) \
|
||||
FNe(MAX)
|
||||
|
||||
/**
|
||||
@ -80,6 +83,8 @@ enum skb_drop_reason {
|
||||
* @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
|
||||
*/
|
||||
SKB_NOT_DROPPED_YET = 0,
|
||||
/** @SKB_CONSUMED: packet has been consumed */
|
||||
SKB_CONSUMED,
|
||||
/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
|
||||
SKB_DROP_REASON_NOT_SPECIFIED,
|
||||
/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
|
||||
@ -298,6 +303,15 @@ enum skb_drop_reason {
|
||||
* MTU)
|
||||
*/
|
||||
SKB_DROP_REASON_PKT_TOO_BIG,
|
||||
/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
|
||||
SKB_DROP_REASON_DUP_FRAG,
|
||||
/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
|
||||
SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
|
||||
/**
|
||||
* @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
|
||||
* (/proc/sys/net/ipv4/ipfrag_max_dist)
|
||||
*/
|
||||
SKB_DROP_REASON_FRAG_TOO_FAR,
|
||||
/**
|
||||
* @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
|
||||
* used as a real 'reason'
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/in6.h>
|
||||
#include <linux/rbtree_types.h>
|
||||
#include <linux/refcount.h>
|
||||
#include <net/dropreason.h>
|
||||
|
||||
/* Per netns frag queues directory */
|
||||
struct fqdir {
|
||||
@ -34,12 +35,14 @@ struct fqdir {
|
||||
* @INET_FRAG_LAST_IN: final fragment has arrived
|
||||
* @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
|
||||
* @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
|
||||
* @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed)
|
||||
*/
|
||||
enum {
|
||||
INET_FRAG_FIRST_IN = BIT(0),
|
||||
INET_FRAG_LAST_IN = BIT(1),
|
||||
INET_FRAG_COMPLETE = BIT(2),
|
||||
INET_FRAG_HASH_DEAD = BIT(3),
|
||||
INET_FRAG_DROP = BIT(4),
|
||||
};
|
||||
|
||||
struct frag_v4_compare_key {
|
||||
@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q);
|
||||
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
|
||||
|
||||
/* Free all skbs in the queue; return the sum of their truesizes. */
|
||||
unsigned int inet_frag_rbtree_purge(struct rb_root *root);
|
||||
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
|
||||
enum skb_drop_reason reason);
|
||||
|
||||
static inline void inet_frag_put(struct inet_frag_queue *q)
|
||||
{
|
||||
|
@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
|
||||
if (fq->q.flags & INET_FRAG_COMPLETE)
|
||||
goto out;
|
||||
|
||||
fq->q.flags |= INET_FRAG_DROP;
|
||||
inet_frag_kill(&fq->q);
|
||||
|
||||
dev = dev_get_by_index_rcu(net, fq->iif);
|
||||
@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
|
||||
spin_unlock(&fq->q.lock);
|
||||
|
||||
icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
|
||||
kfree_skb(head);
|
||||
kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
|
||||
goto out_rcu_unlock;
|
||||
|
||||
out:
|
||||
|
@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
|
||||
#undef FN
|
||||
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
|
||||
const char * const drop_reasons[] = {
|
||||
[SKB_CONSUMED] = "CONSUMED",
|
||||
DEFINE_DROP_REASON(FN, FN)
|
||||
};
|
||||
EXPORT_SYMBOL(drop_reasons);
|
||||
@ -768,7 +769,7 @@ static void skb_free_head(struct sk_buff *skb)
|
||||
}
|
||||
}
|
||||
|
||||
static void skb_release_data(struct sk_buff *skb)
|
||||
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
{
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
||||
int i;
|
||||
@ -791,7 +792,7 @@ static void skb_release_data(struct sk_buff *skb)
|
||||
|
||||
free_head:
|
||||
if (shinfo->frag_list)
|
||||
kfree_skb_list(shinfo->frag_list);
|
||||
kfree_skb_list_reason(shinfo->frag_list, reason);
|
||||
|
||||
skb_free_head(skb);
|
||||
exit:
|
||||
@ -854,11 +855,11 @@ void skb_release_head_state(struct sk_buff *skb)
|
||||
}
|
||||
|
||||
/* Free everything but the sk_buff shell. */
|
||||
static void skb_release_all(struct sk_buff *skb)
|
||||
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
{
|
||||
skb_release_head_state(skb);
|
||||
if (likely(skb->head))
|
||||
skb_release_data(skb);
|
||||
skb_release_data(skb, reason);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -872,7 +873,7 @@ static void skb_release_all(struct sk_buff *skb)
|
||||
|
||||
void __kfree_skb(struct sk_buff *skb)
|
||||
{
|
||||
skb_release_all(skb);
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
|
||||
kfree_skbmem(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(__kfree_skb);
|
||||
@ -894,7 +895,10 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
|
||||
|
||||
DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
|
||||
|
||||
trace_kfree_skb(skb, __builtin_return_address(0), reason);
|
||||
if (reason == SKB_CONSUMED)
|
||||
trace_consume_skb(skb);
|
||||
else
|
||||
trace_kfree_skb(skb, __builtin_return_address(0), reason);
|
||||
__kfree_skb(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(kfree_skb_reason);
|
||||
@ -1052,7 +1056,7 @@ EXPORT_SYMBOL(consume_skb);
|
||||
void __consume_stateless_skb(struct sk_buff *skb)
|
||||
{
|
||||
trace_consume_skb(skb);
|
||||
skb_release_data(skb);
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
kfree_skbmem(skb);
|
||||
}
|
||||
|
||||
@ -1077,7 +1081,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
|
||||
|
||||
void __kfree_skb_defer(struct sk_buff *skb)
|
||||
{
|
||||
skb_release_all(skb);
|
||||
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
|
||||
napi_skb_cache_put(skb);
|
||||
}
|
||||
|
||||
@ -1115,7 +1119,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
|
||||
return;
|
||||
}
|
||||
|
||||
skb_release_all(skb);
|
||||
skb_release_all(skb, SKB_CONSUMED);
|
||||
napi_skb_cache_put(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(napi_consume_skb);
|
||||
@ -1246,7 +1250,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
|
||||
*/
|
||||
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
|
||||
{
|
||||
skb_release_all(dst);
|
||||
skb_release_all(dst, SKB_CONSUMED);
|
||||
return __skb_clone(dst, src);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skb_morph);
|
||||
@ -1869,7 +1873,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
|
||||
if (skb_has_frag_list(skb))
|
||||
skb_clone_fraglist(skb);
|
||||
|
||||
skb_release_data(skb);
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
} else {
|
||||
skb_free_head(skb);
|
||||
}
|
||||
@ -6209,7 +6213,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
|
||||
skb_frag_ref(skb, i);
|
||||
if (skb_has_frag_list(skb))
|
||||
skb_clone_fraglist(skb);
|
||||
skb_release_data(skb);
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
} else {
|
||||
/* we can reuse existing recount- all we did was
|
||||
* relocate values
|
||||
@ -6352,7 +6356,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
|
||||
kfree(data);
|
||||
return -ENOMEM;
|
||||
}
|
||||
skb_release_data(skb);
|
||||
skb_release_data(skb, SKB_CONSUMED);
|
||||
|
||||
skb->head = data;
|
||||
skb->head_frag = 0;
|
||||
|
@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
|
||||
count = del_timer_sync(&fq->timer) ? 1 : 0;
|
||||
|
||||
spin_lock_bh(&fq->lock);
|
||||
fq->flags |= INET_FRAG_DROP;
|
||||
if (!(fq->flags & INET_FRAG_COMPLETE)) {
|
||||
fq->flags |= INET_FRAG_COMPLETE;
|
||||
count++;
|
||||
@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
|
||||
kmem_cache_free(f->frags_cachep, q);
|
||||
}
|
||||
|
||||
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
|
||||
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
|
||||
enum skb_drop_reason reason)
|
||||
{
|
||||
struct rb_node *p = rb_first(root);
|
||||
unsigned int sum = 0;
|
||||
@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root)
|
||||
struct sk_buff *next = FRAG_CB(skb)->next_frag;
|
||||
|
||||
sum += skb->truesize;
|
||||
kfree_skb(skb);
|
||||
kfree_skb_reason(skb, reason);
|
||||
skb = next;
|
||||
}
|
||||
}
|
||||
@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
|
||||
|
||||
void inet_frag_destroy(struct inet_frag_queue *q)
|
||||
{
|
||||
struct fqdir *fqdir;
|
||||
unsigned int sum, sum_truesize = 0;
|
||||
enum skb_drop_reason reason;
|
||||
struct inet_frags *f;
|
||||
struct fqdir *fqdir;
|
||||
|
||||
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
|
||||
reason = (q->flags & INET_FRAG_DROP) ?
|
||||
SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
|
||||
SKB_CONSUMED;
|
||||
WARN_ON(del_timer(&q->timer) != 0);
|
||||
|
||||
/* Release all fragment data. */
|
||||
fqdir = q->fqdir;
|
||||
f = fqdir->f;
|
||||
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
|
||||
sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
|
||||
sum = sum_truesize + f->qsize;
|
||||
|
||||
call_rcu(&q->rcu, inet_frag_destroy_rcu);
|
||||
|
@ -153,6 +153,7 @@ static void ip_expire(struct timer_list *t)
|
||||
if (qp->q.flags & INET_FRAG_COMPLETE)
|
||||
goto out;
|
||||
|
||||
qp->q.flags |= INET_FRAG_DROP;
|
||||
ipq_kill(qp);
|
||||
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
|
||||
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
|
||||
@ -194,7 +195,7 @@ out:
|
||||
spin_unlock(&qp->q.lock);
|
||||
out_rcu_unlock:
|
||||
rcu_read_unlock();
|
||||
kfree_skb(head);
|
||||
kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
|
||||
ipq_put(qp);
|
||||
}
|
||||
|
||||
@ -254,7 +255,8 @@ static int ip_frag_reinit(struct ipq *qp)
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
|
||||
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
|
||||
sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
|
||||
SKB_DROP_REASON_FRAG_TOO_FAR);
|
||||
sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
|
||||
|
||||
qp->q.flags = 0;
|
||||
@ -278,10 +280,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
|
||||
struct net_device *dev;
|
||||
unsigned int fragsize;
|
||||
int err = -ENOENT;
|
||||
SKB_DR(reason);
|
||||
u8 ecn;
|
||||
|
||||
if (qp->q.flags & INET_FRAG_COMPLETE)
|
||||
/* If reassembly is already done, @skb must be a duplicate frag. */
|
||||
if (qp->q.flags & INET_FRAG_COMPLETE) {
|
||||
SKB_DR_SET(reason, DUP_FRAG);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
|
||||
unlikely(ip_frag_too_far(qp)) &&
|
||||
@ -382,8 +388,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
|
||||
|
||||
insert_error:
|
||||
if (err == IPFRAG_DUP) {
|
||||
kfree_skb(skb);
|
||||
return -EINVAL;
|
||||
SKB_DR_SET(reason, DUP_FRAG);
|
||||
err = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
err = -EINVAL;
|
||||
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
|
||||
@ -391,7 +398,7 @@ discard_qp:
|
||||
inet_frag_kill(&qp->q);
|
||||
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
|
||||
err:
|
||||
kfree_skb(skb);
|
||||
kfree_skb_reason(skb, reason);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -253,7 +253,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
|
||||
if (err) {
|
||||
if (err == IPFRAG_DUP) {
|
||||
/* No error for duplicates, pretend they got queued. */
|
||||
kfree_skb(skb);
|
||||
kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
goto insert_error;
|
||||
|
@ -112,10 +112,14 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
|
||||
struct sk_buff *prev_tail;
|
||||
struct net_device *dev;
|
||||
int err = -ENOENT;
|
||||
SKB_DR(reason);
|
||||
u8 ecn;
|
||||
|
||||
if (fq->q.flags & INET_FRAG_COMPLETE)
|
||||
/* If reassembly is already done, @skb must be a duplicate frag. */
|
||||
if (fq->q.flags & INET_FRAG_COMPLETE) {
|
||||
SKB_DR_SET(reason, DUP_FRAG);
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = -EINVAL;
|
||||
offset = ntohs(fhdr->frag_off) & ~0x7;
|
||||
@ -226,8 +230,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
|
||||
|
||||
insert_error:
|
||||
if (err == IPFRAG_DUP) {
|
||||
kfree_skb(skb);
|
||||
return -EINVAL;
|
||||
SKB_DR_SET(reason, DUP_FRAG);
|
||||
err = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
err = -EINVAL;
|
||||
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
|
||||
@ -237,7 +242,7 @@ discard_fq:
|
||||
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
|
||||
IPSTATS_MIB_REASMFAILS);
|
||||
err:
|
||||
kfree_skb(skb);
|
||||
kfree_skb_reason(skb, reason);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user