diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f35bfe43bf7a..625c8dda4be7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -104,7 +104,9 @@ ipfrag_high_thresh - INTEGER is reached. ipfrag_low_thresh - INTEGER - See ipfrag_high_thresh + Maximum memory used to reassemble IP fragments before the kernel + begins to remove incomplete fragment queues to free up resources. + The kernel still accepts new fragments for defragmentation. ipfrag_time - INTEGER Time in seconds to keep an IP fragment in memory. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 9fe644d1a26e..e975032ea11b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -32,6 +32,7 @@ struct inet_frag_queue { int meat; __u8 last_in; /* first/last segment arrived? */ +#define INET_FRAG_EVICTED 8 #define INET_FRAG_COMPLETE 4 #define INET_FRAG_FIRST_IN 2 #define INET_FRAG_LAST_IN 1 @@ -48,7 +49,7 @@ struct inet_frag_queue { * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or * struct frag_queue)) */ -#define INETFRAGS_MAXDEPTH 128 +#define INETFRAGS_MAXDEPTH 128 struct inet_frag_bucket { struct hlist_head chain; @@ -65,6 +66,9 @@ struct inet_frags { int secret_interval; struct timer_list secret_timer; + struct work_struct frags_work; + unsigned int next_bucket; + /* The first call to hashfn is responsible to initialize * rnd. This is best done with net_get_random_once. */ diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 535636017534..43315ecb9400 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,9 @@ #include #include +#define INETFRAGS_EVICT_BUCKETS 128 +#define INETFRAGS_EVICT_MAX 512 + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -46,8 +49,6 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); - static unsigned int inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) { @@ -89,10 +90,92 @@ static void inet_frag_secret_rebuild(unsigned long dummy) mod_timer(&f->secret_timer, now + f->secret_interval); } +static bool inet_fragq_should_evict(const struct inet_frag_queue *q) +{ + return q->net->low_thresh == 0 || + frag_mem_limit(q->net) >= q->net->low_thresh; +} + +static unsigned int +inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +{ + struct inet_frag_queue *fq; + struct hlist_node *n; + unsigned int evicted = 0; + HLIST_HEAD(expired); + +evict_again: + spin_lock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &hb->chain, list) { + if (!inet_fragq_should_evict(fq)) + continue; + + if (!del_timer(&fq->timer)) { + /* q expiring right now thus increment its refcount so + * it won't be freed under us and wait until the timer + * has finished executing then destroy it + */ + atomic_inc(&fq->refcnt); + spin_unlock(&hb->chain_lock); + del_timer_sync(&fq->timer); + WARN_ON(atomic_read(&fq->refcnt) != 1); + inet_frag_put(fq, f); + goto evict_again; + } + + /* suppress xmit of (icmp) error packet */ + fq->last_in &= ~INET_FRAG_FIRST_IN; + fq->last_in |= INET_FRAG_EVICTED; + hlist_del(&fq->list); + hlist_add_head(&fq->list, &expired); + ++evicted; + } + + spin_unlock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &expired, list) + f->frag_expire((unsigned long) fq); + + return evicted; +} + +static void inet_frag_worker(struct work_struct *work) +{ + unsigned int budget = INETFRAGS_EVICT_BUCKETS; + unsigned int i, evicted = 0; + struct inet_frags *f; + + f = container_of(work, struct inet_frags, frags_work); + + BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + + read_lock_bh(&f->lock); + + for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + evicted += inet_evict_bucket(f, &f->hash[i]); + i = (i + 1) & (INETFRAGS_HASHSZ - 1); + if (evicted > INETFRAGS_EVICT_MAX) + break; + } + + f->next_bucket = i; + + read_unlock_bh(&f->lock); +} + +static void inet_frag_schedule_worker(struct inet_frags *f) +{ + if (unlikely(!work_pending(&f->frags_work))) + schedule_work(&f->frags_work); +} + void inet_frags_init(struct inet_frags *f) { int i; + INIT_WORK(&f->frags_work, inet_frag_worker); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i]; @@ -120,16 +203,22 @@ EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { del_timer(&f->secret_timer); + cancel_work_sync(&f->frags_work); } EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { + int i; + nf->low_thresh = 0; - local_bh_disable(); - inet_frag_evictor(nf, f, true); - local_bh_enable(); + read_lock_bh(&f->lock); + + for (i = 0; i < INETFRAGS_HASHSZ ; i++) + inet_evict_bucket(f, &f->hash[i]); + + read_unlock_bh(&f->lock); percpu_counter_destroy(&nf->mem); } @@ -205,41 +294,6 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) -{ - struct inet_frag_queue *q; - int work, evicted = 0; - - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0 || force) { - spin_lock(&nf->lru_lock); - - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } - - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); - - spin_unlock(&nf->lru_lock); - - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); - - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; - } - - return evicted; -} - static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) @@ -292,8 +346,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (frag_mem_limit(nf) > nf->high_thresh) + if (frag_mem_limit(nf) > nf->high_thresh) { + inet_frag_schedule_worker(f); return NULL; + } q = kzalloc(f->qsize, GFP_ATOMIC); if (q == NULL) @@ -331,8 +387,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frag_queue *q; int depth = 0; - if (frag_mem_limit(nf) > nf->high_thresh) - inet_frag_evictor(nf, f, false); + if (frag_mem_limit(nf) > nf->low_thresh) + inet_frag_schedule_worker(f); hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 54988672d00d..54bd170c5eb4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -195,7 +195,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + if (!(qp->q.last_in & INET_FRAG_EVICTED)) + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 97acbc490d9e..b3924b10dff3 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -141,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + if (!(fq->q.last_in & INET_FRAG_EVICTED)) + IP6_INC_STATS_BH(net, __in6_dev_get(dev), + IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */