net: fix rtable leak in net/ipv4/route.c

Alexander V. Lukyanov found a regression in 2.6.29 and made a complete
analysis found in http://bugzilla.kernel.org/show_bug.cgi?id=13339
Quoted here because its a perfect one :

begin_of_quotation
 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the
 patch has at least one critical flaw, and another problem.

 rt_intern_hash calculates rthi pointer, which is later used for new entry
 insertion. The same loop calculates cand pointer which is used to clean the
 list. If the pointers are the same, rtable leak occurs, as first the cand is
 removed then the new entry is appended to it.

 This leak leads to unregister_netdevice problem (usage count > 0).

 Another problem of the patch is that it tries to insert the entries in certain
 order, to facilitate counting of entries distinct by all but QoS parameters.
 Unfortunately, referencing an existing rtable entry moves it to list beginning,
 to speed up further lookups, so the carefully built order is destroyed.

 For the first problem the simplest patch it to set rthi=0 when rthi==cand, but
 it will also destroy the ordering.
end_of_quotation

Problematic commit is 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b
(net: implement emergency route cache rebulds when gc_elasticity is exceeded)

Trying to keep dst_entries ordered is too complex and breaks the fact that
order should depend on the frequency of use for garbage collection.

A possible fix is to make rt_intern_hash() simpler, and only makes
rt_check_expire() a litle bit smarter, being able to cope with an arbitrary
entries order. The added loop is running on cache hot data, while cpu
is prefetching next object, so should be unnoticied.

Reported-and-analyzed-by: Alexander V. Lukyanov <lav@yar.ru>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2009-05-19 20:14:28 +00:00 committed by David S. Miller
parent cf8da764fc
commit 1ddbcb005c

View File

@ -784,7 +784,7 @@ static void rt_check_expire(void)
{ {
static unsigned int rover; static unsigned int rover;
unsigned int i = rover, goal; unsigned int i = rover, goal;
struct rtable *rth, **rthp; struct rtable *rth, *aux, **rthp;
unsigned long samples = 0; unsigned long samples = 0;
unsigned long sum = 0, sum2 = 0; unsigned long sum = 0, sum2 = 0;
u64 mult; u64 mult;
@ -812,6 +812,7 @@ static void rt_check_expire(void)
length = 0; length = 0;
spin_lock_bh(rt_hash_lock_addr(i)); spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) { while ((rth = *rthp) != NULL) {
prefetch(rth->u.dst.rt_next);
if (rt_is_expired(rth)) { if (rt_is_expired(rth)) {
*rthp = rth->u.dst.rt_next; *rthp = rth->u.dst.rt_next;
rt_free(rth); rt_free(rth);
@ -820,33 +821,30 @@ static void rt_check_expire(void)
if (rth->u.dst.expires) { if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */ /* Entry is expired even if it is in use */
if (time_before_eq(jiffies, rth->u.dst.expires)) { if (time_before_eq(jiffies, rth->u.dst.expires)) {
nofree:
tmo >>= 1; tmo >>= 1;
rthp = &rth->u.dst.rt_next; rthp = &rth->u.dst.rt_next;
/* /*
* Only bump our length if the hash * We only count entries on
* inputs on entries n and n+1 are not
* the same, we only count entries on
* a chain with equal hash inputs once * a chain with equal hash inputs once
* so that entries for different QOS * so that entries for different QOS
* levels, and other non-hash input * levels, and other non-hash input
* attributes don't unfairly skew * attributes don't unfairly skew
* the length computation * the length computation
*/ */
if ((*rthp == NULL) || for (aux = rt_hash_table[i].chain;;) {
!compare_hash_inputs(&(*rthp)->fl, if (aux == rth) {
&rth->fl)) length += ONE;
length += ONE; break;
}
if (compare_hash_inputs(&aux->fl, &rth->fl))
break;
aux = aux->u.dst.rt_next;
}
continue; continue;
} }
} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
tmo >>= 1; goto nofree;
rthp = &rth->u.dst.rt_next;
if ((*rthp == NULL) ||
!compare_hash_inputs(&(*rthp)->fl,
&rth->fl))
length += ONE;
continue;
}
/* Cleanup aged off entries. */ /* Cleanup aged off entries. */
*rthp = rth->u.dst.rt_next; *rthp = rth->u.dst.rt_next;
@ -1069,7 +1067,6 @@ out: return 0;
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{ {
struct rtable *rth, **rthp; struct rtable *rth, **rthp;
struct rtable *rthi;
unsigned long now; unsigned long now;
struct rtable *cand, **candp; struct rtable *cand, **candp;
u32 min_score; u32 min_score;
@ -1089,7 +1086,6 @@ restart:
} }
rthp = &rt_hash_table[hash].chain; rthp = &rt_hash_table[hash].chain;
rthi = NULL;
spin_lock_bh(rt_hash_lock_addr(hash)); spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) { while ((rth = *rthp) != NULL) {
@ -1135,17 +1131,6 @@ restart:
chain_length++; chain_length++;
rthp = &rth->u.dst.rt_next; rthp = &rth->u.dst.rt_next;
/*
* check to see if the next entry in the chain
* contains the same hash input values as rt. If it does
* This is where we will insert into the list, instead of
* at the head. This groups entries that differ by aspects not
* relvant to the hash function together, which we use to adjust
* our chain length
*/
if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
rthi = rth;
} }
if (cand) { if (cand) {
@ -1206,10 +1191,7 @@ restart:
} }
} }
if (rthi) rt->u.dst.rt_next = rt_hash_table[hash].chain;
rt->u.dst.rt_next = rthi->u.dst.rt_next;
else
rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2 #if RT_CACHE_DEBUG >= 2
if (rt->u.dst.rt_next) { if (rt->u.dst.rt_next) {
@ -1225,10 +1207,7 @@ restart:
* previous writes to rt are comitted to memory * previous writes to rt are comitted to memory
* before making rt visible to other CPUS. * before making rt visible to other CPUS.
*/ */
if (rthi) rcu_assign_pointer(rt_hash_table[hash].chain, rt);
rcu_assign_pointer(rthi->u.dst.rt_next, rt);
else
rcu_assign_pointer(rt_hash_table[hash].chain, rt);
spin_unlock_bh(rt_hash_lock_addr(hash)); spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt; *rp = rt;